#47 PDF Zitat-Highlighting via PyMuPDF Single-Page-Render
Klick auf eine Zitat-Quelle im Report öffnet jetzt eine 1-Seiten-PDF- Variante des Wahlprogramms mit gelb markiertem Snippet, statt nur zum Page-Anchor zu springen und den Leser selbst suchen zu lassen. Implementation: embeddings.render_highlighted_page(programm_id, seite, query) - Validiert programm_id gegen PROGRAMME (Path-Traversal-Schutz) - Lädt das volle Wahlprogramm-PDF, extrahiert via insert_pdf nur die angeforderte Seite in einen neuen Document → kleinere Response - search_for(query[:200]) → Bounding-Boxes aller Treffer - Fallback: 5-Wort-Anker wenn Volltext-Match leer (LLM-Truncation, identisch zu find_chunk_for_text/Sub-D-Logik) - add_highlight_annot mit gelber stroke-Color (1.0, 0.93, 0.0) - Returns serialisierte PDF-Bytes oder None embeddings._chunk_pdf_url - Wenn chunk["text"] vorhanden: emittiert /api/wahlprogramm-cite-URL mit pid=, seite=, q=urlencoded(text[:200]) - Sonst: alter statischer /static/referenzen/X.pdf#page=N (Pre-#47 rückwärts-kompatibel) - text wird auf 200 Zeichen abgeschnitten, sonst blasen 500-Zeichen-Snippets jedes Assessment-JSON auf main.py /api/wahlprogramm-cite Endpoint - Validiert pid gegen PROGRAMME registry - seite: 1 ≤ n ≤ 2000 - Response: application/pdf, Cache-Control max-age=86400 - 404 bei unknown pid oder fehlendem PDF, 400 bei seite out of range Reconstruct-Pipeline (Issue #60 Option B) zieht das automatisch durch: reconstruct_zitate ruft _chunk_pdf_url(matched_chunk) auf, der jetzt bevorzugt die Cite-URL emittiert. Keine Änderung an reconstruct_zitate selbst nötig. Tests: 194/194 grün (185 + 9 neue): - TestChunkPdfUrl: 4 Cases (cite vs static, unknown prog, 200-char-truncate) - TestRenderHighlightedPage: 5 Cases (unknown pid, invalid seite, valid render, empty query, query-not-found-falls-back-zu-leerem-Highlight) - Plus Bridge im Test-Stub: pymupdf-as-fitz Shim falls eine third-party "fitz" das Pkg shadowt (kommt auf älteren Dev-Setups vor) Refs: #47
This commit is contained in:
parent
27ae82a758
commit
4ec6190416
@ -3,6 +3,7 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
import urllib.parse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@ -549,7 +550,17 @@ def _chunk_source_label(chunk: dict) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def _chunk_pdf_url(chunk: dict) -> Optional[str]:
|
def _chunk_pdf_url(chunk: dict) -> Optional[str]:
|
||||||
"""Build the canonical PDF URL with page anchor for a chunk."""
|
"""Build the canonical PDF URL with page anchor for a chunk.
|
||||||
|
|
||||||
|
Wenn der Chunk einen ``text`` enthält, wird stattdessen die
|
||||||
|
Highlight-Endpoint-URL ``/api/wahlprogramm-cite?pid=…&seite=…&q=…``
|
||||||
|
emittiert (Issue #47). Der Endpoint rendert die Wahlprogramm-Seite
|
||||||
|
mit gelb markiertem Zitat und liefert ein 1-Seiten-PDF. Klick im
|
||||||
|
Report öffnet die Quelle direkt mit visuell hervorgehobener Stelle.
|
||||||
|
|
||||||
|
Fallback: ohne text → statische ``/static/referenzen/<pdf>#page=<n>``
|
||||||
|
URL (rückwärts-kompatibel für Pre-#47 Assessments).
|
||||||
|
"""
|
||||||
prog_id = chunk.get("programm_id", "")
|
prog_id = chunk.get("programm_id", "")
|
||||||
info = PROGRAMME.get(prog_id)
|
info = PROGRAMME.get(prog_id)
|
||||||
if not info:
|
if not info:
|
||||||
@ -558,11 +569,91 @@ def _chunk_pdf_url(chunk: dict) -> Optional[str]:
|
|||||||
if not pdf:
|
if not pdf:
|
||||||
return None
|
return None
|
||||||
seite = chunk.get("seite")
|
seite = chunk.get("seite")
|
||||||
|
text = (chunk.get("text") or "").strip()
|
||||||
|
|
||||||
|
if text and seite:
|
||||||
|
# Highlight-Endpoint mit URL-encoded query. Den Text auf 200 Zeichen
|
||||||
|
# abschneiden — search_for matched ohnehin nur Substring-Anker, und
|
||||||
|
# die URL bleibt bounded (sonst würden 500-Zeichen-Snippets in jeder
|
||||||
|
# Zitat-URL stehen und das HTML-Report-JSON aufblähen).
|
||||||
|
q = urllib.parse.quote_plus(text[:200])
|
||||||
|
return f"/api/wahlprogramm-cite?pid={prog_id}&seite={seite}&q={q}"
|
||||||
|
|
||||||
if seite:
|
if seite:
|
||||||
return f"/static/referenzen/{pdf}#page={seite}"
|
return f"/static/referenzen/{pdf}#page={seite}"
|
||||||
return f"/static/referenzen/{pdf}"
|
return f"/static/referenzen/{pdf}"
|
||||||
|
|
||||||
|
|
||||||
|
def render_highlighted_page(programm_id: str, seite: int, query: str) -> Optional[bytes]:
|
||||||
|
"""Render a single Wahlprogramm-page with yellow highlights for a query.
|
||||||
|
|
||||||
|
Used by the ``/api/wahlprogramm-cite`` endpoint to serve a one-page
|
||||||
|
PDF where the cited snippet is visually highlighted via PyMuPDF
|
||||||
|
``add_highlight_annot``. Returns the serialized PDF bytes, or None
|
||||||
|
if the programme/page can't be resolved.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
programm_id: Key into PROGRAMME registry — validated by caller.
|
||||||
|
seite: 1-indexed page number within the programme PDF.
|
||||||
|
query: Snippet text to search and highlight on the page. Long
|
||||||
|
queries are truncated to the first 200 characters before the
|
||||||
|
search; PyMuPDF's ``search_for`` falls over on huge needles
|
||||||
|
anyway and a short anchor is what we want for the visual hit.
|
||||||
|
"""
|
||||||
|
info = PROGRAMME.get(programm_id)
|
||||||
|
if not info:
|
||||||
|
return None
|
||||||
|
pdf_filename = info.get("pdf")
|
||||||
|
if not pdf_filename:
|
||||||
|
return None
|
||||||
|
|
||||||
|
referenzen = Path(__file__).parent / "static" / "referenzen"
|
||||||
|
pdf_path = referenzen / pdf_filename
|
||||||
|
if not pdf_path.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
|
needle = (query or "").strip()[:200]
|
||||||
|
|
||||||
|
src = fitz.open(str(pdf_path))
|
||||||
|
try:
|
||||||
|
if seite < 1 or seite > len(src):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Single-page Sub-PDF erzeugen — hält den Response klein und
|
||||||
|
# schließt versehentliche Cross-Page-Highlights aus.
|
||||||
|
new = fitz.open()
|
||||||
|
try:
|
||||||
|
new.insert_pdf(src, from_page=seite - 1, to_page=seite - 1)
|
||||||
|
page = new[0]
|
||||||
|
|
||||||
|
if needle:
|
||||||
|
# PyMuPDF ist tolerant gegen Whitespace, aber Soft-Hyphen
|
||||||
|
# bricht den Match — analog zu _normalize_for_match
|
||||||
|
# entfernen wir \xad vor dem search_for.
|
||||||
|
clean = needle.replace("\u00ad", "")
|
||||||
|
rects = page.search_for(clean)
|
||||||
|
if not rects:
|
||||||
|
# Fallback: nur die ersten 5 Wörter als Anker — analog
|
||||||
|
# zu find_chunk_for_text. Wenn der LLM den Snippet
|
||||||
|
# mid-sentence gekürzt hat, bricht der Volltext-Match,
|
||||||
|
# aber 5-Wort-Sequenz findet die Stelle trotzdem.
|
||||||
|
words = clean.split()
|
||||||
|
if len(words) >= 5:
|
||||||
|
anchor = " ".join(words[:5])
|
||||||
|
rects = page.search_for(anchor)
|
||||||
|
for rect in rects:
|
||||||
|
annot = page.add_highlight_annot(rect)
|
||||||
|
if annot is not None:
|
||||||
|
annot.set_colors(stroke=(1.0, 0.93, 0.0)) # gelb
|
||||||
|
annot.update()
|
||||||
|
|
||||||
|
return new.tobytes()
|
||||||
|
finally:
|
||||||
|
new.close()
|
||||||
|
finally:
|
||||||
|
src.close()
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
# Citation post-processing — Issue #60 Option B
|
# Citation post-processing — Issue #60 Option B
|
||||||
#
|
#
|
||||||
|
|||||||
42
app/main.py
42
app/main.py
@ -41,7 +41,7 @@ from .analyzer import analyze_antrag
|
|||||||
from .report import generate_html_report, generate_pdf_report
|
from .report import generate_html_report, generate_pdf_report
|
||||||
from .embeddings import (
|
from .embeddings import (
|
||||||
init_embeddings_db, get_programme_info, get_indexing_status,
|
init_embeddings_db, get_programme_info, get_indexing_status,
|
||||||
index_programm, PROGRAMME
|
index_programm, render_highlighted_page, PROGRAMME,
|
||||||
)
|
)
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
@ -595,6 +595,46 @@ async def quellen_page(request: Request):
|
|||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/wahlprogramm-cite")
|
||||||
|
async def wahlprogramm_cite(pid: str, seite: int, q: str = ""):
|
||||||
|
"""Render eine Wahlprogramm-Seite mit gelb hervorgehobener Zitat-Stelle.
|
||||||
|
|
||||||
|
Issue #47: Klick auf eine Zitat-Quelle im Report soll direkt zur
|
||||||
|
Stelle im Wahlprogramm-PDF springen, mit dem zitierten Snippet
|
||||||
|
visuell markiert. Statt das ganze PDF auszuliefern (Browser scrollt
|
||||||
|
auf #page=N und Leser muss von Hand suchen), liefern wir hier ein
|
||||||
|
1-Seiten-PDF mit ``add_highlight_annot``-Annotation auf den per
|
||||||
|
``page.search_for`` gefundenen Bounding-Boxes.
|
||||||
|
|
||||||
|
Security: ``pid`` muss ein registrierter PROGRAMME-Key sein —
|
||||||
|
verhindert Path-Traversal und arbiträren File-Read aus dem
|
||||||
|
referenzen-Verzeichnis. ``seite`` wird per Pydantic-Coercion
|
||||||
|
auf int gezwungen. ``q`` ist auf 200 Zeichen begrenzt im Renderer.
|
||||||
|
"""
|
||||||
|
if pid not in PROGRAMME:
|
||||||
|
raise HTTPException(status_code=404, detail="Unbekanntes Wahlprogramm")
|
||||||
|
if seite < 1 or seite > 2000:
|
||||||
|
raise HTTPException(status_code=400, detail="Ungültige Seitennummer")
|
||||||
|
|
||||||
|
pdf_bytes = render_highlighted_page(pid, seite, q)
|
||||||
|
if pdf_bytes is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404,
|
||||||
|
detail="Wahlprogramm-PDF oder Seite nicht verfügbar",
|
||||||
|
)
|
||||||
|
|
||||||
|
info = PROGRAMME[pid]
|
||||||
|
safe_name = info.get("pdf", f"{pid}.pdf")
|
||||||
|
return Response(
|
||||||
|
content=pdf_bytes,
|
||||||
|
media_type="application/pdf",
|
||||||
|
headers={
|
||||||
|
"Content-Disposition": f'inline; filename="{safe_name}"',
|
||||||
|
"Cache-Control": "public, max-age=86400",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/programme")
|
@app.get("/api/programme")
|
||||||
async def list_programme():
|
async def list_programme():
|
||||||
"""List all available programmes."""
|
"""List all available programmes."""
|
||||||
|
|||||||
@ -14,6 +14,8 @@ quote.
|
|||||||
import sys
|
import sys
|
||||||
import types
|
import types
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
# Stub openai before importing embeddings, since the test environment may
|
# Stub openai before importing embeddings, since the test environment may
|
||||||
# not have it installed and we don't actually need to make API calls.
|
# not have it installed and we don't actually need to make API calls.
|
||||||
if "openai" not in sys.modules:
|
if "openai" not in sys.modules:
|
||||||
@ -21,13 +23,32 @@ if "openai" not in sys.modules:
|
|||||||
openai_stub.OpenAI = lambda **kw: None
|
openai_stub.OpenAI = lambda **kw: None
|
||||||
sys.modules["openai"] = openai_stub
|
sys.modules["openai"] = openai_stub
|
||||||
|
|
||||||
|
# On dev machines an older third-party "fitz" package may shadow PyMuPDF's
|
||||||
|
# legacy import alias — verify the loaded module actually has ``open`` and
|
||||||
|
# fall back to ``pymupdf`` (the canonical name in PyMuPDF ≥ 1.24) when the
|
||||||
|
# wrong "fitz" is in front of pymupdf on sys.path.
|
||||||
|
try:
|
||||||
|
import fitz as _fitz
|
||||||
|
if not hasattr(_fitz, "open"):
|
||||||
|
import pymupdf as _pymupdf
|
||||||
|
sys.modules["fitz"] = _pymupdf
|
||||||
|
except ImportError:
|
||||||
|
try:
|
||||||
|
import pymupdf as _pymupdf
|
||||||
|
sys.modules["fitz"] = _pymupdf
|
||||||
|
except ImportError:
|
||||||
|
pass # render tests will skip via fixture below
|
||||||
|
|
||||||
from app import embeddings as embeddings_mod
|
from app import embeddings as embeddings_mod
|
||||||
from app.embeddings import (
|
from app.embeddings import (
|
||||||
|
_chunk_pdf_url,
|
||||||
_chunk_source_label,
|
_chunk_source_label,
|
||||||
find_chunk_for_text,
|
find_chunk_for_text,
|
||||||
format_quotes_for_prompt,
|
format_quotes_for_prompt,
|
||||||
get_relevant_quotes_for_antrag,
|
get_relevant_quotes_for_antrag,
|
||||||
reconstruct_zitate,
|
reconstruct_zitate,
|
||||||
|
render_highlighted_page,
|
||||||
|
PROGRAMME,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -261,7 +282,11 @@ class TestReconstructZitate:
|
|||||||
out = reconstruct_zitate(data, semantic_quotes)
|
out = reconstruct_zitate(data, semantic_quotes)
|
||||||
z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]
|
z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]
|
||||||
assert z["quelle"] == "BSW Brandenburg Wahlprogramm 2024, S. 27"
|
assert z["quelle"] == "BSW Brandenburg Wahlprogramm 2024, S. 27"
|
||||||
assert z["url"] == "/static/referenzen/bsw-bb-2024.pdf#page=27"
|
# Post-#47: URL ist der Highlight-Cite-Endpoint mit pid+seite+q.
|
||||||
|
# Static-Fallback nur noch wenn der Chunk kein text-Feld hat.
|
||||||
|
assert z["url"].startswith("/api/wahlprogramm-cite?")
|
||||||
|
assert "pid=bsw-bb-2024" in z["url"]
|
||||||
|
assert "seite=27" in z["url"]
|
||||||
|
|
||||||
def test_drops_zitat_not_found_in_any_chunk(self):
|
def test_drops_zitat_not_found_in_any_chunk(self):
|
||||||
"""If a snippet was hallucinated entirely (no matching chunk),
|
"""If a snippet was hallucinated entirely (no matching chunk),
|
||||||
@ -342,7 +367,124 @@ class TestReconstructZitate:
|
|||||||
assert find_chunk_for_text(text, [chunk]) is chunk
|
assert find_chunk_for_text(text, [chunk]) is chunk
|
||||||
|
|
||||||
|
|
||||||
def test_text_truncated_at_500_chars(self):
|
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# _chunk_pdf_url + render_highlighted_page — Issue #47 PDF-Highlighting
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestChunkPdfUrl:
|
||||||
|
"""Verify the URL builder switches between the cite-endpoint (when
|
||||||
|
chunk text is present) and the static fallback (Pre-#47 chunks).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_cite_url_when_text_present(self):
|
||||||
|
chunk = {
|
||||||
|
"programm_id": "gruene-grundsatz",
|
||||||
|
"seite": 36,
|
||||||
|
"text": "Plattformen müssen umfassend reguliert werden",
|
||||||
|
}
|
||||||
|
url = _chunk_pdf_url(chunk)
|
||||||
|
assert url is not None
|
||||||
|
assert url.startswith("/api/wahlprogramm-cite?")
|
||||||
|
assert "pid=gruene-grundsatz" in url
|
||||||
|
assert "seite=36" in url
|
||||||
|
# URL-encoded query (urlencode/quote_plus uses + for space)
|
||||||
|
assert "Plattformen" in url
|
||||||
|
|
||||||
|
def test_static_fallback_when_no_text(self):
|
||||||
|
chunk = {"programm_id": "fdp-mv-2021", "seite": 73}
|
||||||
|
url = _chunk_pdf_url(chunk)
|
||||||
|
assert url == "/static/referenzen/fdp-mv-2021.pdf#page=73"
|
||||||
|
|
||||||
|
def test_unknown_programme_returns_none(self):
|
||||||
|
chunk = {"programm_id": "fake-xx-9999", "seite": 1, "text": "x" * 50}
|
||||||
|
assert _chunk_pdf_url(chunk) is None
|
||||||
|
|
||||||
|
def test_url_truncates_long_text_to_200_chars(self):
|
||||||
|
chunk = {
|
||||||
|
"programm_id": "gruene-grundsatz",
|
||||||
|
"seite": 36,
|
||||||
|
"text": "A" * 1000,
|
||||||
|
}
|
||||||
|
url = _chunk_pdf_url(chunk)
|
||||||
|
assert url is not None
|
||||||
|
# Eingebettete Text-Länge ist auf 200 Zeichen begrenzt — sonst
|
||||||
|
# blasen 500-Zeichen-Snippets das Assessment-JSON auf.
|
||||||
|
# Der `q=`-Parameter darf nicht 1000 'A' enthalten.
|
||||||
|
assert "A" * 1000 not in url
|
||||||
|
assert "A" * 200 in url
|
||||||
|
|
||||||
|
|
||||||
|
class TestRenderHighlightedPage:
|
||||||
|
"""Smoke-Test gegen ein reales Wahlprogramm-PDF aus dem
|
||||||
|
referenzen-Verzeichnis. Bestätigt dass PyMuPDF einen 1-Seiten-PDF
|
||||||
|
mit Highlight-Annotation produziert. Skipped wenn das Test-PDF
|
||||||
|
nicht im Repo vorhanden ist.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_pid(self):
|
||||||
|
# Wir nehmen einen kleinen, sicher vorhandenen Eintrag aus PROGRAMME.
|
||||||
|
# spd-grundsatz ist seit Tag 1 indexiert und im Repo committed.
|
||||||
|
from pathlib import Path
|
||||||
|
from app.embeddings import PROGRAMME
|
||||||
|
pid = "spd-grundsatz"
|
||||||
|
info = PROGRAMME.get(pid)
|
||||||
|
if not info:
|
||||||
|
pytest.skip("PROGRAMME registry missing spd-grundsatz")
|
||||||
|
path = Path(__file__).parent.parent / "app" / "static" / "referenzen" / info["pdf"]
|
||||||
|
if not path.exists():
|
||||||
|
pytest.skip(f"Test-PDF {path} nicht im Repo")
|
||||||
|
return pid
|
||||||
|
|
||||||
|
def test_unknown_pid_returns_none(self):
|
||||||
|
assert render_highlighted_page("fake-xx-9999", 1, "x") is None
|
||||||
|
|
||||||
|
def test_invalid_seite_returns_none(self, sample_pid):
|
||||||
|
assert render_highlighted_page(sample_pid, 99999, "x") is None
|
||||||
|
assert render_highlighted_page(sample_pid, 0, "x") is None
|
||||||
|
|
||||||
|
def test_renders_single_page_pdf(self, sample_pid):
|
||||||
|
out = render_highlighted_page(sample_pid, 1, "Soziale Gerechtigkeit")
|
||||||
|
assert out is not None
|
||||||
|
assert isinstance(out, bytes)
|
||||||
|
# PDF magic header
|
||||||
|
assert out[:5] == b"%PDF-"
|
||||||
|
# PyMuPDF behält bei insert_pdf gemeinsame Resources (Fonts, Images)
|
||||||
|
# mit, deshalb ist ein 1-Seiten-Sub-PDF nicht zwangsläufig winzig.
|
||||||
|
# Wir prüfen nur dass es überhaupt deutlich kleiner als das Original
|
||||||
|
# ist (< 50% der Programm-Größe).
|
||||||
|
from pathlib import Path
|
||||||
|
info = PROGRAMME[sample_pid]
|
||||||
|
original_size = (
|
||||||
|
Path(__file__).parent.parent / "app" / "static" / "referenzen" / info["pdf"]
|
||||||
|
).stat().st_size
|
||||||
|
assert len(out) < original_size, (
|
||||||
|
f"sub-PDF {len(out)} not smaller than original {original_size}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_returns_pdf_even_when_query_empty(self, sample_pid):
|
||||||
|
# Empty query → render the page without any annotations
|
||||||
|
out = render_highlighted_page(sample_pid, 1, "")
|
||||||
|
assert out is not None
|
||||||
|
assert out[:5] == b"%PDF-"
|
||||||
|
|
||||||
|
def test_returns_pdf_even_when_query_not_found(self, sample_pid):
|
||||||
|
# No match → still render the page (no highlights)
|
||||||
|
out = render_highlighted_page(
|
||||||
|
sample_pid, 1, "this exact phrase definitely does not exist anywhere",
|
||||||
|
)
|
||||||
|
assert out is not None
|
||||||
|
assert out[:5] == b"%PDF-"
|
||||||
|
|
||||||
|
|
||||||
|
def test_format_quotes_truncates_long_chunks_at_500_chars():
|
||||||
|
"""Truncation-Test for format_quotes_for_prompt — sat lange als
|
||||||
|
Methode in TestRenderHighlightedPage (falsche Class-Zuordnung
|
||||||
|
durch Edit-Reihenfolge), jetzt module-level."""
|
||||||
long_chunk = {
|
long_chunk = {
|
||||||
"FDP": {
|
"FDP": {
|
||||||
"wahlprogramm": [
|
"wahlprogramm": [
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user