diff --git a/app/embeddings.py b/app/embeddings.py index 5a95c4d..920f764 100644 --- a/app/embeddings.py +++ b/app/embeddings.py @@ -797,20 +797,18 @@ def find_chunk_for_text(text: str, chunks: list[dict]) -> Optional[dict]: def reconstruct_zitate(data: dict, semantic_quotes: dict) -> dict: - """Replace LLM-emitted quelle/url with canonical chunk values; drop unbacked. + """Verify and reconstruct LLM-emitted zitate against retrieved chunks. - Walks over ``data['wahlprogrammScores'][i][kind]['zitate']`` (the raw - LLM-output dict, not the Pydantic model). For each Zitat: + For each Zitat: + * **verified** (substring/4-word-anchor match): overwrite quelle/url + with canonical chunk values, set ``verified: true``. + * **unverified** (no match found): keep the Zitat but set + ``verified: false``. The UI shows it with a different style so the + user knows it's an LLM-Paraphrase, not a wörtliches Zitat. - * Locate the chunk whose text contains the snippet (or a 5-word anchor - from it). Search across **all** retrieved chunks regardless of party, - so cross-mixes between Q-IDs become invisible to the persisted output. - * If found: overwrite ``quelle`` and ``url`` with values derived from - the matching chunk's ``programm_id`` + ``seite``. The LLM is no longer - trusted for these fields. - * If not found: drop the Zitat entirely. - - Returns the same ``data`` dict (mutated in place) for chaining. + This replaces the old drop-on-no-match behavior (ADR 0001 Option B) + with a more honest approach: paraphrased citations are still valuable + context, they just need to be marked as such. """ if not semantic_quotes: return data @@ -830,12 +828,16 @@ def reconstruct_zitate(data: dict, semantic_quotes: dict) -> dict: for z in zitate: text = z.get("text", "") matched = find_chunk_for_text(text, all_chunks) - if matched is None: - continue - z["quelle"] = _chunk_source_label(matched) - url = _chunk_pdf_url(matched) - if url: - z["url"] = url + if matched is not None: + z["quelle"] = _chunk_source_label(matched) + url = _chunk_pdf_url(matched) + if url: + z["url"] = url + z["verified"] = True + else: + # Kein Match — Zitat behalten aber als unverified markieren. + # Die LLM-emittierte quelle/url bleibt (best effort). + z["verified"] = False cleaned.append(z) blk["zitate"] = cleaned return data diff --git a/app/models.py b/app/models.py index 8a46dee..bd6397c 100644 --- a/app/models.py +++ b/app/models.py @@ -45,6 +45,7 @@ class Zitat(BaseModel): text: str quelle: str url: Optional[str] = None + verified: Optional[bool] = None # True=wörtlich im Chunk, False=paraphrasiert, None=pre-#97 class ProgrammScore(BaseModel): diff --git a/app/templates/index.html b/app/templates/index.html index 5e07d72..0f4f919 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -1619,14 +1619,22 @@ const wahlprogrammHtml = (item.wahlprogrammScores || []).map(wp => { // Zitate formatieren mit klickbaren Links + Highlighting - const zitateHtml = (wp.wahlprogramm?.zitate || []).map(z => ` -