diff --git a/app/analyzer.py b/app/analyzer.py index 431af6c..e45da65 100644 --- a/app/analyzer.py +++ b/app/analyzer.py @@ -16,7 +16,12 @@ from .wahlprogramme import ( format_quote_for_prompt, WAHLPROGRAMM_KONTEXT_FILES, ) -from .embeddings import get_relevant_quotes_for_antrag, format_quotes_for_prompt, EMBEDDINGS_DB +from .embeddings import ( + get_relevant_quotes_for_antrag, + format_quotes_for_prompt, + reconstruct_zitate, + EMBEDDINGS_DB, +) logger = logging.getLogger(__name__) @@ -240,6 +245,7 @@ async def analyze_antrag(text: str, bundesland: str = "NRW", model: str = "qwen- # Suche relevante Zitate via semantische Suche (Embeddings) quotes_context = "" + semantic_quotes: dict = {} if EMBEDDINGS_DB.exists(): try: semantic_quotes = get_relevant_quotes_for_antrag( @@ -321,6 +327,14 @@ Ausgabe als reines JSON ohne Markdown-Codeblöcke.""" try: # Parse JSON data = json.loads(content) + # Issue #60 Option B — server-side reconstruction of citation + # quelle/url from the actually retrieved chunks, before Pydantic + # validation. The LLM is no longer trusted for the citation source + # label; we replace it with the canonical _chunk_source_label of + # the chunk whose text actually contains the cited snippet, and + # drop any zitat that can't be located in any retrieved chunk. + if semantic_quotes: + data = reconstruct_zitate(data, semantic_quotes) # Convert to Assessment model return Assessment.model_validate(data) except json.JSONDecodeError as e: diff --git a/app/embeddings.py b/app/embeddings.py index cd67e17..695cfed 100644 --- a/app/embeddings.py +++ b/app/embeddings.py @@ -1,6 +1,7 @@ """Semantic search for Wahlprogramme and Parteiprogramme using Qwen embeddings.""" import json +import re import sqlite3 from pathlib import Path from typing import Optional @@ -547,6 +548,136 @@ def _chunk_source_label(chunk: dict) -> str: return f"{name}, S. {seite}" +def _chunk_pdf_url(chunk: dict) -> Optional[str]: + """Build the canonical PDF URL with page anchor for a chunk.""" + prog_id = chunk.get("programm_id", "") + info = PROGRAMME.get(prog_id) + if not info: + return None + pdf = info.get("pdf") + if not pdf: + return None + seite = chunk.get("seite") + if seite: + return f"/static/referenzen/{pdf}#page={seite}" + return f"/static/referenzen/{pdf}" + + +# ───────────────────────────────────────────────────────────────────────────── +# Citation post-processing — Issue #60 Option B +# +# Pre-#60 the LLM was free to fabricate `quelle`/`url` strings even when the +# `text` was a real snippet from a retrieved chunk. The A+C fix made the +# prompt more strict, but BB 8/673 (post-deploy) showed the LLM still +# cross-mixed: it copied text from chunk Qn but wrote the page from chunk Qm +# in the `quelle` field. +# +# The structural fix is to take quelle/url generation away from the LLM +# entirely. After the LLM responds, we walk over every Zitat and try to +# locate its `text` (substring or 5-word anchor) in any of the chunks the +# LLM was actually shown. If we find a match, we *overwrite* quelle and url +# with the canonical values from that chunk. If we don't find a match, the +# Zitat is dropped — it cannot be backed by retrieved evidence. +# ───────────────────────────────────────────────────────────────────────────── + + +_RE_WHITESPACE = re.compile(r"\s+") +_RE_HYPHEN_BREAK = re.compile(r"(\w)-\s+(\w)") +_RE_TRUNCATION = re.compile(r"^\s*\.{2,}|\.{2,}\s*$") + + +def _normalize_for_match(text: str) -> str: + """Lowercase, collapse whitespace, bridge soft-hyphen line breaks. + + Mirrors the matcher used in tests/integration/test_citations_substring.py + so that the analyzer's post-processing and Sub-D's verification stay in + lockstep. + """ + s = (text or "").lower() + s = _RE_TRUNCATION.sub("", s) + s = s.replace("\u00ad", "") # soft hyphen + s = _RE_WHITESPACE.sub(" ", s).strip() + prev = None + while prev != s: + prev = s + s = _RE_HYPHEN_BREAK.sub(r"\1\2", s) + return s + + +def find_chunk_for_text(text: str, chunks: list[dict]) -> Optional[dict]: + """Locate the retrieved chunk that a Zitat snippet was copied from. + + Two-stage match identical to Sub-D: + 1. **Strict substring** — full needle as substring of any chunk. + 2. **5-word anchor** — any 5 consecutive words of the needle as + substring of any chunk. + + Snippets shorter than 20 characters are rejected (too weak to bind). + Returns the matching chunk dict, or None. + """ + needle = _normalize_for_match(text) + if len(needle) < 20: + return None + chunks_norm = [(c, _normalize_for_match(c.get("text", ""))) for c in chunks] + for c, norm in chunks_norm: + if needle in norm: + return c + words = needle.split() + if len(words) < 5: + return None + for i in range(len(words) - 4): + anchor = " ".join(words[i:i + 5]) + for c, norm in chunks_norm: + if anchor in norm: + return c + return None + + +def reconstruct_zitate(data: dict, semantic_quotes: dict) -> dict: + """Replace LLM-emitted quelle/url with canonical chunk values; drop unbacked. + + Walks over ``data['wahlprogrammScores'][i][kind]['zitate']`` (the raw + LLM-output dict, not the Pydantic model). For each Zitat: + + * Locate the chunk whose text contains the snippet (or a 5-word anchor + from it). Search across **all** retrieved chunks regardless of party, + so cross-mixes between Q-IDs become invisible to the persisted output. + * If found: overwrite ``quelle`` and ``url`` with values derived from + the matching chunk's ``programm_id`` + ``seite``. The LLM is no longer + trusted for these fields. + * If not found: drop the Zitat entirely. + + Returns the same ``data`` dict (mutated in place) for chaining. + """ + if not semantic_quotes: + return data + + all_chunks: list[dict] = [] + for d in semantic_quotes.values(): + all_chunks.extend(d.get("wahlprogramm", [])) + all_chunks.extend(d.get("parteiprogramm", [])) + if not all_chunks: + return data + + for fs in data.get("wahlprogrammScores", []) or []: + for kind in ("wahlprogramm", "parteiprogramm"): + blk = fs.get(kind) or {} + zitate = blk.get("zitate") or [] + cleaned = [] + for z in zitate: + text = z.get("text", "") + matched = find_chunk_for_text(text, all_chunks) + if matched is None: + continue + z["quelle"] = _chunk_source_label(matched) + url = _chunk_pdf_url(matched) + if url: + z["url"] = url + cleaned.append(z) + blk["zitate"] = cleaned + return data + + def format_quotes_for_prompt(quotes: dict) -> str: """Format quotes for inclusion in LLM prompt. diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py index ee7349a..718b7c2 100644 --- a/tests/test_embeddings.py +++ b/tests/test_embeddings.py @@ -24,8 +24,10 @@ if "openai" not in sys.modules: from app import embeddings as embeddings_mod from app.embeddings import ( _chunk_source_label, + find_chunk_for_text, format_quotes_for_prompt, get_relevant_quotes_for_antrag, + reconstruct_zitate, ) @@ -200,6 +202,146 @@ class TestFormatQuotesForPrompt: assert "wahlprogramm" in first assert "parteiprogramm" in first +# ───────────────────────────────────────────────────────────────────────────── +# reconstruct_zitate — Issue #60 Option B (server-side citation rewrite) +# ───────────────────────────────────────────────────────────────────────────── + + +class TestReconstructZitate: + """Verify the post-processing pass that overwrites LLM-emitted quelle/url + with the canonical source label of whichever retrieved chunk actually + contains the cited text. Drops zitate that don't match any chunk. + + Background: BB 8/673 (Sub-D run after the A+C deploy) showed the LLM + copying real text from one chunk but writing the page number from a + different chunk into ``quelle``. The ENUM-anchor in the prompt is only + a soft hint; this post-processing step is the structural binding. + """ + + def _make_chunk(self, programm_id: str, seite: int, text: str) -> dict: + return { + "programm_id": programm_id, + "partei": programm_id.split("-")[0].upper(), + "typ": "wahlprogramm", + "seite": seite, + "text": text, + "similarity": 0.7, + } + + def test_overwrites_wrong_seite_with_real_chunk_seite(self): + """The BB 8/673 case: LLM cites text from S.27 chunk but writes + S.4 in quelle. After reconstruct_zitate the quelle must point to + the real S.27 chunk.""" + real_chunk = self._make_chunk( + "bsw-bb-2024", 27, + "wertschätzung für lehrerinnen und lehrer abbau von arbeitsüberlastung", + ) + wrong_chunk = self._make_chunk( + "bsw-bb-2024", 4, + "in brandenburg weniger als 14 euro in der stunde verdient", + ) + semantic_quotes = { + "BSW": {"wahlprogramm": [wrong_chunk, real_chunk], "parteiprogramm": []}, + } + data = { + "wahlprogrammScores": [{ + "fraktion": "BSW", + "wahlprogramm": { + "score": 7, + "begründung": "...", + "zitate": [{ + "text": "Wertschätzung für Lehrerinnen und Lehrer Abbau von Arbeitsüberlastung", + "quelle": "BSW Brandenburg Wahlprogramm 2024, S. 4", # WRONG + "url": "/static/referenzen/bsw-bb-2024.pdf#page=4", + }], + }, + "parteiprogramm": {"score": 0, "begründung": "...", "zitate": []}, + }], + } + out = reconstruct_zitate(data, semantic_quotes) + z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0] + assert z["quelle"] == "BSW Brandenburg Wahlprogramm 2024, S. 27" + assert z["url"] == "/static/referenzen/bsw-bb-2024.pdf#page=27" + + def test_drops_zitat_not_found_in_any_chunk(self): + """If a snippet was hallucinated entirely (no matching chunk), + the zitat must be removed rather than persisted.""" + chunk = self._make_chunk( + "spd-lsa-2021", 41, + "die stärkung einer geschlechtersensiblen berufsorientierung", + ) + semantic_quotes = { + "SPD": {"wahlprogramm": [chunk], "parteiprogramm": []}, + } + data = { + "wahlprogrammScores": [{ + "fraktion": "SPD", + "wahlprogramm": { + "score": 7, + "begründung": "...", + "zitate": [ + {"text": "Wir Sozialdemokratinnen ächten Rechtsextremismus seit 1863", + "quelle": "SPD Sachsen-Anhalt 2021, S. 37"}, + {"text": "die Stärkung einer geschlechtersensiblen Berufsorientierung", + "quelle": "SPD Sachsen-Anhalt 2021, S. 41"}, + ], + }, + "parteiprogramm": {"score": 0, "begründung": "...", "zitate": []}, + }], + } + out = reconstruct_zitate(data, semantic_quotes) + zitate = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"] + assert len(zitate) == 1 + assert "geschlechtersensiblen" in zitate[0]["text"] + + def test_empty_semantic_quotes_is_noop(self): + data = {"wahlprogrammScores": [{ + "fraktion": "X", + "wahlprogramm": {"score": 5, "begründung": "x", + "zitate": [{"text": "abc def ghi jkl mno pqr", "quelle": "X"}]}, + "parteiprogramm": {"score": 0, "begründung": "x", "zitate": []}, + }]} + out = reconstruct_zitate(data, {}) + # No chunks → no postprocessing applied; data passes through unchanged + assert out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]["quelle"] == "X" + + def test_anchor_match_when_full_substring_misses(self): + """LLM may slightly truncate a snippet — 5-word-anchor still binds.""" + chunk = self._make_chunk( + "cdu-nrw-2022", 24, + "wir wollen interprofessionelle netzwerkstrukturen für kinderschutz fördern dazu werden wir stellen schaffen", + ) + semantic_quotes = {"CDU": {"wahlprogramm": [chunk], "parteiprogramm": []}} + data = {"wahlprogrammScores": [{ + "fraktion": "CDU", + "wahlprogramm": { + "score": 8, "begründung": "...", + "zitate": [{ + "text": "Wir wollen interprofessionelle Netzwerkstrukturen für Kinderschutz fördern", + "quelle": "CDU NRW Wahlprogramm 2022, S. 999", # wrong page + }], + }, + "parteiprogramm": {"score": 0, "begründung": "...", "zitate": []}, + }]} + out = reconstruct_zitate(data, semantic_quotes) + z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0] + assert z["quelle"] == "CDU NRW Wahlprogramm 2022, S. 24" + + def test_find_chunk_for_text_short_needle_returns_none(self): + chunk = self._make_chunk("x", 1, "egal was hier steht") + assert find_chunk_for_text("ja", [chunk]) is None + + def test_find_chunk_for_text_handles_soft_hyphen(self): + chunk = self._make_chunk( + "bsw-bb-2024", 27, + "handys und tablets wertschätzung für lehrerinnen und lehrer", + ) + # LLM-emitted text with the soft hyphen \xad mid-word, as PyMuPDF + # would extract from a PDF line break. + text = "Handys und Tablets. Wertschätzung für Lehrerinnen und Lehrer" + assert find_chunk_for_text(text, [chunk]) is chunk + + def test_text_truncated_at_500_chars(self): long_chunk = { "FDP": {