#60 Reopen — Option B: server-side reconstruct of zitat quelle/url

Sub-D Live-Run gegen Prod-DB nach dem db3ada9-Deploy hat einen neuen Halluzinations-Case gezeigt, den A+C nicht gefangen hat: BB 8/673 BSW: text aus bsw-bb-2024 S.27 (verifiziert via Volltext-Suche im PDF), aber LLM hat im quelle-Feld "S. 4" angegeben — die Seite des Top-2-Chunks im selben Retrieval-Window. Klassischer Cross-Mix zwischen Q-IDs. Strukturelle Diagnose: Das [Qn]-Tag aus A ist nur ein weicher Anker im Prompt. Das LLM darf Text aus Chunk Qn kopieren und trotzdem die quelle aus Chunk Qm zusammenbauen. Die ZITATEREGEL kann das nicht verhindern, solange wir der LLM-Selbstauskunft vertrauen. Fix (Option B aus dem ursprünglichen Plan): `embeddings.reconstruct_zitate(data, semantic_quotes)` läuft im analyzer **nach** json.loads aber **vor** Pydantic-Validation: 1. Flachen die retrievten Chunks aller Parteien zu einer einzigen Liste. 2. Pro Zitat: text via Substring oder 5-Wort-Anker gegen alle Chunks matchen (Helpers `find_chunk_for_text` + `_normalize_for_match`, identische Logik wie Sub-D Test). 3. Match → quelle/url server-seitig durch _chunk_source_label und _chunk_pdf_url des matchenden Chunks ÜBERSCHREIBEN. 4. Kein Match → Zitat verworfen (statt mit erfundener quelle persistiert). Damit kann der LLM nur noch sauber zitieren oder gar nicht — es gibt keinen Pfad mehr zu "echter Text, falsche quelle". Tests: - TestReconstructZitate (5 cases): BB 8/673 Re-Mapping, Drop bei hallucinated, no-op bei leeren chunks, anchor-match-Fallback, short-needle und soft-hyphen Edge-Cases - 185/185 grün (179 + 6 neu) Refs: #60, #54 (Sub-D)
2026-04-09 22:52:17 +02:00 · 2026-04-09 22:52:17 +02:00 · 6ced7ae018
commit 6ced7ae018
parent db3ada9328
3 changed files with 288 additions and 1 deletions
--- a/app/analyzer.py
+++ b/app/analyzer.py
@ -16,7 +16,12 @@ from .wahlprogramme import (
    format_quote_for_prompt,
    WAHLPROGRAMM_KONTEXT_FILES,
 )
-from .embeddings import get_relevant_quotes_for_antrag, format_quotes_for_prompt, EMBEDDINGS_DB
+from .embeddings import (
+    get_relevant_quotes_for_antrag,
+    format_quotes_for_prompt,
+    reconstruct_zitate,
+    EMBEDDINGS_DB,
+)

 logger = logging.getLogger(__name__)

@ -240,6 +245,7 @@ async def analyze_antrag(text: str, bundesland: str = "NRW", model: str = "qwen-

    # Suche relevante Zitate via semantische Suche (Embeddings)
    quotes_context = ""
+    semantic_quotes: dict = {}
    if EMBEDDINGS_DB.exists():
        try:
            semantic_quotes = get_relevant_quotes_for_antrag(
@ -321,6 +327,14 @@ Ausgabe als reines JSON ohne Markdown-Codeblöcke."""
        try:
            # Parse JSON
            data = json.loads(content)
+            # Issue #60 Option B — server-side reconstruction of citation
+            # quelle/url from the actually retrieved chunks, before Pydantic
+            # validation. The LLM is no longer trusted for the citation source
+            # label; we replace it with the canonical _chunk_source_label of
+            # the chunk whose text actually contains the cited snippet, and
+            # drop any zitat that can't be located in any retrieved chunk.
+            if semantic_quotes:
+                data = reconstruct_zitate(data, semantic_quotes)
            # Convert to Assessment model
            return Assessment.model_validate(data)
        except json.JSONDecodeError as e:
--- a/app/embeddings.py
+++ b/app/embeddings.py
@ -1,6 +1,7 @@
 """Semantic search for Wahlprogramme and Parteiprogramme using Qwen embeddings."""

 import json
+import re
 import sqlite3
 from pathlib import Path
 from typing import Optional
@ -547,6 +548,136 @@ def _chunk_source_label(chunk: dict) -> str:
    return f"{name}, S. {seite}"


+def _chunk_pdf_url(chunk: dict) -> Optional[str]:
+    """Build the canonical PDF URL with page anchor for a chunk."""
+    prog_id = chunk.get("programm_id", "")
+    info = PROGRAMME.get(prog_id)
+    if not info:
+        return None
+    pdf = info.get("pdf")
+    if not pdf:
+        return None
+    seite = chunk.get("seite")
+    if seite:
+        return f"/static/referenzen/{pdf}#page={seite}"
+    return f"/static/referenzen/{pdf}"
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Citation post-processing — Issue #60 Option B
+#
+# Pre-#60 the LLM was free to fabricate `quelle`/`url` strings even when the
+# `text` was a real snippet from a retrieved chunk. The A+C fix made the
+# prompt more strict, but BB 8/673 (post-deploy) showed the LLM still
+# cross-mixed: it copied text from chunk Qn but wrote the page from chunk Qm
+# in the `quelle` field.
+#
+# The structural fix is to take quelle/url generation away from the LLM
+# entirely. After the LLM responds, we walk over every Zitat and try to
+# locate its `text` (substring or 5-word anchor) in any of the chunks the
+# LLM was actually shown. If we find a match, we *overwrite* quelle and url
+# with the canonical values from that chunk. If we don't find a match, the
+# Zitat is dropped — it cannot be backed by retrieved evidence.
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+_RE_WHITESPACE = re.compile(r"\s+")
+_RE_HYPHEN_BREAK = re.compile(r"(\w)-\s+(\w)")
+_RE_TRUNCATION = re.compile(r"^\s*\.{2,}|\.{2,}\s*$")
+
+
+def _normalize_for_match(text: str) -> str:
+    """Lowercase, collapse whitespace, bridge soft-hyphen line breaks.
+
+    Mirrors the matcher used in tests/integration/test_citations_substring.py
+    so that the analyzer's post-processing and Sub-D's verification stay in
+    lockstep.
+    """
+    s = (text or "").lower()
+    s = _RE_TRUNCATION.sub("", s)
+    s = s.replace("\u00ad", "")  # soft hyphen
+    s = _RE_WHITESPACE.sub(" ", s).strip()
+    prev = None
+    while prev != s:
+        prev = s
+        s = _RE_HYPHEN_BREAK.sub(r"\1\2", s)
+    return s
+
+
+def find_chunk_for_text(text: str, chunks: list[dict]) -> Optional[dict]:
+    """Locate the retrieved chunk that a Zitat snippet was copied from.
+
+    Two-stage match identical to Sub-D:
+      1. **Strict substring** — full needle as substring of any chunk.
+      2. **5-word anchor** — any 5 consecutive words of the needle as
+         substring of any chunk.
+
+    Snippets shorter than 20 characters are rejected (too weak to bind).
+    Returns the matching chunk dict, or None.
+    """
+    needle = _normalize_for_match(text)
+    if len(needle) < 20:
+        return None
+    chunks_norm = [(c, _normalize_for_match(c.get("text", ""))) for c in chunks]
+    for c, norm in chunks_norm:
+        if needle in norm:
+            return c
+    words = needle.split()
+    if len(words) < 5:
+        return None
+    for i in range(len(words) - 4):
+        anchor = " ".join(words[i:i + 5])
+        for c, norm in chunks_norm:
+            if anchor in norm:
+                return c
+    return None
+
+
+def reconstruct_zitate(data: dict, semantic_quotes: dict) -> dict:
+    """Replace LLM-emitted quelle/url with canonical chunk values; drop unbacked.
+
+    Walks over ``data['wahlprogrammScores'][i][kind]['zitate']`` (the raw
+    LLM-output dict, not the Pydantic model). For each Zitat:
+
+    * Locate the chunk whose text contains the snippet (or a 5-word anchor
+      from it). Search across **all** retrieved chunks regardless of party,
+      so cross-mixes between Q-IDs become invisible to the persisted output.
+    * If found: overwrite ``quelle`` and ``url`` with values derived from
+      the matching chunk's ``programm_id`` + ``seite``. The LLM is no longer
+      trusted for these fields.
+    * If not found: drop the Zitat entirely.
+
+    Returns the same ``data`` dict (mutated in place) for chaining.
+    """
+    if not semantic_quotes:
+        return data
+
+    all_chunks: list[dict] = []
+    for d in semantic_quotes.values():
+        all_chunks.extend(d.get("wahlprogramm", []))
+        all_chunks.extend(d.get("parteiprogramm", []))
+    if not all_chunks:
+        return data
+
+    for fs in data.get("wahlprogrammScores", []) or []:
+        for kind in ("wahlprogramm", "parteiprogramm"):
+            blk = fs.get(kind) or {}
+            zitate = blk.get("zitate") or []
+            cleaned = []
+            for z in zitate:
+                text = z.get("text", "")
+                matched = find_chunk_for_text(text, all_chunks)
+                if matched is None:
+                    continue
+                z["quelle"] = _chunk_source_label(matched)
+                url = _chunk_pdf_url(matched)
+                if url:
+                    z["url"] = url
+                cleaned.append(z)
+            blk["zitate"] = cleaned
+    return data
+
+
 def format_quotes_for_prompt(quotes: dict) -> str:
    """Format quotes for inclusion in LLM prompt.

--- a/tests/test_embeddings.py
+++ b/tests/test_embeddings.py
@ -24,8 +24,10 @@ if "openai" not in sys.modules:
 from app import embeddings as embeddings_mod
 from app.embeddings import (
    _chunk_source_label,
+    find_chunk_for_text,
    format_quotes_for_prompt,
    get_relevant_quotes_for_antrag,
+    reconstruct_zitate,
 )


@ -200,6 +202,146 @@ class TestFormatQuotesForPrompt:
        assert "wahlprogramm" in first
        assert "parteiprogramm" in first

+# ─────────────────────────────────────────────────────────────────────────────
+# reconstruct_zitate — Issue #60 Option B (server-side citation rewrite)
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+class TestReconstructZitate:
+    """Verify the post-processing pass that overwrites LLM-emitted quelle/url
+    with the canonical source label of whichever retrieved chunk actually
+    contains the cited text. Drops zitate that don't match any chunk.
+
+    Background: BB 8/673 (Sub-D run after the A+C deploy) showed the LLM
+    copying real text from one chunk but writing the page number from a
+    different chunk into ``quelle``. The ENUM-anchor in the prompt is only
+    a soft hint; this post-processing step is the structural binding.
+    """
+
+    def _make_chunk(self, programm_id: str, seite: int, text: str) -> dict:
+        return {
+            "programm_id": programm_id,
+            "partei": programm_id.split("-")[0].upper(),
+            "typ": "wahlprogramm",
+            "seite": seite,
+            "text": text,
+            "similarity": 0.7,
+        }
+
+    def test_overwrites_wrong_seite_with_real_chunk_seite(self):
+        """The BB 8/673 case: LLM cites text from S.27 chunk but writes
+        S.4 in quelle. After reconstruct_zitate the quelle must point to
+        the real S.27 chunk."""
+        real_chunk = self._make_chunk(
+            "bsw-bb-2024", 27,
+            "wertschätzung für lehrerinnen und lehrer abbau von arbeitsüberlastung",
+        )
+        wrong_chunk = self._make_chunk(
+            "bsw-bb-2024", 4,
+            "in brandenburg weniger als 14 euro in der stunde verdient",
+        )
+        semantic_quotes = {
+            "BSW": {"wahlprogramm": [wrong_chunk, real_chunk], "parteiprogramm": []},
+        }
+        data = {
+            "wahlprogrammScores": [{
+                "fraktion": "BSW",
+                "wahlprogramm": {
+                    "score": 7,
+                    "begründung": "...",
+                    "zitate": [{
+                        "text": "Wertschätzung für Lehrerinnen und Lehrer Abbau von Arbeitsüberlastung",
+                        "quelle": "BSW Brandenburg Wahlprogramm 2024, S. 4",  # WRONG
+                        "url": "/static/referenzen/bsw-bb-2024.pdf#page=4",
+                    }],
+                },
+                "parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
+            }],
+        }
+        out = reconstruct_zitate(data, semantic_quotes)
+        z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]
+        assert z["quelle"] == "BSW Brandenburg Wahlprogramm 2024, S. 27"
+        assert z["url"] == "/static/referenzen/bsw-bb-2024.pdf#page=27"
+
+    def test_drops_zitat_not_found_in_any_chunk(self):
+        """If a snippet was hallucinated entirely (no matching chunk),
+        the zitat must be removed rather than persisted."""
+        chunk = self._make_chunk(
+            "spd-lsa-2021", 41,
+            "die stärkung einer geschlechtersensiblen berufsorientierung",
+        )
+        semantic_quotes = {
+            "SPD": {"wahlprogramm": [chunk], "parteiprogramm": []},
+        }
+        data = {
+            "wahlprogrammScores": [{
+                "fraktion": "SPD",
+                "wahlprogramm": {
+                    "score": 7,
+                    "begründung": "...",
+                    "zitate": [
+                        {"text": "Wir Sozialdemokratinnen ächten Rechtsextremismus seit 1863",
+                         "quelle": "SPD Sachsen-Anhalt 2021, S. 37"},
+                        {"text": "die Stärkung einer geschlechtersensiblen Berufsorientierung",
+                         "quelle": "SPD Sachsen-Anhalt 2021, S. 41"},
+                    ],
+                },
+                "parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
+            }],
+        }
+        out = reconstruct_zitate(data, semantic_quotes)
+        zitate = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"]
+        assert len(zitate) == 1
+        assert "geschlechtersensiblen" in zitate[0]["text"]
+
+    def test_empty_semantic_quotes_is_noop(self):
+        data = {"wahlprogrammScores": [{
+            "fraktion": "X",
+            "wahlprogramm": {"score": 5, "begründung": "x",
+                             "zitate": [{"text": "abc def ghi jkl mno pqr", "quelle": "X"}]},
+            "parteiprogramm": {"score": 0, "begründung": "x", "zitate": []},
+        }]}
+        out = reconstruct_zitate(data, {})
+        # No chunks → no postprocessing applied; data passes through unchanged
+        assert out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]["quelle"] == "X"
+
+    def test_anchor_match_when_full_substring_misses(self):
+        """LLM may slightly truncate a snippet — 5-word-anchor still binds."""
+        chunk = self._make_chunk(
+            "cdu-nrw-2022", 24,
+            "wir wollen interprofessionelle netzwerkstrukturen für kinderschutz fördern dazu werden wir stellen schaffen",
+        )
+        semantic_quotes = {"CDU": {"wahlprogramm": [chunk], "parteiprogramm": []}}
+        data = {"wahlprogrammScores": [{
+            "fraktion": "CDU",
+            "wahlprogramm": {
+                "score": 8, "begründung": "...",
+                "zitate": [{
+                    "text": "Wir wollen interprofessionelle Netzwerkstrukturen für Kinderschutz fördern",
+                    "quelle": "CDU NRW Wahlprogramm 2022, S. 999",  # wrong page
+                }],
+            },
+            "parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
+        }]}
+        out = reconstruct_zitate(data, semantic_quotes)
+        z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]
+        assert z["quelle"] == "CDU NRW Wahlprogramm 2022, S. 24"
+
+    def test_find_chunk_for_text_short_needle_returns_none(self):
+        chunk = self._make_chunk("x", 1, "egal was hier steht")
+        assert find_chunk_for_text("ja", [chunk]) is None
+
+    def test_find_chunk_for_text_handles_soft_hyphen(self):
+        chunk = self._make_chunk(
+            "bsw-bb-2024", 27,
+            "handys und tablets wertschätzung für lehrerinnen und lehrer",
+        )
+        # LLM-emitted text with the soft hyphen \xad mid-word, as PyMuPDF
+        # would extract from a PDF line break.
+        text = "Handys und Tablets. Wertschätzung für Lehrerinnen und Lehrer"
+        assert find_chunk_for_text(text, [chunk]) is chunk
+
+
    def test_text_truncated_at_500_chars(self):
        long_chunk = {
            "FDP": {