#60 Reopen — Option B: server-side reconstruct of zitat quelle/url

Sub-D Live-Run gegen Prod-DB nach dem db3ada9-Deploy hat einen neuen
Halluzinations-Case gezeigt, den A+C nicht gefangen hat:

  BB 8/673 BSW: text aus bsw-bb-2024 S.27 (verifiziert via Volltext-Suche
  im PDF), aber LLM hat im quelle-Feld "S. 4" angegeben — die Seite des
  Top-2-Chunks im selben Retrieval-Window. Klassischer Cross-Mix zwischen
  Q-IDs.

Strukturelle Diagnose: Das [Qn]-Tag aus A ist nur ein weicher Anker im
Prompt. Das LLM darf Text aus Chunk Qn kopieren und trotzdem die quelle
aus Chunk Qm zusammenbauen. Die ZITATEREGEL kann das nicht verhindern,
solange wir der LLM-Selbstauskunft vertrauen.

Fix (Option B aus dem ursprünglichen Plan):

`embeddings.reconstruct_zitate(data, semantic_quotes)` läuft im
analyzer **nach** json.loads aber **vor** Pydantic-Validation:

1. Flachen die retrievten Chunks aller Parteien zu einer einzigen Liste.
2. Pro Zitat: text via Substring oder 5-Wort-Anker gegen alle Chunks
   matchen (Helpers `find_chunk_for_text` + `_normalize_for_match`,
   identische Logik wie Sub-D Test).
3. Match → quelle/url server-seitig durch _chunk_source_label und
   _chunk_pdf_url des matchenden Chunks ÜBERSCHREIBEN.
4. Kein Match → Zitat verworfen (statt mit erfundener quelle persistiert).

Damit kann der LLM nur noch sauber zitieren oder gar nicht — es gibt
keinen Pfad mehr zu "echter Text, falsche quelle".

Tests:
- TestReconstructZitate (5 cases): BB 8/673 Re-Mapping, Drop bei
  hallucinated, no-op bei leeren chunks, anchor-match-Fallback,
  short-needle und soft-hyphen Edge-Cases
- 185/185 grün (179 + 6 neu)

Refs: #60, #54 (Sub-D)
This commit is contained in:
Dotty Dotter 2026-04-09 22:52:17 +02:00
parent db3ada9328
commit 6ced7ae018
3 changed files with 288 additions and 1 deletions

View File

@ -16,7 +16,12 @@ from .wahlprogramme import (
format_quote_for_prompt,
WAHLPROGRAMM_KONTEXT_FILES,
)
from .embeddings import get_relevant_quotes_for_antrag, format_quotes_for_prompt, EMBEDDINGS_DB
from .embeddings import (
get_relevant_quotes_for_antrag,
format_quotes_for_prompt,
reconstruct_zitate,
EMBEDDINGS_DB,
)
logger = logging.getLogger(__name__)
@ -240,6 +245,7 @@ async def analyze_antrag(text: str, bundesland: str = "NRW", model: str = "qwen-
# Suche relevante Zitate via semantische Suche (Embeddings)
quotes_context = ""
semantic_quotes: dict = {}
if EMBEDDINGS_DB.exists():
try:
semantic_quotes = get_relevant_quotes_for_antrag(
@ -321,6 +327,14 @@ Ausgabe als reines JSON ohne Markdown-Codeblöcke."""
try:
# Parse JSON
data = json.loads(content)
# Issue #60 Option B — server-side reconstruction of citation
# quelle/url from the actually retrieved chunks, before Pydantic
# validation. The LLM is no longer trusted for the citation source
# label; we replace it with the canonical _chunk_source_label of
# the chunk whose text actually contains the cited snippet, and
# drop any zitat that can't be located in any retrieved chunk.
if semantic_quotes:
data = reconstruct_zitate(data, semantic_quotes)
# Convert to Assessment model
return Assessment.model_validate(data)
except json.JSONDecodeError as e:

View File

@ -1,6 +1,7 @@
"""Semantic search for Wahlprogramme and Parteiprogramme using Qwen embeddings."""
import json
import re
import sqlite3
from pathlib import Path
from typing import Optional
@ -547,6 +548,136 @@ def _chunk_source_label(chunk: dict) -> str:
return f"{name}, S. {seite}"
def _chunk_pdf_url(chunk: dict) -> Optional[str]:
"""Build the canonical PDF URL with page anchor for a chunk."""
prog_id = chunk.get("programm_id", "")
info = PROGRAMME.get(prog_id)
if not info:
return None
pdf = info.get("pdf")
if not pdf:
return None
seite = chunk.get("seite")
if seite:
return f"/static/referenzen/{pdf}#page={seite}"
return f"/static/referenzen/{pdf}"
# ─────────────────────────────────────────────────────────────────────────────
# Citation post-processing — Issue #60 Option B
#
# Pre-#60 the LLM was free to fabricate `quelle`/`url` strings even when the
# `text` was a real snippet from a retrieved chunk. The A+C fix made the
# prompt more strict, but BB 8/673 (post-deploy) showed the LLM still
# cross-mixed: it copied text from chunk Qn but wrote the page from chunk Qm
# in the `quelle` field.
#
# The structural fix is to take quelle/url generation away from the LLM
# entirely. After the LLM responds, we walk over every Zitat and try to
# locate its `text` (substring or 5-word anchor) in any of the chunks the
# LLM was actually shown. If we find a match, we *overwrite* quelle and url
# with the canonical values from that chunk. If we don't find a match, the
# Zitat is dropped — it cannot be backed by retrieved evidence.
# ─────────────────────────────────────────────────────────────────────────────
_RE_WHITESPACE = re.compile(r"\s+")
_RE_HYPHEN_BREAK = re.compile(r"(\w)-\s+(\w)")
_RE_TRUNCATION = re.compile(r"^\s*\.{2,}|\.{2,}\s*$")
def _normalize_for_match(text: str) -> str:
"""Lowercase, collapse whitespace, bridge soft-hyphen line breaks.
Mirrors the matcher used in tests/integration/test_citations_substring.py
so that the analyzer's post-processing and Sub-D's verification stay in
lockstep.
"""
s = (text or "").lower()
s = _RE_TRUNCATION.sub("", s)
s = s.replace("\u00ad", "") # soft hyphen
s = _RE_WHITESPACE.sub(" ", s).strip()
prev = None
while prev != s:
prev = s
s = _RE_HYPHEN_BREAK.sub(r"\1\2", s)
return s
def find_chunk_for_text(text: str, chunks: list[dict]) -> Optional[dict]:
"""Locate the retrieved chunk that a Zitat snippet was copied from.
Two-stage match identical to Sub-D:
1. **Strict substring** full needle as substring of any chunk.
2. **5-word anchor** any 5 consecutive words of the needle as
substring of any chunk.
Snippets shorter than 20 characters are rejected (too weak to bind).
Returns the matching chunk dict, or None.
"""
needle = _normalize_for_match(text)
if len(needle) < 20:
return None
chunks_norm = [(c, _normalize_for_match(c.get("text", ""))) for c in chunks]
for c, norm in chunks_norm:
if needle in norm:
return c
words = needle.split()
if len(words) < 5:
return None
for i in range(len(words) - 4):
anchor = " ".join(words[i:i + 5])
for c, norm in chunks_norm:
if anchor in norm:
return c
return None
def reconstruct_zitate(data: dict, semantic_quotes: dict) -> dict:
"""Replace LLM-emitted quelle/url with canonical chunk values; drop unbacked.
Walks over ``data['wahlprogrammScores'][i][kind]['zitate']`` (the raw
LLM-output dict, not the Pydantic model). For each Zitat:
* Locate the chunk whose text contains the snippet (or a 5-word anchor
from it). Search across **all** retrieved chunks regardless of party,
so cross-mixes between Q-IDs become invisible to the persisted output.
* If found: overwrite ``quelle`` and ``url`` with values derived from
the matching chunk's ``programm_id`` + ``seite``. The LLM is no longer
trusted for these fields.
* If not found: drop the Zitat entirely.
Returns the same ``data`` dict (mutated in place) for chaining.
"""
if not semantic_quotes:
return data
all_chunks: list[dict] = []
for d in semantic_quotes.values():
all_chunks.extend(d.get("wahlprogramm", []))
all_chunks.extend(d.get("parteiprogramm", []))
if not all_chunks:
return data
for fs in data.get("wahlprogrammScores", []) or []:
for kind in ("wahlprogramm", "parteiprogramm"):
blk = fs.get(kind) or {}
zitate = blk.get("zitate") or []
cleaned = []
for z in zitate:
text = z.get("text", "")
matched = find_chunk_for_text(text, all_chunks)
if matched is None:
continue
z["quelle"] = _chunk_source_label(matched)
url = _chunk_pdf_url(matched)
if url:
z["url"] = url
cleaned.append(z)
blk["zitate"] = cleaned
return data
def format_quotes_for_prompt(quotes: dict) -> str:
"""Format quotes for inclusion in LLM prompt.

View File

@ -24,8 +24,10 @@ if "openai" not in sys.modules:
from app import embeddings as embeddings_mod
from app.embeddings import (
_chunk_source_label,
find_chunk_for_text,
format_quotes_for_prompt,
get_relevant_quotes_for_antrag,
reconstruct_zitate,
)
@ -200,6 +202,146 @@ class TestFormatQuotesForPrompt:
assert "wahlprogramm" in first
assert "parteiprogramm" in first
# ─────────────────────────────────────────────────────────────────────────────
# reconstruct_zitate — Issue #60 Option B (server-side citation rewrite)
# ─────────────────────────────────────────────────────────────────────────────
class TestReconstructZitate:
"""Verify the post-processing pass that overwrites LLM-emitted quelle/url
with the canonical source label of whichever retrieved chunk actually
contains the cited text. Drops zitate that don't match any chunk.
Background: BB 8/673 (Sub-D run after the A+C deploy) showed the LLM
copying real text from one chunk but writing the page number from a
different chunk into ``quelle``. The ENUM-anchor in the prompt is only
a soft hint; this post-processing step is the structural binding.
"""
def _make_chunk(self, programm_id: str, seite: int, text: str) -> dict:
return {
"programm_id": programm_id,
"partei": programm_id.split("-")[0].upper(),
"typ": "wahlprogramm",
"seite": seite,
"text": text,
"similarity": 0.7,
}
def test_overwrites_wrong_seite_with_real_chunk_seite(self):
"""The BB 8/673 case: LLM cites text from S.27 chunk but writes
S.4 in quelle. After reconstruct_zitate the quelle must point to
the real S.27 chunk."""
real_chunk = self._make_chunk(
"bsw-bb-2024", 27,
"wertschätzung für lehrerinnen und lehrer abbau von arbeitsüberlastung",
)
wrong_chunk = self._make_chunk(
"bsw-bb-2024", 4,
"in brandenburg weniger als 14 euro in der stunde verdient",
)
semantic_quotes = {
"BSW": {"wahlprogramm": [wrong_chunk, real_chunk], "parteiprogramm": []},
}
data = {
"wahlprogrammScores": [{
"fraktion": "BSW",
"wahlprogramm": {
"score": 7,
"begründung": "...",
"zitate": [{
"text": "Wertschätzung für Lehrerinnen und Lehrer Abbau von Arbeitsüberlastung",
"quelle": "BSW Brandenburg Wahlprogramm 2024, S. 4", # WRONG
"url": "/static/referenzen/bsw-bb-2024.pdf#page=4",
}],
},
"parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
}],
}
out = reconstruct_zitate(data, semantic_quotes)
z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]
assert z["quelle"] == "BSW Brandenburg Wahlprogramm 2024, S. 27"
assert z["url"] == "/static/referenzen/bsw-bb-2024.pdf#page=27"
def test_drops_zitat_not_found_in_any_chunk(self):
"""If a snippet was hallucinated entirely (no matching chunk),
the zitat must be removed rather than persisted."""
chunk = self._make_chunk(
"spd-lsa-2021", 41,
"die stärkung einer geschlechtersensiblen berufsorientierung",
)
semantic_quotes = {
"SPD": {"wahlprogramm": [chunk], "parteiprogramm": []},
}
data = {
"wahlprogrammScores": [{
"fraktion": "SPD",
"wahlprogramm": {
"score": 7,
"begründung": "...",
"zitate": [
{"text": "Wir Sozialdemokratinnen ächten Rechtsextremismus seit 1863",
"quelle": "SPD Sachsen-Anhalt 2021, S. 37"},
{"text": "die Stärkung einer geschlechtersensiblen Berufsorientierung",
"quelle": "SPD Sachsen-Anhalt 2021, S. 41"},
],
},
"parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
}],
}
out = reconstruct_zitate(data, semantic_quotes)
zitate = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"]
assert len(zitate) == 1
assert "geschlechtersensiblen" in zitate[0]["text"]
def test_empty_semantic_quotes_is_noop(self):
data = {"wahlprogrammScores": [{
"fraktion": "X",
"wahlprogramm": {"score": 5, "begründung": "x",
"zitate": [{"text": "abc def ghi jkl mno pqr", "quelle": "X"}]},
"parteiprogramm": {"score": 0, "begründung": "x", "zitate": []},
}]}
out = reconstruct_zitate(data, {})
# No chunks → no postprocessing applied; data passes through unchanged
assert out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]["quelle"] == "X"
def test_anchor_match_when_full_substring_misses(self):
"""LLM may slightly truncate a snippet — 5-word-anchor still binds."""
chunk = self._make_chunk(
"cdu-nrw-2022", 24,
"wir wollen interprofessionelle netzwerkstrukturen für kinderschutz fördern dazu werden wir stellen schaffen",
)
semantic_quotes = {"CDU": {"wahlprogramm": [chunk], "parteiprogramm": []}}
data = {"wahlprogrammScores": [{
"fraktion": "CDU",
"wahlprogramm": {
"score": 8, "begründung": "...",
"zitate": [{
"text": "Wir wollen interprofessionelle Netzwerkstrukturen für Kinderschutz fördern",
"quelle": "CDU NRW Wahlprogramm 2022, S. 999", # wrong page
}],
},
"parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
}]}
out = reconstruct_zitate(data, semantic_quotes)
z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]
assert z["quelle"] == "CDU NRW Wahlprogramm 2022, S. 24"
def test_find_chunk_for_text_short_needle_returns_none(self):
chunk = self._make_chunk("x", 1, "egal was hier steht")
assert find_chunk_for_text("ja", [chunk]) is None
def test_find_chunk_for_text_handles_soft_hyphen(self):
chunk = self._make_chunk(
"bsw-bb-2024", 27,
"handys und tablets wertschätzung für lehrerinnen und lehrer",
)
# LLM-emitted text with the soft hyphen \xad mid-word, as PyMuPDF
# would extract from a PDF line break.
text = "Handys und Tablets. Wertschätzung für Lehrerinnen und Lehrer"
assert find_chunk_for_text(text, [chunk]) is chunk
def test_text_truncated_at_500_chars(self):
long_chunk = {
"FDP": {