Sub-D Live-Run gegen Prod-DB nach dem db3ada9-Deploy hat einen neuen Halluzinations-Case gezeigt, den A+C nicht gefangen hat: BB 8/673 BSW: text aus bsw-bb-2024 S.27 (verifiziert via Volltext-Suche im PDF), aber LLM hat im quelle-Feld "S. 4" angegeben — die Seite des Top-2-Chunks im selben Retrieval-Window. Klassischer Cross-Mix zwischen Q-IDs. Strukturelle Diagnose: Das [Qn]-Tag aus A ist nur ein weicher Anker im Prompt. Das LLM darf Text aus Chunk Qn kopieren und trotzdem die quelle aus Chunk Qm zusammenbauen. Die ZITATEREGEL kann das nicht verhindern, solange wir der LLM-Selbstauskunft vertrauen. Fix (Option B aus dem ursprünglichen Plan): `embeddings.reconstruct_zitate(data, semantic_quotes)` läuft im analyzer **nach** json.loads aber **vor** Pydantic-Validation: 1. Flachen die retrievten Chunks aller Parteien zu einer einzigen Liste. 2. Pro Zitat: text via Substring oder 5-Wort-Anker gegen alle Chunks matchen (Helpers `find_chunk_for_text` + `_normalize_for_match`, identische Logik wie Sub-D Test). 3. Match → quelle/url server-seitig durch _chunk_source_label und _chunk_pdf_url des matchenden Chunks ÜBERSCHREIBEN. 4. Kein Match → Zitat verworfen (statt mit erfundener quelle persistiert). Damit kann der LLM nur noch sauber zitieren oder gar nicht — es gibt keinen Pfad mehr zu "echter Text, falsche quelle". Tests: - TestReconstructZitate (5 cases): BB 8/673 Re-Mapping, Drop bei hallucinated, no-op bei leeren chunks, anchor-match-Fallback, short-needle und soft-hyphen Edge-Cases - 185/185 grün (179 + 6 neu) Refs: #60, #54 (Sub-D)
363 lines
16 KiB
Python
363 lines
16 KiB
Python
"""Tests for embeddings.py prompt formatting.
|
|
|
|
Reproduces the LLM-Halluzinations-Bug from the 2026-04-08 session
|
|
(commits 1b5fd96 + bc7f4a6): the original ``format_quotes_for_prompt``
|
|
rendered each chunk as ``- S. X: "text"`` without any reference to the
|
|
programme name. As a result the LLM hallucinated familiar source labels
|
|
("FDP NRW Wahlprogramm 2022") for chunks that actually came from MV/BE,
|
|
because that was the strongest training-set prior for budget-policy
|
|
citations.
|
|
|
|
Fix: prepend the fully-qualified PROGRAMME[programm_id]["name"] to each
|
|
quote.
|
|
"""
|
|
import sys
|
|
import types
|
|
|
|
# Stub openai before importing embeddings, since the test environment may
|
|
# not have it installed and we don't actually need to make API calls.
|
|
if "openai" not in sys.modules:
|
|
openai_stub = types.ModuleType("openai")
|
|
openai_stub.OpenAI = lambda **kw: None
|
|
sys.modules["openai"] = openai_stub
|
|
|
|
from app import embeddings as embeddings_mod
|
|
from app.embeddings import (
|
|
_chunk_source_label,
|
|
find_chunk_for_text,
|
|
format_quotes_for_prompt,
|
|
get_relevant_quotes_for_antrag,
|
|
reconstruct_zitate,
|
|
)
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# _chunk_source_label — fully-qualified programme name + page
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestChunkSourceLabel:
|
|
def test_known_programme_id(self):
|
|
chunk = {"programm_id": "fdp-mv-2021", "seite": 73, "text": "..."}
|
|
label = _chunk_source_label(chunk)
|
|
assert "FDP Mecklenburg-Vorpommern" in label
|
|
assert "S. 73" in label
|
|
|
|
def test_known_programme_id_for_be(self):
|
|
chunk = {"programm_id": "spd-be-2023", "seite": 24, "text": "..."}
|
|
label = _chunk_source_label(chunk)
|
|
assert "SPD Berlin" in label
|
|
assert "2021" in label # the BE-2023.pdf files contain 2021er programmes
|
|
assert "S. 24" in label
|
|
|
|
def test_unknown_programme_id_falls_back_to_id(self):
|
|
chunk = {"programm_id": "fake-xx-9999", "seite": 1, "text": "..."}
|
|
label = _chunk_source_label(chunk)
|
|
# Should not crash, should at least include the id and the page
|
|
assert "fake-xx-9999" in label
|
|
assert "S. 1" in label
|
|
|
|
def test_missing_seite_uses_questionmark(self):
|
|
chunk = {"programm_id": "cdu-mv-2021", "text": "..."}
|
|
label = _chunk_source_label(chunk)
|
|
assert "?" in label
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# format_quotes_for_prompt — every chunk must carry programme identification
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
EXAMPLE_QUOTES = {
|
|
"FDP": {
|
|
"wahlprogramm": [
|
|
{
|
|
"programm_id": "fdp-mv-2021",
|
|
"partei": "FDP",
|
|
"typ": "wahlprogramm",
|
|
"seite": 73,
|
|
"text": "Die Grundsätze von Wirtschaftlichkeit und Sparsamkeit",
|
|
"similarity": 0.63,
|
|
},
|
|
],
|
|
"parteiprogramm": [
|
|
{
|
|
"programm_id": "fdp-grundsatz",
|
|
"partei": "FDP",
|
|
"typ": "parteiprogramm",
|
|
"seite": 93,
|
|
"text": "Liberale Marktwirtschaft erfordert solide Haushalte",
|
|
"similarity": 0.60,
|
|
},
|
|
],
|
|
},
|
|
"SPD": {
|
|
"wahlprogramm": [
|
|
{
|
|
"programm_id": "spd-mv-2021",
|
|
"partei": "SPD",
|
|
"typ": "wahlprogramm",
|
|
"seite": 22,
|
|
"text": "Verkehrswende weg vom motorisierten Individualverkehr",
|
|
"similarity": 0.58,
|
|
},
|
|
],
|
|
},
|
|
}
|
|
|
|
|
|
class TestFormatQuotesForPrompt:
|
|
def test_empty_input_returns_empty_string(self):
|
|
assert format_quotes_for_prompt({}) == ""
|
|
|
|
def test_renders_party_headings(self):
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
assert "### FDP" in out
|
|
assert "### SPD" in out
|
|
|
|
def test_every_chunk_has_programme_name(self):
|
|
"""Regression: pre-fix this used "S. X:" only, no programme name —
|
|
the LLM then hallucinated NRW-2022 sources from training data."""
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
# Each of the three chunks must reference its source programme
|
|
assert "FDP Mecklenburg-Vorpommern" in out
|
|
assert "FDP Grundsatzprogramm" in out
|
|
assert "SPD Mecklenburg-Vorpommern" in out
|
|
|
|
def test_contains_strict_citation_instruction(self):
|
|
"""The prompt header must explicitly forbid hallucinated sources."""
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
assert "wörtlich" in out.lower()
|
|
|
|
def test_chunks_get_enum_ids(self):
|
|
"""Issue #60 fix: each chunk must be tagged with a stable [Qn] id
|
|
so the LLM can be forced to anchor every citation in a specific
|
|
retrieved chunk instead of inventing snippets from training data.
|
|
"""
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
# 2 wahlprogramm chunks + 1 grundsatz chunk = 3 IDs total
|
|
assert "[Q1]" in out
|
|
assert "[Q2]" in out
|
|
assert "[Q3]" in out
|
|
assert "[Q4]" not in out # only 3 chunks in EXAMPLE_QUOTES
|
|
|
|
def test_zitateregel_mentions_enum_anchor(self):
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
# The prompt header must mention the ENUM anchor mechanism so
|
|
# the LLM understands what [Qn] means.
|
|
assert "[Q" in out
|
|
assert "ZITATEREGEL" in out
|
|
|
|
def test_no_nrw_2022_appears_unless_chunks_are_actually_nrw(self):
|
|
"""Sanity: a pure MV+SPD chunk set must not mention NRW anywhere."""
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
assert "NRW" not in out
|
|
assert "Nordrhein-Westfalen" not in out
|
|
|
|
def test_renders_separate_blocks_for_wahl_and_parteiprogramm(self):
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
assert "**Wahlprogramm:**" in out
|
|
assert "**Grundsatzprogramm:**" in out
|
|
|
|
def test_get_relevant_quotes_for_antrag_populates_results(self, monkeypatch):
|
|
"""Regression for the partei_upper NameError (Phase B / #55 / eb045d0):
|
|
|
|
The dict-write line still referenced ``partei_upper`` after the
|
|
rest of the function had been renamed to ``partei_lookup``. The
|
|
result was that ``get_relevant_quotes_for_antrag`` raised
|
|
``NameError`` on every call, was silently swallowed by the
|
|
``except Exception`` in ``analyzer.run_analysis``, and silently
|
|
downgraded *every* assessment to keyword search — which then
|
|
caused the LLM hallucinations tracked in #60.
|
|
|
|
Test strategy: monkeypatch ``find_relevant_chunks`` so we don't
|
|
need real embeddings, then call the wrapper and assert it
|
|
actually returns a populated dict instead of crashing.
|
|
"""
|
|
def fake_find_relevant_chunks(query, parteien=None, typ=None,
|
|
bundesland=None, top_k=3,
|
|
min_similarity=0.5):
|
|
return [{
|
|
"programm_id": "gruene-nrw-2022",
|
|
"partei": parteien[0] if parteien else "GRÜNE",
|
|
"typ": typ or "wahlprogramm",
|
|
"seite": 58,
|
|
"text": "Wahlalter ab 16",
|
|
"similarity": 0.7,
|
|
}]
|
|
|
|
monkeypatch.setattr(embeddings_mod, "find_relevant_chunks",
|
|
fake_find_relevant_chunks)
|
|
|
|
result = get_relevant_quotes_for_antrag(
|
|
antrag_text="Wahlalter ab 16",
|
|
fraktionen=["GRÜNE"],
|
|
bundesland="NRW",
|
|
top_k_per_partei=2,
|
|
)
|
|
assert result, "Expected a non-empty result dict, got empty"
|
|
# The keys are canonical party names; either GRÜNE itself or
|
|
# whatever the canonical mapper returns for it.
|
|
assert any("GR" in k.upper() for k in result.keys())
|
|
# And the structure must be the {wahlprogramm, parteiprogramm} dict
|
|
first = next(iter(result.values()))
|
|
assert "wahlprogramm" in first
|
|
assert "parteiprogramm" in first
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# reconstruct_zitate — Issue #60 Option B (server-side citation rewrite)
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestReconstructZitate:
|
|
"""Verify the post-processing pass that overwrites LLM-emitted quelle/url
|
|
with the canonical source label of whichever retrieved chunk actually
|
|
contains the cited text. Drops zitate that don't match any chunk.
|
|
|
|
Background: BB 8/673 (Sub-D run after the A+C deploy) showed the LLM
|
|
copying real text from one chunk but writing the page number from a
|
|
different chunk into ``quelle``. The ENUM-anchor in the prompt is only
|
|
a soft hint; this post-processing step is the structural binding.
|
|
"""
|
|
|
|
def _make_chunk(self, programm_id: str, seite: int, text: str) -> dict:
|
|
return {
|
|
"programm_id": programm_id,
|
|
"partei": programm_id.split("-")[0].upper(),
|
|
"typ": "wahlprogramm",
|
|
"seite": seite,
|
|
"text": text,
|
|
"similarity": 0.7,
|
|
}
|
|
|
|
def test_overwrites_wrong_seite_with_real_chunk_seite(self):
|
|
"""The BB 8/673 case: LLM cites text from S.27 chunk but writes
|
|
S.4 in quelle. After reconstruct_zitate the quelle must point to
|
|
the real S.27 chunk."""
|
|
real_chunk = self._make_chunk(
|
|
"bsw-bb-2024", 27,
|
|
"wertschätzung für lehrerinnen und lehrer abbau von arbeitsüberlastung",
|
|
)
|
|
wrong_chunk = self._make_chunk(
|
|
"bsw-bb-2024", 4,
|
|
"in brandenburg weniger als 14 euro in der stunde verdient",
|
|
)
|
|
semantic_quotes = {
|
|
"BSW": {"wahlprogramm": [wrong_chunk, real_chunk], "parteiprogramm": []},
|
|
}
|
|
data = {
|
|
"wahlprogrammScores": [{
|
|
"fraktion": "BSW",
|
|
"wahlprogramm": {
|
|
"score": 7,
|
|
"begründung": "...",
|
|
"zitate": [{
|
|
"text": "Wertschätzung für Lehrerinnen und Lehrer Abbau von Arbeitsüberlastung",
|
|
"quelle": "BSW Brandenburg Wahlprogramm 2024, S. 4", # WRONG
|
|
"url": "/static/referenzen/bsw-bb-2024.pdf#page=4",
|
|
}],
|
|
},
|
|
"parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
|
|
}],
|
|
}
|
|
out = reconstruct_zitate(data, semantic_quotes)
|
|
z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]
|
|
assert z["quelle"] == "BSW Brandenburg Wahlprogramm 2024, S. 27"
|
|
assert z["url"] == "/static/referenzen/bsw-bb-2024.pdf#page=27"
|
|
|
|
def test_drops_zitat_not_found_in_any_chunk(self):
|
|
"""If a snippet was hallucinated entirely (no matching chunk),
|
|
the zitat must be removed rather than persisted."""
|
|
chunk = self._make_chunk(
|
|
"spd-lsa-2021", 41,
|
|
"die stärkung einer geschlechtersensiblen berufsorientierung",
|
|
)
|
|
semantic_quotes = {
|
|
"SPD": {"wahlprogramm": [chunk], "parteiprogramm": []},
|
|
}
|
|
data = {
|
|
"wahlprogrammScores": [{
|
|
"fraktion": "SPD",
|
|
"wahlprogramm": {
|
|
"score": 7,
|
|
"begründung": "...",
|
|
"zitate": [
|
|
{"text": "Wir Sozialdemokratinnen ächten Rechtsextremismus seit 1863",
|
|
"quelle": "SPD Sachsen-Anhalt 2021, S. 37"},
|
|
{"text": "die Stärkung einer geschlechtersensiblen Berufsorientierung",
|
|
"quelle": "SPD Sachsen-Anhalt 2021, S. 41"},
|
|
],
|
|
},
|
|
"parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
|
|
}],
|
|
}
|
|
out = reconstruct_zitate(data, semantic_quotes)
|
|
zitate = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"]
|
|
assert len(zitate) == 1
|
|
assert "geschlechtersensiblen" in zitate[0]["text"]
|
|
|
|
def test_empty_semantic_quotes_is_noop(self):
|
|
data = {"wahlprogrammScores": [{
|
|
"fraktion": "X",
|
|
"wahlprogramm": {"score": 5, "begründung": "x",
|
|
"zitate": [{"text": "abc def ghi jkl mno pqr", "quelle": "X"}]},
|
|
"parteiprogramm": {"score": 0, "begründung": "x", "zitate": []},
|
|
}]}
|
|
out = reconstruct_zitate(data, {})
|
|
# No chunks → no postprocessing applied; data passes through unchanged
|
|
assert out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]["quelle"] == "X"
|
|
|
|
def test_anchor_match_when_full_substring_misses(self):
|
|
"""LLM may slightly truncate a snippet — 5-word-anchor still binds."""
|
|
chunk = self._make_chunk(
|
|
"cdu-nrw-2022", 24,
|
|
"wir wollen interprofessionelle netzwerkstrukturen für kinderschutz fördern dazu werden wir stellen schaffen",
|
|
)
|
|
semantic_quotes = {"CDU": {"wahlprogramm": [chunk], "parteiprogramm": []}}
|
|
data = {"wahlprogrammScores": [{
|
|
"fraktion": "CDU",
|
|
"wahlprogramm": {
|
|
"score": 8, "begründung": "...",
|
|
"zitate": [{
|
|
"text": "Wir wollen interprofessionelle Netzwerkstrukturen für Kinderschutz fördern",
|
|
"quelle": "CDU NRW Wahlprogramm 2022, S. 999", # wrong page
|
|
}],
|
|
},
|
|
"parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
|
|
}]}
|
|
out = reconstruct_zitate(data, semantic_quotes)
|
|
z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]
|
|
assert z["quelle"] == "CDU NRW Wahlprogramm 2022, S. 24"
|
|
|
|
def test_find_chunk_for_text_short_needle_returns_none(self):
|
|
chunk = self._make_chunk("x", 1, "egal was hier steht")
|
|
assert find_chunk_for_text("ja", [chunk]) is None
|
|
|
|
def test_find_chunk_for_text_handles_soft_hyphen(self):
|
|
chunk = self._make_chunk(
|
|
"bsw-bb-2024", 27,
|
|
"handys und tablets wertschätzung für lehrerinnen und lehrer",
|
|
)
|
|
# LLM-emitted text with the soft hyphen \xad mid-word, as PyMuPDF
|
|
# would extract from a PDF line break.
|
|
text = "Handys und Tablets. Wertschätzung für Lehrerinnen und Lehrer"
|
|
assert find_chunk_for_text(text, [chunk]) is chunk
|
|
|
|
|
|
def test_text_truncated_at_500_chars(self):
|
|
long_chunk = {
|
|
"FDP": {
|
|
"wahlprogramm": [
|
|
{
|
|
"programm_id": "fdp-mv-2021",
|
|
"seite": 1,
|
|
"text": "A" * 1000, # 1000 chars → should be truncated
|
|
"similarity": 0.7,
|
|
}
|
|
],
|
|
}
|
|
}
|
|
out = format_quotes_for_prompt(long_chunk)
|
|
# Truncation marker
|
|
assert "..." in out
|
|
# Original chunk text 1000 chars not present in full
|
|
assert "A" * 1000 not in out
|