gwoe-antragspruefer/tests/test_embeddings.py
Dotty Dotter db3ada9328 #60 Fix A+C: ENUM-basiertes Zitieren + top_k 2→5
Strukturelle Lösung für die LLM-Halluzinations-Cases aus #60:

A — ENUM-Anker
- format_quotes_for_prompt nummeriert jeden retrievten Chunk als [Q1], [Q2], …
- Neue ZITATEREGEL im Prompt erzwingt vier Bedingungen:
  1. Jedes Zitat MUSS auf genau einen [Qn]-Chunk verweisen
  2. Der text-String MUSS eine wörtliche, zusammenhängende Passage von
     min. 5 Wörtern aus genau diesem Chunk sein
  3. Die quelle MUSS exakt das Source-Label des gewählten Chunks sein
  4. Wenn kein Chunk passt: leeres zitate-Array — lieber 0 als erfunden
- analyzer.py:get_system_prompt: Wichtige-Regeln-Block zieht den selben
  Mechanismus nach, damit das LLM den [Qn]-Anker auch im System-Prompt
  sieht und nicht nur im User-Prompt.

C — Recall-Boost
- analyzer.py:run_analysis: top_k_per_partei 2 → 5. In den drei Cases
  aus #60 lagen die "richtigen" Seiten (S.36, S.37) bisher außerhalb
  des Top-3-Windows; mit Top-5 erhöht sich die Wahrscheinlichkeit, dass
  sie überhaupt im Kontext landen.

Hintergrund — die Halluzinationen waren KEIN Embedding-Bug:
Die retrievten Chunks für Case 1 enthielten S.58 (richtige Seite, falscher
Snippet) — das LLM hat den Snippet aus seinem Trainingswissen über
GRÜNE-Wahlprogramme rekonstruiert statt aus dem retrievten Chunk-Text zu
zitieren. Cases 2/3 hatten die zitierten Seiten gar nicht im Top-3-Window —
das LLM hat sowohl Seite als auch Snippet halluziniert. ENUM-Anker
verhindert beides strukturell, weil ein nicht-existenter [Qn] sofort
als Cheating sichtbar wäre.

Tests:
- test_chunks_get_enum_ids
- test_zitateregel_mentions_enum_anchor
- 179/179 grün

Refs: #60, #54 (Sub-D), #50 (Umbrella E2E)
2026-04-09 22:21:39 +02:00

221 lines
9.2 KiB
Python

"""Tests for embeddings.py prompt formatting.
Reproduces the LLM-Halluzinations-Bug from the 2026-04-08 session
(commits 1b5fd96 + bc7f4a6): the original ``format_quotes_for_prompt``
rendered each chunk as ``- S. X: "text"`` without any reference to the
programme name. As a result the LLM hallucinated familiar source labels
("FDP NRW Wahlprogramm 2022") for chunks that actually came from MV/BE,
because that was the strongest training-set prior for budget-policy
citations.
Fix: prepend the fully-qualified PROGRAMME[programm_id]["name"] to each
quote.
"""
import sys
import types
# Stub openai before importing embeddings, since the test environment may
# not have it installed and we don't actually need to make API calls.
if "openai" not in sys.modules:
openai_stub = types.ModuleType("openai")
openai_stub.OpenAI = lambda **kw: None
sys.modules["openai"] = openai_stub
from app import embeddings as embeddings_mod
from app.embeddings import (
_chunk_source_label,
format_quotes_for_prompt,
get_relevant_quotes_for_antrag,
)
# ─────────────────────────────────────────────────────────────────────────────
# _chunk_source_label — fully-qualified programme name + page
# ─────────────────────────────────────────────────────────────────────────────
class TestChunkSourceLabel:
def test_known_programme_id(self):
chunk = {"programm_id": "fdp-mv-2021", "seite": 73, "text": "..."}
label = _chunk_source_label(chunk)
assert "FDP Mecklenburg-Vorpommern" in label
assert "S. 73" in label
def test_known_programme_id_for_be(self):
chunk = {"programm_id": "spd-be-2023", "seite": 24, "text": "..."}
label = _chunk_source_label(chunk)
assert "SPD Berlin" in label
assert "2021" in label # the BE-2023.pdf files contain 2021er programmes
assert "S. 24" in label
def test_unknown_programme_id_falls_back_to_id(self):
chunk = {"programm_id": "fake-xx-9999", "seite": 1, "text": "..."}
label = _chunk_source_label(chunk)
# Should not crash, should at least include the id and the page
assert "fake-xx-9999" in label
assert "S. 1" in label
def test_missing_seite_uses_questionmark(self):
chunk = {"programm_id": "cdu-mv-2021", "text": "..."}
label = _chunk_source_label(chunk)
assert "?" in label
# ─────────────────────────────────────────────────────────────────────────────
# format_quotes_for_prompt — every chunk must carry programme identification
# ─────────────────────────────────────────────────────────────────────────────
EXAMPLE_QUOTES = {
"FDP": {
"wahlprogramm": [
{
"programm_id": "fdp-mv-2021",
"partei": "FDP",
"typ": "wahlprogramm",
"seite": 73,
"text": "Die Grundsätze von Wirtschaftlichkeit und Sparsamkeit",
"similarity": 0.63,
},
],
"parteiprogramm": [
{
"programm_id": "fdp-grundsatz",
"partei": "FDP",
"typ": "parteiprogramm",
"seite": 93,
"text": "Liberale Marktwirtschaft erfordert solide Haushalte",
"similarity": 0.60,
},
],
},
"SPD": {
"wahlprogramm": [
{
"programm_id": "spd-mv-2021",
"partei": "SPD",
"typ": "wahlprogramm",
"seite": 22,
"text": "Verkehrswende weg vom motorisierten Individualverkehr",
"similarity": 0.58,
},
],
},
}
class TestFormatQuotesForPrompt:
def test_empty_input_returns_empty_string(self):
assert format_quotes_for_prompt({}) == ""
def test_renders_party_headings(self):
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
assert "### FDP" in out
assert "### SPD" in out
def test_every_chunk_has_programme_name(self):
"""Regression: pre-fix this used "S. X:" only, no programme name —
the LLM then hallucinated NRW-2022 sources from training data."""
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
# Each of the three chunks must reference its source programme
assert "FDP Mecklenburg-Vorpommern" in out
assert "FDP Grundsatzprogramm" in out
assert "SPD Mecklenburg-Vorpommern" in out
def test_contains_strict_citation_instruction(self):
"""The prompt header must explicitly forbid hallucinated sources."""
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
assert "wörtlich" in out.lower()
def test_chunks_get_enum_ids(self):
"""Issue #60 fix: each chunk must be tagged with a stable [Qn] id
so the LLM can be forced to anchor every citation in a specific
retrieved chunk instead of inventing snippets from training data.
"""
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
# 2 wahlprogramm chunks + 1 grundsatz chunk = 3 IDs total
assert "[Q1]" in out
assert "[Q2]" in out
assert "[Q3]" in out
assert "[Q4]" not in out # only 3 chunks in EXAMPLE_QUOTES
def test_zitateregel_mentions_enum_anchor(self):
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
# The prompt header must mention the ENUM anchor mechanism so
# the LLM understands what [Qn] means.
assert "[Q" in out
assert "ZITATEREGEL" in out
def test_no_nrw_2022_appears_unless_chunks_are_actually_nrw(self):
"""Sanity: a pure MV+SPD chunk set must not mention NRW anywhere."""
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
assert "NRW" not in out
assert "Nordrhein-Westfalen" not in out
def test_renders_separate_blocks_for_wahl_and_parteiprogramm(self):
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
assert "**Wahlprogramm:**" in out
assert "**Grundsatzprogramm:**" in out
def test_get_relevant_quotes_for_antrag_populates_results(self, monkeypatch):
"""Regression for the partei_upper NameError (Phase B / #55 / eb045d0):
The dict-write line still referenced ``partei_upper`` after the
rest of the function had been renamed to ``partei_lookup``. The
result was that ``get_relevant_quotes_for_antrag`` raised
``NameError`` on every call, was silently swallowed by the
``except Exception`` in ``analyzer.run_analysis``, and silently
downgraded *every* assessment to keyword search — which then
caused the LLM hallucinations tracked in #60.
Test strategy: monkeypatch ``find_relevant_chunks`` so we don't
need real embeddings, then call the wrapper and assert it
actually returns a populated dict instead of crashing.
"""
def fake_find_relevant_chunks(query, parteien=None, typ=None,
bundesland=None, top_k=3,
min_similarity=0.5):
return [{
"programm_id": "gruene-nrw-2022",
"partei": parteien[0] if parteien else "GRÜNE",
"typ": typ or "wahlprogramm",
"seite": 58,
"text": "Wahlalter ab 16",
"similarity": 0.7,
}]
monkeypatch.setattr(embeddings_mod, "find_relevant_chunks",
fake_find_relevant_chunks)
result = get_relevant_quotes_for_antrag(
antrag_text="Wahlalter ab 16",
fraktionen=["GRÜNE"],
bundesland="NRW",
top_k_per_partei=2,
)
assert result, "Expected a non-empty result dict, got empty"
# The keys are canonical party names; either GRÜNE itself or
# whatever the canonical mapper returns for it.
assert any("GR" in k.upper() for k in result.keys())
# And the structure must be the {wahlprogramm, parteiprogramm} dict
first = next(iter(result.values()))
assert "wahlprogramm" in first
assert "parteiprogramm" in first
def test_text_truncated_at_500_chars(self):
long_chunk = {
"FDP": {
"wahlprogramm": [
{
"programm_id": "fdp-mv-2021",
"seite": 1,
"text": "A" * 1000, # 1000 chars → should be truncated
"similarity": 0.7,
}
],
}
}
out = format_quotes_for_prompt(long_chunk)
# Truncation marker
assert "..." in out
# Original chunk text 1000 chars not present in full
assert "A" * 1000 not in out