"""Tests for embeddings.py prompt formatting. Reproduces the LLM-Halluzinations-Bug from the 2026-04-08 session (commits 1b5fd96 + bc7f4a6): the original ``format_quotes_for_prompt`` rendered each chunk as ``- S. X: "text"`` without any reference to the programme name. As a result the LLM hallucinated familiar source labels ("FDP NRW Wahlprogramm 2022") for chunks that actually came from MV/BE, because that was the strongest training-set prior for budget-policy citations. Fix: prepend the fully-qualified PROGRAMME[programm_id]["name"] to each quote. """ import sys import types import pytest # Stub openai before importing embeddings, since the test environment may # not have it installed and we don't actually need to make API calls. if "openai" not in sys.modules: openai_stub = types.ModuleType("openai") openai_stub.OpenAI = lambda **kw: None sys.modules["openai"] = openai_stub # On dev machines an older third-party "fitz" package may shadow PyMuPDF's # legacy import alias — verify the loaded module actually has ``open`` and # fall back to ``pymupdf`` (the canonical name in PyMuPDF ≥ 1.24) when the # wrong "fitz" is in front of pymupdf on sys.path. try: import fitz as _fitz if not hasattr(_fitz, "open"): import pymupdf as _pymupdf sys.modules["fitz"] = _pymupdf except ImportError: try: import pymupdf as _pymupdf sys.modules["fitz"] = _pymupdf except ImportError: pass # render tests will skip via fixture below from app import embeddings as embeddings_mod from app.embeddings import ( _chunk_pdf_url, _chunk_source_label, find_chunk_for_text, format_quotes_for_prompt, get_relevant_quotes_for_antrag, reconstruct_zitate, render_highlighted_page, PROGRAMME, ) # ───────────────────────────────────────────────────────────────────────────── # _chunk_source_label — fully-qualified programme name + page # ───────────────────────────────────────────────────────────────────────────── class TestChunkSourceLabel: def test_known_programme_id(self): chunk = {"programm_id": "fdp-mv-2021", "seite": 73, "text": "..."} label = _chunk_source_label(chunk) assert "FDP Mecklenburg-Vorpommern" in label assert "S. 73" in label def test_known_programme_id_for_be(self): chunk = {"programm_id": "spd-be-2023", "seite": 24, "text": "..."} label = _chunk_source_label(chunk) assert "SPD Berlin" in label assert "2021" in label # the BE-2023.pdf files contain 2021er programmes assert "S. 24" in label def test_unknown_programme_id_falls_back_to_id(self): chunk = {"programm_id": "fake-xx-9999", "seite": 1, "text": "..."} label = _chunk_source_label(chunk) # Should not crash, should at least include the id and the page assert "fake-xx-9999" in label assert "S. 1" in label def test_missing_seite_uses_questionmark(self): chunk = {"programm_id": "cdu-mv-2021", "text": "..."} label = _chunk_source_label(chunk) assert "?" in label # ───────────────────────────────────────────────────────────────────────────── # format_quotes_for_prompt — every chunk must carry programme identification # ───────────────────────────────────────────────────────────────────────────── EXAMPLE_QUOTES = { "FDP": { "wahlprogramm": [ { "programm_id": "fdp-mv-2021", "partei": "FDP", "typ": "wahlprogramm", "seite": 73, "text": "Die Grundsätze von Wirtschaftlichkeit und Sparsamkeit", "similarity": 0.63, }, ], "parteiprogramm": [ { "programm_id": "fdp-grundsatz", "partei": "FDP", "typ": "parteiprogramm", "seite": 93, "text": "Liberale Marktwirtschaft erfordert solide Haushalte", "similarity": 0.60, }, ], }, "SPD": { "wahlprogramm": [ { "programm_id": "spd-mv-2021", "partei": "SPD", "typ": "wahlprogramm", "seite": 22, "text": "Verkehrswende weg vom motorisierten Individualverkehr", "similarity": 0.58, }, ], }, } class TestFormatQuotesForPrompt: def test_empty_input_returns_empty_string(self): assert format_quotes_for_prompt({}) == "" def test_renders_party_headings(self): out = format_quotes_for_prompt(EXAMPLE_QUOTES) assert "### FDP" in out assert "### SPD" in out def test_every_chunk_has_programme_name(self): """Regression: pre-fix this used "S. X:" only, no programme name — the LLM then hallucinated NRW-2022 sources from training data.""" out = format_quotes_for_prompt(EXAMPLE_QUOTES) # Each of the three chunks must reference its source programme assert "FDP Mecklenburg-Vorpommern" in out assert "FDP Grundsatzprogramm" in out assert "SPD Mecklenburg-Vorpommern" in out def test_contains_strict_citation_instruction(self): """The prompt header must explicitly forbid hallucinated sources.""" out = format_quotes_for_prompt(EXAMPLE_QUOTES) assert "wörtlich" in out.lower() def test_chunks_get_enum_ids(self): """Issue #60 fix: each chunk must be tagged with a stable [Qn] id so the LLM can be forced to anchor every citation in a specific retrieved chunk instead of inventing snippets from training data. """ out = format_quotes_for_prompt(EXAMPLE_QUOTES) # 2 wahlprogramm chunks + 1 grundsatz chunk = 3 IDs total assert "[Q1]" in out assert "[Q2]" in out assert "[Q3]" in out assert "[Q4]" not in out # only 3 chunks in EXAMPLE_QUOTES def test_zitateregel_mentions_enum_anchor(self): out = format_quotes_for_prompt(EXAMPLE_QUOTES) # The prompt header must mention the ENUM anchor mechanism so # the LLM understands what [Qn] means. assert "[Q" in out assert "ZITATEREGEL" in out def test_no_nrw_2022_appears_unless_chunks_are_actually_nrw(self): """Sanity: a pure MV+SPD chunk set must not mention NRW anywhere.""" out = format_quotes_for_prompt(EXAMPLE_QUOTES) assert "NRW" not in out assert "Nordrhein-Westfalen" not in out def test_renders_separate_blocks_for_wahl_and_parteiprogramm(self): out = format_quotes_for_prompt(EXAMPLE_QUOTES) assert "**Wahlprogramm:**" in out assert "**Grundsatzprogramm:**" in out def test_get_relevant_quotes_for_antrag_populates_results(self, monkeypatch): """Regression for the partei_upper NameError (Phase B / #55 / eb045d0): The dict-write line still referenced ``partei_upper`` after the rest of the function had been renamed to ``partei_lookup``. The result was that ``get_relevant_quotes_for_antrag`` raised ``NameError`` on every call, was silently swallowed by the ``except Exception`` in ``analyzer.run_analysis``, and silently downgraded *every* assessment to keyword search — which then caused the LLM hallucinations tracked in #60. Test strategy: monkeypatch ``find_relevant_chunks`` so we don't need real embeddings, then call the wrapper and assert it actually returns a populated dict instead of crashing. """ def fake_find_relevant_chunks(query, parteien=None, typ=None, bundesland=None, top_k=3, min_similarity=0.5): return [{ "programm_id": "gruene-nrw-2022", "partei": parteien[0] if parteien else "GRÜNE", "typ": typ or "wahlprogramm", "seite": 58, "text": "Wahlalter ab 16", "similarity": 0.7, }] monkeypatch.setattr(embeddings_mod, "find_relevant_chunks", fake_find_relevant_chunks) result = get_relevant_quotes_for_antrag( antrag_text="Wahlalter ab 16", fraktionen=["GRÜNE"], bundesland="NRW", top_k_per_partei=2, ) assert result, "Expected a non-empty result dict, got empty" # The keys are canonical party names; either GRÜNE itself or # whatever the canonical mapper returns for it. assert any("GR" in k.upper() for k in result.keys()) # And the structure must be the {wahlprogramm, parteiprogramm} dict first = next(iter(result.values())) assert "wahlprogramm" in first assert "parteiprogramm" in first # ───────────────────────────────────────────────────────────────────────────── # reconstruct_zitate — Issue #60 Option B (server-side citation rewrite) # ───────────────────────────────────────────────────────────────────────────── class TestReconstructZitate: """Verify the post-processing pass that overwrites LLM-emitted quelle/url with the canonical source label of whichever retrieved chunk actually contains the cited text. Drops zitate that don't match any chunk. Background: BB 8/673 (Sub-D run after the A+C deploy) showed the LLM copying real text from one chunk but writing the page number from a different chunk into ``quelle``. The ENUM-anchor in the prompt is only a soft hint; this post-processing step is the structural binding. """ def _make_chunk(self, programm_id: str, seite: int, text: str) -> dict: return { "programm_id": programm_id, "partei": programm_id.split("-")[0].upper(), "typ": "wahlprogramm", "seite": seite, "text": text, "similarity": 0.7, } def test_overwrites_wrong_seite_with_real_chunk_seite(self): """The BB 8/673 case: LLM cites text from S.27 chunk but writes S.4 in quelle. After reconstruct_zitate the quelle must point to the real S.27 chunk.""" real_chunk = self._make_chunk( "bsw-bb-2024", 27, "wertschätzung für lehrerinnen und lehrer abbau von arbeitsüberlastung", ) wrong_chunk = self._make_chunk( "bsw-bb-2024", 4, "in brandenburg weniger als 14 euro in der stunde verdient", ) semantic_quotes = { "BSW": {"wahlprogramm": [wrong_chunk, real_chunk], "parteiprogramm": []}, } data = { "wahlprogrammScores": [{ "fraktion": "BSW", "wahlprogramm": { "score": 7, "begründung": "...", "zitate": [{ "text": "Wertschätzung für Lehrerinnen und Lehrer Abbau von Arbeitsüberlastung", "quelle": "BSW Brandenburg Wahlprogramm 2024, S. 4", # WRONG "url": "/static/referenzen/bsw-bb-2024.pdf#page=4", }], }, "parteiprogramm": {"score": 0, "begründung": "...", "zitate": []}, }], } out = reconstruct_zitate(data, semantic_quotes) z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0] assert z["quelle"] == "BSW Brandenburg Wahlprogramm 2024, S. 27" # Post-#47: URL ist der Highlight-Cite-Endpoint mit pid+seite+q. # Static-Fallback nur noch wenn der Chunk kein text-Feld hat. assert z["url"].startswith("/api/wahlprogramm-cite?") assert "pid=bsw-bb-2024" in z["url"] assert "seite=27" in z["url"] def test_drops_zitat_not_found_in_any_chunk(self): """If a snippet was hallucinated entirely (no matching chunk), the zitat must be removed rather than persisted.""" chunk = self._make_chunk( "spd-lsa-2021", 41, "die stärkung einer geschlechtersensiblen berufsorientierung", ) semantic_quotes = { "SPD": {"wahlprogramm": [chunk], "parteiprogramm": []}, } data = { "wahlprogrammScores": [{ "fraktion": "SPD", "wahlprogramm": { "score": 7, "begründung": "...", "zitate": [ {"text": "Wir Sozialdemokratinnen ächten Rechtsextremismus seit 1863", "quelle": "SPD Sachsen-Anhalt 2021, S. 37"}, {"text": "die Stärkung einer geschlechtersensiblen Berufsorientierung", "quelle": "SPD Sachsen-Anhalt 2021, S. 41"}, ], }, "parteiprogramm": {"score": 0, "begründung": "...", "zitate": []}, }], } out = reconstruct_zitate(data, semantic_quotes) zitate = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"] # Beide Zitate bleiben erhalten — das nicht-matchende wird als # unverified markiert statt gedroppt (Hybrid-Ansatz). assert len(zitate) == 2 # Das halluzinierte Zitat ist unverified halluziniert = [z for z in zitate if "Rechtsextremismus" in z["text"]] assert halluziniert[0]["verified"] is False # Das echte Zitat ist verified echt = [z for z in zitate if "geschlechtersensiblen" in z["text"]] assert echt[0]["verified"] is True def test_empty_semantic_quotes_is_noop(self): data = {"wahlprogrammScores": [{ "fraktion": "X", "wahlprogramm": {"score": 5, "begründung": "x", "zitate": [{"text": "abc def ghi jkl mno pqr", "quelle": "X"}]}, "parteiprogramm": {"score": 0, "begründung": "x", "zitate": []}, }]} out = reconstruct_zitate(data, {}) # No chunks → no postprocessing applied; data passes through unchanged assert out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]["quelle"] == "X" def test_anchor_match_when_full_substring_misses(self): """LLM may slightly truncate a snippet — 5-word-anchor still binds.""" chunk = self._make_chunk( "cdu-nrw-2022", 24, "wir wollen interprofessionelle netzwerkstrukturen für kinderschutz fördern dazu werden wir stellen schaffen", ) semantic_quotes = {"CDU": {"wahlprogramm": [chunk], "parteiprogramm": []}} data = {"wahlprogrammScores": [{ "fraktion": "CDU", "wahlprogramm": { "score": 8, "begründung": "...", "zitate": [{ "text": "Wir wollen interprofessionelle Netzwerkstrukturen für Kinderschutz fördern", "quelle": "CDU NRW Wahlprogramm 2022, S. 999", # wrong page }], }, "parteiprogramm": {"score": 0, "begründung": "...", "zitate": []}, }]} out = reconstruct_zitate(data, semantic_quotes) z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0] assert z["quelle"] == "CDU NRW Wahlprogramm 2022, S. 24" def test_find_chunk_for_text_short_needle_returns_none(self): chunk = self._make_chunk("x", 1, "egal was hier steht") assert find_chunk_for_text("ja", [chunk]) is None def test_find_chunk_for_text_handles_soft_hyphen(self): chunk = self._make_chunk( "bsw-bb-2024", 27, "handys und tablets wertschätzung für lehrerinnen und lehrer", ) # LLM-emitted text with the soft hyphen \xad mid-word, as PyMuPDF # would extract from a PDF line break. text = "Handys und Tablets. Wertschätzung für Lehrerinnen und Lehrer" assert find_chunk_for_text(text, [chunk]) is chunk # ───────────────────────────────────────────────────────────────────────────── # _chunk_pdf_url + render_highlighted_page — Issue #47 PDF-Highlighting # ───────────────────────────────────────────────────────────────────────────── class TestChunkPdfUrl: """Verify the URL builder switches between the cite-endpoint (when chunk text is present) and the static fallback (Pre-#47 chunks). """ def test_cite_url_when_text_present(self): chunk = { "programm_id": "gruene-grundsatz", "seite": 36, "text": "Plattformen müssen umfassend reguliert werden", } url = _chunk_pdf_url(chunk) assert url is not None assert url.startswith("/api/wahlprogramm-cite?") assert "pid=gruene-grundsatz" in url assert "seite=36" in url # URL-encoded query (urlencode/quote_plus uses + for space) assert "Plattformen" in url def test_static_fallback_when_no_text(self): chunk = {"programm_id": "fdp-mv-2021", "seite": 73} url = _chunk_pdf_url(chunk) assert url == "/static/referenzen/fdp-mv-2021.pdf#page=73" def test_unknown_programme_returns_none(self): chunk = {"programm_id": "fake-xx-9999", "seite": 1, "text": "x" * 50} assert _chunk_pdf_url(chunk) is None def test_url_truncates_long_text_to_200_chars(self): chunk = { "programm_id": "gruene-grundsatz", "seite": 36, "text": "A" * 1000, } url = _chunk_pdf_url(chunk) assert url is not None # Eingebettete Text-Länge ist auf 200 Zeichen begrenzt — sonst # blasen 500-Zeichen-Snippets das Assessment-JSON auf. # Der `q=`-Parameter darf nicht 1000 'A' enthalten. assert "A" * 1000 not in url assert "A" * 200 in url class TestRenderHighlightedPage: """Smoke-Test gegen ein reales Wahlprogramm-PDF aus dem referenzen-Verzeichnis. Bestätigt dass PyMuPDF einen 1-Seiten-PDF mit Highlight-Annotation produziert. Skipped wenn das Test-PDF nicht im Repo vorhanden ist. """ @pytest.fixture def sample_pid(self): # Wir nehmen einen kleinen, sicher vorhandenen Eintrag aus PROGRAMME. # spd-grundsatz ist seit Tag 1 indexiert und im Repo committed. from pathlib import Path from app.embeddings import PROGRAMME pid = "spd-grundsatz" info = PROGRAMME.get(pid) if not info: pytest.skip("PROGRAMME registry missing spd-grundsatz") path = Path(__file__).parent.parent / "app" / "static" / "referenzen" / info["pdf"] if not path.exists(): pytest.skip(f"Test-PDF {path} nicht im Repo") return pid def test_unknown_pid_returns_none(self): pdf_bytes, page, highlighted = render_highlighted_page("fake-xx-9999", 1, "x") assert pdf_bytes is None def test_invalid_seite_returns_none(self, sample_pid): pdf_bytes, _, _ = render_highlighted_page(sample_pid, 99999, "x") assert pdf_bytes is None pdf_bytes2, _, _ = render_highlighted_page(sample_pid, 0, "x") assert pdf_bytes2 is None def test_renders_full_pdf_with_highlight(self, sample_pid): pdf_bytes, found_page, highlighted = render_highlighted_page(sample_pid, 1, "Soziale Gerechtigkeit") assert pdf_bytes is not None assert isinstance(pdf_bytes, bytes) assert pdf_bytes[:5] == b"%PDF-" assert found_page >= 1 assert highlighted is True def test_returns_pdf_even_when_query_empty(self, sample_pid): pdf_bytes, _, highlighted = render_highlighted_page(sample_pid, 1, "") assert pdf_bytes is not None assert pdf_bytes[:5] == b"%PDF-" assert highlighted is False def test_returns_pdf_when_query_not_found_flagged_unhighlighted(self, sample_pid): pdf_bytes, _, highlighted = render_highlighted_page( sample_pid, 1, "this exact phrase definitely does not exist anywhere", ) assert pdf_bytes is not None assert pdf_bytes[:5] == b"%PDF-" assert highlighted is False def test_format_quotes_truncates_long_chunks_at_500_chars(): """Truncation-Test for format_quotes_for_prompt — sat lange als Methode in TestRenderHighlightedPage (falsche Class-Zuordnung durch Edit-Reihenfolge), jetzt module-level.""" long_chunk = { "FDP": { "wahlprogramm": [ { "programm_id": "fdp-mv-2021", "seite": 1, "text": "A" * 1000, # 1000 chars → should be truncated "similarity": 0.7, } ], } } out = format_quotes_for_prompt(long_chunk) # Truncation marker assert "..." in out # Original chunk text 1000 chars not present in full assert "A" * 1000 not in out