"""Tests for embeddings.py prompt formatting. Reproduces the LLM-Halluzinations-Bug from the 2026-04-08 session (commits 1b5fd96 + bc7f4a6): the original ``format_quotes_for_prompt`` rendered each chunk as ``- S. X: "text"`` without any reference to the programme name. As a result the LLM hallucinated familiar source labels ("FDP NRW Wahlprogramm 2022") for chunks that actually came from MV/BE, because that was the strongest training-set prior for budget-policy citations. Fix: prepend the fully-qualified PROGRAMME[programm_id]["name"] to each quote. """ import sys import types # Stub openai before importing embeddings, since the test environment may # not have it installed and we don't actually need to make API calls. if "openai" not in sys.modules: openai_stub = types.ModuleType("openai") openai_stub.OpenAI = lambda **kw: None sys.modules["openai"] = openai_stub from app import embeddings as embeddings_mod from app.embeddings import ( _chunk_source_label, format_quotes_for_prompt, get_relevant_quotes_for_antrag, ) # ───────────────────────────────────────────────────────────────────────────── # _chunk_source_label — fully-qualified programme name + page # ───────────────────────────────────────────────────────────────────────────── class TestChunkSourceLabel: def test_known_programme_id(self): chunk = {"programm_id": "fdp-mv-2021", "seite": 73, "text": "..."} label = _chunk_source_label(chunk) assert "FDP Mecklenburg-Vorpommern" in label assert "S. 73" in label def test_known_programme_id_for_be(self): chunk = {"programm_id": "spd-be-2023", "seite": 24, "text": "..."} label = _chunk_source_label(chunk) assert "SPD Berlin" in label assert "2021" in label # the BE-2023.pdf files contain 2021er programmes assert "S. 24" in label def test_unknown_programme_id_falls_back_to_id(self): chunk = {"programm_id": "fake-xx-9999", "seite": 1, "text": "..."} label = _chunk_source_label(chunk) # Should not crash, should at least include the id and the page assert "fake-xx-9999" in label assert "S. 1" in label def test_missing_seite_uses_questionmark(self): chunk = {"programm_id": "cdu-mv-2021", "text": "..."} label = _chunk_source_label(chunk) assert "?" in label # ───────────────────────────────────────────────────────────────────────────── # format_quotes_for_prompt — every chunk must carry programme identification # ───────────────────────────────────────────────────────────────────────────── EXAMPLE_QUOTES = { "FDP": { "wahlprogramm": [ { "programm_id": "fdp-mv-2021", "partei": "FDP", "typ": "wahlprogramm", "seite": 73, "text": "Die Grundsätze von Wirtschaftlichkeit und Sparsamkeit", "similarity": 0.63, }, ], "parteiprogramm": [ { "programm_id": "fdp-grundsatz", "partei": "FDP", "typ": "parteiprogramm", "seite": 93, "text": "Liberale Marktwirtschaft erfordert solide Haushalte", "similarity": 0.60, }, ], }, "SPD": { "wahlprogramm": [ { "programm_id": "spd-mv-2021", "partei": "SPD", "typ": "wahlprogramm", "seite": 22, "text": "Verkehrswende weg vom motorisierten Individualverkehr", "similarity": 0.58, }, ], }, } class TestFormatQuotesForPrompt: def test_empty_input_returns_empty_string(self): assert format_quotes_for_prompt({}) == "" def test_renders_party_headings(self): out = format_quotes_for_prompt(EXAMPLE_QUOTES) assert "### FDP" in out assert "### SPD" in out def test_every_chunk_has_programme_name(self): """Regression: pre-fix this used "S. X:" only, no programme name — the LLM then hallucinated NRW-2022 sources from training data.""" out = format_quotes_for_prompt(EXAMPLE_QUOTES) # Each of the three chunks must reference its source programme assert "FDP Mecklenburg-Vorpommern" in out assert "FDP Grundsatzprogramm" in out assert "SPD Mecklenburg-Vorpommern" in out def test_contains_strict_citation_instruction(self): """The prompt header must explicitly forbid hallucinated sources.""" out = format_quotes_for_prompt(EXAMPLE_QUOTES) assert "wörtlich" in out.lower() def test_chunks_get_enum_ids(self): """Issue #60 fix: each chunk must be tagged with a stable [Qn] id so the LLM can be forced to anchor every citation in a specific retrieved chunk instead of inventing snippets from training data. """ out = format_quotes_for_prompt(EXAMPLE_QUOTES) # 2 wahlprogramm chunks + 1 grundsatz chunk = 3 IDs total assert "[Q1]" in out assert "[Q2]" in out assert "[Q3]" in out assert "[Q4]" not in out # only 3 chunks in EXAMPLE_QUOTES def test_zitateregel_mentions_enum_anchor(self): out = format_quotes_for_prompt(EXAMPLE_QUOTES) # The prompt header must mention the ENUM anchor mechanism so # the LLM understands what [Qn] means. assert "[Q" in out assert "ZITATEREGEL" in out def test_no_nrw_2022_appears_unless_chunks_are_actually_nrw(self): """Sanity: a pure MV+SPD chunk set must not mention NRW anywhere.""" out = format_quotes_for_prompt(EXAMPLE_QUOTES) assert "NRW" not in out assert "Nordrhein-Westfalen" not in out def test_renders_separate_blocks_for_wahl_and_parteiprogramm(self): out = format_quotes_for_prompt(EXAMPLE_QUOTES) assert "**Wahlprogramm:**" in out assert "**Grundsatzprogramm:**" in out def test_get_relevant_quotes_for_antrag_populates_results(self, monkeypatch): """Regression for the partei_upper NameError (Phase B / #55 / eb045d0): The dict-write line still referenced ``partei_upper`` after the rest of the function had been renamed to ``partei_lookup``. The result was that ``get_relevant_quotes_for_antrag`` raised ``NameError`` on every call, was silently swallowed by the ``except Exception`` in ``analyzer.run_analysis``, and silently downgraded *every* assessment to keyword search — which then caused the LLM hallucinations tracked in #60. Test strategy: monkeypatch ``find_relevant_chunks`` so we don't need real embeddings, then call the wrapper and assert it actually returns a populated dict instead of crashing. """ def fake_find_relevant_chunks(query, parteien=None, typ=None, bundesland=None, top_k=3, min_similarity=0.5): return [{ "programm_id": "gruene-nrw-2022", "partei": parteien[0] if parteien else "GRÜNE", "typ": typ or "wahlprogramm", "seite": 58, "text": "Wahlalter ab 16", "similarity": 0.7, }] monkeypatch.setattr(embeddings_mod, "find_relevant_chunks", fake_find_relevant_chunks) result = get_relevant_quotes_for_antrag( antrag_text="Wahlalter ab 16", fraktionen=["GRÜNE"], bundesland="NRW", top_k_per_partei=2, ) assert result, "Expected a non-empty result dict, got empty" # The keys are canonical party names; either GRÜNE itself or # whatever the canonical mapper returns for it. assert any("GR" in k.upper() for k in result.keys()) # And the structure must be the {wahlprogramm, parteiprogramm} dict first = next(iter(result.values())) assert "wahlprogramm" in first assert "parteiprogramm" in first def test_text_truncated_at_500_chars(self): long_chunk = { "FDP": { "wahlprogramm": [ { "programm_id": "fdp-mv-2021", "seite": 1, "text": "A" * 1000, # 1000 chars → should be truncated "similarity": 0.7, } ], } } out = format_quotes_for_prompt(long_chunk) # Truncation marker assert "..." in out # Original chunk text 1000 chars not present in full assert "A" * 1000 not in out