gwoe-antragspruefer/tests/test_embeddings.py

"""Tests for embeddings.py prompt formatting.

Reproduces the LLM-Halluzinations-Bug from the 2026-04-08 session
(commits 1b5fd96 + bc7f4a6): the original ``format_quotes_for_prompt``
rendered each chunk as ``- S. X: "text"`` without any reference to the
programme name. As a result the LLM hallucinated familiar source labels
("FDP NRW Wahlprogramm 2022") for chunks that actually came from MV/BE,
because that was the strongest training-set prior for budget-policy
citations.

Fix: prepend the fully-qualified PROGRAMME[programm_id]["name"] to each
quote.
"""
import sys
import types

import pytest

# Stub openai before importing embeddings, since the test environment may
# not have it installed and we don't actually need to make API calls.
if "openai" not in sys.modules:
    openai_stub = types.ModuleType("openai")
    openai_stub.OpenAI = lambda **kw: None
    sys.modules["openai"] = openai_stub

# On dev machines an older third-party "fitz" package may shadow PyMuPDF's
# legacy import alias — verify the loaded module actually has ``open`` and
# fall back to ``pymupdf`` (the canonical name in PyMuPDF ≥ 1.24) when the
# wrong "fitz" is in front of pymupdf on sys.path.
try:
    import fitz as _fitz
    if not hasattr(_fitz, "open"):
        import pymupdf as _pymupdf
        sys.modules["fitz"] = _pymupdf
except ImportError:
    try:
        import pymupdf as _pymupdf
        sys.modules["fitz"] = _pymupdf
    except ImportError:
        pass  # render tests will skip via fixture below

from app import embeddings as embeddings_mod
from app.embeddings import (
    _chunk_pdf_url,
    _chunk_source_label,
    find_chunk_for_text,
    format_quotes_for_prompt,
    get_relevant_quotes_for_antrag,
    reconstruct_zitate,
    render_highlighted_page,
    PROGRAMME,
)


# ─────────────────────────────────────────────────────────────────────────────
# _chunk_source_label — fully-qualified programme name + page
# ─────────────────────────────────────────────────────────────────────────────

class TestChunkSourceLabel:
    def test_known_programme_id(self):
        chunk = {"programm_id": "fdp-mv-2021", "seite": 73, "text": "..."}
        label = _chunk_source_label(chunk)
        assert "FDP Mecklenburg-Vorpommern" in label
        assert "S. 73" in label

    def test_known_programme_id_for_be(self):
        chunk = {"programm_id": "spd-be-2023", "seite": 24, "text": "..."}
        label = _chunk_source_label(chunk)
        assert "SPD Berlin" in label
        assert "2021" in label  # the BE-2023.pdf files contain 2021er programmes
        assert "S. 24" in label

    def test_unknown_programme_id_falls_back_to_id(self):
        chunk = {"programm_id": "fake-xx-9999", "seite": 1, "text": "..."}
        label = _chunk_source_label(chunk)
        # Should not crash, should at least include the id and the page
        assert "fake-xx-9999" in label
        assert "S. 1" in label

    def test_missing_seite_uses_questionmark(self):
        chunk = {"programm_id": "cdu-mv-2021", "text": "..."}
        label = _chunk_source_label(chunk)
        assert "?" in label


# ─────────────────────────────────────────────────────────────────────────────
# format_quotes_for_prompt — every chunk must carry programme identification
# ─────────────────────────────────────────────────────────────────────────────

EXAMPLE_QUOTES = {
    "FDP": {
        "wahlprogramm": [
            {
                "programm_id": "fdp-mv-2021",
                "partei": "FDP",
                "typ": "wahlprogramm",
                "seite": 73,
                "text": "Die Grundsätze von Wirtschaftlichkeit und Sparsamkeit",
                "similarity": 0.63,
            },
        ],
        "parteiprogramm": [
            {
                "programm_id": "fdp-grundsatz",
                "partei": "FDP",
                "typ": "parteiprogramm",
                "seite": 93,
                "text": "Liberale Marktwirtschaft erfordert solide Haushalte",
                "similarity": 0.60,
            },
        ],
    },
    "SPD": {
        "wahlprogramm": [
            {
                "programm_id": "spd-mv-2021",
                "partei": "SPD",
                "typ": "wahlprogramm",
                "seite": 22,
                "text": "Verkehrswende weg vom motorisierten Individualverkehr",
                "similarity": 0.58,
            },
        ],
    },
}


class TestFormatQuotesForPrompt:
    def test_empty_input_returns_empty_string(self):
        assert format_quotes_for_prompt({}) == ""

    def test_renders_party_headings(self):
        out = format_quotes_for_prompt(EXAMPLE_QUOTES)
        assert "### FDP" in out
        assert "### SPD" in out

    def test_every_chunk_has_programme_name(self):
        """Regression: pre-fix this used "S. X:" only, no programme name —
        the LLM then hallucinated NRW-2022 sources from training data."""
        out = format_quotes_for_prompt(EXAMPLE_QUOTES)
        # Each of the three chunks must reference its source programme
        assert "FDP Mecklenburg-Vorpommern" in out
        assert "FDP Grundsatzprogramm" in out
        assert "SPD Mecklenburg-Vorpommern" in out

    def test_contains_strict_citation_instruction(self):
        """The prompt header must explicitly forbid hallucinated sources."""
        out = format_quotes_for_prompt(EXAMPLE_QUOTES)
        assert "wörtlich" in out.lower()

    def test_chunks_get_enum_ids(self):
        """Issue #60 fix: each chunk must be tagged with a stable [Qn] id
        so the LLM can be forced to anchor every citation in a specific
        retrieved chunk instead of inventing snippets from training data.
        """
        out = format_quotes_for_prompt(EXAMPLE_QUOTES)
        # 2 wahlprogramm chunks + 1 grundsatz chunk = 3 IDs total
        assert "[Q1]" in out
        assert "[Q2]" in out
        assert "[Q3]" in out
        assert "[Q4]" not in out  # only 3 chunks in EXAMPLE_QUOTES

    def test_zitateregel_mentions_enum_anchor(self):
        out = format_quotes_for_prompt(EXAMPLE_QUOTES)
        # The prompt header must mention the ENUM anchor mechanism so
        # the LLM understands what [Qn] means.
        assert "[Q" in out
        assert "ZITATEREGEL" in out

    def test_no_nrw_2022_appears_unless_chunks_are_actually_nrw(self):
        """Sanity: a pure MV+SPD chunk set must not mention NRW anywhere."""
        out = format_quotes_for_prompt(EXAMPLE_QUOTES)
        assert "NRW" not in out
        assert "Nordrhein-Westfalen" not in out

    def test_renders_separate_blocks_for_wahl_and_parteiprogramm(self):
        out = format_quotes_for_prompt(EXAMPLE_QUOTES)
        assert "**Wahlprogramm:**" in out
        assert "**Grundsatzprogramm:**" in out

    def test_get_relevant_quotes_for_antrag_populates_results(self, monkeypatch):
        """Regression for the partei_upper NameError (Phase B / #55 / eb045d0):

        The dict-write line still referenced ``partei_upper`` after the
        rest of the function had been renamed to ``partei_lookup``. The
        result was that ``get_relevant_quotes_for_antrag`` raised
        ``NameError`` on every call, was silently swallowed by the
        ``except Exception`` in ``analyzer.run_analysis``, and silently
        downgraded *every* assessment to keyword search — which then
        caused the LLM hallucinations tracked in #60.

        Test strategy: monkeypatch ``find_relevant_chunks`` so we don't
        need real embeddings, then call the wrapper and assert it
        actually returns a populated dict instead of crashing.
        """
        def fake_find_relevant_chunks(query, parteien=None, typ=None,
                                      bundesland=None, top_k=3,
                                      min_similarity=0.5):
            return [{
                "programm_id": "gruene-nrw-2022",
                "partei": parteien[0] if parteien else "GRÜNE",
                "typ": typ or "wahlprogramm",
                "seite": 58,
                "text": "Wahlalter ab 16",
                "similarity": 0.7,
            }]

        monkeypatch.setattr(embeddings_mod, "find_relevant_chunks",
                            fake_find_relevant_chunks)

        result = get_relevant_quotes_for_antrag(
            antrag_text="Wahlalter ab 16",
            fraktionen=["GRÜNE"],
            bundesland="NRW",
            top_k_per_partei=2,
        )
        assert result, "Expected a non-empty result dict, got empty"
        # The keys are canonical party names; either GRÜNE itself or
        # whatever the canonical mapper returns for it.
        assert any("GR" in k.upper() for k in result.keys())
        # And the structure must be the {wahlprogramm, parteiprogramm} dict
        first = next(iter(result.values()))
        assert "wahlprogramm" in first
        assert "parteiprogramm" in first

# ─────────────────────────────────────────────────────────────────────────────
# reconstruct_zitate — Issue #60 Option B (server-side citation rewrite)
# ─────────────────────────────────────────────────────────────────────────────


class TestReconstructZitate:
    """Verify the post-processing pass that overwrites LLM-emitted quelle/url
    with the canonical source label of whichever retrieved chunk actually
    contains the cited text. Drops zitate that don't match any chunk.

    Background: BB 8/673 (Sub-D run after the A+C deploy) showed the LLM
    copying real text from one chunk but writing the page number from a
    different chunk into ``quelle``. The ENUM-anchor in the prompt is only
    a soft hint; this post-processing step is the structural binding.
    """

    def _make_chunk(self, programm_id: str, seite: int, text: str) -> dict:
        return {
            "programm_id": programm_id,
            "partei": programm_id.split("-")[0].upper(),
            "typ": "wahlprogramm",
            "seite": seite,
            "text": text,
            "similarity": 0.7,
        }

    def test_overwrites_wrong_seite_with_real_chunk_seite(self):
        """The BB 8/673 case: LLM cites text from S.27 chunk but writes
        S.4 in quelle. After reconstruct_zitate the quelle must point to
        the real S.27 chunk."""
        real_chunk = self._make_chunk(
            "bsw-bb-2024", 27,
            "wertschätzung für lehrerinnen und lehrer abbau von arbeitsüberlastung",
        )
        wrong_chunk = self._make_chunk(
            "bsw-bb-2024", 4,
            "in brandenburg weniger als 14 euro in der stunde verdient",
        )
        semantic_quotes = {
            "BSW": {"wahlprogramm": [wrong_chunk, real_chunk], "parteiprogramm": []},
        }
        data = {
            "wahlprogrammScores": [{
                "fraktion": "BSW",
                "wahlprogramm": {
                    "score": 7,
                    "begründung": "...",
                    "zitate": [{
                        "text": "Wertschätzung für Lehrerinnen und Lehrer Abbau von Arbeitsüberlastung",
                        "quelle": "BSW Brandenburg Wahlprogramm 2024, S. 4",  # WRONG
                        "url": "/static/referenzen/bsw-bb-2024.pdf#page=4",
                    }],
                },
                "parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
            }],
        }
        out = reconstruct_zitate(data, semantic_quotes)
        z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]
        assert z["quelle"] == "BSW Brandenburg Wahlprogramm 2024, S. 27"
        # Post-#47: URL ist der Highlight-Cite-Endpoint mit pid+seite+q.
        # Static-Fallback nur noch wenn der Chunk kein text-Feld hat.
        assert z["url"].startswith("/api/wahlprogramm-cite?")
        assert "pid=bsw-bb-2024" in z["url"]
        assert "seite=27" in z["url"]

    def test_drops_zitat_not_found_in_any_chunk(self):
        """If a snippet was hallucinated entirely (no matching chunk),
        the zitat must be removed rather than persisted."""
        chunk = self._make_chunk(
            "spd-lsa-2021", 41,
            "die stärkung einer geschlechtersensiblen berufsorientierung",
        )
        semantic_quotes = {
            "SPD": {"wahlprogramm": [chunk], "parteiprogramm": []},
        }
        data = {
            "wahlprogrammScores": [{
                "fraktion": "SPD",
                "wahlprogramm": {
                    "score": 7,
                    "begründung": "...",
                    "zitate": [
                        {"text": "Wir Sozialdemokratinnen ächten Rechtsextremismus seit 1863",
                         "quelle": "SPD Sachsen-Anhalt 2021, S. 37"},
                        {"text": "die Stärkung einer geschlechtersensiblen Berufsorientierung",
                         "quelle": "SPD Sachsen-Anhalt 2021, S. 41"},
                    ],
                },
                "parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
            }],
        }
        out = reconstruct_zitate(data, semantic_quotes)
        zitate = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"]
        assert len(zitate) == 1
        assert "geschlechtersensiblen" in zitate[0]["text"]

    def test_empty_semantic_quotes_is_noop(self):
        data = {"wahlprogrammScores": [{
            "fraktion": "X",
            "wahlprogramm": {"score": 5, "begründung": "x",
                             "zitate": [{"text": "abc def ghi jkl mno pqr", "quelle": "X"}]},
            "parteiprogramm": {"score": 0, "begründung": "x", "zitate": []},
        }]}
        out = reconstruct_zitate(data, {})
        # No chunks → no postprocessing applied; data passes through unchanged
        assert out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]["quelle"] == "X"

    def test_anchor_match_when_full_substring_misses(self):
        """LLM may slightly truncate a snippet — 5-word-anchor still binds."""
        chunk = self._make_chunk(
            "cdu-nrw-2022", 24,
            "wir wollen interprofessionelle netzwerkstrukturen für kinderschutz fördern dazu werden wir stellen schaffen",
        )
        semantic_quotes = {"CDU": {"wahlprogramm": [chunk], "parteiprogramm": []}}
        data = {"wahlprogrammScores": [{
            "fraktion": "CDU",
            "wahlprogramm": {
                "score": 8, "begründung": "...",
                "zitate": [{
                    "text": "Wir wollen interprofessionelle Netzwerkstrukturen für Kinderschutz fördern",
                    "quelle": "CDU NRW Wahlprogramm 2022, S. 999",  # wrong page
                }],
            },
            "parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
        }]}
        out = reconstruct_zitate(data, semantic_quotes)
        z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]
        assert z["quelle"] == "CDU NRW Wahlprogramm 2022, S. 24"

    def test_find_chunk_for_text_short_needle_returns_none(self):
        chunk = self._make_chunk("x", 1, "egal was hier steht")
        assert find_chunk_for_text("ja", [chunk]) is None

    def test_find_chunk_for_text_handles_soft_hyphen(self):
        chunk = self._make_chunk(
            "bsw-bb-2024", 27,
            "handys und tablets wertschätzung für lehrerinnen und lehrer",
        )
        # LLM-emitted text with the soft hyphen \xad mid-word, as PyMuPDF
        # would extract from a PDF line break.
        text = "Handys und Tablets. Wertschätzung für Lehrerinnen und Lehrer"
        assert find_chunk_for_text(text, [chunk]) is chunk


# ─────────────────────────────────────────────────────────────────────────────
# _chunk_pdf_url + render_highlighted_page — Issue #47 PDF-Highlighting
# ─────────────────────────────────────────────────────────────────────────────


class TestChunkPdfUrl:
    """Verify the URL builder switches between the cite-endpoint (when
    chunk text is present) and the static fallback (Pre-#47 chunks).
    """

    def test_cite_url_when_text_present(self):
        chunk = {
            "programm_id": "gruene-grundsatz",
            "seite": 36,
            "text": "Plattformen müssen umfassend reguliert werden",
        }
        url = _chunk_pdf_url(chunk)
        assert url is not None
        assert url.startswith("/api/wahlprogramm-cite?")
        assert "pid=gruene-grundsatz" in url
        assert "seite=36" in url
        # URL-encoded query (urlencode/quote_plus uses + for space)
        assert "Plattformen" in url

    def test_static_fallback_when_no_text(self):
        chunk = {"programm_id": "fdp-mv-2021", "seite": 73}
        url = _chunk_pdf_url(chunk)
        assert url == "/static/referenzen/fdp-mv-2021.pdf#page=73"

    def test_unknown_programme_returns_none(self):
        chunk = {"programm_id": "fake-xx-9999", "seite": 1, "text": "x" * 50}
        assert _chunk_pdf_url(chunk) is None

    def test_url_truncates_long_text_to_200_chars(self):
        chunk = {
            "programm_id": "gruene-grundsatz",
            "seite": 36,
            "text": "A" * 1000,
        }
        url = _chunk_pdf_url(chunk)
        assert url is not None
        # Eingebettete Text-Länge ist auf 200 Zeichen begrenzt — sonst
        # blasen 500-Zeichen-Snippets das Assessment-JSON auf.
        # Der `q=`-Parameter darf nicht 1000 'A' enthalten.
        assert "A" * 1000 not in url
        assert "A" * 200 in url


class TestRenderHighlightedPage:
    """Smoke-Test gegen ein reales Wahlprogramm-PDF aus dem
    referenzen-Verzeichnis. Bestätigt dass PyMuPDF einen 1-Seiten-PDF
    mit Highlight-Annotation produziert. Skipped wenn das Test-PDF
    nicht im Repo vorhanden ist.
    """

    @pytest.fixture
    def sample_pid(self):
        # Wir nehmen einen kleinen, sicher vorhandenen Eintrag aus PROGRAMME.
        # spd-grundsatz ist seit Tag 1 indexiert und im Repo committed.
        from pathlib import Path
        from app.embeddings import PROGRAMME
        pid = "spd-grundsatz"
        info = PROGRAMME.get(pid)
        if not info:
            pytest.skip("PROGRAMME registry missing spd-grundsatz")
        path = Path(__file__).parent.parent / "app" / "static" / "referenzen" / info["pdf"]
        if not path.exists():
            pytest.skip(f"Test-PDF {path} nicht im Repo")
        return pid

    def test_unknown_pid_returns_none(self):
        assert render_highlighted_page("fake-xx-9999", 1, "x") is None

    def test_invalid_seite_returns_none(self, sample_pid):
        assert render_highlighted_page(sample_pid, 99999, "x") is None
        assert render_highlighted_page(sample_pid, 0, "x") is None

    def test_renders_single_page_pdf(self, sample_pid):
        out = render_highlighted_page(sample_pid, 1, "Soziale Gerechtigkeit")
        assert out is not None
        assert isinstance(out, bytes)
        # PDF magic header
        assert out[:5] == b"%PDF-"
        # PyMuPDF behält bei insert_pdf gemeinsame Resources (Fonts, Images)
        # mit, deshalb ist ein 1-Seiten-Sub-PDF nicht zwangsläufig winzig.
        # Wir prüfen nur dass es überhaupt deutlich kleiner als das Original
        # ist (< 50% der Programm-Größe).
        from pathlib import Path
        info = PROGRAMME[sample_pid]
        original_size = (
            Path(__file__).parent.parent / "app" / "static" / "referenzen" / info["pdf"]
        ).stat().st_size
        assert len(out) < original_size, (
            f"sub-PDF {len(out)} not smaller than original {original_size}"
        )

    def test_returns_pdf_even_when_query_empty(self, sample_pid):
        # Empty query → render the page without any annotations
        out = render_highlighted_page(sample_pid, 1, "")
        assert out is not None
        assert out[:5] == b"%PDF-"

    def test_returns_pdf_even_when_query_not_found(self, sample_pid):
        # No match → still render the page (no highlights)
        out = render_highlighted_page(
            sample_pid, 1, "this exact phrase definitely does not exist anywhere",
        )
        assert out is not None
        assert out[:5] == b"%PDF-"


def test_format_quotes_truncates_long_chunks_at_500_chars():
    """Truncation-Test for format_quotes_for_prompt — sat lange als
    Methode in TestRenderHighlightedPage (falsche Class-Zuordnung
    durch Edit-Reihenfolge), jetzt module-level."""
    long_chunk = {
        "FDP": {
            "wahlprogramm": [
                {
                    "programm_id": "fdp-mv-2021",
                    "seite": 1,
                    "text": "A" * 1000,  # 1000 chars → should be truncated
                    "similarity": 0.7,
                }
            ],
        }
    }
    out = format_quotes_for_prompt(long_chunk)
    # Truncation marker
    assert "..." in out
    # Original chunk text 1000 chars not present in full
    assert "A" * 1000 not in out