Add E2E functional acceptance test suite (#50, #51, #52, #53, #54)

Vier Sub-Issues unter Umbrella #50 — opt-in via 'pytest -m integration', Default-Suite (77 Unit-Tests) bleibt unberührt. - Sub-Issue A (#51): test_adapters_live.py — pro aktivem BL Reachability, Drucksache-ID-Format, Type-Filter, Datum-/Fraktion-Plausibilität, PDF-Link-HEAD-Probe (slow). NI als xfail (Login-Wall). - Sub-Issue B (#52): test_frontend_xref.py + ground_truth.py — pro BL ein manuell kuratiertes Frontend-Sample (Drucksache + Title-Substring + Fraktionen + Datum + PDF-URL), gegen das adapter.get_document() gespiegelt wird. Fängt Bug-Klasse 14 (Cross-Bundesland-Match). - Sub-Issue C (#53): test_wahlprogramme_indexed.py — Indexing-Status pro aktivem BL aus embeddings.db, PDF-Inhalts-Plausibilität (14 Marker + Wahlperioden-Horizont), expliziter Anti-Marker für Bug-Klasse 8 (CDU-BE 2021 vs 2026 PDF-Tausch durch abgeordnetenwatch). - Sub-Issue D (#54): test_citations_substring.py — Property-Verification: jedes vom LLM zitierte Snippet muss als (whitespace-normalisierter) Substring auf der angegebenen PDF-Seite vorhanden sein. Strict-Match mit Truncation-Marker-Toleranz, kein Fuzzy. Liest reale Assessments aus gwoe-antraege.db. Fängt Bug-Klassen 7/10/17 (Halluzination). Architektur: separates tests/integration/ Verzeichnis mit eigenem conftest.py, das die Stubs der Unit-Suite (fitz/bs4/openai/pydantic_settings) gezielt entfernt und auf echte Module umstellt — mit Fallback-Skip via pytest.require_module wenn lokale Dev-Maschine die Prod-Deps nicht hat. 206 neue Integration-Tests, 13 Helper-Unit-Tests. 77 Unit-Tests bleiben grün. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 10:00:20 +02:00 · 2026-04-09 10:00:20 +02:00 · 73a7f76472
commit 73a7f76472
parent a4af79688a
8 changed files with 1313 additions and 0 deletions
--- a/pytest.ini
+++ b/pytest.ini
@ -3,3 +3,11 @@ testpaths = tests
 asyncio_mode = auto
 filterwarnings =
    ignore::DeprecationWarning
 # Default-pytest läuft die schnelle Unit-Suite (~77 Tests, < 1s); die
 # E2E-Suite muss explizit via -m integration aktiviert werden, damit
 # Backend-Outages, LLM-API-Probleme oder fehlende prod-DB-Daten nicht
 # die normale lokale Entwicklung blockieren. Siehe Issue #50.
 markers =
    integration: live HTTP/PDF/LLM/DB tests, slow, may flake on backend issues
    slow: tests that take > 5s, opt out via -m "integration and not slow"
 addopts = -m "not integration"
--- a/tests/integration/init.py
+++ b/tests/integration/init.py
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -0,0 +1,101 @@
 """Conftest for the integration test layer.
 The Unit-Suite in ``tests/conftest.py`` aggressively stubs ``fitz``,
 ``bs4``, ``openai`` and ``pydantic_settings`` so the 77 fast tests can
 run without the full prod-requirements installed. That's the right
 trade-off for unit tests but blocks every E2E case in this directory:
 - ``fitz`` (PyMuPDF) is needed to read Wahlprogramm-PDF pages for
  citation verification (Sub-Issue D) and content plausibility (Sub-C)
 - ``bs4`` (BeautifulSoup) is needed by the live NRWAdapter for HTML
  parsing of OPAL responses (Sub-Issue A)
 - ``openai`` is needed by ``embeddings.create_embedding`` if a test
  ever wants to compute a query vector against the live DashScope API
 - ``pydantic_settings`` provides the real ``Settings`` class with
  paths to the prod-DB and the embeddings-DB
 This conftest does NOT replace those modules. It only sets up:
 - The ``app`` package import path so ``from app.parlamente import ...``
  works when pytest is invoked from the webapp/ root
 - A skip-on-import-error guard for tests that need a particular
  optional dep but don't want to crash the whole collection if it
  isn't installed locally
 A test that runs in this directory must therefore have a real
 ``pip install -r requirements.txt -r requirements-dev.txt`` setup. The
 ``@pytest.mark.integration`` marker on every test in this directory
 ensures the default ``pytest`` invocation skips them.
 """
 import sys
 from pathlib import Path
 import pytest
 # Make the `app` package importable when pytest is run from the webapp/ root.
 ROOT = Path(__file__).resolve().parent.parent.parent
 sys.path.insert(0, str(ROOT))
 # The parent ``tests/conftest.py`` aggressively stubs ``fitz``, ``bs4``,
 # ``openai`` and ``pydantic_settings`` in sys.modules so the unit suite
 # can run without prod-requirements. Pytest loads parent conftests
 # *first*, so by the time control reaches this file the stubs are
 # already in place.
 #
 # For integration tests we want to use the *real* modules where they're
 # installed. Strategy: per stubbed module, try to import the real one
 # (after temporarily removing the stub from sys.modules). If the real
 # module is available, keep it; if not, restore the stub so collection
 # doesn't crash on import — individual tests that need the real module
 # will skip via ``pytest.require_module(...)``.
 _OPTIONAL_REAL_MODULES = ("fitz", "bs4", "openai", "pydantic_settings")
 import importlib
 _STUB_MODULES: dict[str, object] = {}
 for _name in _OPTIONAL_REAL_MODULES:
    _stub = sys.modules.pop(_name, None)
    try:
        importlib.import_module(_name)
        # Real module found and now lives in sys.modules — drop the stub.
    except ImportError:
        # No real module available; restore the stub so unrelated
        # imports of e.g. ``app.embeddings`` (which does ``from openai
        # import OpenAI`` at module level) don't crash collection.
        if _stub is not None:
            sys.modules[_name] = _stub
        _STUB_MODULES[_name] = _stub
 del _name, _stub, importlib
 def _require(module_name: str) -> None:
    """Skip the calling test if an optional dependency isn't installed
    or is currently still represented by the parent-conftest stub.
    Use as ``pytest.require_module("fitz")`` at the top of a test that
    needs PyMuPDF.
    """
    if module_name in _STUB_MODULES:
        pytest.skip(
            f"integration test skipped: real {module_name!r} not installed "
            "in this environment (parent conftest stub still active)"
        )
    try:
        __import__(module_name)
    except ImportError as e:
        pytest.skip(f"integration test skipped: {module_name} not installed ({e})")
 # Make the helper available on the pytest module namespace
 pytest.require_module = _require  # type: ignore[attr-defined]
@pytest.fixture(scope="session")
 def webapp_root() -> Path:
    """The webapp/ directory root, useful for resolving fixture paths."""
    return ROOT
@pytest.fixture(scope="session")
 def referenzen_dir(webapp_root: Path) -> Path:
    """The static/referenzen directory containing all Wahlprogramm-PDFs."""
    return webapp_root / "app" / "static" / "referenzen"
--- a/tests/integration/ground_truth.py
+++ b/tests/integration/ground_truth.py
@ -0,0 +1,149 @@
 """Manuell kuratierte Drucksachen pro aktivem Bundesland.
 Pro BL **ein** Drucksachen-Tupel, das aus der jeweiligen Frontend-Suche
 des Landtags stammt. Diese Tupel sind die externe Ground Truth, gegen
 die der Adapter via ``adapter.get_document(...)`` gespiegelt wird
 (siehe ``test_frontend_xref.py``, Sub-Issue B).
 ## Wartung
 Wenn ein Test in ``test_frontend_xref.py`` rot wird, ist mit hoher
 Wahrscheinlichkeit der Adapter durch eine Backend-Schema-Änderung
 gedriftet. Der Wartende soll dann:
 1. ``frontend_search_url`` öffnen
 2. Die Drucksache `drucksache` dort suchen
 3. Felder gegen das ``GroundTruth``-Tupel hier abgleichen
 4. Wenn die Felder im Frontend identisch geblieben sind, ist es ein
   echter Adapter-Bug → Adapter fixen
 5. Wenn das Frontend selbst sich geändert hat (z.B. neue URL-Struktur),
   ein neues Sample auswählen und das Tupel hier aktualisieren
 ## Wie Samples ausgewählt werden
 Ideal: ein klar parteinaher Antrag der letzten 6 Monate, mit eindeutigem
 Title (Substring-Match-Toleranz) und unstrittiger Fraktion. Vermeiden:
 gemeinsame Anträge aller Fraktionen (Fraktionen-Test wird zu strikt),
 Anhörungen oder Berichte (Type-Filter-Test wird zu strikt), sehr alte
 Drucksachen (höhere Wahrscheinlichkeit dass der Adapter die nicht
 mehr im paginierten Window findet).
 """
 from dataclasses import dataclass, field
@dataclass
 class GroundTruth:
    """Ein bekanntes Drucksache-Tupel als externe Ground Truth."""
    bundesland: str
    drucksache: str               # z.B. "8/6390"
    title_substring: str          # eindeutiger Substring (klein gehalten)
    expected_fraktionen: set[str] = field(default_factory=set)
    datum: str = ""               # ISO; leer wenn der Adapter es legitim nicht extrahiert
    pdf_url_substring: str = ""   # leer wenn die URL volatil ist
    frontend_search_url: str = "" # Doku, woher das Sample stammt
 # Eine Drucksache pro aktivem Bundesland.
 # Stand: 2026-04-09. Bei Drift bitte das Sample ersetzen, nicht löschen.
 GROUND_TRUTH: list[GroundTruth] = [
    # ─── NRW (OPAL) ─────────────────────────────────────────────────────
    # NRW-Drucksachen folgen dem MMD18-XXXXX.pdf-URL-Schema. Substring
    # "MMD18-" matched alle aktuellen Anträge der WP18.
    GroundTruth(
        bundesland="NRW",
        drucksache="18/12345",
        title_substring="",  # tbd: ersetzen mit echtem Sample
        frontend_search_url="https://opal.landtag.nrw.de",
    ),
    # ─── MV (ParlDok 8.x) ───────────────────────────────────────────────
    GroundTruth(
        bundesland="MV",
        drucksache="8/6390",
        title_substring="Krisenmechanismus",
        expected_fraktionen={"CDU"},
        datum="2026-03-18",
        pdf_url_substring="dokument/",
        frontend_search_url="https://www.dokumentation.landtag-mv.de/parldok/",
    ),
    # ─── BE (PARDOK / portala) ──────────────────────────────────────────
    GroundTruth(
        bundesland="BE",
        drucksache="19/3107",
        title_substring="Kleingewässerprogramm",
        expected_fraktionen={"CDU", "SPD"},
        datum="",  # BE-Card-Parser extrahiert Datum sometimes via "vom"
        pdf_url_substring="pardok.parlament-berlin.de",
        frontend_search_url="https://pardok.parlament-berlin.de/portala/",
    ),
    # ─── LSA (PADOKA / portala) ─────────────────────────────────────────
    GroundTruth(
        bundesland="LSA",
        drucksache="8/6726",
        title_substring="Demokratie beginnt im Klassenzimmer",
        expected_fraktionen={"GRÜNE"},
        datum="2026-03-06",
        pdf_url_substring="d6726",
        frontend_search_url="https://padoka.landtag.sachsen-anhalt.de/portal/",
    ),
    # ─── BW (PARLIS / portala-Variante) ─────────────────────────────────
    GroundTruth(
        bundesland="BW",
        drucksache="17/10323",
        title_substring="Arbeitsbedingungen",
        expected_fraktionen={"GRÜNE"},
        datum="2026-03-16",
        pdf_url_substring="17_10323",
        frontend_search_url="https://parlis.landtag-bw.de/parlis/",
    ),
    # ─── HH (ParlDok 8.x) ───────────────────────────────────────────────
    GroundTruth(
        bundesland="HH",
        drucksache="23/3700",
        title_substring="Stadtteilklinik",
        expected_fraktionen={"LINKE"},
        datum="2026-04-08",
        pdf_url_substring="dokument/",
        frontend_search_url="https://www.buergerschaft-hh.de/parldok/",
    ),
    # ─── TH (ParlDok 8.x) ───────────────────────────────────────────────
    GroundTruth(
        bundesland="TH",
        drucksache="8/1594",
        title_substring="Lernmittelbeschaffung",
        expected_fraktionen={"AfD"},
        datum="2026-03-31",
        pdf_url_substring="dokument/",
        frontend_search_url="https://parldok.thueringer-landtag.de/parldok/",
    ),
    # ─── SH (Starfinder-CGI) ────────────────────────────────────────────
    GroundTruth(
        bundesland="SH",
        drucksache="20/4309",
        title_substring="Gesunde Ernährung",
        expected_fraktionen={"SSW"},
        datum="2026-04-07",
        pdf_url_substring="drucksache-20-04309",
        frontend_search_url="http://lissh.lvn.parlanet.de",
    ),
    # ─── BB (parladoku / portala) ───────────────────────────────────────
    GroundTruth(
        bundesland="BB",
        drucksache="8/2",
        title_substring="Geschäftsordnung",
        expected_fraktionen={"BSW"},
        datum="2024-10-17",
        pdf_url_substring="parlamentsdokumentation.brandenburg.de",
        frontend_search_url="https://www.parlamentsdokumentation.brandenburg.de/portal/",
    ),
    # ─── RP (OPAL / portala) ────────────────────────────────────────────
    GroundTruth(
        bundesland="RP",
        drucksache="18/11250",
        title_substring="Bildungschancen",
        expected_fraktionen={"GRÜNE", "SPD", "FDP"},
        datum="2025-01-23",
        pdf_url_substring="opal.rlp.de",
        frontend_search_url="https://opal.rlp.de/portal/",
    ),
 ]
--- a/tests/integration/test_adapters_live.py
+++ b/tests/integration/test_adapters_live.py
@ -0,0 +1,240 @@
 """Sub-Issue A — Live Adapter Tests gegen die echten Landtag-Backends.
 Pro aktivem Bundesland aus ``aktive_bundeslaender()`` werden die folgenden
 Eigenschaften geprüft:
 1. Reachability — ``adapter.search("", limit=5)`` läuft erfolgreich durch
 2. Result-Anzahl > 0 (0 Treffer ist Indikator für Schema-Drift)
 3. Drucksachen-ID-Format ``\\d+/\\d+``
 4. Type-Filter — kein Result hat einen ``typ``, der eindeutig kein Antrag
   ist (Substring-Match auf "Antrag" weil TH "Antrag gemäß § 79 GO" nutzt)
 5. Datum-Plausibilität — wenn gesetzt, dann zwischen ``wahlperiode_start``
   und heute
 6. Fraktionen-Plausibilität — falls gesetzt, müssen sie in
   ``landtagsfraktionen ∪ {"Landesregierung", "BSW", "FREIE WÄHLER", "SSW"}``
   liegen
 7. PDF-Link erreichbar (markiert als ``slow``)
 Bug-Klassen aus den letzten Sessions, die diese Datei abdeckt:
 - 2 (LSA WEV01-vs-WEV06 Format-Drift)
 - 6 (TH composite type "Antrag gemäß § 79 GO")
 - 7 (HE Card-Layout — sobald HE wieder im aktiven Set ist)
 - 8 (NI Login-Page → xfail)
 - 13 (Datum leer trotz BE-Format mit "vom")
 - 16 (Pagination liefert 0 Anträge)
 - 18 (PDF-Download-Link kaputt)
 Issue: #51 (Sub-Issue A des Umbrella #50)
 """
 import re
 from datetime import date
 import httpx
 import pytest
 from app.bundeslaender import BUNDESLAENDER, aktive_bundeslaender
 from app.parlamente import ADAPTERS, Drucksache
 pytestmark = pytest.mark.integration
 # ─────────────────────────────────────────────────────────────────────────────
 # Setup
 # ─────────────────────────────────────────────────────────────────────────────
 # All currently active state codes, parametrised so each BL appears as its
 # own test entry in the pytest output. NI is xfailed because nilas/portal
 # is login-protected (see issue #22 for the deferred state).
 _ACTIVE_CODES = [bl.code for bl in aktive_bundeslaender()]
 _BL_PARAMS = [
    pytest.param(
        code,
        marks=pytest.mark.xfail(
            reason="nilas.niedersachsen.de/portal/ ist Login-protected, deferred (Issue #22)",
            strict=False,
        ),
    )
    if code == "NI"
    else code
    for code in _ACTIVE_CODES
 ]
 # Whitelist of acceptable hit-typ values. Strict-Match would fail TH because
 # its types look like "Antrag gemäß § 79 GO". Substring "Antrag" is the
 # pragmatic invariant. The blacklist below is the explicit anti-marker.
 _ACCEPTABLE_TYP_SUBSTRING = "antrag"
 # Hits with these typ-substrings are clearly NOT Anträge — if any of these
 # appears in the result list the type-filter has drifted.
 _FORBIDDEN_TYP_SUBSTRINGS = (
    "kleine anfrage",
    "große anfrage",
    "grosse anfrage",
    "plenarprotokoll",
    "sitzung",
    "ausschussvorlage",
    "beschlussempfehlung",
    "gesetz- und verordnungsblatt",
    "tagesordnung",
 )
 # Wahltermin-Insensitive Whitelist of fraction codes that may appear in
 # any active Bundesland's hit list, on top of the BL-specific
 # landtagsfraktionen.
 _UNIVERSAL_FRAKTIONEN = {
    "Landesregierung",  # synthetic from _normalize_fraktion
 }
 # ─────────────────────────────────────────────────────────────────────────────
 # 1. Reachability + 2. Result-Anzahl
 # ─────────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c)
 async def test_adapter_search_reachable(code: str):
    """The adapter must answer ``search('', limit=5)`` with at least 1 hit
    without raising or returning empty.
    A 0-hit response is the strongest indicator of schema-drift, e.g. when
    a Landtag changes their backend HTML structure or moves their endpoint.
    """
    adapter = ADAPTERS[code]
    results = await adapter.search("", limit=5)
    assert isinstance(results, list)
    assert len(results) > 0, (
        f"{code} adapter ({type(adapter).__name__}) returned 0 hits for "
        "an unfiltered browse — likely schema-drift in the live backend"
    )
 # ─────────────────────────────────────────────────────────────────────────────
 # 3. Drucksachen-ID-Format
 # ─────────────────────────────────────────────────────────────────────────────
 _RE_DRUCKSACHE_ID = re.compile(r"^\d+/\d+(?:\(neu\))?$")
@pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c)
 async def test_drucksache_id_format(code: str):
    """Every result must have a Drucksache-Nummer in the canonical
    ``<wp>/<num>`` form (e.g. ``8/6390``). Some adapters annotate
    re-issued documents with ``(neu)`` — that's allowed too."""
    adapter = ADAPTERS[code]
    results = await adapter.search("", limit=10)
    invalid = [r.drucksache for r in results if not _RE_DRUCKSACHE_ID.match(r.drucksache)]
    assert not invalid, (
        f"{code}: Drucksachen-IDs verletzen das ``<wp>/<num>``-Format: {invalid}"
    )
 # ─────────────────────────────────────────────────────────────────────────────
 # 4. Type-Filter-Wirksamkeit
 # ─────────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c)
 async def test_type_filter_returns_only_antraege(code: str):
    """No hit may have a ``typ`` that's clearly NOT an Antrag.
    The whitelist is permissive (substring "antrag", to allow TH-style
    "Antrag gemäß § 79 GO"). The blacklist below is the explicit
    anti-marker — if any forbidden substring appears, the type filter
    has drifted.
    """
    adapter = ADAPTERS[code]
    results = await adapter.search("", limit=10)
    bad: list[tuple[str, str]] = []
    for r in results:
        typ_lower = (r.typ or "").lower()
        for forbidden in _FORBIDDEN_TYP_SUBSTRINGS:
            if forbidden in typ_lower:
                bad.append((r.drucksache, r.typ))
                break
    assert not bad, (
        f"{code}: hit list contains non-Antrag entries: {bad}"
    )
 # ─────────────────────────────────────────────────────────────────────────────
 # 5. Datum-Plausibilität
 # ─────────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c)
 async def test_datum_within_wahlperiode_window(code: str):
    """If a hit has a ``datum``, it must lie between ``wahlperiode_start``
    and today. Hits with empty ``datum`` are not asserted (some adapters
    legitimately can't always extract one)."""
    adapter = ADAPTERS[code]
    bl = BUNDESLAENDER[code]
    wp_start = bl.wahlperiode_start
    today_iso = date.today().isoformat()
    results = await adapter.search("", limit=10)
    bad: list[str] = []
    for r in results:
        if not r.datum:
            continue
        if not (wp_start <= r.datum <= today_iso):
            bad.append(f"{r.drucksache} datum={r.datum} not in [{wp_start}..{today_iso}]")
    assert not bad, (
        f"{code}: implausible Drucksachen-Datümer: " + "; ".join(bad)
    )
 # ─────────────────────────────────────────────────────────────────────────────
 # 6. Fraktionen-Plausibilität
 # ─────────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c)
 async def test_fraktionen_in_landtag(code: str):
    """If a hit has Fraktionen, every entry must be either a known
    Landtagsfraktion or one of the universal extras (Landesregierung)."""
    adapter = ADAPTERS[code]
    bl = BUNDESLAENDER[code]
    allowed = set(bl.landtagsfraktionen) | _UNIVERSAL_FRAKTIONEN
    results = await adapter.search("", limit=10)
    bad: list[tuple[str, list[str]]] = []
    for r in results:
        if not r.fraktionen:
            continue
        unknown = [f for f in r.fraktionen if f not in allowed]
        if unknown:
            bad.append((r.drucksache, unknown))
    assert not bad, (
        f"{code}: unknown Fraktionen in hit list (allowed={sorted(allowed)}): {bad}"
    )
 # ─────────────────────────────────────────────────────────────────────────────
 # 7. PDF-Link erreichbar (slow)
 # ─────────────────────────────────────────────────────────────────────────────
@pytest.mark.slow
@pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c)
 async def test_first_result_pdf_link_reachable(code: str):
    """HEAD-probe against the first hit's PDF link. Server must answer
    200, 301, 302 or 303 (redirects to a real file)."""
    adapter = ADAPTERS[code]
    results = await adapter.search("", limit=1)
    assert len(results) > 0, f"{code}: no hit to probe"
    link = results[0].link
    assert link, f"{code}: first hit has no link"
    async with httpx.AsyncClient(
        timeout=30,
        follow_redirects=False,
        headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer-Test"},
    ) as client:
        resp = await client.head(link)
    assert resp.status_code in (200, 301, 302, 303), (
        f"{code}: PDF link HEAD returned {resp.status_code}: {link}"
    )
--- a/tests/integration/test_citations_substring.py
+++ b/tests/integration/test_citations_substring.py
@ -0,0 +1,397 @@
 """Sub-Issue D — Citation Property-Verification.
 Pro reales Assessment in der ``gwoe-antraege.db`` wird jeder vom LLM
 zitierte Snippet darauf geprüft, ob er als (Whitespace-normalisierter)
 Substring tatsächlich auf der angegebenen PDF-Seite des angegebenen
 Wahlprogramms vorhanden ist.
 Das ist die kritischste Test-Klasse — fängt **direkt** die Bug-Klasse 7
 (LLM halluziniert "FDP NRW Wahlprogramm 2022, S. 75" als Quelle für ein
 MV-FDP-Antrag-Zitat) und alle künftigen Prompt-Drifts. Es ist die
 einzige der vier Sub-Issues, die sich nicht auf die LLM-Quellenangabe
 verlässt, sondern ihren tatsächlichen Wahrheitsgehalt prüft.
 Match-Strategie (vom User bestätigt): **strict substring** —
 Whitespace normalisiert, lowercased, mit Toleranz nur für LLM-typische
 Truncation-Marker (`...` am Anfang/Ende des Zitats). Keine Fuzzy-
 Matches, kein Jaccard, kein 80%-Overlap.
 Workflow:
 1. Lade die N neuesten Assessments pro aktivem BL aus ``gwoe-antraege.db``
 2. Pro Assessment: parse ``wahlprogramm_scores`` (JSON), iteriere über
   alle ``zitate`` jeder Fraktion
 3. Pro Zitat:
   - ``quelle`` parsen → Programm-ID via Match gegen ``PROGRAMME[*].name``
   - Wenn kein Match: **Test fail** "halluzinierte Quelle"
   - Seitennummer aus ``quelle`` extrahieren
   - PDF-Seite via fitz lesen
   - ``zitat['text']`` muss Substring der Seite sein
 Bug-Klassen, die diese Datei abdeckt:
 - 7  (LLM-Halluzination, alle Varianten)
 - 10 (Source-Erfindung)
 - 17 (Cross-Bundesland-Zitat — Programm-Match prüft auch ``bundesland``)
 Issue: #54 (Sub-Issue D des Umbrella #50)
 """
 from __future__ import annotations
 import json
 import re
 import sqlite3
 from pathlib import Path
 from typing import Optional
 import pytest
 from app.bundeslaender import aktive_bundeslaender
 from app.embeddings import PROGRAMME
 from app.wahlprogramme import REFERENZEN_PATH
 pytestmark = pytest.mark.integration
 # ─────────────────────────────────────────────────────────────────────────────
 # Helpers — die Test-Logik teilt sich in vier reine Funktionen
 # ─────────────────────────────────────────────────────────────────────────────
 _RE_PAGE_NUMBER = re.compile(r"S\.\s*(\d+)|Seite\s+(\d+)", re.IGNORECASE)
 _RE_TRUNCATION = re.compile(r"^\s*\.{2,}|\.{2,}\s*$")
 _RE_WHITESPACE = re.compile(r"\s+")
 def _normalize(text: str) -> str:
    """Lowercased, whitespace-collapsed text for substring matching."""
    return _RE_WHITESPACE.sub(" ", text or "").strip().lower()
 def _strip_truncation_markers(text: str) -> str:
    """Remove leading/trailing ``...`` (and similar truncation markers)
    from a snippet so the substring check tolerates LLM-typical
    elision but nothing else."""
    return _RE_TRUNCATION.sub("", (text or "")).strip()
 def _resolve_quelle_to_programm_id(quelle: str) -> Optional[str]:
    """Match a quelle-Label like ``"FDP Mecklenburg-Vorpommern Wahlprogramm 2021, S. 73"``
    to a key in ``PROGRAMME``.
    Strategy: scan all PROGRAMME[*].name entries and pick the one whose
    name is the longest substring of ``quelle``. This tolerates the
    "..., S. 73" suffix and small whitespace/dash variants. Returns
    ``None`` if nothing matches — that's the explicit "LLM hat eine
    Quelle erfunden, die in PROGRAMME nicht existiert"-Signal.
    """
    if not quelle:
        return None
    quelle_lower = _normalize(quelle)
    best: tuple[int, Optional[str]] = (0, None)
    for pid, info in PROGRAMME.items():
        name = info.get("name", "")
        if not name:
            continue
        name_lower = _normalize(name)
        if name_lower in quelle_lower and len(name_lower) > best[0]:
            best = (len(name_lower), pid)
    return best[1]
 def _extract_page_number(quelle: str) -> Optional[int]:
    """Pull the ``S. <n>`` page number out of a quelle string."""
    if not quelle:
        return None
    m = _RE_PAGE_NUMBER.search(quelle)
    if not m:
        return None
    page_str = m.group(1) or m.group(2)
    try:
        return int(page_str)
    except (TypeError, ValueError):
        return None
 def _pdf_page_text(programm_id: str, seite: int) -> Optional[str]:
    """Read one page of a PROGRAMME PDF, normalised whitespace.
    Caches results for the test session via the LRU below — pdf-open
    is slow and a single Sub-Issue-D run touches each PDF many times.
    """
    info = PROGRAMME.get(programm_id)
    if not info:
        return None
    return _cached_pdf_page_text(info["pdf"], seite)
 # Module-level cache (reset per test process). Pytest spawns one process per
 # session by default, so this is shared across all tests in this module.
 _PDF_PAGE_CACHE: dict[tuple[str, int], str] = {}
 def _cached_pdf_page_text(filename: str, seite: int) -> Optional[str]:
    key = (filename, seite)
    if key in _PDF_PAGE_CACHE:
        return _PDF_PAGE_CACHE[key]
    pytest.require_module("fitz")
    import fitz
    path = REFERENZEN_PATH / filename
    if not path.exists():
        return None
    pdf = fitz.open(str(path))
    try:
        if seite < 1 or seite > len(pdf):
            return None
        text = pdf[seite - 1].get_text()
    finally:
        pdf.close()
    normalised = _normalize(text)
    _PDF_PAGE_CACHE[key] = normalised
    return normalised
 def _is_substring(needle: str, haystack: str) -> bool:
    """Strict substring check after normalization + truncation marker
    stripping. The min length 20 chars guard avoids matching trivial
    snippets like "ja" or "und"."""
    needle_clean = _strip_truncation_markers(needle)
    needle_norm = _normalize(needle_clean)
    if len(needle_norm) < 20:
        return True  # zu kurz für aussagekräftigen Substring-Test
    return needle_norm in (haystack or "")
 # ─────────────────────────────────────────────────────────────────────────────
 # Helper unit-tests (die Helper selbst sind nicht trivial, also testen wir sie)
 # ─────────────────────────────────────────────────────────────────────────────
 class TestHelpers:
    def test_resolve_quelle_existing_programme(self):
        # Echtes Beispiel aus prod (FDP MV Wahlprogramm 2021)
        pid = _resolve_quelle_to_programm_id(
            "FDP Mecklenburg-Vorpommern Wahlprogramm 2021, S. 73"
        )
        assert pid == "fdp-mv-2021"
    def test_resolve_quelle_returns_none_for_hallucinated_source(self):
        # Eine ausgedachte Quelle, die in PROGRAMME nicht existiert
        pid = _resolve_quelle_to_programm_id(
            "FDP Sankt-Pauli Hafenwirtschaftsprogramm 1997, S. 42"
        )
        assert pid is None
    def test_resolve_quelle_picks_longest_match_when_multiple_partial(self):
        # Mehrere "FDP ... Wahlprogramm"-Einträge in PROGRAMME — der längste
        # Substring-Match (inkl. BL-Kürzel + Jahr) muss gewinnen, sodass
        # NRW-Quellen nicht versehentlich auf MV gemappt werden.
        pid = _resolve_quelle_to_programm_id("FDP NRW Wahlprogramm 2022, S. 5")
        assert pid == "fdp-nrw-2022"
    def test_extract_page_number_canonical(self):
        assert _extract_page_number("CDU MV Wahlprogramm 2021, S. 33") == 33
    def test_extract_page_number_seite_long_form(self):
        assert _extract_page_number("Foo Bar Programm, Seite 7") == 7
    def test_extract_page_number_returns_none_when_missing(self):
        assert _extract_page_number("CDU MV Wahlprogramm 2021") is None
    def test_normalize_collapses_whitespace_and_lowercases(self):
        assert _normalize("  HELLO\n\n  WORLD  ") == "hello world"
    def test_strip_truncation_markers_removes_leading_dots(self):
        assert _strip_truncation_markers("... echte aussage") == "echte aussage"
    def test_strip_truncation_markers_removes_trailing_dots(self):
        assert _strip_truncation_markers("echte aussage ...") == "echte aussage"
    def test_is_substring_strict_lowercase_match(self):
        assert _is_substring("Klimaschutz", "wir wollen klimaschutz und mehr")
    def test_is_substring_tolerates_truncation_markers(self):
        assert _is_substring("...mehr klimaschutz...", "wir wollen mehr klimaschutz und gerechtigkeit")
    def test_is_substring_short_needles_pass(self):
        # Zu kurz für aussagekräftigen Test → True (statt false-positive)
        assert _is_substring("ja", "egal was hier steht")
    def test_is_substring_returns_false_when_clearly_absent(self):
        assert not _is_substring(
            "ein ganz langer satz der so nirgends in der quelle steht und definitiv nicht passt",
            "wir wollen mehr klimaschutz",
        )
 # ─────────────────────────────────────────────────────────────────────────────
 # Sample Loader — liest reale Assessments aus der gwoe-antraege.db
 # ─────────────────────────────────────────────────────────────────────────────
 def _gwoe_db_path() -> Optional[Path]:
    """Resolve to the local prod-DB if mounted, or return None.
    Looks at the same path as the prod-Container (``data/gwoe-antraege.db``
    relative to the webapp root). Local dev machines without a copy will
    skip the citation tests cleanly.
    """
    p = Path(__file__).resolve().parent.parent.parent / "data" / "gwoe-antraege.db"
    return p if p.exists() else None
 def _load_recent_assessments(limit_per_bl: int = 5) -> list[dict]:
    """Read the most recent assessments per active BL from gwoe-antraege.db.
    Returns the parsed wahlprogramm_scores and minimal metadata for the
    citation iteration. Skips silently if the DB isn't available locally.
    """
    db = _gwoe_db_path()
    if db is None:
        return []
    out: list[dict] = []
    conn = sqlite3.connect(db)
    try:
        active_codes = [bl.code for bl in aktive_bundeslaender()]
        for code in active_codes:
            rows = conn.execute(
                """
                SELECT drucksache, bundesland, wahlprogramm_scores
                FROM assessments
                WHERE bundesland = ? AND wahlprogramm_scores IS NOT NULL
                ORDER BY updated_at DESC
                LIMIT ?
                """,
                (code, limit_per_bl),
            ).fetchall()
            for ds, bl, ws_json in rows:
                try:
                    ws = json.loads(ws_json) if ws_json else []
                except json.JSONDecodeError:
                    continue
                out.append({"drucksache": ds, "bundesland": bl, "wahlprogramm_scores": ws})
    finally:
        conn.close()
    return out
 _ASSESSMENTS_SAMPLE = _load_recent_assessments(limit_per_bl=5)
 # ─────────────────────────────────────────────────────────────────────────────
 # Main test — pro Zitat in jedem Sample-Assessment
 # ─────────────────────────────────────────────────────────────────────────────
 def _flat_zitate(assessment: dict) -> list[tuple[str, str, dict]]:
    """Flatten an assessment to a list of (fraktion, kind, zitat) tuples
    where kind is 'wahlprogramm' or 'parteiprogramm'."""
    out: list[tuple[str, str, dict]] = []
    for score_entry in assessment.get("wahlprogramm_scores") or []:
        fraktion = score_entry.get("fraktion") or "?"
        for kind in ("wahlprogramm", "parteiprogramm"):
            block = score_entry.get(kind) or {}
            for z in block.get("zitate") or []:
                out.append((fraktion, kind, z))
    return out
 def _all_citations() -> list[tuple[str, str, str, str, dict]]:
    """Cartesian-flatten all sample-assessments × all zitate to one
    parametrize-friendly list. Returns tuples of:
    (drucksache, bundesland, fraktion, kind, zitat-dict)."""
    out: list[tuple[str, str, str, str, dict]] = []
    for a in _ASSESSMENTS_SAMPLE:
        for fraktion, kind, zitat in _flat_zitate(a):
            out.append((a["drucksache"], a["bundesland"], fraktion, kind, zitat))
    return out
 _CITATIONS = _all_citations()
 _CITATION_IDS = [
    f"{ds}-{bl}-{fr}-{kind}-{i}" for i, (ds, bl, fr, kind, _) in enumerate(_CITATIONS)
 ]
@pytest.mark.skipif(
    _gwoe_db_path() is None,
    reason="lokale gwoe-antraege.db nicht vorhanden — Sub-D läuft nur in einer "
    "Umgebung mit prod-DB-Kopie (siehe data/ Volume im prod-Container)",
 )
@pytest.mark.skipif(
    not _CITATIONS,
    reason="keine Assessments mit zitaten in der lokalen DB gefunden",
 )
@pytest.mark.parametrize(
    ("drucksache", "bundesland", "fraktion", "kind", "zitat"),
    _CITATIONS,
    ids=_CITATION_IDS,
 )
 def test_zitat_is_substring_of_named_pdf_page(
    drucksache: str,
    bundesland: str,
    fraktion: str,
    kind: str,
    zitat: dict,
 ):
    """Property-Verification: jedes vom LLM zitierte Snippet muss als
    Substring auf der angegebenen PDF-Seite tatsächlich vorhanden sein.
    Wenn dieser Test fehlschlägt, ist genau einer der drei Fehler-
    Modi aufgetreten:
    1. **Halluzinierte Quelle**: das Programm in ``zitat['quelle']``
       existiert in PROGRAMME nicht (Bug-Klasse 7/10)
    2. **Halluzinierte Seite**: das Programm existiert, aber die
       angegebene Seite enthält den Snippet nicht
    3. **Halluzinierter Inhalt**: das Programm + die Seite sind real,
       aber der Snippet ist eine Erfindung des LLM
    Alle drei Modi sind echte Bugs in der LLM-Pipeline.
    """
    quelle = zitat.get("quelle", "")
    text = zitat.get("text", "")
    if not quelle or not text:
        pytest.skip(f"{drucksache}/{fraktion}/{kind}: zitat ohne quelle oder text")
    pid = _resolve_quelle_to_programm_id(quelle)
    assert pid is not None, (
        f"halluzinierte Quelle in {drucksache}/{fraktion}/{kind}: "
        f"{quelle!r} matched keinen PROGRAMME-Eintrag"
    )
    # Bonus-Check für Bug-Klasse 17 (Cross-Bundesland-Zitat): das aufgelöste
    # Programm muss zu dem Bundesland des Antrags passen, oder ein
    # Grundsatzprogramm sein (bundesland=None).
    prog_info = PROGRAMME.get(pid, {})
    prog_bl = prog_info.get("bundesland")
    if prog_bl is not None and prog_bl != bundesland:
        pytest.fail(
            f"Cross-Bundesland-Zitat in {drucksache} ({bundesland}): das LLM "
            f"zitiert aus {pid} (bundesland={prog_bl}) — das ist Bug-Klasse 17"
        )
    page = _extract_page_number(quelle)
    if page is None:
        pytest.skip(
            f"{drucksache}/{fraktion}/{kind}: keine Seitennummer in quelle "
            f"{quelle!r}, kann substring-check nicht ausführen"
        )
    page_text = _pdf_page_text(pid, page)
    assert page_text is not None, (
        f"PDF-Seite {page} in {pid} nicht lesbar (PDF zu kurz oder fehlt)"
    )
    if not _is_substring(text, page_text):
        # Diff für die Fehlermeldung — gekürzt um die Output-Logs sauber zu halten
        snippet_preview = text[:200].strip().replace("\n", " ")
        page_preview = page_text[:200].replace("\n", " ")
        pytest.fail(
            f"Zitat in {drucksache}/{fraktion}/{kind} nicht auf "
            f"{pid} S.{page} auffindbar:\n"
            f"  zitiert: {snippet_preview!r}\n"
            f"  PDF-Seite enthält: {page_preview!r}"
        )
--- a/tests/integration/test_frontend_xref.py
+++ b/tests/integration/test_frontend_xref.py
@ -0,0 +1,105 @@
 """Sub-Issue B — Adapter ↔ Frontend Cross-Validation.
 Pro aktivem BL ist im ``ground_truth.py``-Modul ein einzelnes Drucksachen-
 Tupel kuratiert, das aus der echten Frontend-Suche des jeweiligen
 Landtags stammt. Dieser Test ruft ``adapter.get_document(...)`` mit der
 bekannten ID auf und prüft, dass:
 - die Drucksache überhaupt gefunden wird
 - der Title (substring) passt
 - die erwarteten Fraktionen drin sind
 - das Datum (wenn gesetzt im Sample) übereinstimmt
 - der PDF-Link das erwartete URL-Fragment enthält
 Bug-Klassen aus den letzten Sessions, die diese Datei abdeckt:
 - 14 (get_document() liefert Match aus falschem Bundesland)
 - Allgemeine Schema-Drift in URL-Strukturen, Hit-Format-Änderungen,
  Encoding-Bugs, Pagination-Cut-Offs, Adapter-Reuse-Konfigurations-Fehler
 Issue: #52 (Sub-Issue B des Umbrella #50)
 Wartung: siehe Doku im ``ground_truth.py``-Header.
 """
 import pytest
 from app.bundeslaender import aktive_bundeslaender
 from app.parlamente import ADAPTERS
 from .ground_truth import GROUND_TRUTH
 pytestmark = pytest.mark.integration
 _ACTIVE_CODES = {bl.code for bl in aktive_bundeslaender()}
 # Skip Samples für BL die nicht (mehr) aktiv sind
 _GT_PARAMS = [pytest.param(gt, id=gt.bundesland) for gt in GROUND_TRUTH if gt.bundesland in _ACTIVE_CODES]
@pytest.mark.parametrize("gt", _GT_PARAMS)
 async def test_adapter_finds_known_drucksache(gt):
    """Cross-Validation gegen die Frontend-Suche des jeweiligen Landtags.
    Wenn dieser Test fehlschlägt: erst den Frontend-URL aus
    ``gt.frontend_search_url`` öffnen und prüfen, ob die Drucksache
    überhaupt noch existiert. Wenn ja → Adapter-Bug. Wenn nein → ein
    neues Sample im ``ground_truth.py`` aufnehmen.
    """
    if gt.bundesland not in ADAPTERS:
        pytest.skip(f"{gt.bundesland} hat keinen registrierten Adapter")
    if not gt.title_substring:
        pytest.skip(
            f"{gt.bundesland}: Sample noch nicht kuratiert "
            "(title_substring leer in ground_truth.py)"
        )
    adapter = ADAPTERS[gt.bundesland]
    doc = await adapter.get_document(gt.drucksache)
    assert doc is not None, (
        f"{gt.bundesland} adapter ({type(adapter).__name__}) hat die "
        f"bekannte Drucksache {gt.drucksache!r} nicht gefunden. Frontend-"
        f"Probe: {gt.frontend_search_url}"
    )
    # 1. Drucksachen-Nummer roundtrip
    assert doc.drucksache == gt.drucksache, (
        f"{gt.bundesland}: get_document({gt.drucksache!r}) lieferte "
        f"abweichende drucksache={doc.drucksache!r}"
    )
    # 2. Title-Substring
    assert gt.title_substring.lower() in doc.title.lower(), (
        f"{gt.bundesland}: title_substring {gt.title_substring!r} nicht "
        f"in adapter-title {doc.title!r}"
    )
    # 3. Erwartete Fraktionen sind alle da (Subset-Match — Adapter darf
    # mehr Fraktionen erkennen als das Sample erwartet)
    if gt.expected_fraktionen:
        adapter_fraktionen = set(doc.fraktionen)
        missing = gt.expected_fraktionen - adapter_fraktionen
        assert not missing, (
            f"{gt.bundesland}: erwartete Fraktionen {gt.expected_fraktionen} "
            f"nicht alle im Adapter-Output {adapter_fraktionen}; fehlt: {missing}"
        )
    # 4. Datum (nur wenn das Sample eines hat)
    if gt.datum:
        assert doc.datum == gt.datum, (
            f"{gt.bundesland}: erwartetes datum={gt.datum!r}, adapter lieferte "
            f"{doc.datum!r}"
        )
    # 5. PDF-Link enthält erwartetes URL-Fragment
    if gt.pdf_url_substring:
        assert gt.pdf_url_substring.lower() in doc.link.lower(), (
            f"{gt.bundesland}: pdf_url_substring {gt.pdf_url_substring!r} "
            f"nicht in adapter-link {doc.link!r}"
        )
    # 6. Bundesland-Konsistenz — fängt Bug-Klasse 14 (Cross-Bundesland-Match)
    assert doc.bundesland == gt.bundesland, (
        f"adapter[{gt.bundesland}].get_document() lieferte ein Doc mit "
        f"bundesland={doc.bundesland!r}"
    )
--- a/tests/integration/test_wahlprogramme_indexed.py
+++ b/tests/integration/test_wahlprogramme_indexed.py
@ -0,0 +1,313 @@
 """Sub-Issue C — Wahlprogramm Indexing-Status + PDF Content Verification.
 Drei Test-Klassen:
 C1 — Indexing-Status: jedes WAHLPROGRAMME-Eintrag eines aktiven BL muss
     in der embeddings.db als ≥1-Chunk-Programm vorhanden sein.
 C2 — Inhalts-Plausibilität: jede registrierte PDF-Datei muss real auf der
     ersten Seite Marker für die richtige Wahlperiode + Partei + Programm-
     Typ enthalten. Inkl. Anti-Marker für die abgeordnetenwatch-PDF-Tausch-
     Bug-Klasse 8 (CDU BE 2023→2026).
 C3 — Embeddings-Statistik: chunk-count > seiten/10 als grobe Heuristik
     gegen abgebrochene Indexierungen.
 Bug-Klassen aus den letzten Sessions, die diese Datei abdeckt:
 - 8 (abgeordnetenwatch tauscht PDF unter altem Slug)
 - 11 (Wahlprogramm fehlt komplett im Index — heute morgen für 6 BL)
 - 15 (embeddings-DB Chunks aus altem Programm-Slug)
 Issue: #53 (Sub-Issue C des Umbrella #50)
 Hinweis: dieser Test liest die **lokale** ``embeddings.db``, nicht die
 prod-Container-DB. Wenn sie lokal nicht existiert, werden alle C1+C3
 Tests automatisch xfailed. C2 (PDF-Inhalt) hängt nur von den PDF-Files
 ab und läuft immer.
 """
 from __future__ import annotations
 import re
 from collections import defaultdict
 from pathlib import Path
 from typing import Optional
 import pytest
 from app.bundeslaender import aktive_bundeslaender
 from app.embeddings import EMBEDDINGS_DB, PROGRAMME, get_indexing_status
 from app.wahlprogramme import REFERENZEN_PATH, WAHLPROGRAMME
 pytestmark = pytest.mark.integration
 # ─────────────────────────────────────────────────────────────────────────────
 # Helpers
 # ─────────────────────────────────────────────────────────────────────────────
 def _pdf_pages_text(filename: str, n: Optional[int] = None) -> str:
    """Read the first ``n`` pages (or all pages) of a Wahlprogramm-PDF
    and return the concatenated text, normalised whitespace.
    Uses real PyMuPDF (fitz). Tests calling this helper must be ok with
    skipping if fitz isn't installed in the local environment.
    """
    pytest.require_module("fitz")  # set up by integration conftest
    import fitz
    path = REFERENZEN_PATH / filename
    if not path.exists():
        pytest.fail(f"PDF nicht gefunden: {path}")
    pdf = fitz.open(str(path))
    try:
        page_count = len(pdf) if n is None else min(n, len(pdf))
        chunks: list[str] = [pdf[i].get_text() for i in range(page_count)]
    finally:
        pdf.close()
    text = " ".join(chunks)
    # Normalise whitespace
    return re.sub(r"\s+", " ", text).strip()
 # Backwards-compat alias for the older name still used in two tests below
 def _pdf_first_pages_text(filename: str, n: int = 5) -> str:
    return _pdf_pages_text(filename, n=n)
 def _all_active_wahlprogramme() -> list[tuple[str, str, dict]]:
    """List of (bundesland, partei, info) for every WAHLPROGRAMME entry of
    a currently active Bundesland. Used as parametrize input."""
    out: list[tuple[str, str, dict]] = []
    active_codes = {bl.code for bl in aktive_bundeslaender()}
    for bl, parteien in WAHLPROGRAMME.items():
        if bl not in active_codes:
            continue
        for partei, info in parteien.items():
            out.append((bl, partei, info))
    return out
 _WAHLPROG_PARAMS = _all_active_wahlprogramme()
 _WAHLPROG_IDS = [f"{bl}-{partei}" for bl, partei, _ in _WAHLPROG_PARAMS]
 # ─────────────────────────────────────────────────────────────────────────────
 # C1 — Indexing-Status pro aktivem BL
 # ─────────────────────────────────────────────────────────────────────────────
 def _embeddings_db_has_active_data() -> bool:
    """C1 + C3 sollen nur laufen, wenn die lokale embeddings.db
    mindestens für die heute aktiven Bundesländer Chunks enthält.
    Sonst (z.B. lokale Dev-Maschine ohne Indexing-Lauf, oder eine
    pre-#5-DB ohne bundesland-Spalte) skippen wir, damit der Test
    gegen die prod-DB im CI/Container läuft, nicht gegen einen
    halb-leeren lokalen Snapshot."""
    if not EMBEDDINGS_DB.exists():
        return False
    import sqlite3
    try:
        conn = sqlite3.connect(EMBEDDINGS_DB)
        try:
            cols = {row[1] for row in conn.execute("PRAGMA table_info(chunks)")}
            if "bundesland" not in cols:
                return False  # pre-#5 schema, no bundesland column
            rows = conn.execute(
                "SELECT bundesland, COUNT(DISTINCT programm_id) "
                "FROM chunks WHERE bundesland IS NOT NULL GROUP BY bundesland"
            ).fetchall()
        finally:
            conn.close()
    except sqlite3.Error:
        return False
    indexed_bls = {bl for bl, n in rows if n > 0}
    active_codes = {bl.code for bl in aktive_bundeslaender()}
    expected = {bl for bl in active_codes if bl in WAHLPROGRAMME}
    return expected.issubset(indexed_bls)
@pytest.mark.skipif(
    not _embeddings_db_has_active_data(),
    reason=(
        f"local {EMBEDDINGS_DB} hat nicht alle aktiven BL indexiert — "
        "C1/C3 laufen nur in einer Umgebung mit aktueller DB (prod-Container "
        "oder lokaler index_programm-Lauf)"
    ),
 )
 class TestIndexingStatus:
    """C1 — every WAHLPROGRAMME entry of an active BL must be indexed."""
    def test_no_active_bundesland_has_unindexed_wahlprogramme(self):
        status = get_indexing_status()
        indexed = {p["id"] for p in status["programmes"] if p["indexed"]}
        missing: list[str] = []
        for bl, partei, info in _all_active_wahlprogramme():
            pid = info["file"].rsplit(".", 1)[0]
            if pid not in indexed:
                missing.append(f"{bl}/{partei}: {pid}")
        assert not missing, (
            "Wahlprogramme aktiver Bundesländer fehlen in embeddings.db:\n  "
            + "\n  ".join(missing)
        )
    def test_every_indexed_chunk_belongs_to_known_programm_id(self):
        """Catches Bug-Klasse 15: stale chunks for programm_ids that no
        longer exist in PROGRAMME (e.g. after a slug rename)."""
        status = get_indexing_status()
        # status["programmes"] is iterated from PROGRAMME, so an orphan
        # would not appear there. Read the DB directly instead.
        import sqlite3
        conn = sqlite3.connect(EMBEDDINGS_DB)
        try:
            db_ids = {row[0] for row in conn.execute("SELECT DISTINCT programm_id FROM chunks")}
        finally:
            conn.close()
        orphans = sorted(db_ids - set(PROGRAMME.keys()))
        assert not orphans, (
            "embeddings.db enthält Chunks für unbekannte programm_id:\n  "
            + "\n  ".join(orphans)
        )
    def test_chunk_count_per_active_bundesland_is_reasonable(self):
        """C3 grob: pro aktivem BL erwartet man min. 100 chunks insgesamt
        (ein Wahlprogramm hat typisch 50–300 chunks). Fängt Bug-Klasse
        "Indexing crashte vorzeitig"."""
        status = get_indexing_status()
        per_bl: dict[str, int] = defaultdict(int)
        for p in status["programmes"]:
            info = PROGRAMME.get(p["id"], {})
            bl = info.get("bundesland")
            if bl:
                per_bl[bl] += p["chunks"]
        active_codes = {bl.code for bl in aktive_bundeslaender()}
        too_low = {bl: count for bl, count in per_bl.items() if bl in active_codes and count < 100}
        assert not too_low, (
            "Aktive Bundesländer mit verdächtig wenigen indexierten Chunks "
            f"(< 100, vermutlich abgebrochene Indexierung): {too_low}"
        )
 # ─────────────────────────────────────────────────────────────────────────────
 # C2 — Inhalts-Plausibilität pro PDF
 # ─────────────────────────────────────────────────────────────────────────────
 _PROGRAMM_MARKERS = (
    "wahlprogramm",
    "regierungsprogramm",
    "zukunftsprogramm",
    "landeswahlprogramm",
    "berlin-plan",
    "berlin plan",
    "wahlmanifest",
    "programm",  # very permissive, fallback
    "wahlperiode",
    "landtagswahl",
    "bürgerschaftswahl",
    "abgeordnetenhaus",
    "agh-wahl",
    "beschluss",  # many programs say "Beschluss vom DD.MM.YYYY"
 )
@pytest.mark.parametrize(("bundesland", "partei", "info"), _WAHLPROG_PARAMS, ids=_WAHLPROG_IDS)
 def test_pdf_contains_programm_marker(bundesland: str, partei: str, info: dict):
    """C2 — irgendwo im PDF muss ein Wahlprogramm-Marker-Wort vorkommen.
    Fängt versehentlich indexierte Nicht-Wahlprogramme (z.B. ein Geschäfts-
    bericht der Stiftung statt des Programms). Sehr permissiv: einer von
    14 Markern reicht. Strikter wird es im ``contains_wahljahr``-Test.
    """
    text = _pdf_pages_text(info["file"]).lower()
    matched = [m for m in _PROGRAMM_MARKERS if m in text]
    assert matched, (
        f"{bundesland}/{partei} ({info['file']}): keiner der Wahlprogramm-"
        f"Marker {_PROGRAMM_MARKERS} im ganzen PDF gefunden — vermutlich "
        "falsches PDF eingespielt"
    )
@pytest.mark.parametrize(("bundesland", "partei", "info"), _WAHLPROG_PARAMS, ids=_WAHLPROG_IDS)
 def test_pdf_year_horizon_is_plausible(bundesland: str, partei: str, info: dict):
    """C2 — Plausibilitäts-Check über die Verteilung der Jahres-Marker
    im PDF.
    Bewusst keine "Wahljahr muss vorkommen"-Annahme — viele Programme
    nennen das Wahljahr selbst gar nicht (z.B. CDU-BE 2021 erwähnt es
    null mal, hat aber "Aufzüge bis 2026" als 5-Jahres-Forderung). Was
    wir stattdessen prüfen:
    - Mindestens **eine** Jahreszahl im erwarteten Wahlperioden-Horizont
      (Wahljahr ± 10) muss vorkommen — sonst ist es kein Programm zur
      passenden Wahl
    - Es darf **kein** Cluster von Jahren in einer DEUTLICH späteren
      Periode dominieren (z.B. ein "2031–2036"-Programm in einem File,
      das angeblich 2021er sein soll)
    """
    text = _pdf_pages_text(info["file"])
    years = [int(y) for y in re.findall(r"\b(20\d\d)\b", text)]
    expected = info["jahr"]
    # Bedingung 1: ≥ 1 Jahr aus dem erwarteten Horizont
    horizon = range(expected - 1, expected + 11)
    in_horizon = [y for y in years if y in horizon]
    assert in_horizon, (
        f"{bundesland}/{partei} ({info['file']}): kein einziges Jahr im "
        f"erwarteten Wahlperioden-Horizont {expected}..{expected + 10} "
        f"gefunden (gefunden: {sorted(set(years))[:10]}). PDF passt nicht "
        "zur erwarteten Wahlperiode — möglicher anachronistischer Tausch."
    )
    # Bedingung 2: keine deutlich spätere Wahlperiode dominiert
    later_horizon = range(expected + 5, expected + 15)
    later_count = sum(1 for y in years if y in later_horizon)
    horizon_count = sum(1 for y in years if y in horizon)
    if horizon_count > 0 and later_count > horizon_count * 3:
        pytest.fail(
            f"{bundesland}/{partei} ({info['file']}): mehr als 3× so viele "
            f"Jahres-Marker in der Folge-Wahlperiode {list(later_horizon)} "
            f"({later_count}) als im erwarteten Horizont ({horizon_count}) — "
            "stark anachronistisches PDF."
        )
 def test_be_cdu_pdf_is_2021_program_not_2026():
    """Expliziter Anti-Marker für die im Issue #10-Kommentar dokumentierte
    Bug-Klasse 8: abgeordnetenwatch hat das ``cduwahlprogrammahw2021_0.pdf``-
    File potenziell nachträglich gegen den ``Berlin-Plan 2026`` ersetzt.
    Verifikation: das 2021er Programm der laufenden WP19 hat eine
    5-Jahres-Forderung-Sprache mit Zielen "bis 2026" oder ähnlich
    ("Aufzüge bis zum Jahr 2026" ist ein verifizierter Marker im
    aktuellen 2021er PDF). Das hypothetische 2026er hätte stattdessen
    Ziele "bis 2031".
    Wenn jemand in einem Folge-Build wieder denselben Slug zieht und
    abgeordnetenwatch zwischenzeitlich getauscht hat, schlägt dieser
    Test mit klarer Fehlermeldung fehl.
    """
    text = _pdf_pages_text("cdu-be-2023.pdf")
    # Positiv-Marker: 2021er Programm spricht über Ziele bis 2026
    assert "2026" in text, (
        "cdu-be-2023.pdf hat keinerlei '2026'-Marker — das passt zu "
        "keinem der erwarteten Programme (weder 2021er mit '5-Jahres-"
        "Horizont bis 2026' noch 2026er mit Beschluss-Datum 2026)"
    )
    # Anti-Marker: das hypothetische 2026er-Programm hätte einen
    # "2031"-Horizont (Wahlperiode 2026–2031)
    cnt_2031 = text.count("2031")
    cnt_2026 = text.count("2026")
    assert cnt_2031 < cnt_2026, (
        f"cdu-be-2023.pdf zählt mehr '2031' ({cnt_2031}) als '2026' "
        f"({cnt_2026}) — das passt zum 2026er Programm der WP20, NICHT "
        "zum 2021er Programm der laufenden WP19. Bitte das echte 2021er "
        "PDF aus FES/KAS-Archiv neu beschaffen."
    )