gwoe-antragspruefer/tests/integration/test_wahlprogramme_indexed.py

"""Sub-Issue C — Wahlprogramm Indexing-Status + PDF Content Verification.

Drei Test-Klassen:

C1 — Indexing-Status: jedes WAHLPROGRAMME-Eintrag eines aktiven BL muss
     in der embeddings.db als ≥1-Chunk-Programm vorhanden sein.

C2 — Inhalts-Plausibilität: jede registrierte PDF-Datei muss real auf der
     ersten Seite Marker für die richtige Wahlperiode + Partei + Programm-
     Typ enthalten. Inkl. Anti-Marker für die abgeordnetenwatch-PDF-Tausch-
     Bug-Klasse 8 (CDU BE 2023→2026).

C3 — Embeddings-Statistik: chunk-count > seiten/10 als grobe Heuristik
     gegen abgebrochene Indexierungen.

Bug-Klassen aus den letzten Sessions, die diese Datei abdeckt:
- 8 (abgeordnetenwatch tauscht PDF unter altem Slug)
- 11 (Wahlprogramm fehlt komplett im Index — heute morgen für 6 BL)
- 15 (embeddings-DB Chunks aus altem Programm-Slug)

Issue: #53 (Sub-Issue C des Umbrella #50)

Hinweis: dieser Test liest die **lokale** ``embeddings.db``, nicht die
prod-Container-DB. Wenn sie lokal nicht existiert, werden alle C1+C3
Tests automatisch xfailed. C2 (PDF-Inhalt) hängt nur von den PDF-Files
ab und läuft immer.
"""
from __future__ import annotations

import re
from collections import defaultdict
from pathlib import Path
from typing import Optional

import pytest

from app.bundeslaender import aktive_bundeslaender
from app.embeddings import EMBEDDINGS_DB, PROGRAMME, get_indexing_status
from app.wahlprogramme import REFERENZEN_PATH, WAHLPROGRAMME


pytestmark = pytest.mark.integration


# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────


def _pdf_pages_text(filename: str, n: Optional[int] = None) -> str:
    """Read the first ``n`` pages (or all pages) of a Wahlprogramm-PDF
    and return the concatenated text, normalised whitespace.

    Uses real PyMuPDF (fitz). Tests calling this helper must be ok with
    skipping if fitz isn't installed in the local environment.
    """
    pytest.require_module("fitz")  # set up by integration conftest
    import fitz

    path = REFERENZEN_PATH / filename
    if not path.exists():
        pytest.fail(f"PDF nicht gefunden: {path}")
    pdf = fitz.open(str(path))
    try:
        page_count = len(pdf) if n is None else min(n, len(pdf))
        chunks: list[str] = [pdf[i].get_text() for i in range(page_count)]
    finally:
        pdf.close()
    text = " ".join(chunks)
    # Normalise whitespace
    return re.sub(r"\s+", " ", text).strip()


# Backwards-compat alias for the older name still used in two tests below
def _pdf_first_pages_text(filename: str, n: int = 5) -> str:
    return _pdf_pages_text(filename, n=n)


def _all_active_wahlprogramme() -> list[tuple[str, str, dict]]:
    """List of (bundesland, partei, info) for every WAHLPROGRAMME entry of
    a currently active Bundesland. Used as parametrize input."""
    out: list[tuple[str, str, dict]] = []
    active_codes = {bl.code for bl in aktive_bundeslaender()}
    for bl, parteien in WAHLPROGRAMME.items():
        if bl not in active_codes:
            continue
        for partei, info in parteien.items():
            out.append((bl, partei, info))
    return out


_WAHLPROG_PARAMS = _all_active_wahlprogramme()
_WAHLPROG_IDS = [f"{bl}-{partei}" for bl, partei, _ in _WAHLPROG_PARAMS]


# ─────────────────────────────────────────────────────────────────────────────
# C1 — Indexing-Status pro aktivem BL
# ─────────────────────────────────────────────────────────────────────────────


def _embeddings_db_has_active_data() -> bool:
    """C1 + C3 sollen nur laufen, wenn die lokale embeddings.db
    mindestens für die heute aktiven Bundesländer Chunks enthält.
    Sonst (z.B. lokale Dev-Maschine ohne Indexing-Lauf, oder eine
    pre-#5-DB ohne bundesland-Spalte) skippen wir, damit der Test
    gegen die prod-DB im CI/Container läuft, nicht gegen einen
    halb-leeren lokalen Snapshot."""
    if not EMBEDDINGS_DB.exists():
        return False
    import sqlite3

    try:
        conn = sqlite3.connect(EMBEDDINGS_DB)
        try:
            cols = {row[1] for row in conn.execute("PRAGMA table_info(chunks)")}
            if "bundesland" not in cols:
                return False  # pre-#5 schema, no bundesland column
            rows = conn.execute(
                "SELECT bundesland, COUNT(DISTINCT programm_id) "
                "FROM chunks WHERE bundesland IS NOT NULL GROUP BY bundesland"
            ).fetchall()
        finally:
            conn.close()
    except sqlite3.Error:
        return False

    indexed_bls = {bl for bl, n in rows if n > 0}
    active_codes = {bl.code for bl in aktive_bundeslaender()}
    expected = {bl for bl in active_codes if bl in WAHLPROGRAMME}
    return expected.issubset(indexed_bls)


@pytest.mark.skipif(
    not _embeddings_db_has_active_data(),
    reason=(
        f"local {EMBEDDINGS_DB} hat nicht alle aktiven BL indexiert — "
        "C1/C3 laufen nur in einer Umgebung mit aktueller DB (prod-Container "
        "oder lokaler index_programm-Lauf)"
    ),
)
class TestIndexingStatus:
    """C1 — every WAHLPROGRAMME entry of an active BL must be indexed."""

    def test_no_active_bundesland_has_unindexed_wahlprogramme(self):
        status = get_indexing_status()
        indexed = {p["id"] for p in status["programmes"] if p["indexed"]}

        missing: list[str] = []
        for bl, partei, info in _all_active_wahlprogramme():
            pid = info["file"].rsplit(".", 1)[0]
            if pid not in indexed:
                missing.append(f"{bl}/{partei}: {pid}")
        assert not missing, (
            "Wahlprogramme aktiver Bundesländer fehlen in embeddings.db:\n  "
            + "\n  ".join(missing)
        )

    def test_every_indexed_chunk_belongs_to_known_programm_id(self):
        """Catches Bug-Klasse 15: stale chunks for programm_ids that no
        longer exist in PROGRAMME (e.g. after a slug rename)."""
        status = get_indexing_status()
        # status["programmes"] is iterated from PROGRAMME, so an orphan
        # would not appear there. Read the DB directly instead.
        import sqlite3

        conn = sqlite3.connect(EMBEDDINGS_DB)
        try:
            db_ids = {row[0] for row in conn.execute("SELECT DISTINCT programm_id FROM chunks")}
        finally:
            conn.close()

        orphans = sorted(db_ids - set(PROGRAMME.keys()))
        assert not orphans, (
            "embeddings.db enthält Chunks für unbekannte programm_id:\n  "
            + "\n  ".join(orphans)
        )

    def test_chunk_count_per_active_bundesland_is_reasonable(self):
        """C3 grob: pro aktivem BL erwartet man min. 100 chunks insgesamt
        (ein Wahlprogramm hat typisch 50–300 chunks). Fängt Bug-Klasse
        "Indexing crashte vorzeitig"."""
        status = get_indexing_status()
        per_bl: dict[str, int] = defaultdict(int)
        for p in status["programmes"]:
            info = PROGRAMME.get(p["id"], {})
            bl = info.get("bundesland")
            if bl:
                per_bl[bl] += p["chunks"]

        active_codes = {bl.code for bl in aktive_bundeslaender()}
        too_low = {bl: count for bl, count in per_bl.items() if bl in active_codes and count < 100}
        assert not too_low, (
            "Aktive Bundesländer mit verdächtig wenigen indexierten Chunks "
            f"(< 100, vermutlich abgebrochene Indexierung): {too_low}"
        )


# ─────────────────────────────────────────────────────────────────────────────
# C2 — Inhalts-Plausibilität pro PDF
# ─────────────────────────────────────────────────────────────────────────────


_PROGRAMM_MARKERS = (
    "wahlprogramm",
    "regierungsprogramm",
    "zukunftsprogramm",
    "landeswahlprogramm",
    "berlin-plan",
    "berlin plan",
    "wahlmanifest",
    "programm",  # very permissive, fallback
    "wahlperiode",
    "landtagswahl",
    "bürgerschaftswahl",
    "abgeordnetenhaus",
    "agh-wahl",
    "beschluss",  # many programs say "Beschluss vom DD.MM.YYYY"
)


@pytest.mark.parametrize(("bundesland", "partei", "info"), _WAHLPROG_PARAMS, ids=_WAHLPROG_IDS)
def test_pdf_contains_programm_marker(bundesland: str, partei: str, info: dict):
    """C2 — irgendwo im PDF muss ein Wahlprogramm-Marker-Wort vorkommen.

    Fängt versehentlich indexierte Nicht-Wahlprogramme (z.B. ein Geschäfts-
    bericht der Stiftung statt des Programms). Sehr permissiv: einer von
    14 Markern reicht. Strikter wird es im ``contains_wahljahr``-Test.
    """
    text = _pdf_pages_text(info["file"]).lower()
    matched = [m for m in _PROGRAMM_MARKERS if m in text]
    assert matched, (
        f"{bundesland}/{partei} ({info['file']}): keiner der Wahlprogramm-"
        f"Marker {_PROGRAMM_MARKERS} im ganzen PDF gefunden — vermutlich "
        "falsches PDF eingespielt"
    )


@pytest.mark.parametrize(("bundesland", "partei", "info"), _WAHLPROG_PARAMS, ids=_WAHLPROG_IDS)
def test_pdf_year_horizon_is_plausible(bundesland: str, partei: str, info: dict):
    """C2 — Plausibilitäts-Check über die Verteilung der Jahres-Marker
    im PDF.

    Bewusst keine "Wahljahr muss vorkommen"-Annahme — viele Programme
    nennen das Wahljahr selbst gar nicht (z.B. CDU-BE 2021 erwähnt es
    null mal, hat aber "Aufzüge bis 2026" als 5-Jahres-Forderung). Was
    wir stattdessen prüfen:

    - Mindestens **eine** Jahreszahl im erwarteten Wahlperioden-Horizont
      (Wahljahr ± 10) muss vorkommen — sonst ist es kein Programm zur
      passenden Wahl
    - Es darf **kein** Cluster von Jahren in einer DEUTLICH späteren
      Periode dominieren (z.B. ein "2031–2036"-Programm in einem File,
      das angeblich 2021er sein soll)
    """
    text = _pdf_pages_text(info["file"])
    years = [int(y) for y in re.findall(r"\b(20\d\d)\b", text)]
    expected = info["jahr"]

    # Bedingung 1: ≥ 1 Jahr aus dem erwarteten Horizont
    horizon = range(expected - 1, expected + 11)
    in_horizon = [y for y in years if y in horizon]
    assert in_horizon, (
        f"{bundesland}/{partei} ({info['file']}): kein einziges Jahr im "
        f"erwarteten Wahlperioden-Horizont {expected}..{expected + 10} "
        f"gefunden (gefunden: {sorted(set(years))[:10]}). PDF passt nicht "
        "zur erwarteten Wahlperiode — möglicher anachronistischer Tausch."
    )

    # Bedingung 2: keine deutlich spätere Wahlperiode dominiert
    later_horizon = range(expected + 5, expected + 15)
    later_count = sum(1 for y in years if y in later_horizon)
    horizon_count = sum(1 for y in years if y in horizon)
    if horizon_count > 0 and later_count > horizon_count * 3:
        pytest.fail(
            f"{bundesland}/{partei} ({info['file']}): mehr als 3× so viele "
            f"Jahres-Marker in der Folge-Wahlperiode {list(later_horizon)} "
            f"({later_count}) als im erwarteten Horizont ({horizon_count}) — "
            "stark anachronistisches PDF."
        )


def test_be_cdu_pdf_is_2021_program_not_2026():
    """Expliziter Anti-Marker für die im Issue #10-Kommentar dokumentierte
    Bug-Klasse 8: abgeordnetenwatch hat das ``cduwahlprogrammahw2021_0.pdf``-
    File potenziell nachträglich gegen den ``Berlin-Plan 2026`` ersetzt.

    Verifikation: das 2021er Programm der laufenden WP19 hat eine
    5-Jahres-Forderung-Sprache mit Zielen "bis 2026" oder ähnlich
    ("Aufzüge bis zum Jahr 2026" ist ein verifizierter Marker im
    aktuellen 2021er PDF). Das hypothetische 2026er hätte stattdessen
    Ziele "bis 2031".

    Wenn jemand in einem Folge-Build wieder denselben Slug zieht und
    abgeordnetenwatch zwischenzeitlich getauscht hat, schlägt dieser
    Test mit klarer Fehlermeldung fehl.
    """
    text = _pdf_pages_text("cdu-be-2023.pdf")
    # Positiv-Marker: 2021er Programm spricht über Ziele bis 2026
    assert "2026" in text, (
        "cdu-be-2023.pdf hat keinerlei '2026'-Marker — das passt zu "
        "keinem der erwarteten Programme (weder 2021er mit '5-Jahres-"
        "Horizont bis 2026' noch 2026er mit Beschluss-Datum 2026)"
    )
    # Anti-Marker: das hypothetische 2026er-Programm hätte einen
    # "2031"-Horizont (Wahlperiode 2026–2031)
    cnt_2031 = text.count("2031")
    cnt_2026 = text.count("2026")
    assert cnt_2031 < cnt_2026, (
        f"cdu-be-2023.pdf zählt mehr '2031' ({cnt_2031}) als '2026' "
        f"({cnt_2026}) — das passt zum 2026er Programm der WP20, NICHT "
        "zum 2021er Programm der laufenden WP19. Bitte das echte 2021er "
        "PDF aus FES/KAS-Archiv neu beschaffen."
    )
-												Add E2E functional acceptance test suite (#50, #51, #52, #53, #54)

Vier Sub-Issues unter Umbrella #50 — opt-in via 'pytest -m integration',
Default-Suite (77 Unit-Tests) bleibt unberührt.

- Sub-Issue A (#51): test_adapters_live.py — pro aktivem BL Reachability,
  Drucksache-ID-Format, Type-Filter, Datum-/Fraktion-Plausibilität,
  PDF-Link-HEAD-Probe (slow). NI als xfail (Login-Wall).
- Sub-Issue B (#52): test_frontend_xref.py + ground_truth.py — pro BL
  ein manuell kuratiertes Frontend-Sample (Drucksache + Title-Substring +
  Fraktionen + Datum + PDF-URL), gegen das adapter.get_document() gespiegelt
  wird. Fängt Bug-Klasse 14 (Cross-Bundesland-Match).
- Sub-Issue C (#53): test_wahlprogramme_indexed.py — Indexing-Status pro
  aktivem BL aus embeddings.db, PDF-Inhalts-Plausibilität (14 Marker +
  Wahlperioden-Horizont), expliziter Anti-Marker für Bug-Klasse 8
  (CDU-BE 2021 vs 2026 PDF-Tausch durch abgeordnetenwatch).
- Sub-Issue D (#54): test_citations_substring.py — Property-Verification:
  jedes vom LLM zitierte Snippet muss als (whitespace-normalisierter)
  Substring auf der angegebenen PDF-Seite vorhanden sein. Strict-Match
  mit Truncation-Marker-Toleranz, kein Fuzzy. Liest reale Assessments
  aus gwoe-antraege.db. Fängt Bug-Klassen 7/10/17 (Halluzination).

Architektur: separates tests/integration/ Verzeichnis mit eigenem
conftest.py, das die Stubs der Unit-Suite (fitz/bs4/openai/pydantic_settings)
gezielt entfernt und auf echte Module umstellt — mit Fallback-Skip via
pytest.require_module wenn lokale Dev-Maschine die Prod-Deps nicht hat.

206 neue Integration-Tests, 13 Helper-Unit-Tests. 77 Unit-Tests bleiben grün.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 10:00:20 +02:00
+								"""Sub-Issue C — Wahlprogramm Indexing-Status + PDF Content Verification.
 								Drei Test-Klassen:
 								C1 — Indexing-Status: jedes WAHLPROGRAMME-Eintrag eines aktiven BL muss
 								     in der embeddings.db als ≥1-Chunk-Programm vorhanden sein.
 								C2 — Inhalts-Plausibilität: jede registrierte PDF-Datei muss real auf der
 								     ersten Seite Marker für die richtige Wahlperiode + Partei + Programm-
 								     Typ enthalten. Inkl. Anti-Marker für die abgeordnetenwatch-PDF-Tausch-
 								     Bug-Klasse 8 (CDU BE 2023→2026).
 								C3 — Embeddings-Statistik: chunk-count > seiten/10 als grobe Heuristik
 								     gegen abgebrochene Indexierungen.
 								Bug-Klassen aus den letzten Sessions, die diese Datei abdeckt:
 								- 8 (abgeordnetenwatch tauscht PDF unter altem Slug)
 								- 11 (Wahlprogramm fehlt komplett im Index — heute morgen für 6 BL)
 								- 15 (embeddings-DB Chunks aus altem Programm-Slug)
 								Issue: #53 (Sub-Issue C des Umbrella #50)
 								Hinweis: dieser Test liest die **lokale** ``embeddings.db``, nicht die
 								prod-Container-DB. Wenn sie lokal nicht existiert, werden alle C1+C3
 								Tests automatisch xfailed. C2 (PDF-Inhalt) hängt nur von den PDF-Files
 								ab und läuft immer.
 								"""
 								from __future__ import annotations
 								import re
 								from collections import defaultdict
 								from pathlib import Path
 								from typing import Optional
 								import pytest
 								from app.bundeslaender import aktive_bundeslaender
 								from app.embeddings import EMBEDDINGS_DB, PROGRAMME, get_indexing_status
 								from app.wahlprogramme import REFERENZEN_PATH, WAHLPROGRAMME
 								pytestmark = pytest.mark.integration
 								# ─────────────────────────────────────────────────────────────────────────────
 								# Helpers
 								# ─────────────────────────────────────────────────────────────────────────────
 								def _pdf_pages_text(filename: str, n: Optional[int] = None) -> str:
 								    """Read the first ``n`` pages (or all pages) of a Wahlprogramm-PDF
 								    and return the concatenated text, normalised whitespace.
 								    Uses real PyMuPDF (fitz). Tests calling this helper must be ok with
 								    skipping if fitz isn't installed in the local environment.
 								    """
 								    pytest.require_module("fitz")  # set up by integration conftest
 								    import fitz
 								    path = REFERENZEN_PATH / filename
 								    if not path.exists():
 								        pytest.fail(f"PDF nicht gefunden: {path}")
 								    pdf = fitz.open(str(path))
 								    try:
 								        page_count = len(pdf) if n is None else min(n, len(pdf))
 								        chunks: list[str] = [pdf[i].get_text() for i in range(page_count)]
 								    finally:
 								        pdf.close()
 								    text = " ".join(chunks)
 								    # Normalise whitespace
 								    return re.sub(r"\s+", " ", text).strip()
 								# Backwards-compat alias for the older name still used in two tests below
 								def _pdf_first_pages_text(filename: str, n: int = 5) -> str:
 								    return _pdf_pages_text(filename, n=n)
 								def _all_active_wahlprogramme() -> list[tuple[str, str, dict]]:
 								    """List of (bundesland, partei, info) for every WAHLPROGRAMME entry of
 								    a currently active Bundesland. Used as parametrize input."""
 								    out: list[tuple[str, str, dict]] = []
 								    active_codes = {bl.code for bl in aktive_bundeslaender()}
 								    for bl, parteien in WAHLPROGRAMME.items():
 								        if bl not in active_codes:
 								            continue
 								        for partei, info in parteien.items():
 								            out.append((bl, partei, info))
 								    return out
 								_WAHLPROG_PARAMS = _all_active_wahlprogramme()
 								_WAHLPROG_IDS = [f"{bl}-{partei}" for bl, partei, _ in _WAHLPROG_PARAMS]
 								# ─────────────────────────────────────────────────────────────────────────────
 								# C1 — Indexing-Status pro aktivem BL
 								# ─────────────────────────────────────────────────────────────────────────────
 								def _embeddings_db_has_active_data() -> bool:
 								    """C1 + C3 sollen nur laufen, wenn die lokale embeddings.db
 								    mindestens für die heute aktiven Bundesländer Chunks enthält.
 								    Sonst (z.B. lokale Dev-Maschine ohne Indexing-Lauf, oder eine
 								    pre-#5-DB ohne bundesland-Spalte) skippen wir, damit der Test
 								    gegen die prod-DB im CI/Container läuft, nicht gegen einen
 								    halb-leeren lokalen Snapshot."""
 								    if not EMBEDDINGS_DB.exists():
 								        return False
 								    import sqlite3
 								    try:
 								        conn = sqlite3.connect(EMBEDDINGS_DB)
 								        try:
 								            cols = {row[1] for row in conn.execute("PRAGMA table_info(chunks)")}
 								            if "bundesland" not in cols:
 								                return False  # pre-#5 schema, no bundesland column
 								            rows = conn.execute(
 								                "SELECT bundesland, COUNT(DISTINCT programm_id) "
 								                "FROM chunks WHERE bundesland IS NOT NULL GROUP BY bundesland"
 								            ).fetchall()
 								        finally:
 								            conn.close()
 								    except sqlite3.Error:
 								        return False
 								    indexed_bls = {bl for bl, n in rows if n > 0}
 								    active_codes = {bl.code for bl in aktive_bundeslaender()}
 								    expected = {bl for bl in active_codes if bl in WAHLPROGRAMME}
 								    return expected.issubset(indexed_bls)
 								@pytest.mark.skipif(
 								    not _embeddings_db_has_active_data(),
 								    reason=(
 								        f"local {EMBEDDINGS_DB} hat nicht alle aktiven BL indexiert — "
 								        "C1/C3 laufen nur in einer Umgebung mit aktueller DB (prod-Container "
 								        "oder lokaler index_programm-Lauf)"
 								    ),
 								)
 								class TestIndexingStatus:
 								    """C1 — every WAHLPROGRAMME entry of an active BL must be indexed."""
 								    def test_no_active_bundesland_has_unindexed_wahlprogramme(self):
 								        status = get_indexing_status()
 								        indexed = {p["id"] for p in status["programmes"] if p["indexed"]}
 								        missing: list[str] = []
 								        for bl, partei, info in _all_active_wahlprogramme():
 								            pid = info["file"].rsplit(".", 1)[0]
 								            if pid not in indexed:
 								                missing.append(f"{bl}/{partei}: {pid}")
 								        assert not missing, (
 								            "Wahlprogramme aktiver Bundesländer fehlen in embeddings.db:\n  "
 								            + "\n  ".join(missing)
 								        )
 								    def test_every_indexed_chunk_belongs_to_known_programm_id(self):
 								        """Catches Bug-Klasse 15: stale chunks for programm_ids that no
 								        longer exist in PROGRAMME (e.g. after a slug rename)."""
 								        status = get_indexing_status()
 								        # status["programmes"] is iterated from PROGRAMME, so an orphan
 								        # would not appear there. Read the DB directly instead.
 								        import sqlite3
 								        conn = sqlite3.connect(EMBEDDINGS_DB)
 								        try:
 								            db_ids = {row[0] for row in conn.execute("SELECT DISTINCT programm_id FROM chunks")}
 								        finally:
 								            conn.close()
 								        orphans = sorted(db_ids - set(PROGRAMME.keys()))
 								        assert not orphans, (
 								            "embeddings.db enthält Chunks für unbekannte programm_id:\n  "
 								            + "\n  ".join(orphans)
 								        )
 								    def test_chunk_count_per_active_bundesland_is_reasonable(self):
 								        """C3 grob: pro aktivem BL erwartet man min. 100 chunks insgesamt
 								        (ein Wahlprogramm hat typisch 50–300 chunks). Fängt Bug-Klasse
 								        "Indexing crashte vorzeitig"."""
 								        status = get_indexing_status()
 								        per_bl: dict[str, int] = defaultdict(int)
 								        for p in status["programmes"]:
 								            info = PROGRAMME.get(p["id"], {})
 								            bl = info.get("bundesland")
 								            if bl:
 								                per_bl[bl] += p["chunks"]
 								        active_codes = {bl.code for bl in aktive_bundeslaender()}
 								        too_low = {bl: count for bl, count in per_bl.items() if bl in active_codes and count < 100}
 								        assert not too_low, (
 								            "Aktive Bundesländer mit verdächtig wenigen indexierten Chunks "
 								            f"(< 100, vermutlich abgebrochene Indexierung): {too_low}"
 								        )
 								# ─────────────────────────────────────────────────────────────────────────────
 								# C2 — Inhalts-Plausibilität pro PDF
 								# ─────────────────────────────────────────────────────────────────────────────
 								_PROGRAMM_MARKERS = (
 								    "wahlprogramm",
 								    "regierungsprogramm",
 								    "zukunftsprogramm",
 								    "landeswahlprogramm",
 								    "berlin-plan",
 								    "berlin plan",
 								    "wahlmanifest",
 								    "programm",  # very permissive, fallback
 								    "wahlperiode",
 								    "landtagswahl",
 								    "bürgerschaftswahl",
 								    "abgeordnetenhaus",
 								    "agh-wahl",
 								    "beschluss",  # many programs say "Beschluss vom DD.MM.YYYY"
 								)
 								@pytest.mark.parametrize(("bundesland", "partei", "info"), _WAHLPROG_PARAMS, ids=_WAHLPROG_IDS)
 								def test_pdf_contains_programm_marker(bundesland: str, partei: str, info: dict):
 								    """C2 — irgendwo im PDF muss ein Wahlprogramm-Marker-Wort vorkommen.
 								    Fängt versehentlich indexierte Nicht-Wahlprogramme (z.B. ein Geschäfts-
 								    bericht der Stiftung statt des Programms). Sehr permissiv: einer von
 Markern reicht. Strikter wird es im ``contains_wahljahr``-Test.
 								    """
 								    text = _pdf_pages_text(info["file"]).lower()
 								    matched = [m for m in _PROGRAMM_MARKERS if m in text]
 								    assert matched, (
 								        f"{bundesland}/{partei} ({info['file']}): keiner der Wahlprogramm-"
 								        f"Marker {_PROGRAMM_MARKERS} im ganzen PDF gefunden — vermutlich "
 								        "falsches PDF eingespielt"
 								    )
 								@pytest.mark.parametrize(("bundesland", "partei", "info"), _WAHLPROG_PARAMS, ids=_WAHLPROG_IDS)
 								def test_pdf_year_horizon_is_plausible(bundesland: str, partei: str, info: dict):
 								    """C2 — Plausibilitäts-Check über die Verteilung der Jahres-Marker
 								    im PDF.
 								    Bewusst keine "Wahljahr muss vorkommen"-Annahme — viele Programme
 								    nennen das Wahljahr selbst gar nicht (z.B. CDU-BE 2021 erwähnt es
 								    null mal, hat aber "Aufzüge bis 2026" als 5-Jahres-Forderung). Was
 								    wir stattdessen prüfen:
 								    - Mindestens **eine** Jahreszahl im erwarteten Wahlperioden-Horizont
 								      (Wahljahr ± 10) muss vorkommen — sonst ist es kein Programm zur
 								      passenden Wahl
 								    - Es darf **kein** Cluster von Jahren in einer DEUTLICH späteren
 								      Periode dominieren (z.B. ein "2031–2036"-Programm in einem File,
 								      das angeblich 2021er sein soll)
 								    """
 								    text = _pdf_pages_text(info["file"])
 								    years = [int(y) for y in re.findall(r"\b(20\d\d)\b", text)]
 								    expected = info["jahr"]
 								    # Bedingung 1: ≥ 1 Jahr aus dem erwarteten Horizont
 								    horizon = range(expected - 1, expected + 11)
 								    in_horizon = [y for y in years if y in horizon]
 								    assert in_horizon, (
 								        f"{bundesland}/{partei} ({info['file']}): kein einziges Jahr im "
 								        f"erwarteten Wahlperioden-Horizont {expected}..{expected + 10} "
 								        f"gefunden (gefunden: {sorted(set(years))[:10]}). PDF passt nicht "
 								        "zur erwarteten Wahlperiode — möglicher anachronistischer Tausch."
 								    )
 								    # Bedingung 2: keine deutlich spätere Wahlperiode dominiert
 								    later_horizon = range(expected + 5, expected + 15)
 								    later_count = sum(1 for y in years if y in later_horizon)
 								    horizon_count = sum(1 for y in years if y in horizon)
 								    if horizon_count > 0 and later_count > horizon_count * 3:
 								        pytest.fail(
 								            f"{bundesland}/{partei} ({info['file']}): mehr als 3× so viele "
 								            f"Jahres-Marker in der Folge-Wahlperiode {list(later_horizon)} "
 								            f"({later_count}) als im erwarteten Horizont ({horizon_count}) — "
 								            "stark anachronistisches PDF."
 								        )
 								def test_be_cdu_pdf_is_2021_program_not_2026():
 								    """Expliziter Anti-Marker für die im Issue #10-Kommentar dokumentierte
 								    Bug-Klasse 8: abgeordnetenwatch hat das ``cduwahlprogrammahw2021_0.pdf``-
 								    File potenziell nachträglich gegen den ``Berlin-Plan 2026`` ersetzt.
 								    Verifikation: das 2021er Programm der laufenden WP19 hat eine
 -Jahres-Forderung-Sprache mit Zielen "bis 2026" oder ähnlich
 								    ("Aufzüge bis zum Jahr 2026" ist ein verifizierter Marker im
 								    aktuellen 2021er PDF). Das hypothetische 2026er hätte stattdessen
 								    Ziele "bis 2031".
 								    Wenn jemand in einem Folge-Build wieder denselben Slug zieht und
 								    abgeordnetenwatch zwischenzeitlich getauscht hat, schlägt dieser
 								    Test mit klarer Fehlermeldung fehl.
 								    """
 								    text = _pdf_pages_text("cdu-be-2023.pdf")
 								    # Positiv-Marker: 2021er Programm spricht über Ziele bis 2026
 								    assert "2026" in text, (
 								        "cdu-be-2023.pdf hat keinerlei '2026'-Marker — das passt zu "
 								        "keinem der erwarteten Programme (weder 2021er mit '5-Jahres-"
 								        "Horizont bis 2026' noch 2026er mit Beschluss-Datum 2026)"
 								    )
 								    # Anti-Marker: das hypothetische 2026er-Programm hätte einen
 								    # "2031"-Horizont (Wahlperiode 2026–2031)
 								    cnt_2031 = text.count("2031")
 								    cnt_2026 = text.count("2026")
 								    assert cnt_2031 < cnt_2026, (
 								        f"cdu-be-2023.pdf zählt mehr '2031' ({cnt_2031}) als '2026' "
 								        f"({cnt_2026}) — das passt zum 2026er Programm der WP20, NICHT "
 								        "zum 2021er Programm der laufenden WP19. Bitte das echte 2021er "
 								        "PDF aus FES/KAS-Archiv neu beschaffen."
 								    )