gwoe-antragspruefer/tests/integration/test_wahlprogramme_indexed.py

"""Sub-Issue C — Wahlprogramm Indexing-Status + PDF Content Verification.

Drei Test-Klassen:

C1 — Indexing-Status: jedes WAHLPROGRAMME-Eintrag eines aktiven BL muss
     in der embeddings.db als ≥1-Chunk-Programm vorhanden sein.

C2 — Inhalts-Plausibilität: jede registrierte PDF-Datei muss real auf der
     ersten Seite Marker für die richtige Wahlperiode + Partei + Programm-
     Typ enthalten. Inkl. Anti-Marker für die abgeordnetenwatch-PDF-Tausch-
     Bug-Klasse 8 (CDU BE 2023→2026).

C3 — Embeddings-Statistik: chunk-count > seiten/10 als grobe Heuristik
     gegen abgebrochene Indexierungen.

Bug-Klassen aus den letzten Sessions, die diese Datei abdeckt:
- 8 (abgeordnetenwatch tauscht PDF unter altem Slug)
- 11 (Wahlprogramm fehlt komplett im Index — heute morgen für 6 BL)
- 15 (embeddings-DB Chunks aus altem Programm-Slug)

Issue: #53 (Sub-Issue C des Umbrella #50)

Hinweis: dieser Test liest die **lokale** ``embeddings.db``, nicht die
prod-Container-DB. Wenn sie lokal nicht existiert, werden alle C1+C3
Tests automatisch xfailed. C2 (PDF-Inhalt) hängt nur von den PDF-Files
ab und läuft immer.
"""
from __future__ import annotations

import re
from collections import defaultdict
from pathlib import Path
from typing import Optional

import pytest

from app.bundeslaender import aktive_bundeslaender
from app.embeddings import EMBEDDINGS_DB, PROGRAMME, get_indexing_status
from app.wahlprogramme import REFERENZEN_PATH, WAHLPROGRAMME


pytestmark = pytest.mark.integration


# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────


def _pdf_pages_text(filename: str, n: Optional[int] = None) -> str:
    """Read the first ``n`` pages (or all pages) of a Wahlprogramm-PDF
    and return the concatenated text, normalised whitespace.

    Uses real PyMuPDF (fitz). Tests calling this helper must be ok with
    skipping if fitz isn't installed in the local environment.
    """
    pytest.require_module("fitz")  # set up by integration conftest
    import fitz

    path = REFERENZEN_PATH / filename
    if not path.exists():
        pytest.fail(f"PDF nicht gefunden: {path}")
    pdf = fitz.open(str(path))
    try:
        page_count = len(pdf) if n is None else min(n, len(pdf))
        chunks: list[str] = [pdf[i].get_text() for i in range(page_count)]
    finally:
        pdf.close()
    text = " ".join(chunks)
    # Normalise whitespace
    return re.sub(r"\s+", " ", text).strip()


# Backwards-compat alias for the older name still used in two tests below
def _pdf_first_pages_text(filename: str, n: int = 5) -> str:
    return _pdf_pages_text(filename, n=n)


def _all_active_wahlprogramme() -> list[tuple[str, str, dict]]:
    """List of (bundesland, partei, info) for every WAHLPROGRAMME entry of
    a currently active Bundesland. Used as parametrize input."""
    out: list[tuple[str, str, dict]] = []
    active_codes = {bl.code for bl in aktive_bundeslaender()}
    for bl, parteien in WAHLPROGRAMME.items():
        if bl not in active_codes:
            continue
        for partei, info in parteien.items():
            out.append((bl, partei, info))
    return out


_WAHLPROG_PARAMS = _all_active_wahlprogramme()
_WAHLPROG_IDS = [f"{bl}-{partei}" for bl, partei, _ in _WAHLPROG_PARAMS]


# ─────────────────────────────────────────────────────────────────────────────
# C1 — Indexing-Status pro aktivem BL
# ─────────────────────────────────────────────────────────────────────────────


def _embeddings_db_has_active_data() -> bool:
    """C1 + C3 sollen nur laufen, wenn die lokale embeddings.db
    mindestens für die heute aktiven Bundesländer Chunks enthält.
    Sonst (z.B. lokale Dev-Maschine ohne Indexing-Lauf, oder eine
    pre-#5-DB ohne bundesland-Spalte) skippen wir, damit der Test
    gegen die prod-DB im CI/Container läuft, nicht gegen einen
    halb-leeren lokalen Snapshot."""
    if not EMBEDDINGS_DB.exists():
        return False
    import sqlite3

    try:
        conn = sqlite3.connect(EMBEDDINGS_DB)
        try:
            cols = {row[1] for row in conn.execute("PRAGMA table_info(chunks)")}
            if "bundesland" not in cols:
                return False  # pre-#5 schema, no bundesland column
            rows = conn.execute(
                "SELECT bundesland, COUNT(DISTINCT programm_id) "
                "FROM chunks WHERE bundesland IS NOT NULL GROUP BY bundesland"
            ).fetchall()
        finally:
            conn.close()
    except sqlite3.Error:
        return False

    indexed_bls = {bl for bl, n in rows if n > 0}
    active_codes = {bl.code for bl in aktive_bundeslaender()}
    expected = {bl for bl in active_codes if bl in WAHLPROGRAMME}
    return expected.issubset(indexed_bls)


@pytest.mark.skipif(
    not _embeddings_db_has_active_data(),
    reason=(
        f"local {EMBEDDINGS_DB} hat nicht alle aktiven BL indexiert — "
        "C1/C3 laufen nur in einer Umgebung mit aktueller DB (prod-Container "
        "oder lokaler index_programm-Lauf)"
    ),
)
class TestIndexingStatus:
    """C1 — every WAHLPROGRAMME entry of an active BL must be indexed."""

    def test_no_active_bundesland_has_unindexed_wahlprogramme(self):
        status = get_indexing_status()
        indexed = {p["id"] for p in status["programmes"] if p["indexed"]}

        missing: list[str] = []
        for bl, partei, info in _all_active_wahlprogramme():
            pid = info["file"].rsplit(".", 1)[0]
            if pid not in indexed:
                missing.append(f"{bl}/{partei}: {pid}")
        assert not missing, (
            "Wahlprogramme aktiver Bundesländer fehlen in embeddings.db:\n  "
            + "\n  ".join(missing)
        )

    def test_every_indexed_chunk_belongs_to_known_programm_id(self):
        """Catches Bug-Klasse 15: stale chunks for programm_ids that no
        longer exist in PROGRAMME (e.g. after a slug rename)."""
        status = get_indexing_status()
        # status["programmes"] is iterated from PROGRAMME, so an orphan
        # would not appear there. Read the DB directly instead.
        import sqlite3

        conn = sqlite3.connect(EMBEDDINGS_DB)
        try:
            db_ids = {row[0] for row in conn.execute("SELECT DISTINCT programm_id FROM chunks")}
        finally:
            conn.close()

        orphans = sorted(db_ids - set(PROGRAMME.keys()))
        assert not orphans, (
            "embeddings.db enthält Chunks für unbekannte programm_id:\n  "
            + "\n  ".join(orphans)
        )

    def test_chunk_count_per_active_bundesland_is_reasonable(self):
        """C3 grob: pro aktivem BL erwartet man min. 100 chunks insgesamt
        (ein Wahlprogramm hat typisch 50–300 chunks). Fängt Bug-Klasse
        "Indexing crashte vorzeitig"."""
        status = get_indexing_status()
        per_bl: dict[str, int] = defaultdict(int)
        for p in status["programmes"]:
            info = PROGRAMME.get(p["id"], {})
            bl = info.get("bundesland")
            if bl:
                per_bl[bl] += p["chunks"]

        active_codes = {bl.code for bl in aktive_bundeslaender()}
        too_low = {bl: count for bl, count in per_bl.items() if bl in active_codes and count < 100}
        assert not too_low, (
            "Aktive Bundesländer mit verdächtig wenigen indexierten Chunks "
            f"(< 100, vermutlich abgebrochene Indexierung): {too_low}"
        )


# ─────────────────────────────────────────────────────────────────────────────
# C2 — Inhalts-Plausibilität pro PDF
# ─────────────────────────────────────────────────────────────────────────────


_PROGRAMM_MARKERS = (
    "wahlprogramm",
    "regierungsprogramm",
    "zukunftsprogramm",
    "landeswahlprogramm",
    "berlin-plan",
    "berlin plan",
    "wahlmanifest",
    "programm",  # very permissive, fallback
    "wahlperiode",
    "landtagswahl",
    "bürgerschaftswahl",
    "abgeordnetenhaus",
    "agh-wahl",
    "beschluss",  # many programs say "Beschluss vom DD.MM.YYYY"
)


@pytest.mark.parametrize(("bundesland", "partei", "info"), _WAHLPROG_PARAMS, ids=_WAHLPROG_IDS)
def test_pdf_contains_programm_marker(bundesland: str, partei: str, info: dict):
    """C2 — irgendwo im PDF muss ein Wahlprogramm-Marker-Wort vorkommen.

    Fängt versehentlich indexierte Nicht-Wahlprogramme (z.B. ein Geschäfts-
    bericht der Stiftung statt des Programms). Sehr permissiv: einer von
    14 Markern reicht. Strikter wird es im ``contains_wahljahr``-Test.
    """
    text = _pdf_pages_text(info["file"]).lower()
    matched = [m for m in _PROGRAMM_MARKERS if m in text]
    assert matched, (
        f"{bundesland}/{partei} ({info['file']}): keiner der Wahlprogramm-"
        f"Marker {_PROGRAMM_MARKERS} im ganzen PDF gefunden — vermutlich "
        "falsches PDF eingespielt"
    )


@pytest.mark.parametrize(("bundesland", "partei", "info"), _WAHLPROG_PARAMS, ids=_WAHLPROG_IDS)
def test_pdf_year_horizon_is_plausible(bundesland: str, partei: str, info: dict):
    """C2 — Plausibilitäts-Check über die Verteilung der Jahres-Marker
    im PDF.

    Bewusst keine "Wahljahr muss vorkommen"-Annahme — viele Programme
    nennen das Wahljahr selbst gar nicht (z.B. CDU-BE 2021 erwähnt es
    null mal, hat aber "Aufzüge bis 2026" als 5-Jahres-Forderung). Was
    wir stattdessen prüfen:

    - Mindestens **eine** Jahreszahl im erwarteten Wahlperioden-Horizont
      (Wahljahr ± 10) muss vorkommen — sonst ist es kein Programm zur
      passenden Wahl
    - Es darf **kein** Cluster von Jahren in einer DEUTLICH späteren
      Periode dominieren (z.B. ein "2031–2036"-Programm in einem File,
      das angeblich 2021er sein soll)
    """
    text = _pdf_pages_text(info["file"])
    years = [int(y) for y in re.findall(r"\b(20\d\d)\b", text)]
    expected = info["jahr"]

    # Bedingung 1: ≥ 1 Jahr aus dem erwarteten Horizont
    horizon = range(expected - 1, expected + 11)
    in_horizon = [y for y in years if y in horizon]
    assert in_horizon, (
        f"{bundesland}/{partei} ({info['file']}): kein einziges Jahr im "
        f"erwarteten Wahlperioden-Horizont {expected}..{expected + 10} "
        f"gefunden (gefunden: {sorted(set(years))[:10]}). PDF passt nicht "
        "zur erwarteten Wahlperiode — möglicher anachronistischer Tausch."
    )

    # Bedingung 2: keine deutlich spätere Wahlperiode dominiert
    later_horizon = range(expected + 5, expected + 15)
    later_count = sum(1 for y in years if y in later_horizon)
    horizon_count = sum(1 for y in years if y in horizon)
    if horizon_count > 0 and later_count > horizon_count * 3:
        pytest.fail(
            f"{bundesland}/{partei} ({info['file']}): mehr als 3× so viele "
            f"Jahres-Marker in der Folge-Wahlperiode {list(later_horizon)} "
            f"({later_count}) als im erwarteten Horizont ({horizon_count}) — "
            "stark anachronistisches PDF."
        )


def test_be_cdu_pdf_is_2021_program_not_2026():
    """Expliziter Anti-Marker für die im Issue #10-Kommentar dokumentierte
    Bug-Klasse 8: abgeordnetenwatch hat das ``cduwahlprogrammahw2021_0.pdf``-
    File potenziell nachträglich gegen den ``Berlin-Plan 2026`` ersetzt.

    Verifikation: das 2021er Programm der laufenden WP19 hat eine
    5-Jahres-Forderung-Sprache mit Zielen "bis 2026" oder ähnlich
    ("Aufzüge bis zum Jahr 2026" ist ein verifizierter Marker im
    aktuellen 2021er PDF). Das hypothetische 2026er hätte stattdessen
    Ziele "bis 2031".

    Wenn jemand in einem Folge-Build wieder denselben Slug zieht und
    abgeordnetenwatch zwischenzeitlich getauscht hat, schlägt dieser
    Test mit klarer Fehlermeldung fehl.
    """
    text = _pdf_pages_text("cdu-be-2023.pdf")
    # Positiv-Marker: 2021er Programm spricht über Ziele bis 2026
    assert "2026" in text, (
        "cdu-be-2023.pdf hat keinerlei '2026'-Marker — das passt zu "
        "keinem der erwarteten Programme (weder 2021er mit '5-Jahres-"
        "Horizont bis 2026' noch 2026er mit Beschluss-Datum 2026)"
    )
    # Anti-Marker: das hypothetische 2026er-Programm hätte einen
    # "2031"-Horizont (Wahlperiode 2026–2031)
    cnt_2031 = text.count("2031")
    cnt_2026 = text.count("2026")
    assert cnt_2031 < cnt_2026, (
        f"cdu-be-2023.pdf zählt mehr '2031' ({cnt_2031}) als '2026' "
        f"({cnt_2026}) — das passt zum 2026er Programm der WP20, NICHT "
        "zum 2021er Programm der laufenden WP19. Bitte das echte 2021er "
        "PDF aus FES/KAS-Archiv neu beschaffen."
    )