gwoe-antragspruefer/app/embeddings.py

"""Semantic search for Wahlprogramme and Parteiprogramme using Qwen embeddings."""

import json
import re
import sqlite3
import urllib.parse
from pathlib import Path
from typing import Optional

import fitz  # PyMuPDF
from openai import OpenAI

from .config import settings

# Embedding model
EMBEDDING_MODEL = "text-embedding-v3"
EMBEDDING_DIMENSIONS = 1024

# Database path
EMBEDDINGS_DB = settings.data_dir / "embeddings.db"

# Programme definitions
PROGRAMME = {
    # Wahlprogramme NRW 2022
    "spd-nrw-2022": {
        "name": "SPD NRW Wahlprogramm 2022",
        "typ": "wahlprogramm",
        "partei": "SPD",
        "bundesland": "NRW",
        "pdf": "spd-nrw-2022.pdf",
    },
    "cdu-nrw-2022": {
        "name": "CDU NRW Wahlprogramm 2022",
        "typ": "wahlprogramm",
        "partei": "CDU",
        "bundesland": "NRW",
        "pdf": "cdu-nrw-2022.pdf",
    },
    "gruene-nrw-2022": {
        "name": "Grüne NRW Wahlprogramm 2022",
        "typ": "wahlprogramm",
        "partei": "GRÜNE",
        "bundesland": "NRW",
        "pdf": "gruene-nrw-2022.pdf",
    },
    "fdp-nrw-2022": {
        "name": "FDP NRW Wahlprogramm 2022",
        "typ": "wahlprogramm",
        "partei": "FDP",
        "bundesland": "NRW",
        "pdf": "fdp-nrw-2022.pdf",
    },
    "afd-nrw-2022": {
        "name": "AfD NRW Wahlprogramm 2022",
        "typ": "wahlprogramm",
        "partei": "AfD",
        "bundesland": "NRW",
        "pdf": "afd-nrw-2022.pdf",
    },
    # Sachsen-Anhalt (LTW 2021)
    "cdu-lsa-2021": {
        "name": "CDU Sachsen-Anhalt Regierungsprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "CDU",
        "bundesland": "LSA",
        "pdf": "cdu-lsa-2021.pdf",
    },
    "spd-lsa-2021": {
        "name": "SPD Sachsen-Anhalt Wahlprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "SPD",
        "bundesland": "LSA",
        "pdf": "spd-lsa-2021.pdf",
    },
    "gruene-lsa-2021": {
        "name": "Grüne Sachsen-Anhalt Wahlprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "GRÜNE",
        "bundesland": "LSA",
        "pdf": "gruene-lsa-2021.pdf",
    },
    "fdp-lsa-2021": {
        "name": "FDP Sachsen-Anhalt Wahlprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "FDP",
        "bundesland": "LSA",
        "pdf": "fdp-lsa-2021.pdf",
    },
    "afd-lsa-2021": {
        "name": "AfD Sachsen-Anhalt Wahlprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "AfD",
        "bundesland": "LSA",
        "pdf": "afd-lsa-2021.pdf",
    },
    "linke-lsa-2021": {
        "name": "DIE LINKE Sachsen-Anhalt Wahlprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "LINKE",
        "bundesland": "LSA",
        "pdf": "linke-lsa-2021.pdf",
    },
    # Mecklenburg-Vorpommern (LTW 26.09.2021, WP 8) — Issue #4
    "cdu-mv-2021": {
        "name": "CDU Mecklenburg-Vorpommern Wahlprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "CDU",
        "bundesland": "MV",
        "pdf": "cdu-mv-2021.pdf",
    },
    "spd-mv-2021": {
        "name": "SPD Mecklenburg-Vorpommern Regierungsprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "SPD",
        "bundesland": "MV",
        "pdf": "spd-mv-2021.pdf",
    },
    "gruene-mv-2021": {
        "name": "Grüne Mecklenburg-Vorpommern Wahlprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "GRÜNE",
        "bundesland": "MV",
        "pdf": "gruene-mv-2021.pdf",
    },
    "fdp-mv-2021": {
        "name": "FDP Mecklenburg-Vorpommern Wahlprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "FDP",
        "bundesland": "MV",
        "pdf": "fdp-mv-2021.pdf",
    },
    "afd-mv-2021": {
        "name": "AfD Mecklenburg-Vorpommern Landeswahlprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "AfD",
        "bundesland": "MV",
        "pdf": "afd-mv-2021.pdf",
    },
    "linke-mv-2021": {
        "name": "DIE LINKE Mecklenburg-Vorpommern Zukunftsprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "LINKE",
        "bundesland": "MV",
        "pdf": "linke-mv-2021.pdf",
    },
    # Berlin (AGH-Wahl 26.09.2021, Wiederholung 12.02.2023, WP 19) —
    # Issue #10. Programme stammen aus dem Wahlkampf 2021 — die
    # Wiederholungswahl 2023 nutzte dieselben Programme.
    "cdu-be-2023": {
        "name": "CDU Berlin Berlin-Plan 2021",
        "typ": "wahlprogramm",
        "partei": "CDU",
        "bundesland": "BE",
        "pdf": "cdu-be-2023.pdf",
    },
    "spd-be-2023": {
        "name": "SPD Berlin Wahlprogramm AGH 2021",
        "typ": "wahlprogramm",
        "partei": "SPD",
        "bundesland": "BE",
        "pdf": "spd-be-2023.pdf",
    },
    "gruene-be-2023": {
        "name": "Grüne Berlin Landeswahlprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "GRÜNE",
        "bundesland": "BE",
        "pdf": "gruene-be-2023.pdf",
    },
    "linke-be-2023": {
        "name": "DIE LINKE Berlin Wahlprogramm 2021",
        "typ": "wahlprogramm",
        "partei": "LINKE",
        "bundesland": "BE",
        "pdf": "linke-be-2023.pdf",
    },
    "afd-be-2023": {
        "name": "AfD Berlin Wahlprogramm AGH 2021",
        "typ": "wahlprogramm",
        "partei": "AfD",
        "bundesland": "BE",
        "pdf": "afd-be-2023.pdf",
    },
    # Thüringen — LTW 01.09.2024, WP 8 (Issue #37)
    "cdu-th-2024": {"name": "CDU Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "TH", "pdf": "cdu-th-2024.pdf"},
    "afd-th-2024": {"name": "AfD Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "TH", "pdf": "afd-th-2024.pdf"},
    "linke-th-2024": {"name": "DIE LINKE Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "LINKE", "bundesland": "TH", "pdf": "linke-th-2024.pdf"},
    "bsw-th-2024": {"name": "BSW Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "BSW", "bundesland": "TH", "pdf": "bsw-th-2024.pdf"},
    "spd-th-2024": {"name": "SPD Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "TH", "pdf": "spd-th-2024.pdf"},
    # Brandenburg — LTW 22.09.2024, WP 8 (Issue #39)
    "spd-bb-2024": {"name": "SPD Brandenburg Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "BB", "pdf": "spd-bb-2024.pdf"},
    "afd-bb-2024": {"name": "AfD Brandenburg Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "BB", "pdf": "afd-bb-2024.pdf"},
    "cdu-bb-2024": {"name": "CDU Brandenburg Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "BB", "pdf": "cdu-bb-2024.pdf"},
    "bsw-bb-2024": {"name": "BSW Brandenburg Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "BSW", "bundesland": "BB", "pdf": "bsw-bb-2024.pdf"},
    # Hamburg — Bürgerschaftswahl 02.03.2025, WP 23 (Issue #40)
    "spd-hh-2025": {"name": "SPD Hamburg Wahlprogramm 2025", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "HH", "pdf": "spd-hh-2025.pdf"},
    "cdu-hh-2025": {"name": "CDU Hamburg Wahlprogramm 2025", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "HH", "pdf": "cdu-hh-2025.pdf"},
    "gruene-hh-2025": {"name": "Grüne Hamburg Regierungsprogramm 2025", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "HH", "pdf": "gruene-hh-2025.pdf"},
    "linke-hh-2025": {"name": "DIE LINKE Hamburg Wahlprogramm 2025", "typ": "wahlprogramm", "partei": "LINKE", "bundesland": "HH", "pdf": "linke-hh-2025.pdf"},
    "afd-hh-2025": {"name": "AfD Hamburg Wahlprogramm 2025", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "HH", "pdf": "afd-hh-2025.pdf"},
    # Schleswig-Holstein — LTW 08.05.2022, WP 20 (Issue #32)
    "cdu-sh-2022": {"name": "CDU Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "SH", "pdf": "cdu-sh-2022.pdf"},
    "spd-sh-2022": {"name": "SPD Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "SH", "pdf": "spd-sh-2022.pdf"},
    "gruene-sh-2022": {"name": "Grüne Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "SH", "pdf": "gruene-sh-2022.pdf"},
    "fdp-sh-2022": {"name": "FDP Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "FDP", "bundesland": "SH", "pdf": "fdp-sh-2022.pdf"},
    "ssw-sh-2022": {"name": "SSW Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "SSW", "bundesland": "SH", "pdf": "ssw-sh-2022.pdf"},
    # Baden-Württemberg — LTW 14.03.2021, WP 17 (Issue #41)
    "gruene-bw-2021": {"name": "Grüne Baden-Württemberg Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "BW", "pdf": "gruene-bw-2021.pdf"},
    "cdu-bw-2021": {"name": "CDU Baden-Württemberg Regierungsprogramm 2021", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "BW", "pdf": "cdu-bw-2021.pdf"},
    "afd-bw-2021": {"name": "AfD Baden-Württemberg Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "BW", "pdf": "afd-bw-2021.pdf"},
    "spd-bw-2021": {"name": "SPD Baden-Württemberg Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "BW", "pdf": "spd-bw-2021.pdf"},
    "fdp-bw-2021": {"name": "FDP Baden-Württemberg Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "FDP", "bundesland": "BW", "pdf": "fdp-bw-2021.pdf"},
    # Rheinland-Pfalz — LTW 14.03.2021, WP 18 (Issue #42)
    "spd-rp-2021": {"name": "SPD Rheinland-Pfalz Regierungsprogramm 2021", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "RP", "pdf": "spd-rp-2021.pdf"},
    "cdu-rp-2021": {"name": "CDU Rheinland-Pfalz Regierungsprogramm 2021", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "RP", "pdf": "cdu-rp-2021.pdf"},
    "afd-rp-2021": {"name": "AfD Rheinland-Pfalz Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "RP", "pdf": "afd-rp-2021.pdf"},
    "gruene-rp-2021": {"name": "Grüne Rheinland-Pfalz Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "RP", "pdf": "gruene-rp-2021.pdf"},
    "fw-rp-2021": {"name": "FREIE WÄHLER Rheinland-Pfalz Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "FREIE WÄHLER", "bundesland": "RP", "pdf": "fw-rp-2021.pdf"},
    "fdp-rp-2021": {"name": "FDP Rheinland-Pfalz Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "FDP", "bundesland": "RP", "pdf": "fdp-rp-2021.pdf"},
    # Grundsatzprogramme (Bund)
    "spd-grundsatz": {
        "name": "SPD Grundsatzprogramm 2007",
        "typ": "parteiprogramm",
        "partei": "SPD",
        "pdf": "spd-grundsatzprogramm.pdf",
    },
    "cdu-grundsatz": {
        "name": "CDU Grundsatzprogramm 2007",
        "typ": "parteiprogramm",
        "partei": "CDU",
        "pdf": "cdu-grundsatzprogramm.pdf",
    },
    "gruene-grundsatz": {
        "name": "Grüne Grundsatzprogramm 2020",
        "typ": "parteiprogramm",
        "partei": "GRÜNE",
        "pdf": "gruene-grundsatzprogramm.pdf",
    },
    "fdp-grundsatz": {
        "name": "FDP Grundsatzprogramm 2012",
        "typ": "parteiprogramm",
        "partei": "FDP",
        "pdf": "fdp-grundsatzprogramm.pdf",
    },
}


def init_embeddings_db():
    """Initialize the embeddings database.

    Includes a forward-only migration step (Issue #5): adds the
    ``bundesland`` column if missing and backfills existing rows from the
    ``PROGRAMME`` registry. Grundsatzprogramme (federal level) keep
    ``bundesland = NULL``; the ``find_relevant_chunks`` query treats NULL
    as "matches any state".
    """
    conn = sqlite3.connect(EMBEDDINGS_DB)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS chunks (
            id INTEGER PRIMARY KEY,
            programm_id TEXT NOT NULL,
            partei TEXT NOT NULL,
            typ TEXT NOT NULL,
            seite INTEGER,
            text TEXT NOT NULL,
            embedding BLOB NOT NULL,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)
    conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_partei ON chunks(partei)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_typ ON chunks(typ)")

    # Migration: bundesland-Spalte ergänzen, falls Tabelle aus Pre-#5-Zeit
    cols = {row[1] for row in conn.execute("PRAGMA table_info(chunks)").fetchall()}
    if "bundesland" not in cols:
        conn.execute("ALTER TABLE chunks ADD COLUMN bundesland TEXT")
        conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_bundesland ON chunks(bundesland)")

    # Backfill: Bundesland aus PROGRAMME-Registry für bestehende Zeilen
    # nachtragen. Grundsatzprogramme bleiben NULL.
    for prog_id, info in PROGRAMME.items():
        bl = info.get("bundesland")
        if bl is not None:
            conn.execute(
                "UPDATE chunks SET bundesland = ? WHERE programm_id = ? AND bundesland IS NULL",
                (bl, prog_id),
            )

    conn.commit()
    conn.close()


def get_client() -> OpenAI:
    """Get DashScope client."""
    return OpenAI(
        api_key=settings.dashscope_api_key,
        base_url=settings.dashscope_base_url,
    )


def create_embedding(text: str) -> list[float]:
    """Create embedding for text using Qwen."""
    client = get_client()
    response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=text,
        dimensions=EMBEDDING_DIMENSIONS,
    )
    return response.data[0].embedding


def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
    """Split text into overlapping chunks by words."""
    words = text.split()
    chunks = []

    i = 0
    while i < len(words):
        chunk_words = words[i:i + chunk_size]
        chunk = " ".join(chunk_words)
        if chunk.strip():
            chunks.append(chunk)
        i += chunk_size - overlap

    return chunks


def extract_text_with_pages(pdf_path: Path) -> list[tuple[int, str]]:
    """Extract text from PDF with page numbers."""
    doc = fitz.open(pdf_path)
    pages = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text()
        if text.strip():
            pages.append((page_num + 1, text))

    doc.close()
    return pages


def index_programm(programm_id: str, pdf_dir: Path) -> int:
    """Index a single program PDF into embeddings database."""
    if programm_id not in PROGRAMME:
        raise ValueError(f"Unknown program: {programm_id}")

    info = PROGRAMME[programm_id]
    pdf_path = pdf_dir / info["pdf"]

    if not pdf_path.exists():
        print(f"PDF not found: {pdf_path}")
        return 0

    conn = sqlite3.connect(EMBEDDINGS_DB)

    # Remove existing chunks for this program
    conn.execute("DELETE FROM chunks WHERE programm_id = ?", (programm_id,))

    # Extract and chunk
    pages = extract_text_with_pages(pdf_path)
    total_chunks = 0

    for page_num, page_text in pages:
        chunks = chunk_text(page_text, chunk_size=400, overlap=50)

        for chunk_text_content in chunks:
            if len(chunk_text_content.split()) < 20:  # Skip tiny chunks
                continue

            try:
                embedding = create_embedding(chunk_text_content)
                embedding_blob = json.dumps(embedding).encode()

                conn.execute("""
                    INSERT INTO chunks (programm_id, partei, typ, seite, text, embedding, bundesland)
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                """, (
                    programm_id,
                    info["partei"],
                    info["typ"],
                    page_num,
                    chunk_text_content,
                    embedding_blob,
                    info.get("bundesland"),  # NULL für Grundsatzprogramme
                ))
                total_chunks += 1
            except Exception as e:
                print(f"Error embedding chunk: {e}")
                continue

    conn.commit()
    conn.close()

    print(f"Indexed {total_chunks} chunks from {programm_id}")
    return total_chunks


def cosine_similarity(a: list[float], b: list[float]) -> float:
    """Calculate cosine similarity between two vectors."""
    dot = sum(x * y for x, y in zip(a, b))
    norm_a = sum(x * x for x in a) ** 0.5
    norm_b = sum(x * x for x in b) ** 0.5
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return dot / (norm_a * norm_b)


def find_relevant_chunks(
    query: str,
    parteien: list[str] = None,
    typ: str = None,
    bundesland: str = None,
    top_k: int = 3,
    min_similarity: float = 0.5,
) -> list[dict]:
    """Find most relevant chunks for a query.

    Args:
        bundesland: Wenn gesetzt, werden nur Chunks dieses Bundeslands ODER
            globale Chunks (bundesland IS NULL, z.B. Grundsatzprogramme)
            berücksichtigt. Wenn None, kein Filter.
    """

    query_embedding = create_embedding(query)

    conn = sqlite3.connect(EMBEDDINGS_DB)
    conn.row_factory = sqlite3.Row

    # Build query
    sql = "SELECT * FROM chunks WHERE 1=1"
    params = []

    if parteien:
        placeholders = ",".join("?" * len(parteien))
        sql += f" AND partei IN ({placeholders})"
        params.extend(parteien)

    if typ:
        sql += " AND typ = ?"
        params.append(typ)

    if bundesland:
        # Bundesland-spezifische ODER globale Chunks (Grundsatzprogramme).
        sql += " AND (bundesland = ? OR bundesland IS NULL)"
        params.append(bundesland)

    rows = conn.execute(sql, params).fetchall()
    conn.close()

    # Calculate similarities
    results = []
    for row in rows:
        chunk_embedding = json.loads(row["embedding"])
        similarity = cosine_similarity(query_embedding, chunk_embedding)

        if similarity >= min_similarity:
            results.append({
                "programm_id": row["programm_id"],
                "partei": row["partei"],
                "typ": row["typ"],
                "seite": row["seite"],
                "text": row["text"],
                "similarity": similarity,
            })

    # Sort by similarity and return top_k
    results.sort(key=lambda x: x["similarity"], reverse=True)
    return results[:top_k]


def get_relevant_quotes_for_antrag(
    antrag_text: str,
    fraktionen: list[str],
    bundesland: str,
    top_k_per_partei: int = 2,
) -> dict[str, list[dict]]:
    """Get relevant quotes from Wahl- and Parteiprogramme for an Antrag.

    Args:
        bundesland: Pflicht. Bestimmt, welche Wahlprogramme durchsucht werden
            und welche Regierungsfraktionen zusätzlich zu den Antragstellern
            einbezogen werden.
    """
    # Lokaler Import vermeidet Zirkularität: bundeslaender.py importiert nichts
    # aus diesem Modul, aber der saubere Trennstrich bleibt erhalten.
    from .bundeslaender import BUNDESLAENDER

    if bundesland not in BUNDESLAENDER:
        raise ValueError(f"Unbekanntes Bundesland: {bundesland}")

    regierungsfraktionen = BUNDESLAENDER[bundesland].regierungsfraktionen
    parteien_to_search = list(dict.fromkeys(fraktionen + regierungsfraktionen))  # dedupe, Reihenfolge stabil

    results = {}

    from .parteien import normalize_partei

    for partei in parteien_to_search:
        # Kanonischer Lookup-Key über den zentralen Mapper (#55). Ersetzt
        # den alten Hack ``partei.upper() if partei != "GRÜNE" else "GRÜNE"``,
        # der nur die Schreibweisen-Drift in einer einzigen Partei
        # abgefangen hat. Wenn der Mapper nichts findet, fallen wir auf
        # den Originalstring zurück — die DB-Lookup-Schicht macht ohnehin
        # eigene Case-insensitive-Vergleiche.
        canonical = normalize_partei(partei, bundesland=bundesland)
        partei_lookup = canonical or partei

        # Wahlprogramm — bundesland-gefiltert
        wahl_chunks = find_relevant_chunks(
            antrag_text,
            parteien=[partei_lookup],
            typ="wahlprogramm",
            bundesland=bundesland,
            top_k=top_k_per_partei,
            min_similarity=0.45,
        )

        # Parteiprogramm (Grundsatz, federal — bundesland=NULL matched implizit)
        partei_chunks = find_relevant_chunks(
            antrag_text,
            parteien=[partei_lookup],
            typ="parteiprogramm",
            bundesland=bundesland,
            top_k=top_k_per_partei,
            min_similarity=0.45,
        )

        if wahl_chunks or partei_chunks:
            results[partei_lookup] = {
                "wahlprogramm": wahl_chunks,
                "parteiprogramm": partei_chunks,
            }

    return results


def _chunk_source_label(chunk: dict) -> str:
    """Build a fully-qualified source label like 'FDP MV Wahlprogramm 2021, S. 73'.

    Without the programme name + Bundesland in the prompt, the LLM
    halluzinates familiar sources from its training (typically NRW 2022)
    even when the retrieved chunks all come from a different state.
    """
    prog_id = chunk.get("programm_id", "")
    info = PROGRAMME.get(prog_id, {})
    name = info.get("name") or prog_id
    seite = chunk.get("seite", "?")
    return f"{name}, S. {seite}"


def _chunk_pdf_url(chunk: dict) -> Optional[str]:
    """Build the canonical PDF URL with page anchor for a chunk.

    Wenn der Chunk einen ``text`` enthält, wird stattdessen die
    Highlight-Endpoint-URL ``/api/wahlprogramm-cite?pid=…&seite=…&q=…``
    emittiert (Issue #47). Der Endpoint rendert die Wahlprogramm-Seite
    mit gelb markiertem Zitat und liefert ein 1-Seiten-PDF. Klick im
    Report öffnet die Quelle direkt mit visuell hervorgehobener Stelle.

    Fallback: ohne text → statische ``/static/referenzen/<pdf>#page=<n>``
    URL (rückwärts-kompatibel für Pre-#47 Assessments).
    """
    prog_id = chunk.get("programm_id", "")
    info = PROGRAMME.get(prog_id)
    if not info:
        return None
    pdf = info.get("pdf")
    if not pdf:
        return None
    seite = chunk.get("seite")
    text = (chunk.get("text") or "").strip()

    if text and seite:
        # Highlight-Endpoint mit URL-encoded query. Den Text auf 200 Zeichen
        # abschneiden — search_for matched ohnehin nur Substring-Anker, und
        # die URL bleibt bounded (sonst würden 500-Zeichen-Snippets in jeder
        # Zitat-URL stehen und das HTML-Report-JSON aufblähen).
        q = urllib.parse.quote_plus(text[:200])
        return f"/api/wahlprogramm-cite?pid={prog_id}&seite={seite}&q={q}"

    if seite:
        return f"/static/referenzen/{pdf}#page={seite}"
    return f"/static/referenzen/{pdf}"


def render_highlighted_page(programm_id: str, seite: int, query: str) -> Optional[bytes]:
    """Render a single Wahlprogramm-page with yellow highlights for a query.

    Used by the ``/api/wahlprogramm-cite`` endpoint to serve a one-page
    PDF where the cited snippet is visually highlighted via PyMuPDF
    ``add_highlight_annot``. Returns the serialized PDF bytes, or None
    if the programme/page can't be resolved.

    Args:
        programm_id: Key into PROGRAMME registry — validated by caller.
        seite: 1-indexed page number within the programme PDF.
        query: Snippet text to search and highlight on the page. Long
            queries are truncated to the first 200 characters before the
            search; PyMuPDF's ``search_for`` falls over on huge needles
            anyway and a short anchor is what we want for the visual hit.
    """
    info = PROGRAMME.get(programm_id)
    if not info:
        return None
    pdf_filename = info.get("pdf")
    if not pdf_filename:
        return None

    referenzen = Path(__file__).parent / "static" / "referenzen"
    pdf_path = referenzen / pdf_filename
    if not pdf_path.exists():
        return None

    needle = (query or "").strip()[:200]

    src = fitz.open(str(pdf_path))
    try:
        if seite < 1 or seite > len(src):
            return None

        # Single-page Sub-PDF erzeugen — hält den Response klein und
        # schließt versehentliche Cross-Page-Highlights aus.
        new = fitz.open()
        try:
            new.insert_pdf(src, from_page=seite - 1, to_page=seite - 1)
            page = new[0]

            if needle:
                # PyMuPDF ist tolerant gegen Whitespace, aber Soft-Hyphen
                # bricht den Match — analog zu _normalize_for_match
                # entfernen wir \xad vor dem search_for.
                clean = needle.replace("\u00ad", "")
                rects = page.search_for(clean)
                if not rects:
                    # Fallback: nur die ersten 5 Wörter als Anker — analog
                    # zu find_chunk_for_text. Wenn der LLM den Snippet
                    # mid-sentence gekürzt hat, bricht der Volltext-Match,
                    # aber 5-Wort-Sequenz findet die Stelle trotzdem.
                    words = clean.split()
                    if len(words) >= 5:
                        anchor = " ".join(words[:5])
                        rects = page.search_for(anchor)
                for rect in rects:
                    annot = page.add_highlight_annot(rect)
                    if annot is not None:
                        annot.set_colors(stroke=(1.0, 0.93, 0.0))  # gelb
                        annot.update()

            return new.tobytes()
        finally:
            new.close()
    finally:
        src.close()


# ─────────────────────────────────────────────────────────────────────────────
# Citation post-processing — Issue #60 Option B
#
# Pre-#60 the LLM was free to fabricate `quelle`/`url` strings even when the
# `text` was a real snippet from a retrieved chunk. The A+C fix made the
# prompt more strict, but BB 8/673 (post-deploy) showed the LLM still
# cross-mixed: it copied text from chunk Qn but wrote the page from chunk Qm
# in the `quelle` field.
#
# The structural fix is to take quelle/url generation away from the LLM
# entirely. After the LLM responds, we walk over every Zitat and try to
# locate its `text` (substring or 5-word anchor) in any of the chunks the
# LLM was actually shown. If we find a match, we *overwrite* quelle and url
# with the canonical values from that chunk. If we don't find a match, the
# Zitat is dropped — it cannot be backed by retrieved evidence.
# ─────────────────────────────────────────────────────────────────────────────


_RE_WHITESPACE = re.compile(r"\s+")
_RE_HYPHEN_BREAK = re.compile(r"(\w)-\s+(\w)")
_RE_TRUNCATION = re.compile(r"^\s*\.{2,}|\.{2,}\s*$")


def _normalize_for_match(text: str) -> str:
    """Lowercase, collapse whitespace, bridge soft-hyphen line breaks.

    Mirrors the matcher used in tests/integration/test_citations_substring.py
    so that the analyzer's post-processing and Sub-D's verification stay in
    lockstep.
    """
    s = (text or "").lower()
    s = _RE_TRUNCATION.sub("", s)
    s = s.replace("\u00ad", "")  # soft hyphen
    s = _RE_WHITESPACE.sub(" ", s).strip()
    prev = None
    while prev != s:
        prev = s
        s = _RE_HYPHEN_BREAK.sub(r"\1\2", s)
    return s


def find_chunk_for_text(text: str, chunks: list[dict]) -> Optional[dict]:
    """Locate the retrieved chunk that a Zitat snippet was copied from.

    Two-stage match identical to Sub-D:
      1. **Strict substring** — full needle as substring of any chunk.
      2. **5-word anchor** — any 5 consecutive words of the needle as
         substring of any chunk.

    Snippets shorter than 20 characters are rejected (too weak to bind).
    Returns the matching chunk dict, or None.
    """
    needle = _normalize_for_match(text)
    if len(needle) < 20:
        return None
    chunks_norm = [(c, _normalize_for_match(c.get("text", ""))) for c in chunks]
    for c, norm in chunks_norm:
        if needle in norm:
            return c
    words = needle.split()
    if len(words) < 5:
        return None
    for i in range(len(words) - 4):
        anchor = " ".join(words[i:i + 5])
        for c, norm in chunks_norm:
            if anchor in norm:
                return c
    return None


def reconstruct_zitate(data: dict, semantic_quotes: dict) -> dict:
    """Replace LLM-emitted quelle/url with canonical chunk values; drop unbacked.

    Walks over ``data['wahlprogrammScores'][i][kind]['zitate']`` (the raw
    LLM-output dict, not the Pydantic model). For each Zitat:

    * Locate the chunk whose text contains the snippet (or a 5-word anchor
      from it). Search across **all** retrieved chunks regardless of party,
      so cross-mixes between Q-IDs become invisible to the persisted output.
    * If found: overwrite ``quelle`` and ``url`` with values derived from
      the matching chunk's ``programm_id`` + ``seite``. The LLM is no longer
      trusted for these fields.
    * If not found: drop the Zitat entirely.

    Returns the same ``data`` dict (mutated in place) for chaining.
    """
    if not semantic_quotes:
        return data

    all_chunks: list[dict] = []
    for d in semantic_quotes.values():
        all_chunks.extend(d.get("wahlprogramm", []))
        all_chunks.extend(d.get("parteiprogramm", []))
    if not all_chunks:
        return data

    for fs in data.get("wahlprogrammScores", []) or []:
        for kind in ("wahlprogramm", "parteiprogramm"):
            blk = fs.get(kind) or {}
            zitate = blk.get("zitate") or []
            cleaned = []
            for z in zitate:
                text = z.get("text", "")
                matched = find_chunk_for_text(text, all_chunks)
                if matched is None:
                    continue
                z["quelle"] = _chunk_source_label(matched)
                url = _chunk_pdf_url(matched)
                if url:
                    z["url"] = url
                cleaned.append(z)
            blk["zitate"] = cleaned
    return data


def format_quotes_for_prompt(quotes: dict) -> str:
    """Format quotes for inclusion in LLM prompt.

    Each chunk gets a stable ENUM-ID ([Q1], [Q2], …) and the prompt
    instructs the LLM to anchor every citation in one of those IDs and
    to copy the snippet **verbatim** from the cited chunk. This is the
    structural fix for Issue #60: pre-#60 the LLM was free to invent
    snippets under real source labels because nothing in the prompt
    bound a citation to a specific retrieved chunk.

    Each quote is annotated with the fully-qualified source (programme
    name + page) so the LLM cannot fall back on training-set defaults
    when constructing its citations.
    """
    if not quotes:
        return ""

    lines = ["\n## Relevante Passagen aus Wahl- und Parteiprogrammen\n"]
    lines.append(
        "**ZITATEREGEL** — verbindlich für alle Zitate in `wahlprogramm`/"
        "`parteiprogramm`-Blöcken:\n"
        "1. Jedes Zitat MUSS auf genau einen der unten aufgelisteten "
        "Chunks verweisen (Format `[Q1]`, `[Q2]`, …).\n"
        "2. Der `text`-String MUSS eine **wörtliche, zusammenhängende** "
        "Passage von mindestens 5 Wörtern aus genau diesem Chunk sein — "
        "keine Paraphrasen, keine Zusammenfassungen, keine "
        "Cross-References aus dem Gedächtnis.\n"
        "3. Der `quelle`-String MUSS exakt das Source-Label des "
        "gewählten Chunks sein (Programm-Name + Seitenzahl, wie unten "
        "ausgeschrieben).\n"
        "4. Wenn kein Chunk wirklich passt: lass das Zitat-Array leer. "
        "Lieber 0 Zitate als ein erfundenes Zitat.\n"
    )

    counter = 0
    for partei, data in quotes.items():
        lines.append(f"\n### {partei}\n")

        if data.get("wahlprogramm"):
            lines.append("**Wahlprogramm:**")
            for chunk in data["wahlprogramm"]:
                counter += 1
                text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
                lines.append(f'- [Q{counter}] {_chunk_source_label(chunk)}: "{text}"')

        if data.get("parteiprogramm"):
            lines.append("\n**Grundsatzprogramm:**")
            for chunk in data["parteiprogramm"]:
                counter += 1
                text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
                lines.append(f'- [Q{counter}] {_chunk_source_label(chunk)}: "{text}"')

    return "\n".join(lines)


def get_programme_info() -> list[dict]:
    """Get list of all indexed programmes with metadata."""
    info_list = []

    for prog_id, info in PROGRAMME.items():
        info_list.append({
            "id": prog_id,
            "name": info["name"],
            "typ": info["typ"],
            "partei": info["partei"],
            "bundesland": info.get("bundesland"),
            "pdf": info["pdf"],
            "pdf_url": f"/static/referenzen/{info['pdf']}",
        })

    return info_list


def get_indexing_status() -> dict:
    """Get status of indexed programmes."""
    if not EMBEDDINGS_DB.exists():
        return {"indexed": 0, "programmes": []}

    conn = sqlite3.connect(EMBEDDINGS_DB)

    # Count chunks per program
    rows = conn.execute("""
        SELECT programm_id, COUNT(*) as chunks
        FROM chunks
        GROUP BY programm_id
    """).fetchall()

    conn.close()

    indexed = {row[0]: row[1] for row in rows}

    programmes = []
    for prog_id, info in PROGRAMME.items():
        programmes.append({
            "id": prog_id,
            "name": info["name"],
            "partei": info["partei"],
            "chunks": indexed.get(prog_id, 0),
            "indexed": prog_id in indexed,
        })

    return {
        "indexed": len(indexed),
        "total": len(PROGRAMME),
        "programmes": programmes,
    }