"""Semantic search for Wahlprogramme and Parteiprogramme using Qwen embeddings.""" import json import logging import re logger = logging.getLogger(__name__) import sqlite3 import urllib.parse from pathlib import Path from typing import Optional import fitz # PyMuPDF from openai import OpenAI from .config import settings # Embedding model EMBEDDING_MODEL = "text-embedding-v3" EMBEDDING_DIMENSIONS = 1024 # Database path EMBEDDINGS_DB = settings.data_dir / "embeddings.db" # Programme definitions PROGRAMME = { # Wahlprogramme NRW 2022 "spd-nrw-2022": { "name": "SPD NRW Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "NRW", "pdf": "spd-nrw-2022.pdf", }, "cdu-nrw-2022": { "name": "CDU NRW Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "NRW", "pdf": "cdu-nrw-2022.pdf", }, "gruene-nrw-2022": { "name": "Grüne NRW Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "NRW", "pdf": "gruene-nrw-2022.pdf", }, "fdp-nrw-2022": { "name": "FDP NRW Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "FDP", "bundesland": "NRW", "pdf": "fdp-nrw-2022.pdf", }, "afd-nrw-2022": { "name": "AfD NRW Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "NRW", "pdf": "afd-nrw-2022.pdf", }, # Sachsen-Anhalt (LTW 2021) "cdu-lsa-2021": { "name": "CDU Sachsen-Anhalt Regierungsprogramm 2021", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "LSA", "pdf": "cdu-lsa-2021.pdf", }, "spd-lsa-2021": { "name": "SPD Sachsen-Anhalt Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "LSA", "pdf": "spd-lsa-2021.pdf", }, "gruene-lsa-2021": { "name": "Grüne Sachsen-Anhalt Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "LSA", "pdf": "gruene-lsa-2021.pdf", }, "fdp-lsa-2021": { "name": "FDP Sachsen-Anhalt Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "FDP", "bundesland": "LSA", "pdf": "fdp-lsa-2021.pdf", }, "afd-lsa-2021": { "name": "AfD Sachsen-Anhalt Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "LSA", "pdf": "afd-lsa-2021.pdf", }, "linke-lsa-2021": { "name": "DIE LINKE Sachsen-Anhalt Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "LINKE", "bundesland": "LSA", "pdf": "linke-lsa-2021.pdf", }, # Mecklenburg-Vorpommern (LTW 26.09.2021, WP 8) — Issue #4 "cdu-mv-2021": { "name": "CDU Mecklenburg-Vorpommern Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "MV", "pdf": "cdu-mv-2021.pdf", }, "spd-mv-2021": { "name": "SPD Mecklenburg-Vorpommern Regierungsprogramm 2021", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "MV", "pdf": "spd-mv-2021.pdf", }, "gruene-mv-2021": { "name": "Grüne Mecklenburg-Vorpommern Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "MV", "pdf": "gruene-mv-2021.pdf", }, "fdp-mv-2021": { "name": "FDP Mecklenburg-Vorpommern Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "FDP", "bundesland": "MV", "pdf": "fdp-mv-2021.pdf", }, "afd-mv-2021": { "name": "AfD Mecklenburg-Vorpommern Landeswahlprogramm 2021", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "MV", "pdf": "afd-mv-2021.pdf", }, "linke-mv-2021": { "name": "DIE LINKE Mecklenburg-Vorpommern Zukunftsprogramm 2021", "typ": "wahlprogramm", "partei": "LINKE", "bundesland": "MV", "pdf": "linke-mv-2021.pdf", }, # Berlin (AGH-Wahl 26.09.2021, Wiederholung 12.02.2023, WP 19) — # Issue #10. Programme stammen aus dem Wahlkampf 2021 — die # Wiederholungswahl 2023 nutzte dieselben Programme. "cdu-be-2023": { "name": "CDU Berlin Berlin-Plan 2021", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "BE", "pdf": "cdu-be-2023.pdf", }, "spd-be-2023": { "name": "SPD Berlin Wahlprogramm AGH 2021", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "BE", "pdf": "spd-be-2023.pdf", }, "gruene-be-2023": { "name": "Grüne Berlin Landeswahlprogramm 2021", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "BE", "pdf": "gruene-be-2023.pdf", }, "linke-be-2023": { "name": "DIE LINKE Berlin Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "LINKE", "bundesland": "BE", "pdf": "linke-be-2023.pdf", }, "afd-be-2023": { "name": "AfD Berlin Wahlprogramm AGH 2021", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "BE", "pdf": "afd-be-2023.pdf", }, # Thüringen — LTW 01.09.2024, WP 8 (Issue #37) "cdu-th-2024": {"name": "CDU Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "TH", "pdf": "cdu-th-2024.pdf"}, "afd-th-2024": {"name": "AfD Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "TH", "pdf": "afd-th-2024.pdf"}, "linke-th-2024": {"name": "DIE LINKE Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "LINKE", "bundesland": "TH", "pdf": "linke-th-2024.pdf"}, "bsw-th-2024": {"name": "BSW Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "BSW", "bundesland": "TH", "pdf": "bsw-th-2024.pdf"}, "spd-th-2024": {"name": "SPD Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "TH", "pdf": "spd-th-2024.pdf"}, # Brandenburg — LTW 22.09.2024, WP 8 (Issue #39) "spd-bb-2024": {"name": "SPD Brandenburg Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "BB", "pdf": "spd-bb-2024.pdf"}, "afd-bb-2024": {"name": "AfD Brandenburg Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "BB", "pdf": "afd-bb-2024.pdf"}, "cdu-bb-2024": {"name": "CDU Brandenburg Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "BB", "pdf": "cdu-bb-2024.pdf"}, "bsw-bb-2024": {"name": "BSW Brandenburg Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "BSW", "bundesland": "BB", "pdf": "bsw-bb-2024.pdf"}, # Hamburg — Bürgerschaftswahl 02.03.2025, WP 23 (Issue #40) "spd-hh-2025": {"name": "SPD Hamburg Wahlprogramm 2025", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "HH", "pdf": "spd-hh-2025.pdf"}, "cdu-hh-2025": {"name": "CDU Hamburg Wahlprogramm 2025", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "HH", "pdf": "cdu-hh-2025.pdf"}, "gruene-hh-2025": {"name": "Grüne Hamburg Regierungsprogramm 2025", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "HH", "pdf": "gruene-hh-2025.pdf"}, "linke-hh-2025": {"name": "DIE LINKE Hamburg Wahlprogramm 2025", "typ": "wahlprogramm", "partei": "LINKE", "bundesland": "HH", "pdf": "linke-hh-2025.pdf"}, "afd-hh-2025": {"name": "AfD Hamburg Wahlprogramm 2025", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "HH", "pdf": "afd-hh-2025.pdf"}, # Schleswig-Holstein — LTW 08.05.2022, WP 20 (Issue #32) "cdu-sh-2022": {"name": "CDU Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "SH", "pdf": "cdu-sh-2022.pdf"}, "spd-sh-2022": {"name": "SPD Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "SH", "pdf": "spd-sh-2022.pdf"}, "gruene-sh-2022": {"name": "Grüne Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "SH", "pdf": "gruene-sh-2022.pdf"}, "fdp-sh-2022": {"name": "FDP Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "FDP", "bundesland": "SH", "pdf": "fdp-sh-2022.pdf"}, "ssw-sh-2022": {"name": "SSW Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "SSW", "bundesland": "SH", "pdf": "ssw-sh-2022.pdf"}, # Baden-Württemberg — LTW 14.03.2021, WP 17 (Issue #41) "gruene-bw-2021": {"name": "Grüne Baden-Württemberg Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "BW", "pdf": "gruene-bw-2021.pdf"}, "cdu-bw-2021": {"name": "CDU Baden-Württemberg Regierungsprogramm 2021", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "BW", "pdf": "cdu-bw-2021.pdf"}, "afd-bw-2021": {"name": "AfD Baden-Württemberg Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "BW", "pdf": "afd-bw-2021.pdf"}, "spd-bw-2021": {"name": "SPD Baden-Württemberg Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "BW", "pdf": "spd-bw-2021.pdf"}, "fdp-bw-2021": {"name": "FDP Baden-Württemberg Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "FDP", "bundesland": "BW", "pdf": "fdp-bw-2021.pdf"}, # Rheinland-Pfalz — LTW 14.03.2021, WP 18 (Issue #42) "spd-rp-2021": {"name": "SPD Rheinland-Pfalz Regierungsprogramm 2021", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "RP", "pdf": "spd-rp-2021.pdf"}, "cdu-rp-2021": {"name": "CDU Rheinland-Pfalz Regierungsprogramm 2021", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "RP", "pdf": "cdu-rp-2021.pdf"}, "afd-rp-2021": {"name": "AfD Rheinland-Pfalz Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "RP", "pdf": "afd-rp-2021.pdf"}, "gruene-rp-2021": {"name": "Grüne Rheinland-Pfalz Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "RP", "pdf": "gruene-rp-2021.pdf"}, "fw-rp-2021": {"name": "FREIE WÄHLER Rheinland-Pfalz Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "FREIE WÄHLER", "bundesland": "RP", "pdf": "fw-rp-2021.pdf"}, "fdp-rp-2021": {"name": "FDP Rheinland-Pfalz Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "FDP", "bundesland": "RP", "pdf": "fdp-rp-2021.pdf"}, # Grundsatzprogramme (Bund) "spd-grundsatz": { "name": "SPD Grundsatzprogramm 2007", "typ": "parteiprogramm", "partei": "SPD", "pdf": "spd-grundsatzprogramm.pdf", }, "cdu-grundsatz": { "name": "CDU Grundsatzprogramm 2007", "typ": "parteiprogramm", "partei": "CDU", "pdf": "cdu-grundsatzprogramm.pdf", }, "gruene-grundsatz": { "name": "Grüne Grundsatzprogramm 2020", "typ": "parteiprogramm", "partei": "GRÜNE", "pdf": "gruene-grundsatzprogramm.pdf", }, "fdp-grundsatz": { "name": "FDP Grundsatzprogramm 2012", "typ": "parteiprogramm", "partei": "FDP", "pdf": "fdp-grundsatzprogramm.pdf", }, "afd-grundsatz": { "name": "AfD Grundsatzprogramm 2016", "typ": "parteiprogramm", "partei": "AfD", "pdf": "afd-grundsatzprogramm.pdf", }, "linke-grundsatz": { "name": "DIE LINKE Erfurter Programm 2011", "typ": "parteiprogramm", "partei": "LINKE", "pdf": "linke-grundsatzprogramm.pdf", }, } def init_embeddings_db(): """Initialize the embeddings database. Includes a forward-only migration step (Issue #5): adds the ``bundesland`` column if missing and backfills existing rows from the ``PROGRAMME`` registry. Grundsatzprogramme (federal level) keep ``bundesland = NULL``; the ``find_relevant_chunks`` query treats NULL as "matches any state". """ conn = sqlite3.connect(EMBEDDINGS_DB) conn.execute(""" CREATE TABLE IF NOT EXISTS chunks ( id INTEGER PRIMARY KEY, programm_id TEXT NOT NULL, partei TEXT NOT NULL, typ TEXT NOT NULL, seite INTEGER, text TEXT NOT NULL, embedding BLOB NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_partei ON chunks(partei)") conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_typ ON chunks(typ)") # Migration: bundesland-Spalte ergänzen, falls Tabelle aus Pre-#5-Zeit cols = {row[1] for row in conn.execute("PRAGMA table_info(chunks)").fetchall()} if "bundesland" not in cols: conn.execute("ALTER TABLE chunks ADD COLUMN bundesland TEXT") conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_bundesland ON chunks(bundesland)") # Backfill: Bundesland aus PROGRAMME-Registry für bestehende Zeilen # nachtragen. Grundsatzprogramme bleiben NULL. for prog_id, info in PROGRAMME.items(): bl = info.get("bundesland") if bl is not None: conn.execute( "UPDATE chunks SET bundesland = ? WHERE programm_id = ? AND bundesland IS NULL", (bl, prog_id), ) conn.commit() conn.close() def get_client() -> OpenAI: """Get DashScope client.""" return OpenAI( api_key=settings.dashscope_api_key, base_url=settings.dashscope_base_url, ) def create_embedding(text: str) -> list[float]: """Create embedding for text using Qwen.""" client = get_client() response = client.embeddings.create( model=EMBEDDING_MODEL, input=text, dimensions=EMBEDDING_DIMENSIONS, ) return response.data[0].embedding def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]: """Split text into overlapping chunks by words.""" words = text.split() chunks = [] i = 0 while i < len(words): chunk_words = words[i:i + chunk_size] chunk = " ".join(chunk_words) if chunk.strip(): chunks.append(chunk) i += chunk_size - overlap return chunks def extract_text_with_pages(pdf_path: Path) -> list[tuple[int, str]]: """Extract text from PDF with page numbers.""" doc = fitz.open(pdf_path) pages = [] for page_num in range(len(doc)): page = doc[page_num] text = page.get_text() if text.strip(): pages.append((page_num + 1, text)) doc.close() return pages def index_programm(programm_id: str, pdf_dir: Path) -> int: """Index a single program PDF into embeddings database.""" if programm_id not in PROGRAMME: raise ValueError(f"Unknown program: {programm_id}") info = PROGRAMME[programm_id] pdf_path = pdf_dir / info["pdf"] if not pdf_path.exists(): logger.warning("PDF not found: %s", pdf_path) return 0 conn = sqlite3.connect(EMBEDDINGS_DB) # Remove existing chunks for this program conn.execute("DELETE FROM chunks WHERE programm_id = ?", (programm_id,)) # Extract and chunk pages = extract_text_with_pages(pdf_path) total_chunks = 0 for page_num, page_text in pages: chunks = chunk_text(page_text, chunk_size=400, overlap=50) for chunk_text_content in chunks: if len(chunk_text_content.split()) < 20: # Skip tiny chunks continue try: embedding = create_embedding(chunk_text_content) embedding_blob = json.dumps(embedding).encode() conn.execute(""" INSERT INTO chunks (programm_id, partei, typ, seite, text, embedding, bundesland) VALUES (?, ?, ?, ?, ?, ?, ?) """, ( programm_id, info["partei"], info["typ"], page_num, chunk_text_content, embedding_blob, info.get("bundesland"), # NULL für Grundsatzprogramme )) total_chunks += 1 except Exception as e: logger.exception("Error embedding chunk") continue conn.commit() conn.close() logger.info("Indexed %d chunks from %s", total_chunks, programm_id) return total_chunks def cosine_similarity(a: list[float], b: list[float]) -> float: """Calculate cosine similarity between two vectors.""" dot = sum(x * y for x, y in zip(a, b)) norm_a = sum(x * x for x in a) ** 0.5 norm_b = sum(x * x for x in b) ** 0.5 if norm_a == 0 or norm_b == 0: return 0.0 return dot / (norm_a * norm_b) def find_relevant_chunks( query: str, parteien: list[str] = None, typ: str = None, bundesland: str = None, top_k: int = 3, min_similarity: float = 0.5, ) -> list[dict]: """Find most relevant chunks for a query. Args: bundesland: Wenn gesetzt, werden nur Chunks dieses Bundeslands ODER globale Chunks (bundesland IS NULL, z.B. Grundsatzprogramme) berücksichtigt. Wenn None, kein Filter. """ query_embedding = create_embedding(query) conn = sqlite3.connect(EMBEDDINGS_DB) conn.row_factory = sqlite3.Row # Build query sql = "SELECT * FROM chunks WHERE 1=1" params = [] if parteien: placeholders = ",".join("?" * len(parteien)) sql += f" AND partei IN ({placeholders})" params.extend(parteien) if typ: sql += " AND typ = ?" params.append(typ) if bundesland: # Bundesland-spezifische ODER globale Chunks (Grundsatzprogramme). sql += " AND (bundesland = ? OR bundesland IS NULL)" params.append(bundesland) rows = conn.execute(sql, params).fetchall() conn.close() # Calculate similarities results = [] for row in rows: chunk_embedding = json.loads(row["embedding"]) similarity = cosine_similarity(query_embedding, chunk_embedding) if similarity >= min_similarity: results.append({ "programm_id": row["programm_id"], "partei": row["partei"], "typ": row["typ"], "seite": row["seite"], "text": row["text"], "similarity": similarity, }) # Sort by similarity and return top_k results.sort(key=lambda x: x["similarity"], reverse=True) return results[:top_k] def get_relevant_quotes_for_antrag( antrag_text: str, fraktionen: list[str], bundesland: str, top_k_per_partei: int = 2, ) -> dict[str, list[dict]]: """Get relevant quotes from Wahl- and Parteiprogramme for an Antrag. Args: bundesland: Pflicht. Bestimmt, welche Wahlprogramme durchsucht werden und welche Regierungsfraktionen zusätzlich zu den Antragstellern einbezogen werden. """ # Lokaler Import vermeidet Zirkularität: bundeslaender.py importiert nichts # aus diesem Modul, aber der saubere Trennstrich bleibt erhalten. from .bundeslaender import BUNDESLAENDER if bundesland not in BUNDESLAENDER: raise ValueError(f"Unbekanntes Bundesland: {bundesland}") regierungsfraktionen = BUNDESLAENDER[bundesland].regierungsfraktionen parteien_to_search = list(dict.fromkeys(fraktionen + regierungsfraktionen)) # dedupe, Reihenfolge stabil results = {} from .parteien import normalize_partei for partei in parteien_to_search: # Kanonischer Lookup-Key über den zentralen Mapper (#55). Ersetzt # den alten Hack ``partei.upper() if partei != "GRÜNE" else "GRÜNE"``, # der nur die Schreibweisen-Drift in einer einzigen Partei # abgefangen hat. Wenn der Mapper nichts findet, fallen wir auf # den Originalstring zurück — die DB-Lookup-Schicht macht ohnehin # eigene Case-insensitive-Vergleiche. canonical = normalize_partei(partei, bundesland=bundesland) partei_lookup = canonical or partei # Wahlprogramm — bundesland-gefiltert wahl_chunks = find_relevant_chunks( antrag_text, parteien=[partei_lookup], typ="wahlprogramm", bundesland=bundesland, top_k=top_k_per_partei, min_similarity=0.45, ) # Parteiprogramm (Grundsatz, federal — bundesland=NULL matched implizit) partei_chunks = find_relevant_chunks( antrag_text, parteien=[partei_lookup], typ="parteiprogramm", bundesland=bundesland, top_k=top_k_per_partei, min_similarity=0.45, ) if wahl_chunks or partei_chunks: results[partei_lookup] = { "wahlprogramm": wahl_chunks, "parteiprogramm": partei_chunks, } return results def _chunk_source_label(chunk: dict) -> str: """Build a fully-qualified source label like 'FDP MV Wahlprogramm 2021, S. 73'. Without the programme name + Bundesland in the prompt, the LLM halluzinates familiar sources from its training (typically NRW 2022) even when the retrieved chunks all come from a different state. """ prog_id = chunk.get("programm_id", "") info = PROGRAMME.get(prog_id, {}) name = info.get("name") or prog_id seite = chunk.get("seite", "?") return f"{name}, S. {seite}" def _chunk_pdf_url(chunk: dict) -> Optional[str]: """Build the canonical PDF URL with page anchor for a chunk. Wenn der Chunk einen ``text`` enthält, wird stattdessen die Highlight-Endpoint-URL ``/api/wahlprogramm-cite?pid=…&seite=…&q=…`` emittiert (Issue #47). Der Endpoint rendert die Wahlprogramm-Seite mit gelb markiertem Zitat und liefert ein 1-Seiten-PDF. Klick im Report öffnet die Quelle direkt mit visuell hervorgehobener Stelle. Fallback: ohne text → statische ``/static/referenzen/#page=`` URL (rückwärts-kompatibel für Pre-#47 Assessments). """ prog_id = chunk.get("programm_id", "") info = PROGRAMME.get(prog_id) if not info: return None pdf = info.get("pdf") if not pdf: return None seite = chunk.get("seite") text = (chunk.get("text") or "").strip() if text and seite: # Highlight-Endpoint mit URL-encoded query. Den Text auf 200 Zeichen # abschneiden — search_for matched ohnehin nur Substring-Anker, und # die URL bleibt bounded (sonst würden 500-Zeichen-Snippets in jeder # Zitat-URL stehen und das HTML-Report-JSON aufblähen). q = urllib.parse.quote_plus(text[:200]) return f"/api/wahlprogramm-cite?pid={prog_id}&seite={seite}&q={q}" if seite: return f"/static/referenzen/{pdf}#page={seite}" return f"/static/referenzen/{pdf}" def render_highlighted_page(programm_id: str, seite: int, query: str) -> Optional[bytes]: """Render a single Wahlprogramm-page with yellow highlights for a query. Used by the ``/api/wahlprogramm-cite`` endpoint to serve a one-page PDF where the cited snippet is visually highlighted via PyMuPDF ``add_highlight_annot``. Returns the serialized PDF bytes, or None if the programme/page can't be resolved. Returns a tuple ``(pdf_bytes, found_page, highlighted)`` where ``found_page`` is the 1-indexed page number and ``highlighted`` is True if the text was found and annotated. Returns ``(None, 0, False)`` if the programme/page can't be resolved. Args: programm_id: Key into PROGRAMME registry — validated by caller. seite: 1-indexed page number within the programme PDF. query: Snippet text to search and highlight on the page. Long queries are truncated to the first 200 characters before the search; PyMuPDF's ``search_for`` falls over on huge needles anyway and a short anchor is what we want for the visual hit. """ info = PROGRAMME.get(programm_id) if not info: return None, 0, False pdf_filename = info.get("pdf") if not pdf_filename: return None, 0, False referenzen = Path(__file__).parent / "static" / "referenzen" pdf_path = referenzen / pdf_filename if not pdf_path.exists(): return None, 0, False needle = (query or "").strip()[:200] src = fitz.open(str(pdf_path)) try: if seite < 1 or seite > len(src): return None, 0, False # Suche den Needle auf der angegebenen Seite. Falls dort nichts # gefunden wird (Pre-#60-Assessments haben oft falsche Seiten- # nummern), durchsuchen wir ALLE Seiten und nehmen die erste # mit einem Treffer — so funktioniert Highlighting auch bei # halluzinierten Seitenzahlen retroaktiv. target_page_idx = seite - 1 rects = [] if needle: clean = needle.replace("\u00ad", "") words = clean.split() anchor = " ".join(words[:5]) if len(words) >= 5 else clean # Versuch 1: angegebene Seite, Volltext rects = src[target_page_idx].search_for(clean) # Versuch 2: angegebene Seite, 5-Wort-Anker if not rects: rects = src[target_page_idx].search_for(anchor) # Versuch 3: alle Seiten durchsuchen if not rects: for i in range(len(src)): rects = src[i].search_for(anchor) if rects: target_page_idx = i break # Volles PDF mit Highlight-Annotation. Der Browser öffnet das # vollständige Wahlprogramm; das Frontend hängt #page=N an die URL. page = src[target_page_idx] if needle and rects: for rect in rects: annot = page.add_highlight_annot(rect) if annot is not None: annot.set_colors(stroke=(1.0, 0.93, 0.0)) # gelb annot.update() highlighted = bool(needle and rects) return src.tobytes(), target_page_idx + 1, highlighted finally: src.close() # ───────────────────────────────────────────────────────────────────────────── # Citation post-processing — Issue #60 Option B # # Pre-#60 the LLM was free to fabricate `quelle`/`url` strings even when the # `text` was a real snippet from a retrieved chunk. The A+C fix made the # prompt more strict, but BB 8/673 (post-deploy) showed the LLM still # cross-mixed: it copied text from chunk Qn but wrote the page from chunk Qm # in the `quelle` field. # # The structural fix is to take quelle/url generation away from the LLM # entirely. After the LLM responds, we walk over every Zitat and try to # locate its `text` (substring or 5-word anchor) in any of the chunks the # LLM was actually shown. If we find a match, we *overwrite* quelle and url # with the canonical values from that chunk. If we don't find a match, the # Zitat is dropped — it cannot be backed by retrieved evidence. # ───────────────────────────────────────────────────────────────────────────── _RE_WHITESPACE = re.compile(r"\s+") _RE_HYPHEN_BREAK = re.compile(r"(\w)-\s+(\w)") _RE_TRUNCATION = re.compile(r"^\s*\.{2,}|\.{2,}\s*$") def _normalize_for_match(text: str) -> str: """Lowercase, collapse whitespace, bridge soft-hyphen line breaks. Mirrors the matcher used in tests/integration/test_citations_substring.py so that the analyzer's post-processing and Sub-D's verification stay in lockstep. """ s = (text or "").lower() s = _RE_TRUNCATION.sub("", s) s = s.replace("\u00ad", "") # soft hyphen s = _RE_WHITESPACE.sub(" ", s).strip() prev = None while prev != s: prev = s s = _RE_HYPHEN_BREAK.sub(r"\1\2", s) return s def find_chunk_for_text(text: str, chunks: list[dict]) -> Optional[dict]: """Locate the retrieved chunk that a Zitat snippet was copied from. Two-stage match identical to Sub-D: 1. **Strict substring** — full needle as substring of any chunk. 2. **5-word anchor** — any 5 consecutive words of the needle as substring of any chunk. Snippets shorter than 20 characters are rejected (too weak to bind). Returns the matching chunk dict, or None. """ needle = _normalize_for_match(text) if len(needle) < 20: return None chunks_norm = [(c, _normalize_for_match(c.get("text", ""))) for c in chunks] for c, norm in chunks_norm: if needle in norm: return c words = needle.split() if len(words) < 5: return None for i in range(len(words) - 4): anchor = " ".join(words[i:i + 5]) for c, norm in chunks_norm: if anchor in norm: return c return None def reconstruct_zitate(data: dict, semantic_quotes: dict) -> dict: """Replace LLM-emitted quelle/url with canonical chunk values; drop unbacked. Walks over ``data['wahlprogrammScores'][i][kind]['zitate']`` (the raw LLM-output dict, not the Pydantic model). For each Zitat: * Locate the chunk whose text contains the snippet (or a 5-word anchor from it). Search across **all** retrieved chunks regardless of party, so cross-mixes between Q-IDs become invisible to the persisted output. * If found: overwrite ``quelle`` and ``url`` with values derived from the matching chunk's ``programm_id`` + ``seite``. The LLM is no longer trusted for these fields. * If not found: drop the Zitat entirely. Returns the same ``data`` dict (mutated in place) for chaining. """ if not semantic_quotes: return data all_chunks: list[dict] = [] for d in semantic_quotes.values(): all_chunks.extend(d.get("wahlprogramm", [])) all_chunks.extend(d.get("parteiprogramm", [])) if not all_chunks: return data for fs in data.get("wahlprogrammScores", []) or []: for kind in ("wahlprogramm", "parteiprogramm"): blk = fs.get(kind) or {} zitate = blk.get("zitate") or [] cleaned = [] for z in zitate: text = z.get("text", "") matched = find_chunk_for_text(text, all_chunks) if matched is None: continue z["quelle"] = _chunk_source_label(matched) url = _chunk_pdf_url(matched) if url: z["url"] = url cleaned.append(z) blk["zitate"] = cleaned return data def format_quotes_for_prompt( quotes: dict, searched_parties: Optional[list[str]] = None, ) -> str: """Format quotes for inclusion in LLM prompt. Each chunk gets a stable ENUM-ID ([Q1], [Q2], …) and the prompt instructs the LLM to anchor every citation in one of those IDs and to copy the snippet **verbatim** from the cited chunk. This is the structural fix for Issue #60: pre-#60 the LLM was free to invent snippets under real source labels because nothing in the prompt bound a citation to a specific retrieved chunk. Each quote is annotated with the fully-qualified source (programme name + page) so the LLM cannot fall back on training-set defaults when constructing its citations. Issue #63 erweitert: wenn ``searched_parties`` übergeben wird, werden Parteien, für die **kein** Chunk retrievt wurde, im Prompt explizit als "keine Quellen im Index" markiert. Das LLM wird angewiesen, für diese Parteien ``score: null`` zu setzen statt aus dem Trainingswissen zu raten. """ if not quotes and not searched_parties: return "" lines = ["\n## Relevante Passagen aus Wahl- und Parteiprogrammen\n"] lines.append( "**ZITATEREGEL** — verbindlich für alle Zitate in `wahlprogramm`/" "`parteiprogramm`-Blöcken:\n" "1. Jedes Zitat MUSS auf genau einen der unten aufgelisteten " "Chunks verweisen (Format `[Q1]`, `[Q2]`, …).\n" "2. Der `text`-String MUSS eine **wörtliche, zusammenhängende** " "Passage von mindestens 5 Wörtern aus genau diesem Chunk sein — " "keine Paraphrasen, keine Zusammenfassungen, keine " "Cross-References aus dem Gedächtnis.\n" "3. Der `quelle`-String MUSS exakt das Source-Label des " "gewählten Chunks sein (Programm-Name + Seitenzahl, wie unten " "ausgeschrieben).\n" "4. Wenn kein Chunk wirklich passt: lass das Zitat-Array leer. " "Lieber 0 Zitate als ein erfundenes Zitat.\n" "5. **Wenn für eine Fraktion unten KEINE QUELLEN VORHANDEN " "steht**: setze `score: 0` für `wahlprogramm` UND " "`parteiprogramm` dieser Fraktion und schreibe in die " "`begründung`: 'Keine Quellen im Index — Bewertung nicht " "möglich.' Erfinde KEINEN Score aus dem Trainingswissen.\n" ) counter = 0 for partei, data in quotes.items(): lines.append(f"\n### {partei}\n") if data.get("wahlprogramm"): lines.append("**Wahlprogramm:**") for chunk in data["wahlprogramm"]: counter += 1 text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"] lines.append(f'- [Q{counter}] {_chunk_source_label(chunk)}: "{text}"') if data.get("parteiprogramm"): lines.append("\n**Grundsatzprogramm:**") for chunk in data["parteiprogramm"]: counter += 1 text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"] lines.append(f'- [Q{counter}] {_chunk_source_label(chunk)}: "{text}"') # Issue #63: Parteien ohne jegliche retrievte Chunks explizit markieren, # damit das LLM nicht aus Trainingswissen halluziniert. if searched_parties: parties_with_chunks = set(quotes.keys()) missing = [p for p in searched_parties if p not in parties_with_chunks] if missing: lines.append("\n### KEINE QUELLEN VORHANDEN\n") lines.append( "Für folgende Fraktionen sind weder Wahl- noch " "Grundsatzprogramm-Passagen im Index vorhanden. " "Bewerte sie mit `score: 0` und `zitate: []`:\n" ) for p in missing: lines.append(f"- **{p}**: KEINE QUELLEN — score 0, keine Zitate.") return "\n".join(lines) def get_programme_info() -> list[dict]: """Get list of all indexed programmes with metadata.""" info_list = [] for prog_id, info in PROGRAMME.items(): info_list.append({ "id": prog_id, "name": info["name"], "typ": info["typ"], "partei": info["partei"], "bundesland": info.get("bundesland"), "pdf": info["pdf"], "pdf_url": f"/static/referenzen/{info['pdf']}", }) return info_list def get_indexing_status() -> dict: """Get status of indexed programmes.""" if not EMBEDDINGS_DB.exists(): return {"indexed": 0, "programmes": []} conn = sqlite3.connect(EMBEDDINGS_DB) # Count chunks per program rows = conn.execute(""" SELECT programm_id, COUNT(*) as chunks FROM chunks GROUP BY programm_id """).fetchall() conn.close() indexed = {row[0]: row[1] for row in rows} programmes = [] for prog_id, info in PROGRAMME.items(): programmes.append({ "id": prog_id, "name": info["name"], "partei": info["partei"], "chunks": indexed.get(prog_id, 0), "indexed": prog_id in indexed, }) return { "indexed": len(indexed), "total": len(PROGRAMME), "programmes": programmes, }