Sub-D Live-Run gegen Prod-DB nach dem db3ada9-Deploy hat einen neuen Halluzinations-Case gezeigt, den A+C nicht gefangen hat: BB 8/673 BSW: text aus bsw-bb-2024 S.27 (verifiziert via Volltext-Suche im PDF), aber LLM hat im quelle-Feld "S. 4" angegeben — die Seite des Top-2-Chunks im selben Retrieval-Window. Klassischer Cross-Mix zwischen Q-IDs. Strukturelle Diagnose: Das [Qn]-Tag aus A ist nur ein weicher Anker im Prompt. Das LLM darf Text aus Chunk Qn kopieren und trotzdem die quelle aus Chunk Qm zusammenbauen. Die ZITATEREGEL kann das nicht verhindern, solange wir der LLM-Selbstauskunft vertrauen. Fix (Option B aus dem ursprünglichen Plan): `embeddings.reconstruct_zitate(data, semantic_quotes)` läuft im analyzer **nach** json.loads aber **vor** Pydantic-Validation: 1. Flachen die retrievten Chunks aller Parteien zu einer einzigen Liste. 2. Pro Zitat: text via Substring oder 5-Wort-Anker gegen alle Chunks matchen (Helpers `find_chunk_for_text` + `_normalize_for_match`, identische Logik wie Sub-D Test). 3. Match → quelle/url server-seitig durch _chunk_source_label und _chunk_pdf_url des matchenden Chunks ÜBERSCHREIBEN. 4. Kein Match → Zitat verworfen (statt mit erfundener quelle persistiert). Damit kann der LLM nur noch sauber zitieren oder gar nicht — es gibt keinen Pfad mehr zu "echter Text, falsche quelle". Tests: - TestReconstructZitate (5 cases): BB 8/673 Re-Mapping, Drop bei hallucinated, no-op bei leeren chunks, anchor-match-Fallback, short-needle und soft-hyphen Edge-Cases - 185/185 grün (179 + 6 neu) Refs: #60, #54 (Sub-D)
787 lines
30 KiB
Python
787 lines
30 KiB
Python
"""Semantic search for Wahlprogramme and Parteiprogramme using Qwen embeddings."""
|
|
|
|
import json
|
|
import re
|
|
import sqlite3
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import fitz # PyMuPDF
|
|
from openai import OpenAI
|
|
|
|
from .config import settings
|
|
|
|
# Embedding model
|
|
EMBEDDING_MODEL = "text-embedding-v3"
|
|
EMBEDDING_DIMENSIONS = 1024
|
|
|
|
# Database path
|
|
EMBEDDINGS_DB = settings.data_dir / "embeddings.db"
|
|
|
|
# Programme definitions
|
|
PROGRAMME = {
|
|
# Wahlprogramme NRW 2022
|
|
"spd-nrw-2022": {
|
|
"name": "SPD NRW Wahlprogramm 2022",
|
|
"typ": "wahlprogramm",
|
|
"partei": "SPD",
|
|
"bundesland": "NRW",
|
|
"pdf": "spd-nrw-2022.pdf",
|
|
},
|
|
"cdu-nrw-2022": {
|
|
"name": "CDU NRW Wahlprogramm 2022",
|
|
"typ": "wahlprogramm",
|
|
"partei": "CDU",
|
|
"bundesland": "NRW",
|
|
"pdf": "cdu-nrw-2022.pdf",
|
|
},
|
|
"gruene-nrw-2022": {
|
|
"name": "Grüne NRW Wahlprogramm 2022",
|
|
"typ": "wahlprogramm",
|
|
"partei": "GRÜNE",
|
|
"bundesland": "NRW",
|
|
"pdf": "gruene-nrw-2022.pdf",
|
|
},
|
|
"fdp-nrw-2022": {
|
|
"name": "FDP NRW Wahlprogramm 2022",
|
|
"typ": "wahlprogramm",
|
|
"partei": "FDP",
|
|
"bundesland": "NRW",
|
|
"pdf": "fdp-nrw-2022.pdf",
|
|
},
|
|
"afd-nrw-2022": {
|
|
"name": "AfD NRW Wahlprogramm 2022",
|
|
"typ": "wahlprogramm",
|
|
"partei": "AfD",
|
|
"bundesland": "NRW",
|
|
"pdf": "afd-nrw-2022.pdf",
|
|
},
|
|
# Sachsen-Anhalt (LTW 2021)
|
|
"cdu-lsa-2021": {
|
|
"name": "CDU Sachsen-Anhalt Regierungsprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "CDU",
|
|
"bundesland": "LSA",
|
|
"pdf": "cdu-lsa-2021.pdf",
|
|
},
|
|
"spd-lsa-2021": {
|
|
"name": "SPD Sachsen-Anhalt Wahlprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "SPD",
|
|
"bundesland": "LSA",
|
|
"pdf": "spd-lsa-2021.pdf",
|
|
},
|
|
"gruene-lsa-2021": {
|
|
"name": "Grüne Sachsen-Anhalt Wahlprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "GRÜNE",
|
|
"bundesland": "LSA",
|
|
"pdf": "gruene-lsa-2021.pdf",
|
|
},
|
|
"fdp-lsa-2021": {
|
|
"name": "FDP Sachsen-Anhalt Wahlprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "FDP",
|
|
"bundesland": "LSA",
|
|
"pdf": "fdp-lsa-2021.pdf",
|
|
},
|
|
"afd-lsa-2021": {
|
|
"name": "AfD Sachsen-Anhalt Wahlprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "AfD",
|
|
"bundesland": "LSA",
|
|
"pdf": "afd-lsa-2021.pdf",
|
|
},
|
|
"linke-lsa-2021": {
|
|
"name": "DIE LINKE Sachsen-Anhalt Wahlprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "LINKE",
|
|
"bundesland": "LSA",
|
|
"pdf": "linke-lsa-2021.pdf",
|
|
},
|
|
# Mecklenburg-Vorpommern (LTW 26.09.2021, WP 8) — Issue #4
|
|
"cdu-mv-2021": {
|
|
"name": "CDU Mecklenburg-Vorpommern Wahlprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "CDU",
|
|
"bundesland": "MV",
|
|
"pdf": "cdu-mv-2021.pdf",
|
|
},
|
|
"spd-mv-2021": {
|
|
"name": "SPD Mecklenburg-Vorpommern Regierungsprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "SPD",
|
|
"bundesland": "MV",
|
|
"pdf": "spd-mv-2021.pdf",
|
|
},
|
|
"gruene-mv-2021": {
|
|
"name": "Grüne Mecklenburg-Vorpommern Wahlprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "GRÜNE",
|
|
"bundesland": "MV",
|
|
"pdf": "gruene-mv-2021.pdf",
|
|
},
|
|
"fdp-mv-2021": {
|
|
"name": "FDP Mecklenburg-Vorpommern Wahlprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "FDP",
|
|
"bundesland": "MV",
|
|
"pdf": "fdp-mv-2021.pdf",
|
|
},
|
|
"afd-mv-2021": {
|
|
"name": "AfD Mecklenburg-Vorpommern Landeswahlprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "AfD",
|
|
"bundesland": "MV",
|
|
"pdf": "afd-mv-2021.pdf",
|
|
},
|
|
"linke-mv-2021": {
|
|
"name": "DIE LINKE Mecklenburg-Vorpommern Zukunftsprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "LINKE",
|
|
"bundesland": "MV",
|
|
"pdf": "linke-mv-2021.pdf",
|
|
},
|
|
# Berlin (AGH-Wahl 26.09.2021, Wiederholung 12.02.2023, WP 19) —
|
|
# Issue #10. Programme stammen aus dem Wahlkampf 2021 — die
|
|
# Wiederholungswahl 2023 nutzte dieselben Programme.
|
|
"cdu-be-2023": {
|
|
"name": "CDU Berlin Berlin-Plan 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "CDU",
|
|
"bundesland": "BE",
|
|
"pdf": "cdu-be-2023.pdf",
|
|
},
|
|
"spd-be-2023": {
|
|
"name": "SPD Berlin Wahlprogramm AGH 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "SPD",
|
|
"bundesland": "BE",
|
|
"pdf": "spd-be-2023.pdf",
|
|
},
|
|
"gruene-be-2023": {
|
|
"name": "Grüne Berlin Landeswahlprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "GRÜNE",
|
|
"bundesland": "BE",
|
|
"pdf": "gruene-be-2023.pdf",
|
|
},
|
|
"linke-be-2023": {
|
|
"name": "DIE LINKE Berlin Wahlprogramm 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "LINKE",
|
|
"bundesland": "BE",
|
|
"pdf": "linke-be-2023.pdf",
|
|
},
|
|
"afd-be-2023": {
|
|
"name": "AfD Berlin Wahlprogramm AGH 2021",
|
|
"typ": "wahlprogramm",
|
|
"partei": "AfD",
|
|
"bundesland": "BE",
|
|
"pdf": "afd-be-2023.pdf",
|
|
},
|
|
# Thüringen — LTW 01.09.2024, WP 8 (Issue #37)
|
|
"cdu-th-2024": {"name": "CDU Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "TH", "pdf": "cdu-th-2024.pdf"},
|
|
"afd-th-2024": {"name": "AfD Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "TH", "pdf": "afd-th-2024.pdf"},
|
|
"linke-th-2024": {"name": "DIE LINKE Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "LINKE", "bundesland": "TH", "pdf": "linke-th-2024.pdf"},
|
|
"bsw-th-2024": {"name": "BSW Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "BSW", "bundesland": "TH", "pdf": "bsw-th-2024.pdf"},
|
|
"spd-th-2024": {"name": "SPD Thüringen Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "TH", "pdf": "spd-th-2024.pdf"},
|
|
# Brandenburg — LTW 22.09.2024, WP 8 (Issue #39)
|
|
"spd-bb-2024": {"name": "SPD Brandenburg Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "BB", "pdf": "spd-bb-2024.pdf"},
|
|
"afd-bb-2024": {"name": "AfD Brandenburg Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "BB", "pdf": "afd-bb-2024.pdf"},
|
|
"cdu-bb-2024": {"name": "CDU Brandenburg Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "BB", "pdf": "cdu-bb-2024.pdf"},
|
|
"bsw-bb-2024": {"name": "BSW Brandenburg Wahlprogramm 2024", "typ": "wahlprogramm", "partei": "BSW", "bundesland": "BB", "pdf": "bsw-bb-2024.pdf"},
|
|
# Hamburg — Bürgerschaftswahl 02.03.2025, WP 23 (Issue #40)
|
|
"spd-hh-2025": {"name": "SPD Hamburg Wahlprogramm 2025", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "HH", "pdf": "spd-hh-2025.pdf"},
|
|
"cdu-hh-2025": {"name": "CDU Hamburg Wahlprogramm 2025", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "HH", "pdf": "cdu-hh-2025.pdf"},
|
|
"gruene-hh-2025": {"name": "Grüne Hamburg Regierungsprogramm 2025", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "HH", "pdf": "gruene-hh-2025.pdf"},
|
|
"linke-hh-2025": {"name": "DIE LINKE Hamburg Wahlprogramm 2025", "typ": "wahlprogramm", "partei": "LINKE", "bundesland": "HH", "pdf": "linke-hh-2025.pdf"},
|
|
"afd-hh-2025": {"name": "AfD Hamburg Wahlprogramm 2025", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "HH", "pdf": "afd-hh-2025.pdf"},
|
|
# Schleswig-Holstein — LTW 08.05.2022, WP 20 (Issue #32)
|
|
"cdu-sh-2022": {"name": "CDU Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "SH", "pdf": "cdu-sh-2022.pdf"},
|
|
"spd-sh-2022": {"name": "SPD Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "SH", "pdf": "spd-sh-2022.pdf"},
|
|
"gruene-sh-2022": {"name": "Grüne Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "SH", "pdf": "gruene-sh-2022.pdf"},
|
|
"fdp-sh-2022": {"name": "FDP Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "FDP", "bundesland": "SH", "pdf": "fdp-sh-2022.pdf"},
|
|
"ssw-sh-2022": {"name": "SSW Schleswig-Holstein Wahlprogramm 2022", "typ": "wahlprogramm", "partei": "SSW", "bundesland": "SH", "pdf": "ssw-sh-2022.pdf"},
|
|
# Baden-Württemberg — LTW 14.03.2021, WP 17 (Issue #41)
|
|
"gruene-bw-2021": {"name": "Grüne Baden-Württemberg Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "BW", "pdf": "gruene-bw-2021.pdf"},
|
|
"cdu-bw-2021": {"name": "CDU Baden-Württemberg Regierungsprogramm 2021", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "BW", "pdf": "cdu-bw-2021.pdf"},
|
|
"afd-bw-2021": {"name": "AfD Baden-Württemberg Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "BW", "pdf": "afd-bw-2021.pdf"},
|
|
"spd-bw-2021": {"name": "SPD Baden-Württemberg Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "BW", "pdf": "spd-bw-2021.pdf"},
|
|
"fdp-bw-2021": {"name": "FDP Baden-Württemberg Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "FDP", "bundesland": "BW", "pdf": "fdp-bw-2021.pdf"},
|
|
# Rheinland-Pfalz — LTW 14.03.2021, WP 18 (Issue #42)
|
|
"spd-rp-2021": {"name": "SPD Rheinland-Pfalz Regierungsprogramm 2021", "typ": "wahlprogramm", "partei": "SPD", "bundesland": "RP", "pdf": "spd-rp-2021.pdf"},
|
|
"cdu-rp-2021": {"name": "CDU Rheinland-Pfalz Regierungsprogramm 2021", "typ": "wahlprogramm", "partei": "CDU", "bundesland": "RP", "pdf": "cdu-rp-2021.pdf"},
|
|
"afd-rp-2021": {"name": "AfD Rheinland-Pfalz Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "AfD", "bundesland": "RP", "pdf": "afd-rp-2021.pdf"},
|
|
"gruene-rp-2021": {"name": "Grüne Rheinland-Pfalz Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "GRÜNE", "bundesland": "RP", "pdf": "gruene-rp-2021.pdf"},
|
|
"fw-rp-2021": {"name": "FREIE WÄHLER Rheinland-Pfalz Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "FREIE WÄHLER", "bundesland": "RP", "pdf": "fw-rp-2021.pdf"},
|
|
"fdp-rp-2021": {"name": "FDP Rheinland-Pfalz Wahlprogramm 2021", "typ": "wahlprogramm", "partei": "FDP", "bundesland": "RP", "pdf": "fdp-rp-2021.pdf"},
|
|
# Grundsatzprogramme (Bund)
|
|
"spd-grundsatz": {
|
|
"name": "SPD Grundsatzprogramm 2007",
|
|
"typ": "parteiprogramm",
|
|
"partei": "SPD",
|
|
"pdf": "spd-grundsatzprogramm.pdf",
|
|
},
|
|
"cdu-grundsatz": {
|
|
"name": "CDU Grundsatzprogramm 2007",
|
|
"typ": "parteiprogramm",
|
|
"partei": "CDU",
|
|
"pdf": "cdu-grundsatzprogramm.pdf",
|
|
},
|
|
"gruene-grundsatz": {
|
|
"name": "Grüne Grundsatzprogramm 2020",
|
|
"typ": "parteiprogramm",
|
|
"partei": "GRÜNE",
|
|
"pdf": "gruene-grundsatzprogramm.pdf",
|
|
},
|
|
"fdp-grundsatz": {
|
|
"name": "FDP Grundsatzprogramm 2012",
|
|
"typ": "parteiprogramm",
|
|
"partei": "FDP",
|
|
"pdf": "fdp-grundsatzprogramm.pdf",
|
|
},
|
|
}
|
|
|
|
|
|
def init_embeddings_db():
|
|
"""Initialize the embeddings database.
|
|
|
|
Includes a forward-only migration step (Issue #5): adds the
|
|
``bundesland`` column if missing and backfills existing rows from the
|
|
``PROGRAMME`` registry. Grundsatzprogramme (federal level) keep
|
|
``bundesland = NULL``; the ``find_relevant_chunks`` query treats NULL
|
|
as "matches any state".
|
|
"""
|
|
conn = sqlite3.connect(EMBEDDINGS_DB)
|
|
conn.execute("""
|
|
CREATE TABLE IF NOT EXISTS chunks (
|
|
id INTEGER PRIMARY KEY,
|
|
programm_id TEXT NOT NULL,
|
|
partei TEXT NOT NULL,
|
|
typ TEXT NOT NULL,
|
|
seite INTEGER,
|
|
text TEXT NOT NULL,
|
|
embedding BLOB NOT NULL,
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
)
|
|
""")
|
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_partei ON chunks(partei)")
|
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_typ ON chunks(typ)")
|
|
|
|
# Migration: bundesland-Spalte ergänzen, falls Tabelle aus Pre-#5-Zeit
|
|
cols = {row[1] for row in conn.execute("PRAGMA table_info(chunks)").fetchall()}
|
|
if "bundesland" not in cols:
|
|
conn.execute("ALTER TABLE chunks ADD COLUMN bundesland TEXT")
|
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_bundesland ON chunks(bundesland)")
|
|
|
|
# Backfill: Bundesland aus PROGRAMME-Registry für bestehende Zeilen
|
|
# nachtragen. Grundsatzprogramme bleiben NULL.
|
|
for prog_id, info in PROGRAMME.items():
|
|
bl = info.get("bundesland")
|
|
if bl is not None:
|
|
conn.execute(
|
|
"UPDATE chunks SET bundesland = ? WHERE programm_id = ? AND bundesland IS NULL",
|
|
(bl, prog_id),
|
|
)
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
|
|
def get_client() -> OpenAI:
|
|
"""Get DashScope client."""
|
|
return OpenAI(
|
|
api_key=settings.dashscope_api_key,
|
|
base_url=settings.dashscope_base_url,
|
|
)
|
|
|
|
|
|
def create_embedding(text: str) -> list[float]:
|
|
"""Create embedding for text using Qwen."""
|
|
client = get_client()
|
|
response = client.embeddings.create(
|
|
model=EMBEDDING_MODEL,
|
|
input=text,
|
|
dimensions=EMBEDDING_DIMENSIONS,
|
|
)
|
|
return response.data[0].embedding
|
|
|
|
|
|
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
|
|
"""Split text into overlapping chunks by words."""
|
|
words = text.split()
|
|
chunks = []
|
|
|
|
i = 0
|
|
while i < len(words):
|
|
chunk_words = words[i:i + chunk_size]
|
|
chunk = " ".join(chunk_words)
|
|
if chunk.strip():
|
|
chunks.append(chunk)
|
|
i += chunk_size - overlap
|
|
|
|
return chunks
|
|
|
|
|
|
def extract_text_with_pages(pdf_path: Path) -> list[tuple[int, str]]:
|
|
"""Extract text from PDF with page numbers."""
|
|
doc = fitz.open(pdf_path)
|
|
pages = []
|
|
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
text = page.get_text()
|
|
if text.strip():
|
|
pages.append((page_num + 1, text))
|
|
|
|
doc.close()
|
|
return pages
|
|
|
|
|
|
def index_programm(programm_id: str, pdf_dir: Path) -> int:
|
|
"""Index a single program PDF into embeddings database."""
|
|
if programm_id not in PROGRAMME:
|
|
raise ValueError(f"Unknown program: {programm_id}")
|
|
|
|
info = PROGRAMME[programm_id]
|
|
pdf_path = pdf_dir / info["pdf"]
|
|
|
|
if not pdf_path.exists():
|
|
print(f"PDF not found: {pdf_path}")
|
|
return 0
|
|
|
|
conn = sqlite3.connect(EMBEDDINGS_DB)
|
|
|
|
# Remove existing chunks for this program
|
|
conn.execute("DELETE FROM chunks WHERE programm_id = ?", (programm_id,))
|
|
|
|
# Extract and chunk
|
|
pages = extract_text_with_pages(pdf_path)
|
|
total_chunks = 0
|
|
|
|
for page_num, page_text in pages:
|
|
chunks = chunk_text(page_text, chunk_size=400, overlap=50)
|
|
|
|
for chunk_text_content in chunks:
|
|
if len(chunk_text_content.split()) < 20: # Skip tiny chunks
|
|
continue
|
|
|
|
try:
|
|
embedding = create_embedding(chunk_text_content)
|
|
embedding_blob = json.dumps(embedding).encode()
|
|
|
|
conn.execute("""
|
|
INSERT INTO chunks (programm_id, partei, typ, seite, text, embedding, bundesland)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
programm_id,
|
|
info["partei"],
|
|
info["typ"],
|
|
page_num,
|
|
chunk_text_content,
|
|
embedding_blob,
|
|
info.get("bundesland"), # NULL für Grundsatzprogramme
|
|
))
|
|
total_chunks += 1
|
|
except Exception as e:
|
|
print(f"Error embedding chunk: {e}")
|
|
continue
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
print(f"Indexed {total_chunks} chunks from {programm_id}")
|
|
return total_chunks
|
|
|
|
|
|
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
"""Calculate cosine similarity between two vectors."""
|
|
dot = sum(x * y for x, y in zip(a, b))
|
|
norm_a = sum(x * x for x in a) ** 0.5
|
|
norm_b = sum(x * x for x in b) ** 0.5
|
|
if norm_a == 0 or norm_b == 0:
|
|
return 0.0
|
|
return dot / (norm_a * norm_b)
|
|
|
|
|
|
def find_relevant_chunks(
|
|
query: str,
|
|
parteien: list[str] = None,
|
|
typ: str = None,
|
|
bundesland: str = None,
|
|
top_k: int = 3,
|
|
min_similarity: float = 0.5,
|
|
) -> list[dict]:
|
|
"""Find most relevant chunks for a query.
|
|
|
|
Args:
|
|
bundesland: Wenn gesetzt, werden nur Chunks dieses Bundeslands ODER
|
|
globale Chunks (bundesland IS NULL, z.B. Grundsatzprogramme)
|
|
berücksichtigt. Wenn None, kein Filter.
|
|
"""
|
|
|
|
query_embedding = create_embedding(query)
|
|
|
|
conn = sqlite3.connect(EMBEDDINGS_DB)
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
# Build query
|
|
sql = "SELECT * FROM chunks WHERE 1=1"
|
|
params = []
|
|
|
|
if parteien:
|
|
placeholders = ",".join("?" * len(parteien))
|
|
sql += f" AND partei IN ({placeholders})"
|
|
params.extend(parteien)
|
|
|
|
if typ:
|
|
sql += " AND typ = ?"
|
|
params.append(typ)
|
|
|
|
if bundesland:
|
|
# Bundesland-spezifische ODER globale Chunks (Grundsatzprogramme).
|
|
sql += " AND (bundesland = ? OR bundesland IS NULL)"
|
|
params.append(bundesland)
|
|
|
|
rows = conn.execute(sql, params).fetchall()
|
|
conn.close()
|
|
|
|
# Calculate similarities
|
|
results = []
|
|
for row in rows:
|
|
chunk_embedding = json.loads(row["embedding"])
|
|
similarity = cosine_similarity(query_embedding, chunk_embedding)
|
|
|
|
if similarity >= min_similarity:
|
|
results.append({
|
|
"programm_id": row["programm_id"],
|
|
"partei": row["partei"],
|
|
"typ": row["typ"],
|
|
"seite": row["seite"],
|
|
"text": row["text"],
|
|
"similarity": similarity,
|
|
})
|
|
|
|
# Sort by similarity and return top_k
|
|
results.sort(key=lambda x: x["similarity"], reverse=True)
|
|
return results[:top_k]
|
|
|
|
|
|
def get_relevant_quotes_for_antrag(
|
|
antrag_text: str,
|
|
fraktionen: list[str],
|
|
bundesland: str,
|
|
top_k_per_partei: int = 2,
|
|
) -> dict[str, list[dict]]:
|
|
"""Get relevant quotes from Wahl- and Parteiprogramme for an Antrag.
|
|
|
|
Args:
|
|
bundesland: Pflicht. Bestimmt, welche Wahlprogramme durchsucht werden
|
|
und welche Regierungsfraktionen zusätzlich zu den Antragstellern
|
|
einbezogen werden.
|
|
"""
|
|
# Lokaler Import vermeidet Zirkularität: bundeslaender.py importiert nichts
|
|
# aus diesem Modul, aber der saubere Trennstrich bleibt erhalten.
|
|
from .bundeslaender import BUNDESLAENDER
|
|
|
|
if bundesland not in BUNDESLAENDER:
|
|
raise ValueError(f"Unbekanntes Bundesland: {bundesland}")
|
|
|
|
regierungsfraktionen = BUNDESLAENDER[bundesland].regierungsfraktionen
|
|
parteien_to_search = list(dict.fromkeys(fraktionen + regierungsfraktionen)) # dedupe, Reihenfolge stabil
|
|
|
|
results = {}
|
|
|
|
from .parteien import normalize_partei
|
|
|
|
for partei in parteien_to_search:
|
|
# Kanonischer Lookup-Key über den zentralen Mapper (#55). Ersetzt
|
|
# den alten Hack ``partei.upper() if partei != "GRÜNE" else "GRÜNE"``,
|
|
# der nur die Schreibweisen-Drift in einer einzigen Partei
|
|
# abgefangen hat. Wenn der Mapper nichts findet, fallen wir auf
|
|
# den Originalstring zurück — die DB-Lookup-Schicht macht ohnehin
|
|
# eigene Case-insensitive-Vergleiche.
|
|
canonical = normalize_partei(partei, bundesland=bundesland)
|
|
partei_lookup = canonical or partei
|
|
|
|
# Wahlprogramm — bundesland-gefiltert
|
|
wahl_chunks = find_relevant_chunks(
|
|
antrag_text,
|
|
parteien=[partei_lookup],
|
|
typ="wahlprogramm",
|
|
bundesland=bundesland,
|
|
top_k=top_k_per_partei,
|
|
min_similarity=0.45,
|
|
)
|
|
|
|
# Parteiprogramm (Grundsatz, federal — bundesland=NULL matched implizit)
|
|
partei_chunks = find_relevant_chunks(
|
|
antrag_text,
|
|
parteien=[partei_lookup],
|
|
typ="parteiprogramm",
|
|
bundesland=bundesland,
|
|
top_k=top_k_per_partei,
|
|
min_similarity=0.45,
|
|
)
|
|
|
|
if wahl_chunks or partei_chunks:
|
|
results[partei_lookup] = {
|
|
"wahlprogramm": wahl_chunks,
|
|
"parteiprogramm": partei_chunks,
|
|
}
|
|
|
|
return results
|
|
|
|
|
|
def _chunk_source_label(chunk: dict) -> str:
|
|
"""Build a fully-qualified source label like 'FDP MV Wahlprogramm 2021, S. 73'.
|
|
|
|
Without the programme name + Bundesland in the prompt, the LLM
|
|
halluzinates familiar sources from its training (typically NRW 2022)
|
|
even when the retrieved chunks all come from a different state.
|
|
"""
|
|
prog_id = chunk.get("programm_id", "")
|
|
info = PROGRAMME.get(prog_id, {})
|
|
name = info.get("name") or prog_id
|
|
seite = chunk.get("seite", "?")
|
|
return f"{name}, S. {seite}"
|
|
|
|
|
|
def _chunk_pdf_url(chunk: dict) -> Optional[str]:
|
|
"""Build the canonical PDF URL with page anchor for a chunk."""
|
|
prog_id = chunk.get("programm_id", "")
|
|
info = PROGRAMME.get(prog_id)
|
|
if not info:
|
|
return None
|
|
pdf = info.get("pdf")
|
|
if not pdf:
|
|
return None
|
|
seite = chunk.get("seite")
|
|
if seite:
|
|
return f"/static/referenzen/{pdf}#page={seite}"
|
|
return f"/static/referenzen/{pdf}"
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Citation post-processing — Issue #60 Option B
|
|
#
|
|
# Pre-#60 the LLM was free to fabricate `quelle`/`url` strings even when the
|
|
# `text` was a real snippet from a retrieved chunk. The A+C fix made the
|
|
# prompt more strict, but BB 8/673 (post-deploy) showed the LLM still
|
|
# cross-mixed: it copied text from chunk Qn but wrote the page from chunk Qm
|
|
# in the `quelle` field.
|
|
#
|
|
# The structural fix is to take quelle/url generation away from the LLM
|
|
# entirely. After the LLM responds, we walk over every Zitat and try to
|
|
# locate its `text` (substring or 5-word anchor) in any of the chunks the
|
|
# LLM was actually shown. If we find a match, we *overwrite* quelle and url
|
|
# with the canonical values from that chunk. If we don't find a match, the
|
|
# Zitat is dropped — it cannot be backed by retrieved evidence.
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
_RE_WHITESPACE = re.compile(r"\s+")
|
|
_RE_HYPHEN_BREAK = re.compile(r"(\w)-\s+(\w)")
|
|
_RE_TRUNCATION = re.compile(r"^\s*\.{2,}|\.{2,}\s*$")
|
|
|
|
|
|
def _normalize_for_match(text: str) -> str:
|
|
"""Lowercase, collapse whitespace, bridge soft-hyphen line breaks.
|
|
|
|
Mirrors the matcher used in tests/integration/test_citations_substring.py
|
|
so that the analyzer's post-processing and Sub-D's verification stay in
|
|
lockstep.
|
|
"""
|
|
s = (text or "").lower()
|
|
s = _RE_TRUNCATION.sub("", s)
|
|
s = s.replace("\u00ad", "") # soft hyphen
|
|
s = _RE_WHITESPACE.sub(" ", s).strip()
|
|
prev = None
|
|
while prev != s:
|
|
prev = s
|
|
s = _RE_HYPHEN_BREAK.sub(r"\1\2", s)
|
|
return s
|
|
|
|
|
|
def find_chunk_for_text(text: str, chunks: list[dict]) -> Optional[dict]:
|
|
"""Locate the retrieved chunk that a Zitat snippet was copied from.
|
|
|
|
Two-stage match identical to Sub-D:
|
|
1. **Strict substring** — full needle as substring of any chunk.
|
|
2. **5-word anchor** — any 5 consecutive words of the needle as
|
|
substring of any chunk.
|
|
|
|
Snippets shorter than 20 characters are rejected (too weak to bind).
|
|
Returns the matching chunk dict, or None.
|
|
"""
|
|
needle = _normalize_for_match(text)
|
|
if len(needle) < 20:
|
|
return None
|
|
chunks_norm = [(c, _normalize_for_match(c.get("text", ""))) for c in chunks]
|
|
for c, norm in chunks_norm:
|
|
if needle in norm:
|
|
return c
|
|
words = needle.split()
|
|
if len(words) < 5:
|
|
return None
|
|
for i in range(len(words) - 4):
|
|
anchor = " ".join(words[i:i + 5])
|
|
for c, norm in chunks_norm:
|
|
if anchor in norm:
|
|
return c
|
|
return None
|
|
|
|
|
|
def reconstruct_zitate(data: dict, semantic_quotes: dict) -> dict:
|
|
"""Replace LLM-emitted quelle/url with canonical chunk values; drop unbacked.
|
|
|
|
Walks over ``data['wahlprogrammScores'][i][kind]['zitate']`` (the raw
|
|
LLM-output dict, not the Pydantic model). For each Zitat:
|
|
|
|
* Locate the chunk whose text contains the snippet (or a 5-word anchor
|
|
from it). Search across **all** retrieved chunks regardless of party,
|
|
so cross-mixes between Q-IDs become invisible to the persisted output.
|
|
* If found: overwrite ``quelle`` and ``url`` with values derived from
|
|
the matching chunk's ``programm_id`` + ``seite``. The LLM is no longer
|
|
trusted for these fields.
|
|
* If not found: drop the Zitat entirely.
|
|
|
|
Returns the same ``data`` dict (mutated in place) for chaining.
|
|
"""
|
|
if not semantic_quotes:
|
|
return data
|
|
|
|
all_chunks: list[dict] = []
|
|
for d in semantic_quotes.values():
|
|
all_chunks.extend(d.get("wahlprogramm", []))
|
|
all_chunks.extend(d.get("parteiprogramm", []))
|
|
if not all_chunks:
|
|
return data
|
|
|
|
for fs in data.get("wahlprogrammScores", []) or []:
|
|
for kind in ("wahlprogramm", "parteiprogramm"):
|
|
blk = fs.get(kind) or {}
|
|
zitate = blk.get("zitate") or []
|
|
cleaned = []
|
|
for z in zitate:
|
|
text = z.get("text", "")
|
|
matched = find_chunk_for_text(text, all_chunks)
|
|
if matched is None:
|
|
continue
|
|
z["quelle"] = _chunk_source_label(matched)
|
|
url = _chunk_pdf_url(matched)
|
|
if url:
|
|
z["url"] = url
|
|
cleaned.append(z)
|
|
blk["zitate"] = cleaned
|
|
return data
|
|
|
|
|
|
def format_quotes_for_prompt(quotes: dict) -> str:
|
|
"""Format quotes for inclusion in LLM prompt.
|
|
|
|
Each chunk gets a stable ENUM-ID ([Q1], [Q2], …) and the prompt
|
|
instructs the LLM to anchor every citation in one of those IDs and
|
|
to copy the snippet **verbatim** from the cited chunk. This is the
|
|
structural fix for Issue #60: pre-#60 the LLM was free to invent
|
|
snippets under real source labels because nothing in the prompt
|
|
bound a citation to a specific retrieved chunk.
|
|
|
|
Each quote is annotated with the fully-qualified source (programme
|
|
name + page) so the LLM cannot fall back on training-set defaults
|
|
when constructing its citations.
|
|
"""
|
|
if not quotes:
|
|
return ""
|
|
|
|
lines = ["\n## Relevante Passagen aus Wahl- und Parteiprogrammen\n"]
|
|
lines.append(
|
|
"**ZITATEREGEL** — verbindlich für alle Zitate in `wahlprogramm`/"
|
|
"`parteiprogramm`-Blöcken:\n"
|
|
"1. Jedes Zitat MUSS auf genau einen der unten aufgelisteten "
|
|
"Chunks verweisen (Format `[Q1]`, `[Q2]`, …).\n"
|
|
"2. Der `text`-String MUSS eine **wörtliche, zusammenhängende** "
|
|
"Passage von mindestens 5 Wörtern aus genau diesem Chunk sein — "
|
|
"keine Paraphrasen, keine Zusammenfassungen, keine "
|
|
"Cross-References aus dem Gedächtnis.\n"
|
|
"3. Der `quelle`-String MUSS exakt das Source-Label des "
|
|
"gewählten Chunks sein (Programm-Name + Seitenzahl, wie unten "
|
|
"ausgeschrieben).\n"
|
|
"4. Wenn kein Chunk wirklich passt: lass das Zitat-Array leer. "
|
|
"Lieber 0 Zitate als ein erfundenes Zitat.\n"
|
|
)
|
|
|
|
counter = 0
|
|
for partei, data in quotes.items():
|
|
lines.append(f"\n### {partei}\n")
|
|
|
|
if data.get("wahlprogramm"):
|
|
lines.append("**Wahlprogramm:**")
|
|
for chunk in data["wahlprogramm"]:
|
|
counter += 1
|
|
text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
|
|
lines.append(f'- [Q{counter}] {_chunk_source_label(chunk)}: "{text}"')
|
|
|
|
if data.get("parteiprogramm"):
|
|
lines.append("\n**Grundsatzprogramm:**")
|
|
for chunk in data["parteiprogramm"]:
|
|
counter += 1
|
|
text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
|
|
lines.append(f'- [Q{counter}] {_chunk_source_label(chunk)}: "{text}"')
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def get_programme_info() -> list[dict]:
|
|
"""Get list of all indexed programmes with metadata."""
|
|
info_list = []
|
|
|
|
for prog_id, info in PROGRAMME.items():
|
|
info_list.append({
|
|
"id": prog_id,
|
|
"name": info["name"],
|
|
"typ": info["typ"],
|
|
"partei": info["partei"],
|
|
"bundesland": info.get("bundesland"),
|
|
"pdf": info["pdf"],
|
|
"pdf_url": f"/static/referenzen/{info['pdf']}",
|
|
})
|
|
|
|
return info_list
|
|
|
|
|
|
def get_indexing_status() -> dict:
|
|
"""Get status of indexed programmes."""
|
|
if not EMBEDDINGS_DB.exists():
|
|
return {"indexed": 0, "programmes": []}
|
|
|
|
conn = sqlite3.connect(EMBEDDINGS_DB)
|
|
|
|
# Count chunks per program
|
|
rows = conn.execute("""
|
|
SELECT programm_id, COUNT(*) as chunks
|
|
FROM chunks
|
|
GROUP BY programm_id
|
|
""").fetchall()
|
|
|
|
conn.close()
|
|
|
|
indexed = {row[0]: row[1] for row in rows}
|
|
|
|
programmes = []
|
|
for prog_id, info in PROGRAMME.items():
|
|
programmes.append({
|
|
"id": prog_id,
|
|
"name": info["name"],
|
|
"partei": info["partei"],
|
|
"chunks": indexed.get(prog_id, 0),
|
|
"indexed": prog_id in indexed,
|
|
})
|
|
|
|
return {
|
|
"indexed": len(indexed),
|
|
"total": len(PROGRAMME),
|
|
"programmes": programmes,
|
|
}
|