From 1b5fd96e1649997f47fbc4bcf94506499d96bf51 Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Wed, 8 Apr 2026 11:24:31 +0200 Subject: [PATCH] Embeddings prompt: include programme name in chunk citations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit format_quotes_for_prompt previously rendered each retrieved chunk as just "S. X: text", giving the LLM no way to know which Bundesland or Wahlprogramm the passage came from. Result: even when the embedding search correctly returned MV-only chunks, the LLM hallucinated familiar source labels from its training set (typically "FDP NRW Wahlprogramm 2022, S. 75") because that was its strongest prior for budget/transparency policy citations. Fix: prepend the fully-qualified PROGRAMME[programm_id]["name"] to each quote and explicitly instruct the model to use these labels verbatim. Discovered while smoke-testing MV after indexing the new MV+BE programmes — embedding retrieval was clean (sim ~0.6 chunks all from fdp-mv-2021), only the prompt serialisation was lossy. Co-Authored-By: Claude Opus 4.6 (1M context) --- app/embeddings.py | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/app/embeddings.py b/app/embeddings.py index 91dfbc6..8ca13b8 100644 --- a/app/embeddings.py +++ b/app/embeddings.py @@ -488,28 +488,52 @@ def get_relevant_quotes_for_antrag( return results +def _chunk_source_label(chunk: dict) -> str: + """Build a fully-qualified source label like 'FDP MV Wahlprogramm 2021, S. 73'. + + Without the programme name + Bundesland in the prompt, the LLM + halluzinates familiar sources from its training (typically NRW 2022) + even when the retrieved chunks all come from a different state. + """ + prog_id = chunk.get("programm_id", "") + info = PROGRAMME.get(prog_id, {}) + name = info.get("name") or prog_id + seite = chunk.get("seite", "?") + return f"{name}, S. {seite}" + + def format_quotes_for_prompt(quotes: dict) -> str: - """Format quotes for inclusion in LLM prompt.""" + """Format quotes for inclusion in LLM prompt. + + Each quote is annotated with the fully-qualified source (programme + name + page) so the LLM cannot fall back on training-set defaults + when constructing its citations. + """ if not quotes: return "" - + lines = ["\n## Relevante Passagen aus Wahl- und Parteiprogrammen\n"] - + lines.append( + "Verwende **ausschließlich** die hier gelisteten Quellenangaben " + "(Programm-Name + Seite) wörtlich in deinen Zitaten — erfinde " + "keine Quellen aus dem Gedächtnis.\n" + ) + for partei, data in quotes.items(): lines.append(f"\n### {partei}\n") - + if data.get("wahlprogramm"): lines.append("**Wahlprogramm:**") for chunk in data["wahlprogramm"]: text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"] - lines.append(f'- S. {chunk["seite"]}: "{text}"') - + lines.append(f'- {_chunk_source_label(chunk)}: "{text}"') + if data.get("parteiprogramm"): lines.append("\n**Grundsatzprogramm:**") for chunk in data["parteiprogramm"]: text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"] - lines.append(f'- S. {chunk["seite"]}: "{text}"') - + lines.append(f'- {_chunk_source_label(chunk)}: "{text}"') + return "\n".join(lines)