Embeddings prompt: include programme name in chunk citations

format_quotes_for_prompt previously rendered each retrieved chunk as just "S. X: text", giving the LLM no way to know which Bundesland or Wahlprogramm the passage came from. Result: even when the embedding search correctly returned MV-only chunks, the LLM hallucinated familiar source labels from its training set (typically "FDP NRW Wahlprogramm 2022, S. 75") because that was its strongest prior for budget/transparency policy citations. Fix: prepend the fully-qualified PROGRAMME[programm_id]["name"] to each quote and explicitly instruct the model to use these labels verbatim. Discovered while smoke-testing MV after indexing the new MV+BE programmes — embedding retrieval was clean (sim ~0.6 chunks all from fdp-mv-2021), only the prompt serialisation was lossy. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 11:24:31 +02:00 · 2026-04-08 11:24:31 +02:00 · 1b5fd96e16
commit 1b5fd96e16
parent 8992cffc64
1 changed files with 32 additions and 8 deletions
--- a/app/embeddings.py
+++ b/app/embeddings.py
@ -488,28 +488,52 @@ def get_relevant_quotes_for_antrag(
    return results
 def _chunk_source_label(chunk: dict) -> str:
    """Build a fully-qualified source label like 'FDP MV Wahlprogramm 2021, S. 73'.
    Without the programme name + Bundesland in the prompt, the LLM
    halluzinates familiar sources from its training (typically NRW 2022)
    even when the retrieved chunks all come from a different state.
    """
    prog_id = chunk.get("programm_id", "")
    info = PROGRAMME.get(prog_id, {})
    name = info.get("name") or prog_id
    seite = chunk.get("seite", "?")
    return f"{name}, S. {seite}"
 def format_quotes_for_prompt(quotes: dict) -> str:
-    """Format quotes for inclusion in LLM prompt."""
+    """Format quotes for inclusion in LLM prompt.
    Each quote is annotated with the fully-qualified source (programme
    name + page) so the LLM cannot fall back on training-set defaults
    when constructing its citations.
    """
    if not quotes:
        return ""
-    
+
    lines = ["\n## Relevante Passagen aus Wahl- und Parteiprogrammen\n"]
-    
+    lines.append(
        "Verwende **ausschließlich** die hier gelisteten Quellenangaben "
        "(Programm-Name + Seite) wörtlich in deinen Zitaten — erfinde "
        "keine Quellen aus dem Gedächtnis.\n"
    )
    for partei, data in quotes.items():
        lines.append(f"\n### {partei}\n")
-        
+
        if data.get("wahlprogramm"):
            lines.append("**Wahlprogramm:**")
            for chunk in data["wahlprogramm"]:
                text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
-                lines.append(f'- S. {chunk["seite"]}: "{text}"')
+                lines.append(f'- {_chunk_source_label(chunk)}: "{text}"')
-        
+
        if data.get("parteiprogramm"):
            lines.append("\n**Grundsatzprogramm:**")
            for chunk in data["parteiprogramm"]:
                text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
-                lines.append(f'- S. {chunk["seite"]}: "{text}"')
+                lines.append(f'- {_chunk_source_label(chunk)}: "{text}"')
-    
+
    return "\n".join(lines)