From 1b5fd96e1649997f47fbc4bcf94506499d96bf51 Mon Sep 17 00:00:00 2001
From: Dotty Dotter <dotty@Mac.wideopen.space>
Date: Wed, 8 Apr 2026 11:24:31 +0200
Subject: [PATCH] Embeddings prompt: include programme name in chunk citations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

format_quotes_for_prompt previously rendered each retrieved chunk
as just "S. X: text", giving the LLM no way to know which
Bundesland or Wahlprogramm the passage came from. Result: even
when the embedding search correctly returned MV-only chunks, the
LLM hallucinated familiar source labels from its training set
(typically "FDP NRW Wahlprogramm 2022, S. 75") because that was
its strongest prior for budget/transparency policy citations.

Fix: prepend the fully-qualified PROGRAMME[programm_id]["name"]
to each quote and explicitly instruct the model to use these
labels verbatim. Discovered while smoke-testing MV after
indexing the new MV+BE programmes — embedding retrieval was
clean (sim ~0.6 chunks all from fdp-mv-2021), only the prompt
serialisation was lossy.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 app/embeddings.py | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/app/embeddings.py b/app/embeddings.py
index 91dfbc6..8ca13b8 100644
--- a/app/embeddings.py
+++ b/app/embeddings.py
@@ -488,28 +488,52 @@ def get_relevant_quotes_for_antrag(
     return results
 
 
+def _chunk_source_label(chunk: dict) -> str:
+    """Build a fully-qualified source label like 'FDP MV Wahlprogramm 2021, S. 73'.
+
+    Without the programme name + Bundesland in the prompt, the LLM
+    halluzinates familiar sources from its training (typically NRW 2022)
+    even when the retrieved chunks all come from a different state.
+    """
+    prog_id = chunk.get("programm_id", "")
+    info = PROGRAMME.get(prog_id, {})
+    name = info.get("name") or prog_id
+    seite = chunk.get("seite", "?")
+    return f"{name}, S. {seite}"
+
+
 def format_quotes_for_prompt(quotes: dict) -> str:
-    """Format quotes for inclusion in LLM prompt."""
+    """Format quotes for inclusion in LLM prompt.
+
+    Each quote is annotated with the fully-qualified source (programme
+    name + page) so the LLM cannot fall back on training-set defaults
+    when constructing its citations.
+    """
     if not quotes:
         return ""
-    
+
     lines = ["\n## Relevante Passagen aus Wahl- und Parteiprogrammen\n"]
-    
+    lines.append(
+        "Verwende **ausschließlich** die hier gelisteten Quellenangaben "
+        "(Programm-Name + Seite) wörtlich in deinen Zitaten — erfinde "
+        "keine Quellen aus dem Gedächtnis.\n"
+    )
+
     for partei, data in quotes.items():
         lines.append(f"\n### {partei}\n")
-        
+
         if data.get("wahlprogramm"):
             lines.append("**Wahlprogramm:**")
             for chunk in data["wahlprogramm"]:
                 text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
-                lines.append(f'- S. {chunk["seite"]}: "{text}"')
-        
+                lines.append(f'- {_chunk_source_label(chunk)}: "{text}"')
+
         if data.get("parteiprogramm"):
             lines.append("\n**Grundsatzprogramm:**")
             for chunk in data["parteiprogramm"]:
                 text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
-                lines.append(f'- S. {chunk["seite"]}: "{text}"')
-    
+                lines.append(f'- {_chunk_source_label(chunk)}: "{text}"')
+
     return "\n".join(lines)