Embeddings prompt: include programme name in chunk citations

format_quotes_for_prompt previously rendered each retrieved chunk
as just "S. X: text", giving the LLM no way to know which
Bundesland or Wahlprogramm the passage came from. Result: even
when the embedding search correctly returned MV-only chunks, the
LLM hallucinated familiar source labels from its training set
(typically "FDP NRW Wahlprogramm 2022, S. 75") because that was
its strongest prior for budget/transparency policy citations.

Fix: prepend the fully-qualified PROGRAMME[programm_id]["name"]
to each quote and explicitly instruct the model to use these
labels verbatim. Discovered while smoke-testing MV after
indexing the new MV+BE programmes — embedding retrieval was
clean (sim ~0.6 chunks all from fdp-mv-2021), only the prompt
serialisation was lossy.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dotty Dotter 2026-04-08 11:24:31 +02:00
parent 8992cffc64
commit 1b5fd96e16

View File

@ -488,12 +488,36 @@ def get_relevant_quotes_for_antrag(
return results return results
def _chunk_source_label(chunk: dict) -> str:
"""Build a fully-qualified source label like 'FDP MV Wahlprogramm 2021, S. 73'.
Without the programme name + Bundesland in the prompt, the LLM
halluzinates familiar sources from its training (typically NRW 2022)
even when the retrieved chunks all come from a different state.
"""
prog_id = chunk.get("programm_id", "")
info = PROGRAMME.get(prog_id, {})
name = info.get("name") or prog_id
seite = chunk.get("seite", "?")
return f"{name}, S. {seite}"
def format_quotes_for_prompt(quotes: dict) -> str: def format_quotes_for_prompt(quotes: dict) -> str:
"""Format quotes for inclusion in LLM prompt.""" """Format quotes for inclusion in LLM prompt.
Each quote is annotated with the fully-qualified source (programme
name + page) so the LLM cannot fall back on training-set defaults
when constructing its citations.
"""
if not quotes: if not quotes:
return "" return ""
lines = ["\n## Relevante Passagen aus Wahl- und Parteiprogrammen\n"] lines = ["\n## Relevante Passagen aus Wahl- und Parteiprogrammen\n"]
lines.append(
"Verwende **ausschließlich** die hier gelisteten Quellenangaben "
"(Programm-Name + Seite) wörtlich in deinen Zitaten — erfinde "
"keine Quellen aus dem Gedächtnis.\n"
)
for partei, data in quotes.items(): for partei, data in quotes.items():
lines.append(f"\n### {partei}\n") lines.append(f"\n### {partei}\n")
@ -502,13 +526,13 @@ def format_quotes_for_prompt(quotes: dict) -> str:
lines.append("**Wahlprogramm:**") lines.append("**Wahlprogramm:**")
for chunk in data["wahlprogramm"]: for chunk in data["wahlprogramm"]:
text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"] text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
lines.append(f'- S. {chunk["seite"]}: "{text}"') lines.append(f'- {_chunk_source_label(chunk)}: "{text}"')
if data.get("parteiprogramm"): if data.get("parteiprogramm"):
lines.append("\n**Grundsatzprogramm:**") lines.append("\n**Grundsatzprogramm:**")
for chunk in data["parteiprogramm"]: for chunk in data["parteiprogramm"]:
text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"] text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
lines.append(f'- S. {chunk["seite"]}: "{text}"') lines.append(f'- {_chunk_source_label(chunk)}: "{text}"')
return "\n".join(lines) return "\n".join(lines)