Embeddings prompt: include programme name in chunk citations
format_quotes_for_prompt previously rendered each retrieved chunk as just "S. X: text", giving the LLM no way to know which Bundesland or Wahlprogramm the passage came from. Result: even when the embedding search correctly returned MV-only chunks, the LLM hallucinated familiar source labels from its training set (typically "FDP NRW Wahlprogramm 2022, S. 75") because that was its strongest prior for budget/transparency policy citations. Fix: prepend the fully-qualified PROGRAMME[programm_id]["name"] to each quote and explicitly instruct the model to use these labels verbatim. Discovered while smoke-testing MV after indexing the new MV+BE programmes — embedding retrieval was clean (sim ~0.6 chunks all from fdp-mv-2021), only the prompt serialisation was lossy. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8992cffc64
commit
1b5fd96e16
@ -488,28 +488,52 @@ def get_relevant_quotes_for_antrag(
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _chunk_source_label(chunk: dict) -> str:
|
||||||
|
"""Build a fully-qualified source label like 'FDP MV Wahlprogramm 2021, S. 73'.
|
||||||
|
|
||||||
|
Without the programme name + Bundesland in the prompt, the LLM
|
||||||
|
halluzinates familiar sources from its training (typically NRW 2022)
|
||||||
|
even when the retrieved chunks all come from a different state.
|
||||||
|
"""
|
||||||
|
prog_id = chunk.get("programm_id", "")
|
||||||
|
info = PROGRAMME.get(prog_id, {})
|
||||||
|
name = info.get("name") or prog_id
|
||||||
|
seite = chunk.get("seite", "?")
|
||||||
|
return f"{name}, S. {seite}"
|
||||||
|
|
||||||
|
|
||||||
def format_quotes_for_prompt(quotes: dict) -> str:
|
def format_quotes_for_prompt(quotes: dict) -> str:
|
||||||
"""Format quotes for inclusion in LLM prompt."""
|
"""Format quotes for inclusion in LLM prompt.
|
||||||
|
|
||||||
|
Each quote is annotated with the fully-qualified source (programme
|
||||||
|
name + page) so the LLM cannot fall back on training-set defaults
|
||||||
|
when constructing its citations.
|
||||||
|
"""
|
||||||
if not quotes:
|
if not quotes:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
lines = ["\n## Relevante Passagen aus Wahl- und Parteiprogrammen\n"]
|
lines = ["\n## Relevante Passagen aus Wahl- und Parteiprogrammen\n"]
|
||||||
|
lines.append(
|
||||||
|
"Verwende **ausschließlich** die hier gelisteten Quellenangaben "
|
||||||
|
"(Programm-Name + Seite) wörtlich in deinen Zitaten — erfinde "
|
||||||
|
"keine Quellen aus dem Gedächtnis.\n"
|
||||||
|
)
|
||||||
|
|
||||||
for partei, data in quotes.items():
|
for partei, data in quotes.items():
|
||||||
lines.append(f"\n### {partei}\n")
|
lines.append(f"\n### {partei}\n")
|
||||||
|
|
||||||
if data.get("wahlprogramm"):
|
if data.get("wahlprogramm"):
|
||||||
lines.append("**Wahlprogramm:**")
|
lines.append("**Wahlprogramm:**")
|
||||||
for chunk in data["wahlprogramm"]:
|
for chunk in data["wahlprogramm"]:
|
||||||
text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
|
text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
|
||||||
lines.append(f'- S. {chunk["seite"]}: "{text}"')
|
lines.append(f'- {_chunk_source_label(chunk)}: "{text}"')
|
||||||
|
|
||||||
if data.get("parteiprogramm"):
|
if data.get("parteiprogramm"):
|
||||||
lines.append("\n**Grundsatzprogramm:**")
|
lines.append("\n**Grundsatzprogramm:**")
|
||||||
for chunk in data["parteiprogramm"]:
|
for chunk in data["parteiprogramm"]:
|
||||||
text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
|
text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"]
|
||||||
lines.append(f'- S. {chunk["seite"]}: "{text}"')
|
lines.append(f'- {_chunk_source_label(chunk)}: "{text}"')
|
||||||
|
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user