diff --git a/app/embeddings.py b/app/embeddings.py index 91dfbc6..8ca13b8 100644 --- a/app/embeddings.py +++ b/app/embeddings.py @@ -488,28 +488,52 @@ def get_relevant_quotes_for_antrag( return results +def _chunk_source_label(chunk: dict) -> str: + """Build a fully-qualified source label like 'FDP MV Wahlprogramm 2021, S. 73'. + + Without the programme name + Bundesland in the prompt, the LLM + halluzinates familiar sources from its training (typically NRW 2022) + even when the retrieved chunks all come from a different state. + """ + prog_id = chunk.get("programm_id", "") + info = PROGRAMME.get(prog_id, {}) + name = info.get("name") or prog_id + seite = chunk.get("seite", "?") + return f"{name}, S. {seite}" + + def format_quotes_for_prompt(quotes: dict) -> str: - """Format quotes for inclusion in LLM prompt.""" + """Format quotes for inclusion in LLM prompt. + + Each quote is annotated with the fully-qualified source (programme + name + page) so the LLM cannot fall back on training-set defaults + when constructing its citations. + """ if not quotes: return "" - + lines = ["\n## Relevante Passagen aus Wahl- und Parteiprogrammen\n"] - + lines.append( + "Verwende **ausschließlich** die hier gelisteten Quellenangaben " + "(Programm-Name + Seite) wörtlich in deinen Zitaten — erfinde " + "keine Quellen aus dem Gedächtnis.\n" + ) + for partei, data in quotes.items(): lines.append(f"\n### {partei}\n") - + if data.get("wahlprogramm"): lines.append("**Wahlprogramm:**") for chunk in data["wahlprogramm"]: text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"] - lines.append(f'- S. {chunk["seite"]}: "{text}"') - + lines.append(f'- {_chunk_source_label(chunk)}: "{text}"') + if data.get("parteiprogramm"): lines.append("\n**Grundsatzprogramm:**") for chunk in data["parteiprogramm"]: text = chunk["text"][:500] + "..." if len(chunk["text"]) > 500 else chunk["text"] - lines.append(f'- S. {chunk["seite"]}: "{text}"') - + lines.append(f'- {_chunk_source_label(chunk)}: "{text}"') + return "\n".join(lines)