Klick auf eine Zitat-Quelle im Report öffnet jetzt eine 1-Seiten-PDF- Variante des Wahlprogramms mit gelb markiertem Snippet, statt nur zum Page-Anchor zu springen und den Leser selbst suchen zu lassen. Implementation: embeddings.render_highlighted_page(programm_id, seite, query) - Validiert programm_id gegen PROGRAMME (Path-Traversal-Schutz) - Lädt das volle Wahlprogramm-PDF, extrahiert via insert_pdf nur die angeforderte Seite in einen neuen Document → kleinere Response - search_for(query[:200]) → Bounding-Boxes aller Treffer - Fallback: 5-Wort-Anker wenn Volltext-Match leer (LLM-Truncation, identisch zu find_chunk_for_text/Sub-D-Logik) - add_highlight_annot mit gelber stroke-Color (1.0, 0.93, 0.0) - Returns serialisierte PDF-Bytes oder None embeddings._chunk_pdf_url - Wenn chunk["text"] vorhanden: emittiert /api/wahlprogramm-cite-URL mit pid=, seite=, q=urlencoded(text[:200]) - Sonst: alter statischer /static/referenzen/X.pdf#page=N (Pre-#47 rückwärts-kompatibel) - text wird auf 200 Zeichen abgeschnitten, sonst blasen 500-Zeichen-Snippets jedes Assessment-JSON auf main.py /api/wahlprogramm-cite Endpoint - Validiert pid gegen PROGRAMME registry - seite: 1 ≤ n ≤ 2000 - Response: application/pdf, Cache-Control max-age=86400 - 404 bei unknown pid oder fehlendem PDF, 400 bei seite out of range Reconstruct-Pipeline (Issue #60 Option B) zieht das automatisch durch: reconstruct_zitate ruft _chunk_pdf_url(matched_chunk) auf, der jetzt bevorzugt die Cite-URL emittiert. Keine Änderung an reconstruct_zitate selbst nötig. Tests: 194/194 grün (185 + 9 neue): - TestChunkPdfUrl: 4 Cases (cite vs static, unknown prog, 200-char-truncate) - TestRenderHighlightedPage: 5 Cases (unknown pid, invalid seite, valid render, empty query, query-not-found-falls-back-zu-leerem-Highlight) - Plus Bridge im Test-Stub: pymupdf-as-fitz Shim falls eine third-party "fitz" das Pkg shadowt (kommt auf älteren Dev-Setups vor) Refs: #47
505 lines
22 KiB
Python
505 lines
22 KiB
Python
"""Tests for embeddings.py prompt formatting.
|
|
|
|
Reproduces the LLM-Halluzinations-Bug from the 2026-04-08 session
|
|
(commits 1b5fd96 + bc7f4a6): the original ``format_quotes_for_prompt``
|
|
rendered each chunk as ``- S. X: "text"`` without any reference to the
|
|
programme name. As a result the LLM hallucinated familiar source labels
|
|
("FDP NRW Wahlprogramm 2022") for chunks that actually came from MV/BE,
|
|
because that was the strongest training-set prior for budget-policy
|
|
citations.
|
|
|
|
Fix: prepend the fully-qualified PROGRAMME[programm_id]["name"] to each
|
|
quote.
|
|
"""
|
|
import sys
|
|
import types
|
|
|
|
import pytest
|
|
|
|
# Stub openai before importing embeddings, since the test environment may
|
|
# not have it installed and we don't actually need to make API calls.
|
|
if "openai" not in sys.modules:
|
|
openai_stub = types.ModuleType("openai")
|
|
openai_stub.OpenAI = lambda **kw: None
|
|
sys.modules["openai"] = openai_stub
|
|
|
|
# On dev machines an older third-party "fitz" package may shadow PyMuPDF's
|
|
# legacy import alias — verify the loaded module actually has ``open`` and
|
|
# fall back to ``pymupdf`` (the canonical name in PyMuPDF ≥ 1.24) when the
|
|
# wrong "fitz" is in front of pymupdf on sys.path.
|
|
try:
|
|
import fitz as _fitz
|
|
if not hasattr(_fitz, "open"):
|
|
import pymupdf as _pymupdf
|
|
sys.modules["fitz"] = _pymupdf
|
|
except ImportError:
|
|
try:
|
|
import pymupdf as _pymupdf
|
|
sys.modules["fitz"] = _pymupdf
|
|
except ImportError:
|
|
pass # render tests will skip via fixture below
|
|
|
|
from app import embeddings as embeddings_mod
|
|
from app.embeddings import (
|
|
_chunk_pdf_url,
|
|
_chunk_source_label,
|
|
find_chunk_for_text,
|
|
format_quotes_for_prompt,
|
|
get_relevant_quotes_for_antrag,
|
|
reconstruct_zitate,
|
|
render_highlighted_page,
|
|
PROGRAMME,
|
|
)
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# _chunk_source_label — fully-qualified programme name + page
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestChunkSourceLabel:
|
|
def test_known_programme_id(self):
|
|
chunk = {"programm_id": "fdp-mv-2021", "seite": 73, "text": "..."}
|
|
label = _chunk_source_label(chunk)
|
|
assert "FDP Mecklenburg-Vorpommern" in label
|
|
assert "S. 73" in label
|
|
|
|
def test_known_programme_id_for_be(self):
|
|
chunk = {"programm_id": "spd-be-2023", "seite": 24, "text": "..."}
|
|
label = _chunk_source_label(chunk)
|
|
assert "SPD Berlin" in label
|
|
assert "2021" in label # the BE-2023.pdf files contain 2021er programmes
|
|
assert "S. 24" in label
|
|
|
|
def test_unknown_programme_id_falls_back_to_id(self):
|
|
chunk = {"programm_id": "fake-xx-9999", "seite": 1, "text": "..."}
|
|
label = _chunk_source_label(chunk)
|
|
# Should not crash, should at least include the id and the page
|
|
assert "fake-xx-9999" in label
|
|
assert "S. 1" in label
|
|
|
|
def test_missing_seite_uses_questionmark(self):
|
|
chunk = {"programm_id": "cdu-mv-2021", "text": "..."}
|
|
label = _chunk_source_label(chunk)
|
|
assert "?" in label
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# format_quotes_for_prompt — every chunk must carry programme identification
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
EXAMPLE_QUOTES = {
|
|
"FDP": {
|
|
"wahlprogramm": [
|
|
{
|
|
"programm_id": "fdp-mv-2021",
|
|
"partei": "FDP",
|
|
"typ": "wahlprogramm",
|
|
"seite": 73,
|
|
"text": "Die Grundsätze von Wirtschaftlichkeit und Sparsamkeit",
|
|
"similarity": 0.63,
|
|
},
|
|
],
|
|
"parteiprogramm": [
|
|
{
|
|
"programm_id": "fdp-grundsatz",
|
|
"partei": "FDP",
|
|
"typ": "parteiprogramm",
|
|
"seite": 93,
|
|
"text": "Liberale Marktwirtschaft erfordert solide Haushalte",
|
|
"similarity": 0.60,
|
|
},
|
|
],
|
|
},
|
|
"SPD": {
|
|
"wahlprogramm": [
|
|
{
|
|
"programm_id": "spd-mv-2021",
|
|
"partei": "SPD",
|
|
"typ": "wahlprogramm",
|
|
"seite": 22,
|
|
"text": "Verkehrswende weg vom motorisierten Individualverkehr",
|
|
"similarity": 0.58,
|
|
},
|
|
],
|
|
},
|
|
}
|
|
|
|
|
|
class TestFormatQuotesForPrompt:
|
|
def test_empty_input_returns_empty_string(self):
|
|
assert format_quotes_for_prompt({}) == ""
|
|
|
|
def test_renders_party_headings(self):
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
assert "### FDP" in out
|
|
assert "### SPD" in out
|
|
|
|
def test_every_chunk_has_programme_name(self):
|
|
"""Regression: pre-fix this used "S. X:" only, no programme name —
|
|
the LLM then hallucinated NRW-2022 sources from training data."""
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
# Each of the three chunks must reference its source programme
|
|
assert "FDP Mecklenburg-Vorpommern" in out
|
|
assert "FDP Grundsatzprogramm" in out
|
|
assert "SPD Mecklenburg-Vorpommern" in out
|
|
|
|
def test_contains_strict_citation_instruction(self):
|
|
"""The prompt header must explicitly forbid hallucinated sources."""
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
assert "wörtlich" in out.lower()
|
|
|
|
def test_chunks_get_enum_ids(self):
|
|
"""Issue #60 fix: each chunk must be tagged with a stable [Qn] id
|
|
so the LLM can be forced to anchor every citation in a specific
|
|
retrieved chunk instead of inventing snippets from training data.
|
|
"""
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
# 2 wahlprogramm chunks + 1 grundsatz chunk = 3 IDs total
|
|
assert "[Q1]" in out
|
|
assert "[Q2]" in out
|
|
assert "[Q3]" in out
|
|
assert "[Q4]" not in out # only 3 chunks in EXAMPLE_QUOTES
|
|
|
|
def test_zitateregel_mentions_enum_anchor(self):
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
# The prompt header must mention the ENUM anchor mechanism so
|
|
# the LLM understands what [Qn] means.
|
|
assert "[Q" in out
|
|
assert "ZITATEREGEL" in out
|
|
|
|
def test_no_nrw_2022_appears_unless_chunks_are_actually_nrw(self):
|
|
"""Sanity: a pure MV+SPD chunk set must not mention NRW anywhere."""
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
assert "NRW" not in out
|
|
assert "Nordrhein-Westfalen" not in out
|
|
|
|
def test_renders_separate_blocks_for_wahl_and_parteiprogramm(self):
|
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
|
assert "**Wahlprogramm:**" in out
|
|
assert "**Grundsatzprogramm:**" in out
|
|
|
|
def test_get_relevant_quotes_for_antrag_populates_results(self, monkeypatch):
|
|
"""Regression for the partei_upper NameError (Phase B / #55 / eb045d0):
|
|
|
|
The dict-write line still referenced ``partei_upper`` after the
|
|
rest of the function had been renamed to ``partei_lookup``. The
|
|
result was that ``get_relevant_quotes_for_antrag`` raised
|
|
``NameError`` on every call, was silently swallowed by the
|
|
``except Exception`` in ``analyzer.run_analysis``, and silently
|
|
downgraded *every* assessment to keyword search — which then
|
|
caused the LLM hallucinations tracked in #60.
|
|
|
|
Test strategy: monkeypatch ``find_relevant_chunks`` so we don't
|
|
need real embeddings, then call the wrapper and assert it
|
|
actually returns a populated dict instead of crashing.
|
|
"""
|
|
def fake_find_relevant_chunks(query, parteien=None, typ=None,
|
|
bundesland=None, top_k=3,
|
|
min_similarity=0.5):
|
|
return [{
|
|
"programm_id": "gruene-nrw-2022",
|
|
"partei": parteien[0] if parteien else "GRÜNE",
|
|
"typ": typ or "wahlprogramm",
|
|
"seite": 58,
|
|
"text": "Wahlalter ab 16",
|
|
"similarity": 0.7,
|
|
}]
|
|
|
|
monkeypatch.setattr(embeddings_mod, "find_relevant_chunks",
|
|
fake_find_relevant_chunks)
|
|
|
|
result = get_relevant_quotes_for_antrag(
|
|
antrag_text="Wahlalter ab 16",
|
|
fraktionen=["GRÜNE"],
|
|
bundesland="NRW",
|
|
top_k_per_partei=2,
|
|
)
|
|
assert result, "Expected a non-empty result dict, got empty"
|
|
# The keys are canonical party names; either GRÜNE itself or
|
|
# whatever the canonical mapper returns for it.
|
|
assert any("GR" in k.upper() for k in result.keys())
|
|
# And the structure must be the {wahlprogramm, parteiprogramm} dict
|
|
first = next(iter(result.values()))
|
|
assert "wahlprogramm" in first
|
|
assert "parteiprogramm" in first
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# reconstruct_zitate — Issue #60 Option B (server-side citation rewrite)
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestReconstructZitate:
|
|
"""Verify the post-processing pass that overwrites LLM-emitted quelle/url
|
|
with the canonical source label of whichever retrieved chunk actually
|
|
contains the cited text. Drops zitate that don't match any chunk.
|
|
|
|
Background: BB 8/673 (Sub-D run after the A+C deploy) showed the LLM
|
|
copying real text from one chunk but writing the page number from a
|
|
different chunk into ``quelle``. The ENUM-anchor in the prompt is only
|
|
a soft hint; this post-processing step is the structural binding.
|
|
"""
|
|
|
|
def _make_chunk(self, programm_id: str, seite: int, text: str) -> dict:
|
|
return {
|
|
"programm_id": programm_id,
|
|
"partei": programm_id.split("-")[0].upper(),
|
|
"typ": "wahlprogramm",
|
|
"seite": seite,
|
|
"text": text,
|
|
"similarity": 0.7,
|
|
}
|
|
|
|
def test_overwrites_wrong_seite_with_real_chunk_seite(self):
|
|
"""The BB 8/673 case: LLM cites text from S.27 chunk but writes
|
|
S.4 in quelle. After reconstruct_zitate the quelle must point to
|
|
the real S.27 chunk."""
|
|
real_chunk = self._make_chunk(
|
|
"bsw-bb-2024", 27,
|
|
"wertschätzung für lehrerinnen und lehrer abbau von arbeitsüberlastung",
|
|
)
|
|
wrong_chunk = self._make_chunk(
|
|
"bsw-bb-2024", 4,
|
|
"in brandenburg weniger als 14 euro in der stunde verdient",
|
|
)
|
|
semantic_quotes = {
|
|
"BSW": {"wahlprogramm": [wrong_chunk, real_chunk], "parteiprogramm": []},
|
|
}
|
|
data = {
|
|
"wahlprogrammScores": [{
|
|
"fraktion": "BSW",
|
|
"wahlprogramm": {
|
|
"score": 7,
|
|
"begründung": "...",
|
|
"zitate": [{
|
|
"text": "Wertschätzung für Lehrerinnen und Lehrer Abbau von Arbeitsüberlastung",
|
|
"quelle": "BSW Brandenburg Wahlprogramm 2024, S. 4", # WRONG
|
|
"url": "/static/referenzen/bsw-bb-2024.pdf#page=4",
|
|
}],
|
|
},
|
|
"parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
|
|
}],
|
|
}
|
|
out = reconstruct_zitate(data, semantic_quotes)
|
|
z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]
|
|
assert z["quelle"] == "BSW Brandenburg Wahlprogramm 2024, S. 27"
|
|
# Post-#47: URL ist der Highlight-Cite-Endpoint mit pid+seite+q.
|
|
# Static-Fallback nur noch wenn der Chunk kein text-Feld hat.
|
|
assert z["url"].startswith("/api/wahlprogramm-cite?")
|
|
assert "pid=bsw-bb-2024" in z["url"]
|
|
assert "seite=27" in z["url"]
|
|
|
|
def test_drops_zitat_not_found_in_any_chunk(self):
|
|
"""If a snippet was hallucinated entirely (no matching chunk),
|
|
the zitat must be removed rather than persisted."""
|
|
chunk = self._make_chunk(
|
|
"spd-lsa-2021", 41,
|
|
"die stärkung einer geschlechtersensiblen berufsorientierung",
|
|
)
|
|
semantic_quotes = {
|
|
"SPD": {"wahlprogramm": [chunk], "parteiprogramm": []},
|
|
}
|
|
data = {
|
|
"wahlprogrammScores": [{
|
|
"fraktion": "SPD",
|
|
"wahlprogramm": {
|
|
"score": 7,
|
|
"begründung": "...",
|
|
"zitate": [
|
|
{"text": "Wir Sozialdemokratinnen ächten Rechtsextremismus seit 1863",
|
|
"quelle": "SPD Sachsen-Anhalt 2021, S. 37"},
|
|
{"text": "die Stärkung einer geschlechtersensiblen Berufsorientierung",
|
|
"quelle": "SPD Sachsen-Anhalt 2021, S. 41"},
|
|
],
|
|
},
|
|
"parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
|
|
}],
|
|
}
|
|
out = reconstruct_zitate(data, semantic_quotes)
|
|
zitate = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"]
|
|
assert len(zitate) == 1
|
|
assert "geschlechtersensiblen" in zitate[0]["text"]
|
|
|
|
def test_empty_semantic_quotes_is_noop(self):
|
|
data = {"wahlprogrammScores": [{
|
|
"fraktion": "X",
|
|
"wahlprogramm": {"score": 5, "begründung": "x",
|
|
"zitate": [{"text": "abc def ghi jkl mno pqr", "quelle": "X"}]},
|
|
"parteiprogramm": {"score": 0, "begründung": "x", "zitate": []},
|
|
}]}
|
|
out = reconstruct_zitate(data, {})
|
|
# No chunks → no postprocessing applied; data passes through unchanged
|
|
assert out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]["quelle"] == "X"
|
|
|
|
def test_anchor_match_when_full_substring_misses(self):
|
|
"""LLM may slightly truncate a snippet — 5-word-anchor still binds."""
|
|
chunk = self._make_chunk(
|
|
"cdu-nrw-2022", 24,
|
|
"wir wollen interprofessionelle netzwerkstrukturen für kinderschutz fördern dazu werden wir stellen schaffen",
|
|
)
|
|
semantic_quotes = {"CDU": {"wahlprogramm": [chunk], "parteiprogramm": []}}
|
|
data = {"wahlprogrammScores": [{
|
|
"fraktion": "CDU",
|
|
"wahlprogramm": {
|
|
"score": 8, "begründung": "...",
|
|
"zitate": [{
|
|
"text": "Wir wollen interprofessionelle Netzwerkstrukturen für Kinderschutz fördern",
|
|
"quelle": "CDU NRW Wahlprogramm 2022, S. 999", # wrong page
|
|
}],
|
|
},
|
|
"parteiprogramm": {"score": 0, "begründung": "...", "zitate": []},
|
|
}]}
|
|
out = reconstruct_zitate(data, semantic_quotes)
|
|
z = out["wahlprogrammScores"][0]["wahlprogramm"]["zitate"][0]
|
|
assert z["quelle"] == "CDU NRW Wahlprogramm 2022, S. 24"
|
|
|
|
def test_find_chunk_for_text_short_needle_returns_none(self):
|
|
chunk = self._make_chunk("x", 1, "egal was hier steht")
|
|
assert find_chunk_for_text("ja", [chunk]) is None
|
|
|
|
def test_find_chunk_for_text_handles_soft_hyphen(self):
|
|
chunk = self._make_chunk(
|
|
"bsw-bb-2024", 27,
|
|
"handys und tablets wertschätzung für lehrerinnen und lehrer",
|
|
)
|
|
# LLM-emitted text with the soft hyphen \xad mid-word, as PyMuPDF
|
|
# would extract from a PDF line break.
|
|
text = "Handys und Tablets. Wertschätzung für Lehrerinnen und Lehrer"
|
|
assert find_chunk_for_text(text, [chunk]) is chunk
|
|
|
|
|
|
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# _chunk_pdf_url + render_highlighted_page — Issue #47 PDF-Highlighting
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestChunkPdfUrl:
|
|
"""Verify the URL builder switches between the cite-endpoint (when
|
|
chunk text is present) and the static fallback (Pre-#47 chunks).
|
|
"""
|
|
|
|
def test_cite_url_when_text_present(self):
|
|
chunk = {
|
|
"programm_id": "gruene-grundsatz",
|
|
"seite": 36,
|
|
"text": "Plattformen müssen umfassend reguliert werden",
|
|
}
|
|
url = _chunk_pdf_url(chunk)
|
|
assert url is not None
|
|
assert url.startswith("/api/wahlprogramm-cite?")
|
|
assert "pid=gruene-grundsatz" in url
|
|
assert "seite=36" in url
|
|
# URL-encoded query (urlencode/quote_plus uses + for space)
|
|
assert "Plattformen" in url
|
|
|
|
def test_static_fallback_when_no_text(self):
|
|
chunk = {"programm_id": "fdp-mv-2021", "seite": 73}
|
|
url = _chunk_pdf_url(chunk)
|
|
assert url == "/static/referenzen/fdp-mv-2021.pdf#page=73"
|
|
|
|
def test_unknown_programme_returns_none(self):
|
|
chunk = {"programm_id": "fake-xx-9999", "seite": 1, "text": "x" * 50}
|
|
assert _chunk_pdf_url(chunk) is None
|
|
|
|
def test_url_truncates_long_text_to_200_chars(self):
|
|
chunk = {
|
|
"programm_id": "gruene-grundsatz",
|
|
"seite": 36,
|
|
"text": "A" * 1000,
|
|
}
|
|
url = _chunk_pdf_url(chunk)
|
|
assert url is not None
|
|
# Eingebettete Text-Länge ist auf 200 Zeichen begrenzt — sonst
|
|
# blasen 500-Zeichen-Snippets das Assessment-JSON auf.
|
|
# Der `q=`-Parameter darf nicht 1000 'A' enthalten.
|
|
assert "A" * 1000 not in url
|
|
assert "A" * 200 in url
|
|
|
|
|
|
class TestRenderHighlightedPage:
|
|
"""Smoke-Test gegen ein reales Wahlprogramm-PDF aus dem
|
|
referenzen-Verzeichnis. Bestätigt dass PyMuPDF einen 1-Seiten-PDF
|
|
mit Highlight-Annotation produziert. Skipped wenn das Test-PDF
|
|
nicht im Repo vorhanden ist.
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def sample_pid(self):
|
|
# Wir nehmen einen kleinen, sicher vorhandenen Eintrag aus PROGRAMME.
|
|
# spd-grundsatz ist seit Tag 1 indexiert und im Repo committed.
|
|
from pathlib import Path
|
|
from app.embeddings import PROGRAMME
|
|
pid = "spd-grundsatz"
|
|
info = PROGRAMME.get(pid)
|
|
if not info:
|
|
pytest.skip("PROGRAMME registry missing spd-grundsatz")
|
|
path = Path(__file__).parent.parent / "app" / "static" / "referenzen" / info["pdf"]
|
|
if not path.exists():
|
|
pytest.skip(f"Test-PDF {path} nicht im Repo")
|
|
return pid
|
|
|
|
def test_unknown_pid_returns_none(self):
|
|
assert render_highlighted_page("fake-xx-9999", 1, "x") is None
|
|
|
|
def test_invalid_seite_returns_none(self, sample_pid):
|
|
assert render_highlighted_page(sample_pid, 99999, "x") is None
|
|
assert render_highlighted_page(sample_pid, 0, "x") is None
|
|
|
|
def test_renders_single_page_pdf(self, sample_pid):
|
|
out = render_highlighted_page(sample_pid, 1, "Soziale Gerechtigkeit")
|
|
assert out is not None
|
|
assert isinstance(out, bytes)
|
|
# PDF magic header
|
|
assert out[:5] == b"%PDF-"
|
|
# PyMuPDF behält bei insert_pdf gemeinsame Resources (Fonts, Images)
|
|
# mit, deshalb ist ein 1-Seiten-Sub-PDF nicht zwangsläufig winzig.
|
|
# Wir prüfen nur dass es überhaupt deutlich kleiner als das Original
|
|
# ist (< 50% der Programm-Größe).
|
|
from pathlib import Path
|
|
info = PROGRAMME[sample_pid]
|
|
original_size = (
|
|
Path(__file__).parent.parent / "app" / "static" / "referenzen" / info["pdf"]
|
|
).stat().st_size
|
|
assert len(out) < original_size, (
|
|
f"sub-PDF {len(out)} not smaller than original {original_size}"
|
|
)
|
|
|
|
def test_returns_pdf_even_when_query_empty(self, sample_pid):
|
|
# Empty query → render the page without any annotations
|
|
out = render_highlighted_page(sample_pid, 1, "")
|
|
assert out is not None
|
|
assert out[:5] == b"%PDF-"
|
|
|
|
def test_returns_pdf_even_when_query_not_found(self, sample_pid):
|
|
# No match → still render the page (no highlights)
|
|
out = render_highlighted_page(
|
|
sample_pid, 1, "this exact phrase definitely does not exist anywhere",
|
|
)
|
|
assert out is not None
|
|
assert out[:5] == b"%PDF-"
|
|
|
|
|
|
def test_format_quotes_truncates_long_chunks_at_500_chars():
|
|
"""Truncation-Test for format_quotes_for_prompt — sat lange als
|
|
Methode in TestRenderHighlightedPage (falsche Class-Zuordnung
|
|
durch Edit-Reihenfolge), jetzt module-level."""
|
|
long_chunk = {
|
|
"FDP": {
|
|
"wahlprogramm": [
|
|
{
|
|
"programm_id": "fdp-mv-2021",
|
|
"seite": 1,
|
|
"text": "A" * 1000, # 1000 chars → should be truncated
|
|
"similarity": 0.7,
|
|
}
|
|
],
|
|
}
|
|
}
|
|
out = format_quotes_for_prompt(long_chunk)
|
|
# Truncation marker
|
|
assert "..." in out
|
|
# Original chunk text 1000 chars not present in full
|
|
assert "A" * 1000 not in out
|