Add pytest suite + fix two regex bugs uncovered by it (#46)
Erste Tests für die Codebase. 77 Tests, 0.08s Laufzeit, decken die
drei Bug-Klassen aus der April-2026-Adapter-Session ab plus haben
schon zwei weitere Bugs in Production-Code aufgedeckt.
## Setup
- requirements-dev.txt mit pytest + pytest-asyncio
- pytest.ini mit asyncio_mode=auto
- tests/conftest.py stubbt fitz/bs4/openai/pydantic_settings, damit
die Suite ohne den vollen prod-requirements-Satz läuft (pure unit
tests, kein PDF-Parsing, kein HTTP)
## Tests
- tests/test_parlamente.py (33 Tests)
* PortalaAdapter._parse_hit_list_cards: doctype/doctype_full
NameError-Regression aus 1cb030a, plus Title/Drucksache/Fraktion-
/Datum/PDF-Extraktion gegen ein BE-Card-Fixture
* PortalaAdapter._parse_hit_list_dump: gegen ein LSA-Perl-Dump-
Fixture inkl. Hex-Escape-Decoding (\x{fc} → ü)
* PortalaAdapter._parse_hit_list_html: Auto-Detection zwischen
Card- und Dump-Format
* PortalaAdapter._normalize_fraktion: kanonische Fraktion-Codes
inkl. F.D.P.-mit-Punkten, BÜNDNIS 90, DIE LINKE, BSW
* ParLDokAdapter._hit_to_drucksache: JSON-Hit → Drucksache
Mapping inkl. /navpanes-Stripping, MdL-mit-Partei-in-Klammern,
Landesregierung-Detection
* ParLDokAdapter._fulltext_id: bundle.js-mirroring (deferred,
aber dokumentiert)
* ADAPTERS-Registry-Sanity
- tests/test_embeddings.py (11 Tests)
* _chunk_source_label: Programm-Name + Seite (Halluzinations-
Bug-Regression aus 1b5fd96)
* format_quotes_for_prompt: jeder Chunk muss Programm-Name
enthalten, strict-citation-Hinweis muss im Output sein,
keine NRW-Halluzinationen für MV/BE-Chunk-Sets
- tests/test_wahlprogramme.py (14 Tests)
* Registry-Struktur (jahr int, seiten int, .pdf-Endung)
* File-Existenz: jede registrierte PDF muss in
static/referenzen/ liegen — würde Tippfehler in den 22
indexierten Programmen sofort fangen
* embeddings.PROGRAMME-Konsistenz-Cross-Check
- tests/test_bundeslaender.py (15 Tests)
* Sanity über 16-State-Registry
* #48-Klassifikations-Regression: TH=ParlDok, HB=StarWeb,
SN=Eigensystem
* Wahltermine plausibel (zwischen 2026 und 2035)
- tests/test_analyzer.py (4 Tests)
* Markdown-Codeblock-Stripping aus dem JSON-Retry-Loop
## Bug-Funde während der Test-Schreibphase
Zwei Production-Bugs in den _normalize_fraktion-Helfern wurden
durch die neuen Tests sofort aufgedeckt und im selben Commit gefixt:
1. PortalaAdapter._normalize_fraktion matched "F.D.P." (mit Punkten,
wie historische SH/HB-Drucksachen) nicht — Regex \bFDP\b ist zu
strikt. Fix: \bF\.?\s*D\.?\s*P\.?\b analog zu ParLDokAdapter.
2. ParLDokAdapter._normalize_fraktion (auch PortalaAdapter) matched
"Ministerium der Finanzen" nicht als Landesregierung, weil
\bMINISTER\b die Wortgrenze auch nach MINISTER verlangt — bei
MINISTERIUM steht aber IUM danach, keine Wortgrenze. Fix:
\bMINISTER ohne abschließendes \b.
Beide Bugs hätten Fraktion-Felder bei Drucksachen der Bremischen
Bürgerschaft (FDP-Listen) und bei Landesregierungs-Drucksachen
in MV/LSA fälschlich leer gelassen — exakt der "fraktionen=[]"-
Befund aus dem MV-Smoke-Test in #4.
Phase 0 aus Roadmap-Issue #49.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
5a30ce8bab
commit
f98e64c734
@ -443,7 +443,9 @@ class PortalaAdapter(ParlamentAdapter):
|
|||||||
out.append("CDU")
|
out.append("CDU")
|
||||||
if has(r"\bSPD\b"):
|
if has(r"\bSPD\b"):
|
||||||
out.append("SPD")
|
out.append("SPD")
|
||||||
if has(r"\bFDP\b"):
|
# F.D.P. (with dots, historical SH/HB-style) and FDP (modern) — same
|
||||||
|
# flexible pattern as ParLDokAdapter so the test suite stays consistent.
|
||||||
|
if has(r"\bF\.?\s*D\.?\s*P\.?\b"):
|
||||||
out.append("FDP")
|
out.append("FDP")
|
||||||
if has(r"\bAFD\b"):
|
if has(r"\bAFD\b"):
|
||||||
out.append("AfD")
|
out.append("AfD")
|
||||||
@ -451,7 +453,9 @@ class PortalaAdapter(ParlamentAdapter):
|
|||||||
out.append("LINKE")
|
out.append("LINKE")
|
||||||
if has(r"\bBSW\b"):
|
if has(r"\bBSW\b"):
|
||||||
out.append("BSW")
|
out.append("BSW")
|
||||||
if has(r"LANDESREGIERUNG|SENAT VON BERLIN|REGIERENDE[RN]?\s+BÜRGERMEISTER|MINISTER\b|STAATSKANZLEI"):
|
# MINISTERIUM/MINISTER beide treffen — \bMINISTER ohne abschließende
|
||||||
|
# Wortgrenze, damit "Ministerium der Finanzen" mit erfasst wird.
|
||||||
|
if has(r"LANDESREGIERUNG|SENAT VON BERLIN|REGIERENDE[RN]?\s+BÜRGERMEISTER|\bMINISTER|STAATSKANZLEI|MINISTERPRÄSIDENT"):
|
||||||
out.append("Landesregierung")
|
out.append("Landesregierung")
|
||||||
return out
|
return out
|
||||||
|
|
||||||
@ -944,7 +948,10 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
out.append("LINKE")
|
out.append("LINKE")
|
||||||
if re.search(r"\bBSW\b", u):
|
if re.search(r"\bBSW\b", u):
|
||||||
out.append("BSW")
|
out.append("BSW")
|
||||||
if re.search(r"LANDESREGIERUNG|MINISTER\b|STAATSKANZLEI|MINISTERPRÄSIDENT", u):
|
# \bMINISTER ohne abschließende Wortgrenze, damit MINISTERIUM
|
||||||
|
# auch trifft (z.B. "Ministerium der Finanzen" als Urheber von
|
||||||
|
# Landesregierungs-Drucksachen).
|
||||||
|
if re.search(r"LANDESREGIERUNG|\bMINISTER|STAATSKANZLEI|MINISTERPRÄSIDENT", u):
|
||||||
out.append("Landesregierung")
|
out.append("Landesregierung")
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|||||||
5
pytest.ini
Normal file
5
pytest.ini
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
[pytest]
|
||||||
|
testpaths = tests
|
||||||
|
asyncio_mode = auto
|
||||||
|
filterwarnings =
|
||||||
|
ignore::DeprecationWarning
|
||||||
13
requirements-dev.txt
Normal file
13
requirements-dev.txt
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# Test- und Entwicklungs-Abhängigkeiten — getrennt von requirements.txt,
|
||||||
|
# damit der prod-Container sie nicht installieren muss.
|
||||||
|
#
|
||||||
|
# Installation lokal:
|
||||||
|
# pip install -r requirements.txt -r requirements-dev.txt
|
||||||
|
#
|
||||||
|
# Tests laufen lassen:
|
||||||
|
# pytest -v tests/
|
||||||
|
|
||||||
|
-r requirements.txt
|
||||||
|
|
||||||
|
pytest>=8.0.0
|
||||||
|
pytest-asyncio>=0.24.0
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
51
tests/conftest.py
Normal file
51
tests/conftest.py
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
"""Shared pytest fixtures and path setup.
|
||||||
|
|
||||||
|
Stubs heavy optional dependencies (``fitz``/PyMuPDF, ``bs4``/BeautifulSoup,
|
||||||
|
``openai``) so the test suite can run without the full prod requirements
|
||||||
|
installed. The tests in this directory are pure unit tests over parser
|
||||||
|
logic and prompt formatters — they neither parse PDFs nor make HTTP
|
||||||
|
calls, so the stubs are inert placeholders that satisfy the import
|
||||||
|
machinery but never get exercised.
|
||||||
|
|
||||||
|
If a test ever does need real PyMuPDF or httpx integration, give it a
|
||||||
|
fixture marked with ``@pytest.mark.integration`` and skip it by default.
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
import types
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Make the `app` package importable when pytest is run from the webapp/ root.
|
||||||
|
ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
|
||||||
|
def _stub(name: str, **attrs) -> None:
|
||||||
|
if name in sys.modules:
|
||||||
|
return
|
||||||
|
mod = types.ModuleType(name)
|
||||||
|
for k, v in attrs.items():
|
||||||
|
setattr(mod, k, v)
|
||||||
|
sys.modules[name] = mod
|
||||||
|
|
||||||
|
|
||||||
|
_stub("fitz") # PyMuPDF — used for PDF parsing, not in unit tests
|
||||||
|
_stub("bs4", BeautifulSoup=lambda *a, **kw: None) # only needed by NRWAdapter live calls
|
||||||
|
_stub("openai", OpenAI=lambda **kw: None) # only needed by embeddings live calls
|
||||||
|
|
||||||
|
|
||||||
|
# pydantic_settings is a small but external dep that's not in the test
|
||||||
|
# environment. Stub it with a minimal BaseSettings shim so app.config can
|
||||||
|
# import without crashing — the tests don't actually read settings values.
|
||||||
|
class _BaseSettingsShim:
|
||||||
|
model_config: dict = {}
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
for k, v in kwargs.items():
|
||||||
|
setattr(self, k, v)
|
||||||
|
|
||||||
|
|
||||||
|
def _settings_config_dict(**kwargs):
|
||||||
|
return kwargs
|
||||||
|
|
||||||
|
|
||||||
|
_stub("pydantic_settings", BaseSettings=_BaseSettingsShim, SettingsConfigDict=_settings_config_dict)
|
||||||
62
tests/test_analyzer.py
Normal file
62
tests/test_analyzer.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
"""Tests for analyzer.py JSON-stripping logic.
|
||||||
|
|
||||||
|
Reproduces the markdown-codeblock-stripping in the LLM retry loop. Real
|
||||||
|
Qwen responses sometimes wrap their JSON in ```json …``` fences (despite
|
||||||
|
the prompt asking for raw JSON), and the analyzer must tolerate that
|
||||||
|
without resorting to retries.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import types
|
||||||
|
|
||||||
|
# Stub openai before importing analyzer
|
||||||
|
if "openai" not in sys.modules:
|
||||||
|
openai_stub = types.ModuleType("openai")
|
||||||
|
openai_stub.OpenAI = lambda **kw: None
|
||||||
|
sys.modules["openai"] = openai_stub
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_markdown_fences(content: str) -> str:
|
||||||
|
"""Mirror the analyzer's markdown-stripping snippet so we can unit-test
|
||||||
|
the parsing rules without actually invoking the LLM.
|
||||||
|
|
||||||
|
Keep this in sync with analyzer.py around the `if content.startswith("```")`
|
||||||
|
branch — if the analyzer changes, this helper changes too. The point of
|
||||||
|
the duplication is that the analyzer's stripping is buried in an async
|
||||||
|
LLM call that we cannot easily unit-test directly.
|
||||||
|
"""
|
||||||
|
content = content.strip()
|
||||||
|
if content.startswith("```"):
|
||||||
|
content = content.split("\n", 1)[1]
|
||||||
|
if content.endswith("```"):
|
||||||
|
content = content.rsplit("```", 1)[0]
|
||||||
|
if content.startswith("```json"):
|
||||||
|
content = content[7:]
|
||||||
|
return content.strip()
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_JSON = '{"gwoeScore": 7.0, "title": "Test"}'
|
||||||
|
|
||||||
|
|
||||||
|
class TestMarkdownStripping:
|
||||||
|
def test_plain_json_unchanged(self):
|
||||||
|
assert _strip_markdown_fences(SAMPLE_JSON) == SAMPLE_JSON
|
||||||
|
|
||||||
|
def test_json_in_markdown_fence(self):
|
||||||
|
wrapped = f"```json\n{SAMPLE_JSON}\n```"
|
||||||
|
cleaned = _strip_markdown_fences(wrapped)
|
||||||
|
assert json.loads(cleaned)["gwoeScore"] == 7.0
|
||||||
|
|
||||||
|
def test_json_in_plain_fence(self):
|
||||||
|
wrapped = f"```\n{SAMPLE_JSON}\n```"
|
||||||
|
cleaned = _strip_markdown_fences(wrapped)
|
||||||
|
assert json.loads(cleaned)["gwoeScore"] == 7.0
|
||||||
|
|
||||||
|
def test_leading_whitespace_stripped(self):
|
||||||
|
wrapped = f" \n {SAMPLE_JSON} \n "
|
||||||
|
assert json.loads(_strip_markdown_fences(wrapped))["gwoeScore"] == 7.0
|
||||||
|
|
||||||
|
def test_trailing_fence_stripped(self):
|
||||||
|
wrapped = f"{SAMPLE_JSON}\n```"
|
||||||
|
cleaned = _strip_markdown_fences(wrapped)
|
||||||
|
assert json.loads(cleaned)["gwoeScore"] == 7.0
|
||||||
81
tests/test_bundeslaender.py
Normal file
81
tests/test_bundeslaender.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
"""Tests for bundeslaender.py — sanity over 16-state registry.
|
||||||
|
|
||||||
|
Includes the #48 classification regression: TH must be ParlDok, HB must
|
||||||
|
be StarWeb, SN must be Eigensystem (not ParlDok).
|
||||||
|
"""
|
||||||
|
from app.bundeslaender import BUNDESLAENDER, get, aktive_bundeslaender, alle_bundeslaender
|
||||||
|
|
||||||
|
|
||||||
|
class TestRegistryStructure:
|
||||||
|
def test_sixteen_bundeslaender(self):
|
||||||
|
assert len(BUNDESLAENDER) == 16
|
||||||
|
|
||||||
|
def test_codes_are_uppercase(self):
|
||||||
|
for code in BUNDESLAENDER:
|
||||||
|
assert code.isupper(), f"{code} is not uppercase"
|
||||||
|
|
||||||
|
def test_each_entry_has_naechste_wahl_or_none(self):
|
||||||
|
for code, bl in BUNDESLAENDER.items():
|
||||||
|
assert bl.naechste_wahl is None or len(bl.naechste_wahl) == 10
|
||||||
|
|
||||||
|
def test_wahlperiode_is_positive_integer(self):
|
||||||
|
for bl in BUNDESLAENDER.values():
|
||||||
|
assert isinstance(bl.wahlperiode, int) and bl.wahlperiode > 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestActiveBundeslaender:
|
||||||
|
def test_four_active_bundeslaender(self):
|
||||||
|
active = aktive_bundeslaender()
|
||||||
|
codes = {bl.code for bl in active}
|
||||||
|
assert codes == {"NRW", "LSA", "MV", "BE"}
|
||||||
|
|
||||||
|
def test_alle_bundeslaender_returns_all_sixteen(self):
|
||||||
|
assert len(alle_bundeslaender()) == 16
|
||||||
|
|
||||||
|
def test_alle_bundeslaender_active_first(self):
|
||||||
|
out = alle_bundeslaender()
|
||||||
|
active_codes = {bl.code for bl in aktive_bundeslaender()}
|
||||||
|
# The first len(active) entries must all be active
|
||||||
|
for bl in out[: len(active_codes)]:
|
||||||
|
assert bl.code in active_codes
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetHelper:
|
||||||
|
def test_returns_bundesland_for_known_code(self):
|
||||||
|
bl = get("NRW")
|
||||||
|
assert bl is not None
|
||||||
|
assert bl.name == "Nordrhein-Westfalen"
|
||||||
|
|
||||||
|
def test_returns_none_for_unknown_code(self):
|
||||||
|
assert get("XX") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestClassificationFix48:
|
||||||
|
"""Regression: #48 corrected three doku_system entries that the
|
||||||
|
follow-up adapter issues depend on."""
|
||||||
|
|
||||||
|
def test_th_is_parldok_not_starweb(self):
|
||||||
|
assert BUNDESLAENDER["TH"].doku_system == "ParlDok"
|
||||||
|
|
||||||
|
def test_hb_is_starweb_not_paris(self):
|
||||||
|
"""PARiS is just a StarWeb skin — must be classified as StarWeb."""
|
||||||
|
assert BUNDESLAENDER["HB"].doku_system == "StarWeb"
|
||||||
|
|
||||||
|
def test_sn_is_eigensystem_not_parldok(self):
|
||||||
|
"""EDAS is ASP.NET-Webforms, NOT ParlDok-compatible with MV."""
|
||||||
|
assert BUNDESLAENDER["SN"].doku_system == "Eigensystem"
|
||||||
|
|
||||||
|
|
||||||
|
class TestWahltermineSane:
|
||||||
|
"""All upcoming elections must be in chronological order and in the
|
||||||
|
near future (sanity check that someone has not pasted a 1990 date)."""
|
||||||
|
|
||||||
|
def test_no_election_before_2026(self):
|
||||||
|
for bl in BUNDESLAENDER.values():
|
||||||
|
if bl.naechste_wahl:
|
||||||
|
assert bl.naechste_wahl >= "2026-01-01"
|
||||||
|
|
||||||
|
def test_no_election_after_2035(self):
|
||||||
|
for bl in BUNDESLAENDER.values():
|
||||||
|
if bl.naechste_wahl:
|
||||||
|
assert bl.naechste_wahl < "2035-01-01"
|
||||||
151
tests/test_embeddings.py
Normal file
151
tests/test_embeddings.py
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
"""Tests for embeddings.py prompt formatting.
|
||||||
|
|
||||||
|
Reproduces the LLM-Halluzinations-Bug from the 2026-04-08 session
|
||||||
|
(commits 1b5fd96 + bc7f4a6): the original ``format_quotes_for_prompt``
|
||||||
|
rendered each chunk as ``- S. X: "text"`` without any reference to the
|
||||||
|
programme name. As a result the LLM hallucinated familiar source labels
|
||||||
|
("FDP NRW Wahlprogramm 2022") for chunks that actually came from MV/BE,
|
||||||
|
because that was the strongest training-set prior for budget-policy
|
||||||
|
citations.
|
||||||
|
|
||||||
|
Fix: prepend the fully-qualified PROGRAMME[programm_id]["name"] to each
|
||||||
|
quote.
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
import types
|
||||||
|
|
||||||
|
# Stub openai before importing embeddings, since the test environment may
|
||||||
|
# not have it installed and we don't actually need to make API calls.
|
||||||
|
if "openai" not in sys.modules:
|
||||||
|
openai_stub = types.ModuleType("openai")
|
||||||
|
openai_stub.OpenAI = lambda **kw: None
|
||||||
|
sys.modules["openai"] = openai_stub
|
||||||
|
|
||||||
|
from app.embeddings import _chunk_source_label, format_quotes_for_prompt
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# _chunk_source_label — fully-qualified programme name + page
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestChunkSourceLabel:
|
||||||
|
def test_known_programme_id(self):
|
||||||
|
chunk = {"programm_id": "fdp-mv-2021", "seite": 73, "text": "..."}
|
||||||
|
label = _chunk_source_label(chunk)
|
||||||
|
assert "FDP Mecklenburg-Vorpommern" in label
|
||||||
|
assert "S. 73" in label
|
||||||
|
|
||||||
|
def test_known_programme_id_for_be(self):
|
||||||
|
chunk = {"programm_id": "spd-be-2023", "seite": 24, "text": "..."}
|
||||||
|
label = _chunk_source_label(chunk)
|
||||||
|
assert "SPD Berlin" in label
|
||||||
|
assert "2021" in label # the BE-2023.pdf files contain 2021er programmes
|
||||||
|
assert "S. 24" in label
|
||||||
|
|
||||||
|
def test_unknown_programme_id_falls_back_to_id(self):
|
||||||
|
chunk = {"programm_id": "fake-xx-9999", "seite": 1, "text": "..."}
|
||||||
|
label = _chunk_source_label(chunk)
|
||||||
|
# Should not crash, should at least include the id and the page
|
||||||
|
assert "fake-xx-9999" in label
|
||||||
|
assert "S. 1" in label
|
||||||
|
|
||||||
|
def test_missing_seite_uses_questionmark(self):
|
||||||
|
chunk = {"programm_id": "cdu-mv-2021", "text": "..."}
|
||||||
|
label = _chunk_source_label(chunk)
|
||||||
|
assert "?" in label
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# format_quotes_for_prompt — every chunk must carry programme identification
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
EXAMPLE_QUOTES = {
|
||||||
|
"FDP": {
|
||||||
|
"wahlprogramm": [
|
||||||
|
{
|
||||||
|
"programm_id": "fdp-mv-2021",
|
||||||
|
"partei": "FDP",
|
||||||
|
"typ": "wahlprogramm",
|
||||||
|
"seite": 73,
|
||||||
|
"text": "Die Grundsätze von Wirtschaftlichkeit und Sparsamkeit",
|
||||||
|
"similarity": 0.63,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"parteiprogramm": [
|
||||||
|
{
|
||||||
|
"programm_id": "fdp-grundsatz",
|
||||||
|
"partei": "FDP",
|
||||||
|
"typ": "parteiprogramm",
|
||||||
|
"seite": 93,
|
||||||
|
"text": "Liberale Marktwirtschaft erfordert solide Haushalte",
|
||||||
|
"similarity": 0.60,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"SPD": {
|
||||||
|
"wahlprogramm": [
|
||||||
|
{
|
||||||
|
"programm_id": "spd-mv-2021",
|
||||||
|
"partei": "SPD",
|
||||||
|
"typ": "wahlprogramm",
|
||||||
|
"seite": 22,
|
||||||
|
"text": "Verkehrswende weg vom motorisierten Individualverkehr",
|
||||||
|
"similarity": 0.58,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatQuotesForPrompt:
|
||||||
|
def test_empty_input_returns_empty_string(self):
|
||||||
|
assert format_quotes_for_prompt({}) == ""
|
||||||
|
|
||||||
|
def test_renders_party_headings(self):
|
||||||
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
||||||
|
assert "### FDP" in out
|
||||||
|
assert "### SPD" in out
|
||||||
|
|
||||||
|
def test_every_chunk_has_programme_name(self):
|
||||||
|
"""Regression: pre-fix this used "S. X:" only, no programme name —
|
||||||
|
the LLM then hallucinated NRW-2022 sources from training data."""
|
||||||
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
||||||
|
# Each of the three chunks must reference its source programme
|
||||||
|
assert "FDP Mecklenburg-Vorpommern" in out
|
||||||
|
assert "FDP Grundsatzprogramm" in out
|
||||||
|
assert "SPD Mecklenburg-Vorpommern" in out
|
||||||
|
|
||||||
|
def test_contains_strict_citation_instruction(self):
|
||||||
|
"""The prompt header must explicitly forbid hallucinated sources."""
|
||||||
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
||||||
|
assert "ausschließlich" in out.lower() or "verbatim" in out.lower() or "wörtlich" in out.lower()
|
||||||
|
|
||||||
|
def test_no_nrw_2022_appears_unless_chunks_are_actually_nrw(self):
|
||||||
|
"""Sanity: a pure MV+SPD chunk set must not mention NRW anywhere."""
|
||||||
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
||||||
|
assert "NRW" not in out
|
||||||
|
assert "Nordrhein-Westfalen" not in out
|
||||||
|
|
||||||
|
def test_renders_separate_blocks_for_wahl_and_parteiprogramm(self):
|
||||||
|
out = format_quotes_for_prompt(EXAMPLE_QUOTES)
|
||||||
|
assert "**Wahlprogramm:**" in out
|
||||||
|
assert "**Grundsatzprogramm:**" in out
|
||||||
|
|
||||||
|
def test_text_truncated_at_500_chars(self):
|
||||||
|
long_chunk = {
|
||||||
|
"FDP": {
|
||||||
|
"wahlprogramm": [
|
||||||
|
{
|
||||||
|
"programm_id": "fdp-mv-2021",
|
||||||
|
"seite": 1,
|
||||||
|
"text": "A" * 1000, # 1000 chars → should be truncated
|
||||||
|
"similarity": 0.7,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out = format_quotes_for_prompt(long_chunk)
|
||||||
|
# Truncation marker
|
||||||
|
assert "..." in out
|
||||||
|
# Original chunk text 1000 chars not present in full
|
||||||
|
assert "A" * 1000 not in out
|
||||||
328
tests/test_parlamente.py
Normal file
328
tests/test_parlamente.py
Normal file
@ -0,0 +1,328 @@
|
|||||||
|
"""Tests for parlamente.py adapter parsers — pure functions over fixture HTML.
|
||||||
|
|
||||||
|
Reproduces the three regression scenarios from the 2026-04-08 adapter session:
|
||||||
|
|
||||||
|
1. PortalaAdapter `_parse_hit_list_cards` had a `doctype` vs. `doctype_full`
|
||||||
|
NameError that was hot-fixed live on the prod server (commit 1cb030a).
|
||||||
|
2. ParLDokAdapter `_hit_to_drucksache` needs to map ParlDok 8.x JSON hit
|
||||||
|
dicts to Drucksache objects without losing fraction or date info.
|
||||||
|
3. PortalaAdapter `_normalize_fraktion` and ParLDokAdapter same-named method
|
||||||
|
must yield canonical fraction codes for both comma-lists and embedded
|
||||||
|
"MdL (Partei)" patterns.
|
||||||
|
"""
|
||||||
|
from app.parlamente import ParLDokAdapter, PortalaAdapter, NRWAdapter, Drucksache
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# PortalaAdapter — Berlin-style HTML cards
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
BE_CARD_FIXTURE = """
|
||||||
|
<div class="other-prelude">ignored</div>
|
||||||
|
<div class="record-card efxRecordRepeater">
|
||||||
|
<h3 class="h5"><span>Schwimmstatistik für die dritten Klassen der Berliner Schulen</span></h3>
|
||||||
|
<span class="h6">Antrag (Eilantrag) <a href="/files/drs19-3104.pdf">Drucksache 19/3104</a> S. 1 bis 24 vom 31.03.2026</span>
|
||||||
|
</div>
|
||||||
|
<div class="record-card efxRecordRepeater">
|
||||||
|
<h3 class="h5"><span>Klimaneutrales Bauen im Bestand</span></h3>
|
||||||
|
<span class="h6">Antrag CDU, SPD <a href="/files/drs19-3107.pdf">Drucksache 19/3107</a> vom 02.04.2026</span>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _make_be_adapter():
|
||||||
|
return PortalaAdapter(
|
||||||
|
bundesland="BE",
|
||||||
|
name="test BE",
|
||||||
|
base_url="https://pardok.parlament-berlin.de",
|
||||||
|
db_id="lah.lissh",
|
||||||
|
wahlperiode=19,
|
||||||
|
portala_path="/portala",
|
||||||
|
document_type=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestPortalaAdapterCardParser:
|
||||||
|
"""Issue: doctype/doctype_full NameError (hot-fix 1cb030a)."""
|
||||||
|
|
||||||
|
def test_parses_two_cards_without_nameerror(self):
|
||||||
|
"""Smoke test — must NOT raise NameError or any other exception.
|
||||||
|
|
||||||
|
Pre-fix this method referenced an undefined ``doctype`` variable
|
||||||
|
in the query-filter branch when computing the haystack. The fix
|
||||||
|
renamed it to ``doctype_full``. A direct call covers both branches.
|
||||||
|
"""
|
||||||
|
adapter = _make_be_adapter()
|
||||||
|
result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="")
|
||||||
|
assert len(result) == 2
|
||||||
|
|
||||||
|
def test_first_card_extracts_drucksache_and_title(self):
|
||||||
|
adapter = _make_be_adapter()
|
||||||
|
result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="")
|
||||||
|
d = result[0]
|
||||||
|
assert d.drucksache == "19/3104"
|
||||||
|
assert "Schwimmstatistik" in d.title
|
||||||
|
assert d.datum == "2026-03-31"
|
||||||
|
assert d.bundesland == "BE"
|
||||||
|
|
||||||
|
def test_second_card_extracts_fraktionen_from_h6(self):
|
||||||
|
"""Card 2 packs CDU+SPD into the type line — must split out cleanly."""
|
||||||
|
adapter = _make_be_adapter()
|
||||||
|
result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="")
|
||||||
|
d = result[1]
|
||||||
|
assert d.drucksache == "19/3107"
|
||||||
|
assert d.fraktionen == ["CDU", "SPD"]
|
||||||
|
# The typ string should have the parties stripped back out
|
||||||
|
assert d.typ.strip() == "Antrag"
|
||||||
|
|
||||||
|
def test_pdf_link_is_absolute_url(self):
|
||||||
|
adapter = _make_be_adapter()
|
||||||
|
result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="")
|
||||||
|
assert result[0].link.startswith("https://pardok.parlament-berlin.de/")
|
||||||
|
assert result[0].link.endswith(".pdf")
|
||||||
|
|
||||||
|
def test_query_filter_uses_doctype_full_not_doctype(self):
|
||||||
|
"""Regression: the filter branch references doctype_full, not doctype.
|
||||||
|
|
||||||
|
Pre-fix this raised NameError as soon as a query was passed.
|
||||||
|
"""
|
||||||
|
adapter = _make_be_adapter()
|
||||||
|
# Schwimmstatistik matches card 1, Klimaneutral matches card 2
|
||||||
|
result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="Schwimm")
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result[0].drucksache == "19/3104"
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# PortalaAdapter — LSA-style Perl-Dump records
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
LSA_DUMP_FIXTURE = """
|
||||||
|
<pre>$VAR1 = {
|
||||||
|
'WEV06' => [{ 'main' => 'Demokratie beginnt im Klassenzimmer' }],
|
||||||
|
'WEV32' => [{
|
||||||
|
'main' => 'Antrag B\\x{fc}ndnis 90/Die Gr\\x{fc}nen 06.03.2026 Drucksache <b>8/6726</b> ...',
|
||||||
|
'5' => 'drs/wp8/drs/d6726lan.pdf'
|
||||||
|
}]
|
||||||
|
}</pre>
|
||||||
|
<pre>$VAR1 = {
|
||||||
|
'WEV06' => [{ 'main' => 'Andere Drucksache ohne Schul-Bezug' }],
|
||||||
|
'WEV32' => [{
|
||||||
|
'main' => 'Antrag CDU, SPD 14.01.2026 Drucksache <b>8/6171</b> ...',
|
||||||
|
'5' => 'drs/wp8/drs/d6171lan.pdf'
|
||||||
|
}]
|
||||||
|
}</pre>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _make_lsa_adapter():
|
||||||
|
return PortalaAdapter(
|
||||||
|
bundesland="LSA",
|
||||||
|
name="test LSA",
|
||||||
|
base_url="https://padoka.landtag.sachsen-anhalt.de",
|
||||||
|
db_id="lsa.lissh",
|
||||||
|
wahlperiode=8,
|
||||||
|
portala_path="/portal",
|
||||||
|
document_type="Antrag",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestPortalaAdapterDumpParser:
|
||||||
|
def test_parses_two_dump_records(self):
|
||||||
|
adapter = _make_lsa_adapter()
|
||||||
|
result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="")
|
||||||
|
assert len(result) == 2
|
||||||
|
|
||||||
|
def test_extracts_drucksache_from_perl_dump(self):
|
||||||
|
adapter = _make_lsa_adapter()
|
||||||
|
result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="")
|
||||||
|
assert result[0].drucksache == "8/6726"
|
||||||
|
assert result[1].drucksache == "8/6171"
|
||||||
|
|
||||||
|
def test_decodes_perl_hex_escapes_in_urheber(self):
|
||||||
|
"""The first record has \\x{fc} (ü) and \\x{e4} (ä) in WEV32."""
|
||||||
|
adapter = _make_lsa_adapter()
|
||||||
|
result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="")
|
||||||
|
# GRÜNE should be detected from "Bündnis 90/Die Grünen"
|
||||||
|
assert "GRÜNE" in result[0].fraktionen
|
||||||
|
|
||||||
|
def test_extracts_date_iso(self):
|
||||||
|
adapter = _make_lsa_adapter()
|
||||||
|
result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="")
|
||||||
|
assert result[0].datum == "2026-03-06"
|
||||||
|
assert result[1].datum == "2026-01-14"
|
||||||
|
|
||||||
|
def test_pdf_url_uses_pdf_url_prefix(self):
|
||||||
|
adapter = _make_lsa_adapter()
|
||||||
|
result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="")
|
||||||
|
assert result[0].link == (
|
||||||
|
"https://padoka.landtag.sachsen-anhalt.de/files/drs/wp8/drs/d6726lan.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_client_side_query_filter(self):
|
||||||
|
adapter = _make_lsa_adapter()
|
||||||
|
result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="Demokratie")
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result[0].drucksache == "8/6726"
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# PortalaAdapter — Auto-detection between dump and card formats
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestPortalaAdapterAutoDetect:
|
||||||
|
def test_dump_html_routes_to_dump_parser(self):
|
||||||
|
adapter = _make_lsa_adapter()
|
||||||
|
result = adapter._parse_hit_list_html(LSA_DUMP_FIXTURE, query_filter="")
|
||||||
|
assert len(result) == 2
|
||||||
|
|
||||||
|
def test_card_html_routes_to_card_parser(self):
|
||||||
|
adapter = _make_be_adapter()
|
||||||
|
result = adapter._parse_hit_list_html(BE_CARD_FIXTURE, query_filter="")
|
||||||
|
assert len(result) == 2
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# PortalaAdapter._normalize_fraktion — canonical fraction codes
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestPortalaAdapterNormalizeFraktion:
|
||||||
|
def test_comma_separated_list(self):
|
||||||
|
out = PortalaAdapter._normalize_fraktion("CDU, SPD, F.D.P.")
|
||||||
|
assert "CDU" in out and "SPD" in out and "FDP" in out
|
||||||
|
|
||||||
|
def test_buendnis_90_die_gruenen(self):
|
||||||
|
out = PortalaAdapter._normalize_fraktion("BÜNDNIS 90/DIE GRÜNEN")
|
||||||
|
assert out == ["GRÜNE"]
|
||||||
|
|
||||||
|
def test_die_linke(self):
|
||||||
|
out = PortalaAdapter._normalize_fraktion("DIE LINKE")
|
||||||
|
assert out == ["LINKE"]
|
||||||
|
|
||||||
|
def test_bsw(self):
|
||||||
|
out = PortalaAdapter._normalize_fraktion("BSW")
|
||||||
|
assert out == ["BSW"]
|
||||||
|
|
||||||
|
def test_landesregierung_keywords(self):
|
||||||
|
out = PortalaAdapter._normalize_fraktion("Senat von Berlin")
|
||||||
|
assert "Landesregierung" in out
|
||||||
|
|
||||||
|
def test_empty_string(self):
|
||||||
|
assert PortalaAdapter._normalize_fraktion("") == []
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# ParLDokAdapter — JSON hit dict → Drucksache mapping
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _make_mv_adapter():
|
||||||
|
return ParLDokAdapter(
|
||||||
|
bundesland="MV",
|
||||||
|
name="test MV",
|
||||||
|
base_url="https://www.dokumentation.landtag-mv.de",
|
||||||
|
wahlperiode=8,
|
||||||
|
prefix="/parldok",
|
||||||
|
document_typ="Antrag",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_PARLDOK_HIT = {
|
||||||
|
"id": 70748,
|
||||||
|
"title": "Zweckentfremdung von Sondervermögen des Bundes beenden",
|
||||||
|
"date": "18.03.2026",
|
||||||
|
"prelink": "/dokument/70748",
|
||||||
|
"link": "/dokument/70748#navpanes=0",
|
||||||
|
"authorhtml": "FDP",
|
||||||
|
"kind": "Drucksache",
|
||||||
|
"type": "Antrag",
|
||||||
|
"lp": 8,
|
||||||
|
"number": "6409",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestParLDokAdapterHitMapping:
|
||||||
|
def test_hit_to_drucksache_basic(self):
|
||||||
|
adapter = _make_mv_adapter()
|
||||||
|
d = adapter._hit_to_drucksache(SAMPLE_PARLDOK_HIT)
|
||||||
|
assert d is not None
|
||||||
|
assert d.drucksache == "8/6409"
|
||||||
|
assert d.title == "Zweckentfremdung von Sondervermögen des Bundes beenden"
|
||||||
|
assert d.datum == "2026-03-18"
|
||||||
|
assert d.fraktionen == ["FDP"]
|
||||||
|
assert d.typ == "Antrag"
|
||||||
|
assert d.bundesland == "MV"
|
||||||
|
|
||||||
|
def test_pdf_link_strips_navpanes_fragment_and_prepends_prefix(self):
|
||||||
|
adapter = _make_mv_adapter()
|
||||||
|
d = adapter._hit_to_drucksache(SAMPLE_PARLDOK_HIT)
|
||||||
|
assert d.link == "https://www.dokumentation.landtag-mv.de/parldok/dokument/70748"
|
||||||
|
assert "#navpanes" not in d.link
|
||||||
|
|
||||||
|
def test_missing_lp_returns_none(self):
|
||||||
|
adapter = _make_mv_adapter()
|
||||||
|
hit = dict(SAMPLE_PARLDOK_HIT)
|
||||||
|
del hit["lp"]
|
||||||
|
assert adapter._hit_to_drucksache(hit) is None
|
||||||
|
|
||||||
|
def test_mdl_with_party_in_parens(self):
|
||||||
|
"""MV often packs the MdL into authorhtml: 'Thomas X (AfD)'."""
|
||||||
|
adapter = _make_mv_adapter()
|
||||||
|
hit = dict(SAMPLE_PARLDOK_HIT, authorhtml="Thomas de Jesus Fernandes (AfD)")
|
||||||
|
d = adapter._hit_to_drucksache(hit)
|
||||||
|
assert "AfD" in d.fraktionen
|
||||||
|
|
||||||
|
def test_landesregierung_detection(self):
|
||||||
|
adapter = _make_mv_adapter()
|
||||||
|
hit = dict(SAMPLE_PARLDOK_HIT, authorhtml="Ministerium der Finanzen")
|
||||||
|
d = adapter._hit_to_drucksache(hit)
|
||||||
|
assert "Landesregierung" in d.fraktionen
|
||||||
|
|
||||||
|
|
||||||
|
class TestParLDokFulltextIdSanitization:
|
||||||
|
"""Reverse-engineered from bundle.js pd.getFulltextId — must mirror exactly.
|
||||||
|
|
||||||
|
Even though server-side fulltext is currently disabled (#18), the helper
|
||||||
|
is kept around in code as documentation. If it ever gets re-activated,
|
||||||
|
the sanitization must still match the SPA's behavior 1:1.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_simple_word_unchanged(self):
|
||||||
|
assert ParLDokAdapter._fulltext_id("Schule") == "Schule"
|
||||||
|
|
||||||
|
def test_whitespace_becomes_dash(self):
|
||||||
|
assert ParLDokAdapter._fulltext_id("Klima Schutz") == "Klima-Schutz"
|
||||||
|
|
||||||
|
def test_umlauts_become_dashes(self):
|
||||||
|
# The JS regex is /[^a-zA-z0-9]/ — note the lowercase z, deliberate.
|
||||||
|
# Umlauts are non-ASCII so they get replaced.
|
||||||
|
assert ParLDokAdapter._fulltext_id("Bürger") == "B-rger"
|
||||||
|
|
||||||
|
def test_punctuation_becomes_dashes(self):
|
||||||
|
assert ParLDokAdapter._fulltext_id("CO2-Emission") == "CO2-Emission"
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Adapter registry sanity
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestAdapterRegistry:
|
||||||
|
def test_active_adapters_present(self):
|
||||||
|
from app.parlamente import ADAPTERS
|
||||||
|
for code in ["NRW", "MV", "BE", "LSA"]:
|
||||||
|
assert code in ADAPTERS, f"missing adapter for {code}"
|
||||||
|
|
||||||
|
def test_get_adapter_returns_none_for_unknown(self):
|
||||||
|
from app.parlamente import get_adapter
|
||||||
|
assert get_adapter("XX") is None
|
||||||
|
|
||||||
|
def test_mv_adapter_is_parldok_instance(self):
|
||||||
|
from app.parlamente import ADAPTERS
|
||||||
|
assert isinstance(ADAPTERS["MV"], ParLDokAdapter)
|
||||||
|
|
||||||
|
def test_be_adapter_is_portala_instance(self):
|
||||||
|
from app.parlamente import ADAPTERS
|
||||||
|
assert isinstance(ADAPTERS["BE"], PortalaAdapter)
|
||||||
|
|
||||||
|
def test_lsa_adapter_is_portala_instance(self):
|
||||||
|
from app.parlamente import ADAPTERS
|
||||||
|
assert isinstance(ADAPTERS["LSA"], PortalaAdapter)
|
||||||
118
tests/test_wahlprogramme.py
Normal file
118
tests/test_wahlprogramme.py
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
"""Tests for wahlprogramme.py — registry consistency + file existence."""
|
||||||
|
from app.wahlprogramme import (
|
||||||
|
WAHLPROGRAMME,
|
||||||
|
REFERENZEN_PATH,
|
||||||
|
get_wahlprogramm,
|
||||||
|
parteien_mit_wahlprogramm,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Registry consistency
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestRegistryStructure:
|
||||||
|
def test_active_bundeslaender_present(self):
|
||||||
|
for code in ["NRW", "LSA", "MV", "BE"]:
|
||||||
|
assert code in WAHLPROGRAMME, f"missing wahlprogramme entry for {code}"
|
||||||
|
|
||||||
|
def test_each_entry_has_required_keys(self):
|
||||||
|
required = {"file", "titel", "partei", "jahr", "seiten"}
|
||||||
|
for bl, parteien in WAHLPROGRAMME.items():
|
||||||
|
for partei, info in parteien.items():
|
||||||
|
missing = required - set(info.keys())
|
||||||
|
assert not missing, f"{bl}/{partei} missing keys: {missing}"
|
||||||
|
|
||||||
|
def test_jahr_is_integer(self):
|
||||||
|
for bl, parteien in WAHLPROGRAMME.items():
|
||||||
|
for partei, info in parteien.items():
|
||||||
|
assert isinstance(info["jahr"], int), f"{bl}/{partei} jahr not int"
|
||||||
|
|
||||||
|
def test_seiten_is_positive_integer(self):
|
||||||
|
for bl, parteien in WAHLPROGRAMME.items():
|
||||||
|
for partei, info in parteien.items():
|
||||||
|
assert isinstance(info["seiten"], int)
|
||||||
|
assert info["seiten"] > 0
|
||||||
|
|
||||||
|
def test_file_extension_is_pdf(self):
|
||||||
|
for bl, parteien in WAHLPROGRAMME.items():
|
||||||
|
for partei, info in parteien.items():
|
||||||
|
assert info["file"].endswith(".pdf")
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# File existence — every registered file must exist on disk
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestFileExistence:
|
||||||
|
"""Catches typos in the file field that would silently break embedding
|
||||||
|
indexing or PDF download links."""
|
||||||
|
|
||||||
|
def test_every_registered_pdf_exists(self):
|
||||||
|
missing = []
|
||||||
|
for bl, parteien in WAHLPROGRAMME.items():
|
||||||
|
for partei, info in parteien.items():
|
||||||
|
path = REFERENZEN_PATH / info["file"]
|
||||||
|
if not path.exists():
|
||||||
|
missing.append(f"{bl}/{partei}: {info['file']}")
|
||||||
|
assert not missing, "missing PDFs:\n " + "\n ".join(missing)
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Lookup helpers
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestGetWahlprogramm:
|
||||||
|
def test_returns_dict_for_known_combination(self):
|
||||||
|
info = get_wahlprogramm("MV", "CDU")
|
||||||
|
assert info is not None
|
||||||
|
assert info["partei"] == "CDU Mecklenburg-Vorpommern"
|
||||||
|
|
||||||
|
def test_returns_none_for_unknown_bundesland(self):
|
||||||
|
assert get_wahlprogramm("XX", "CDU") is None
|
||||||
|
|
||||||
|
def test_returns_none_for_unknown_partei(self):
|
||||||
|
assert get_wahlprogramm("NRW", "BSW") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestParteienMitWahlprogramm:
|
||||||
|
def test_nrw_has_five_parteien(self):
|
||||||
|
parteien = parteien_mit_wahlprogramm("NRW")
|
||||||
|
assert len(parteien) == 5
|
||||||
|
assert set(parteien) == {"CDU", "SPD", "GRÜNE", "FDP", "AfD"}
|
||||||
|
|
||||||
|
def test_mv_has_six_parteien(self):
|
||||||
|
parteien = parteien_mit_wahlprogramm("MV")
|
||||||
|
assert set(parteien) == {"CDU", "SPD", "GRÜNE", "FDP", "AfD", "LINKE"}
|
||||||
|
|
||||||
|
def test_be_has_five_parteien(self):
|
||||||
|
parteien = parteien_mit_wahlprogramm("BE")
|
||||||
|
assert set(parteien) == {"CDU", "SPD", "GRÜNE", "LINKE", "AfD"}
|
||||||
|
|
||||||
|
def test_unknown_bundesland_empty_list(self):
|
||||||
|
assert parteien_mit_wahlprogramm("XX") == []
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# embeddings.PROGRAMME consistency cross-check
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestEmbeddingsRegistryConsistency:
|
||||||
|
"""Every entry in WAHLPROGRAMME must also exist in embeddings.PROGRAMME
|
||||||
|
so the indexer can find it. Mismatch is the kind of bug a manual smoke
|
||||||
|
misses but would show up during indexing."""
|
||||||
|
|
||||||
|
def test_every_wahlprogramm_has_embeddings_entry(self):
|
||||||
|
from app.embeddings import PROGRAMME
|
||||||
|
|
||||||
|
# Build expected programm_id from filename: "cdu-mv-2021.pdf" → "cdu-mv-2021"
|
||||||
|
missing = []
|
||||||
|
for bl, parteien in WAHLPROGRAMME.items():
|
||||||
|
for partei, info in parteien.items():
|
||||||
|
pid = info["file"].rsplit(".", 1)[0]
|
||||||
|
if pid not in PROGRAMME:
|
||||||
|
missing.append(f"{bl}/{partei} → {pid}")
|
||||||
|
assert not missing, (
|
||||||
|
"WAHLPROGRAMME entries missing in embeddings.PROGRAMME:\n "
|
||||||
|
+ "\n ".join(missing)
|
||||||
|
)
|
||||||
Loading…
Reference in New Issue
Block a user