From f98e64c73407f86dd8c6a3352561cc8a957ec99f Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Wed, 8 Apr 2026 23:26:06 +0200 Subject: [PATCH] Add pytest suite + fix two regex bugs uncovered by it (#46) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Erste Tests für die Codebase. 77 Tests, 0.08s Laufzeit, decken die drei Bug-Klassen aus der April-2026-Adapter-Session ab plus haben schon zwei weitere Bugs in Production-Code aufgedeckt. ## Setup - requirements-dev.txt mit pytest + pytest-asyncio - pytest.ini mit asyncio_mode=auto - tests/conftest.py stubbt fitz/bs4/openai/pydantic_settings, damit die Suite ohne den vollen prod-requirements-Satz läuft (pure unit tests, kein PDF-Parsing, kein HTTP) ## Tests - tests/test_parlamente.py (33 Tests) * PortalaAdapter._parse_hit_list_cards: doctype/doctype_full NameError-Regression aus 1cb030a, plus Title/Drucksache/Fraktion- /Datum/PDF-Extraktion gegen ein BE-Card-Fixture * PortalaAdapter._parse_hit_list_dump: gegen ein LSA-Perl-Dump- Fixture inkl. Hex-Escape-Decoding (\x{fc} → ü) * PortalaAdapter._parse_hit_list_html: Auto-Detection zwischen Card- und Dump-Format * PortalaAdapter._normalize_fraktion: kanonische Fraktion-Codes inkl. F.D.P.-mit-Punkten, BÜNDNIS 90, DIE LINKE, BSW * ParLDokAdapter._hit_to_drucksache: JSON-Hit → Drucksache Mapping inkl. /navpanes-Stripping, MdL-mit-Partei-in-Klammern, Landesregierung-Detection * ParLDokAdapter._fulltext_id: bundle.js-mirroring (deferred, aber dokumentiert) * ADAPTERS-Registry-Sanity - tests/test_embeddings.py (11 Tests) * _chunk_source_label: Programm-Name + Seite (Halluzinations- Bug-Regression aus 1b5fd96) * format_quotes_for_prompt: jeder Chunk muss Programm-Name enthalten, strict-citation-Hinweis muss im Output sein, keine NRW-Halluzinationen für MV/BE-Chunk-Sets - tests/test_wahlprogramme.py (14 Tests) * Registry-Struktur (jahr int, seiten int, .pdf-Endung) * File-Existenz: jede registrierte PDF muss in static/referenzen/ liegen — würde Tippfehler in den 22 indexierten Programmen sofort fangen * embeddings.PROGRAMME-Konsistenz-Cross-Check - tests/test_bundeslaender.py (15 Tests) * Sanity über 16-State-Registry * #48-Klassifikations-Regression: TH=ParlDok, HB=StarWeb, SN=Eigensystem * Wahltermine plausibel (zwischen 2026 und 2035) - tests/test_analyzer.py (4 Tests) * Markdown-Codeblock-Stripping aus dem JSON-Retry-Loop ## Bug-Funde während der Test-Schreibphase Zwei Production-Bugs in den _normalize_fraktion-Helfern wurden durch die neuen Tests sofort aufgedeckt und im selben Commit gefixt: 1. PortalaAdapter._normalize_fraktion matched "F.D.P." (mit Punkten, wie historische SH/HB-Drucksachen) nicht — Regex \bFDP\b ist zu strikt. Fix: \bF\.?\s*D\.?\s*P\.?\b analog zu ParLDokAdapter. 2. ParLDokAdapter._normalize_fraktion (auch PortalaAdapter) matched "Ministerium der Finanzen" nicht als Landesregierung, weil \bMINISTER\b die Wortgrenze auch nach MINISTER verlangt — bei MINISTERIUM steht aber IUM danach, keine Wortgrenze. Fix: \bMINISTER ohne abschließendes \b. Beide Bugs hätten Fraktion-Felder bei Drucksachen der Bremischen Bürgerschaft (FDP-Listen) und bei Landesregierungs-Drucksachen in MV/LSA fälschlich leer gelassen — exakt der "fraktionen=[]"- Befund aus dem MV-Smoke-Test in #4. Phase 0 aus Roadmap-Issue #49. Co-Authored-By: Claude Opus 4.6 (1M context) --- app/parlamente.py | 13 +- pytest.ini | 5 + requirements-dev.txt | 13 ++ tests/__init__.py | 0 tests/conftest.py | 51 ++++++ tests/test_analyzer.py | 62 +++++++ tests/test_bundeslaender.py | 81 +++++++++ tests/test_embeddings.py | 151 +++++++++++++++++ tests/test_parlamente.py | 328 ++++++++++++++++++++++++++++++++++++ tests/test_wahlprogramme.py | 118 +++++++++++++ 10 files changed, 819 insertions(+), 3 deletions(-) create mode 100644 pytest.ini create mode 100644 requirements-dev.txt create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/test_analyzer.py create mode 100644 tests/test_bundeslaender.py create mode 100644 tests/test_embeddings.py create mode 100644 tests/test_parlamente.py create mode 100644 tests/test_wahlprogramme.py diff --git a/app/parlamente.py b/app/parlamente.py index db35487..8ec3cc6 100644 --- a/app/parlamente.py +++ b/app/parlamente.py @@ -443,7 +443,9 @@ class PortalaAdapter(ParlamentAdapter): out.append("CDU") if has(r"\bSPD\b"): out.append("SPD") - if has(r"\bFDP\b"): + # F.D.P. (with dots, historical SH/HB-style) and FDP (modern) — same + # flexible pattern as ParLDokAdapter so the test suite stays consistent. + if has(r"\bF\.?\s*D\.?\s*P\.?\b"): out.append("FDP") if has(r"\bAFD\b"): out.append("AfD") @@ -451,7 +453,9 @@ class PortalaAdapter(ParlamentAdapter): out.append("LINKE") if has(r"\bBSW\b"): out.append("BSW") - if has(r"LANDESREGIERUNG|SENAT VON BERLIN|REGIERENDE[RN]?\s+BÜRGERMEISTER|MINISTER\b|STAATSKANZLEI"): + # MINISTERIUM/MINISTER beide treffen — \bMINISTER ohne abschließende + # Wortgrenze, damit "Ministerium der Finanzen" mit erfasst wird. + if has(r"LANDESREGIERUNG|SENAT VON BERLIN|REGIERENDE[RN]?\s+BÜRGERMEISTER|\bMINISTER|STAATSKANZLEI|MINISTERPRÄSIDENT"): out.append("Landesregierung") return out @@ -944,7 +948,10 @@ class ParLDokAdapter(ParlamentAdapter): out.append("LINKE") if re.search(r"\bBSW\b", u): out.append("BSW") - if re.search(r"LANDESREGIERUNG|MINISTER\b|STAATSKANZLEI|MINISTERPRÄSIDENT", u): + # \bMINISTER ohne abschließende Wortgrenze, damit MINISTERIUM + # auch trifft (z.B. "Ministerium der Finanzen" als Urheber von + # Landesregierungs-Drucksachen). + if re.search(r"LANDESREGIERUNG|\bMINISTER|STAATSKANZLEI|MINISTERPRÄSIDENT", u): out.append("Landesregierung") return out diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..4dc8d52 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +[pytest] +testpaths = tests +asyncio_mode = auto +filterwarnings = + ignore::DeprecationWarning diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..0f66d22 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,13 @@ +# Test- und Entwicklungs-Abhängigkeiten — getrennt von requirements.txt, +# damit der prod-Container sie nicht installieren muss. +# +# Installation lokal: +# pip install -r requirements.txt -r requirements-dev.txt +# +# Tests laufen lassen: +# pytest -v tests/ + +-r requirements.txt + +pytest>=8.0.0 +pytest-asyncio>=0.24.0 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..9f50970 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,51 @@ +"""Shared pytest fixtures and path setup. + +Stubs heavy optional dependencies (``fitz``/PyMuPDF, ``bs4``/BeautifulSoup, +``openai``) so the test suite can run without the full prod requirements +installed. The tests in this directory are pure unit tests over parser +logic and prompt formatters — they neither parse PDFs nor make HTTP +calls, so the stubs are inert placeholders that satisfy the import +machinery but never get exercised. + +If a test ever does need real PyMuPDF or httpx integration, give it a +fixture marked with ``@pytest.mark.integration`` and skip it by default. +""" +import sys +import types +from pathlib import Path + +# Make the `app` package importable when pytest is run from the webapp/ root. +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT)) + + +def _stub(name: str, **attrs) -> None: + if name in sys.modules: + return + mod = types.ModuleType(name) + for k, v in attrs.items(): + setattr(mod, k, v) + sys.modules[name] = mod + + +_stub("fitz") # PyMuPDF — used for PDF parsing, not in unit tests +_stub("bs4", BeautifulSoup=lambda *a, **kw: None) # only needed by NRWAdapter live calls +_stub("openai", OpenAI=lambda **kw: None) # only needed by embeddings live calls + + +# pydantic_settings is a small but external dep that's not in the test +# environment. Stub it with a minimal BaseSettings shim so app.config can +# import without crashing — the tests don't actually read settings values. +class _BaseSettingsShim: + model_config: dict = {} + + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + + +def _settings_config_dict(**kwargs): + return kwargs + + +_stub("pydantic_settings", BaseSettings=_BaseSettingsShim, SettingsConfigDict=_settings_config_dict) diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py new file mode 100644 index 0000000..a42fe32 --- /dev/null +++ b/tests/test_analyzer.py @@ -0,0 +1,62 @@ +"""Tests for analyzer.py JSON-stripping logic. + +Reproduces the markdown-codeblock-stripping in the LLM retry loop. Real +Qwen responses sometimes wrap their JSON in ```json …``` fences (despite +the prompt asking for raw JSON), and the analyzer must tolerate that +without resorting to retries. +""" +import json +import sys +import types + +# Stub openai before importing analyzer +if "openai" not in sys.modules: + openai_stub = types.ModuleType("openai") + openai_stub.OpenAI = lambda **kw: None + sys.modules["openai"] = openai_stub + + +def _strip_markdown_fences(content: str) -> str: + """Mirror the analyzer's markdown-stripping snippet so we can unit-test + the parsing rules without actually invoking the LLM. + + Keep this in sync with analyzer.py around the `if content.startswith("```")` + branch — if the analyzer changes, this helper changes too. The point of + the duplication is that the analyzer's stripping is buried in an async + LLM call that we cannot easily unit-test directly. + """ + content = content.strip() + if content.startswith("```"): + content = content.split("\n", 1)[1] + if content.endswith("```"): + content = content.rsplit("```", 1)[0] + if content.startswith("```json"): + content = content[7:] + return content.strip() + + +SAMPLE_JSON = '{"gwoeScore": 7.0, "title": "Test"}' + + +class TestMarkdownStripping: + def test_plain_json_unchanged(self): + assert _strip_markdown_fences(SAMPLE_JSON) == SAMPLE_JSON + + def test_json_in_markdown_fence(self): + wrapped = f"```json\n{SAMPLE_JSON}\n```" + cleaned = _strip_markdown_fences(wrapped) + assert json.loads(cleaned)["gwoeScore"] == 7.0 + + def test_json_in_plain_fence(self): + wrapped = f"```\n{SAMPLE_JSON}\n```" + cleaned = _strip_markdown_fences(wrapped) + assert json.loads(cleaned)["gwoeScore"] == 7.0 + + def test_leading_whitespace_stripped(self): + wrapped = f" \n {SAMPLE_JSON} \n " + assert json.loads(_strip_markdown_fences(wrapped))["gwoeScore"] == 7.0 + + def test_trailing_fence_stripped(self): + wrapped = f"{SAMPLE_JSON}\n```" + cleaned = _strip_markdown_fences(wrapped) + assert json.loads(cleaned)["gwoeScore"] == 7.0 diff --git a/tests/test_bundeslaender.py b/tests/test_bundeslaender.py new file mode 100644 index 0000000..0660b8f --- /dev/null +++ b/tests/test_bundeslaender.py @@ -0,0 +1,81 @@ +"""Tests for bundeslaender.py — sanity over 16-state registry. + +Includes the #48 classification regression: TH must be ParlDok, HB must +be StarWeb, SN must be Eigensystem (not ParlDok). +""" +from app.bundeslaender import BUNDESLAENDER, get, aktive_bundeslaender, alle_bundeslaender + + +class TestRegistryStructure: + def test_sixteen_bundeslaender(self): + assert len(BUNDESLAENDER) == 16 + + def test_codes_are_uppercase(self): + for code in BUNDESLAENDER: + assert code.isupper(), f"{code} is not uppercase" + + def test_each_entry_has_naechste_wahl_or_none(self): + for code, bl in BUNDESLAENDER.items(): + assert bl.naechste_wahl is None or len(bl.naechste_wahl) == 10 + + def test_wahlperiode_is_positive_integer(self): + for bl in BUNDESLAENDER.values(): + assert isinstance(bl.wahlperiode, int) and bl.wahlperiode > 0 + + +class TestActiveBundeslaender: + def test_four_active_bundeslaender(self): + active = aktive_bundeslaender() + codes = {bl.code for bl in active} + assert codes == {"NRW", "LSA", "MV", "BE"} + + def test_alle_bundeslaender_returns_all_sixteen(self): + assert len(alle_bundeslaender()) == 16 + + def test_alle_bundeslaender_active_first(self): + out = alle_bundeslaender() + active_codes = {bl.code for bl in aktive_bundeslaender()} + # The first len(active) entries must all be active + for bl in out[: len(active_codes)]: + assert bl.code in active_codes + + +class TestGetHelper: + def test_returns_bundesland_for_known_code(self): + bl = get("NRW") + assert bl is not None + assert bl.name == "Nordrhein-Westfalen" + + def test_returns_none_for_unknown_code(self): + assert get("XX") is None + + +class TestClassificationFix48: + """Regression: #48 corrected three doku_system entries that the + follow-up adapter issues depend on.""" + + def test_th_is_parldok_not_starweb(self): + assert BUNDESLAENDER["TH"].doku_system == "ParlDok" + + def test_hb_is_starweb_not_paris(self): + """PARiS is just a StarWeb skin — must be classified as StarWeb.""" + assert BUNDESLAENDER["HB"].doku_system == "StarWeb" + + def test_sn_is_eigensystem_not_parldok(self): + """EDAS is ASP.NET-Webforms, NOT ParlDok-compatible with MV.""" + assert BUNDESLAENDER["SN"].doku_system == "Eigensystem" + + +class TestWahltermineSane: + """All upcoming elections must be in chronological order and in the + near future (sanity check that someone has not pasted a 1990 date).""" + + def test_no_election_before_2026(self): + for bl in BUNDESLAENDER.values(): + if bl.naechste_wahl: + assert bl.naechste_wahl >= "2026-01-01" + + def test_no_election_after_2035(self): + for bl in BUNDESLAENDER.values(): + if bl.naechste_wahl: + assert bl.naechste_wahl < "2035-01-01" diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py new file mode 100644 index 0000000..48ee6b5 --- /dev/null +++ b/tests/test_embeddings.py @@ -0,0 +1,151 @@ +"""Tests for embeddings.py prompt formatting. + +Reproduces the LLM-Halluzinations-Bug from the 2026-04-08 session +(commits 1b5fd96 + bc7f4a6): the original ``format_quotes_for_prompt`` +rendered each chunk as ``- S. X: "text"`` without any reference to the +programme name. As a result the LLM hallucinated familiar source labels +("FDP NRW Wahlprogramm 2022") for chunks that actually came from MV/BE, +because that was the strongest training-set prior for budget-policy +citations. + +Fix: prepend the fully-qualified PROGRAMME[programm_id]["name"] to each +quote. +""" +import sys +import types + +# Stub openai before importing embeddings, since the test environment may +# not have it installed and we don't actually need to make API calls. +if "openai" not in sys.modules: + openai_stub = types.ModuleType("openai") + openai_stub.OpenAI = lambda **kw: None + sys.modules["openai"] = openai_stub + +from app.embeddings import _chunk_source_label, format_quotes_for_prompt + + +# ───────────────────────────────────────────────────────────────────────────── +# _chunk_source_label — fully-qualified programme name + page +# ───────────────────────────────────────────────────────────────────────────── + +class TestChunkSourceLabel: + def test_known_programme_id(self): + chunk = {"programm_id": "fdp-mv-2021", "seite": 73, "text": "..."} + label = _chunk_source_label(chunk) + assert "FDP Mecklenburg-Vorpommern" in label + assert "S. 73" in label + + def test_known_programme_id_for_be(self): + chunk = {"programm_id": "spd-be-2023", "seite": 24, "text": "..."} + label = _chunk_source_label(chunk) + assert "SPD Berlin" in label + assert "2021" in label # the BE-2023.pdf files contain 2021er programmes + assert "S. 24" in label + + def test_unknown_programme_id_falls_back_to_id(self): + chunk = {"programm_id": "fake-xx-9999", "seite": 1, "text": "..."} + label = _chunk_source_label(chunk) + # Should not crash, should at least include the id and the page + assert "fake-xx-9999" in label + assert "S. 1" in label + + def test_missing_seite_uses_questionmark(self): + chunk = {"programm_id": "cdu-mv-2021", "text": "..."} + label = _chunk_source_label(chunk) + assert "?" in label + + +# ───────────────────────────────────────────────────────────────────────────── +# format_quotes_for_prompt — every chunk must carry programme identification +# ───────────────────────────────────────────────────────────────────────────── + +EXAMPLE_QUOTES = { + "FDP": { + "wahlprogramm": [ + { + "programm_id": "fdp-mv-2021", + "partei": "FDP", + "typ": "wahlprogramm", + "seite": 73, + "text": "Die Grundsätze von Wirtschaftlichkeit und Sparsamkeit", + "similarity": 0.63, + }, + ], + "parteiprogramm": [ + { + "programm_id": "fdp-grundsatz", + "partei": "FDP", + "typ": "parteiprogramm", + "seite": 93, + "text": "Liberale Marktwirtschaft erfordert solide Haushalte", + "similarity": 0.60, + }, + ], + }, + "SPD": { + "wahlprogramm": [ + { + "programm_id": "spd-mv-2021", + "partei": "SPD", + "typ": "wahlprogramm", + "seite": 22, + "text": "Verkehrswende weg vom motorisierten Individualverkehr", + "similarity": 0.58, + }, + ], + }, +} + + +class TestFormatQuotesForPrompt: + def test_empty_input_returns_empty_string(self): + assert format_quotes_for_prompt({}) == "" + + def test_renders_party_headings(self): + out = format_quotes_for_prompt(EXAMPLE_QUOTES) + assert "### FDP" in out + assert "### SPD" in out + + def test_every_chunk_has_programme_name(self): + """Regression: pre-fix this used "S. X:" only, no programme name — + the LLM then hallucinated NRW-2022 sources from training data.""" + out = format_quotes_for_prompt(EXAMPLE_QUOTES) + # Each of the three chunks must reference its source programme + assert "FDP Mecklenburg-Vorpommern" in out + assert "FDP Grundsatzprogramm" in out + assert "SPD Mecklenburg-Vorpommern" in out + + def test_contains_strict_citation_instruction(self): + """The prompt header must explicitly forbid hallucinated sources.""" + out = format_quotes_for_prompt(EXAMPLE_QUOTES) + assert "ausschließlich" in out.lower() or "verbatim" in out.lower() or "wörtlich" in out.lower() + + def test_no_nrw_2022_appears_unless_chunks_are_actually_nrw(self): + """Sanity: a pure MV+SPD chunk set must not mention NRW anywhere.""" + out = format_quotes_for_prompt(EXAMPLE_QUOTES) + assert "NRW" not in out + assert "Nordrhein-Westfalen" not in out + + def test_renders_separate_blocks_for_wahl_and_parteiprogramm(self): + out = format_quotes_for_prompt(EXAMPLE_QUOTES) + assert "**Wahlprogramm:**" in out + assert "**Grundsatzprogramm:**" in out + + def test_text_truncated_at_500_chars(self): + long_chunk = { + "FDP": { + "wahlprogramm": [ + { + "programm_id": "fdp-mv-2021", + "seite": 1, + "text": "A" * 1000, # 1000 chars → should be truncated + "similarity": 0.7, + } + ], + } + } + out = format_quotes_for_prompt(long_chunk) + # Truncation marker + assert "..." in out + # Original chunk text 1000 chars not present in full + assert "A" * 1000 not in out diff --git a/tests/test_parlamente.py b/tests/test_parlamente.py new file mode 100644 index 0000000..75c0ad6 --- /dev/null +++ b/tests/test_parlamente.py @@ -0,0 +1,328 @@ +"""Tests for parlamente.py adapter parsers — pure functions over fixture HTML. + +Reproduces the three regression scenarios from the 2026-04-08 adapter session: + +1. PortalaAdapter `_parse_hit_list_cards` had a `doctype` vs. `doctype_full` + NameError that was hot-fixed live on the prod server (commit 1cb030a). +2. ParLDokAdapter `_hit_to_drucksache` needs to map ParlDok 8.x JSON hit + dicts to Drucksache objects without losing fraction or date info. +3. PortalaAdapter `_normalize_fraktion` and ParLDokAdapter same-named method + must yield canonical fraction codes for both comma-lists and embedded + "MdL (Partei)" patterns. +""" +from app.parlamente import ParLDokAdapter, PortalaAdapter, NRWAdapter, Drucksache + + +# ───────────────────────────────────────────────────────────────────────────── +# PortalaAdapter — Berlin-style HTML cards +# ───────────────────────────────────────────────────────────────────────────── + +BE_CARD_FIXTURE = """ +
ignored
+
+

Schwimmstatistik für die dritten Klassen der Berliner Schulen

+ Antrag (Eilantrag) Drucksache 19/3104 S. 1 bis 24 vom 31.03.2026 +
+
+

Klimaneutrales Bauen im Bestand

+ Antrag CDU, SPD Drucksache 19/3107 vom 02.04.2026 +
+""" + + +def _make_be_adapter(): + return PortalaAdapter( + bundesland="BE", + name="test BE", + base_url="https://pardok.parlament-berlin.de", + db_id="lah.lissh", + wahlperiode=19, + portala_path="/portala", + document_type=None, + ) + + +class TestPortalaAdapterCardParser: + """Issue: doctype/doctype_full NameError (hot-fix 1cb030a).""" + + def test_parses_two_cards_without_nameerror(self): + """Smoke test — must NOT raise NameError or any other exception. + + Pre-fix this method referenced an undefined ``doctype`` variable + in the query-filter branch when computing the haystack. The fix + renamed it to ``doctype_full``. A direct call covers both branches. + """ + adapter = _make_be_adapter() + result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="") + assert len(result) == 2 + + def test_first_card_extracts_drucksache_and_title(self): + adapter = _make_be_adapter() + result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="") + d = result[0] + assert d.drucksache == "19/3104" + assert "Schwimmstatistik" in d.title + assert d.datum == "2026-03-31" + assert d.bundesland == "BE" + + def test_second_card_extracts_fraktionen_from_h6(self): + """Card 2 packs CDU+SPD into the type line — must split out cleanly.""" + adapter = _make_be_adapter() + result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="") + d = result[1] + assert d.drucksache == "19/3107" + assert d.fraktionen == ["CDU", "SPD"] + # The typ string should have the parties stripped back out + assert d.typ.strip() == "Antrag" + + def test_pdf_link_is_absolute_url(self): + adapter = _make_be_adapter() + result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="") + assert result[0].link.startswith("https://pardok.parlament-berlin.de/") + assert result[0].link.endswith(".pdf") + + def test_query_filter_uses_doctype_full_not_doctype(self): + """Regression: the filter branch references doctype_full, not doctype. + + Pre-fix this raised NameError as soon as a query was passed. + """ + adapter = _make_be_adapter() + # Schwimmstatistik matches card 1, Klimaneutral matches card 2 + result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="Schwimm") + assert len(result) == 1 + assert result[0].drucksache == "19/3104" + + +# ───────────────────────────────────────────────────────────────────────────── +# PortalaAdapter — LSA-style Perl-Dump records +# ───────────────────────────────────────────────────────────────────────────── + +LSA_DUMP_FIXTURE = """ +
$VAR1 = {
+  'WEV06' => [{ 'main' => 'Demokratie beginnt im Klassenzimmer' }],
+  'WEV32' => [{
+    'main' => 'Antrag B\\x{fc}ndnis 90/Die Gr\\x{fc}nen 06.03.2026 Drucksache 8/6726 ...',
+    '5' => 'drs/wp8/drs/d6726lan.pdf'
+  }]
+}
+
$VAR1 = {
+  'WEV06' => [{ 'main' => 'Andere Drucksache ohne Schul-Bezug' }],
+  'WEV32' => [{
+    'main' => 'Antrag CDU, SPD 14.01.2026 Drucksache 8/6171 ...',
+    '5' => 'drs/wp8/drs/d6171lan.pdf'
+  }]
+}
+""" + + +def _make_lsa_adapter(): + return PortalaAdapter( + bundesland="LSA", + name="test LSA", + base_url="https://padoka.landtag.sachsen-anhalt.de", + db_id="lsa.lissh", + wahlperiode=8, + portala_path="/portal", + document_type="Antrag", + ) + + +class TestPortalaAdapterDumpParser: + def test_parses_two_dump_records(self): + adapter = _make_lsa_adapter() + result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="") + assert len(result) == 2 + + def test_extracts_drucksache_from_perl_dump(self): + adapter = _make_lsa_adapter() + result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="") + assert result[0].drucksache == "8/6726" + assert result[1].drucksache == "8/6171" + + def test_decodes_perl_hex_escapes_in_urheber(self): + """The first record has \\x{fc} (ü) and \\x{e4} (ä) in WEV32.""" + adapter = _make_lsa_adapter() + result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="") + # GRÜNE should be detected from "Bündnis 90/Die Grünen" + assert "GRÜNE" in result[0].fraktionen + + def test_extracts_date_iso(self): + adapter = _make_lsa_adapter() + result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="") + assert result[0].datum == "2026-03-06" + assert result[1].datum == "2026-01-14" + + def test_pdf_url_uses_pdf_url_prefix(self): + adapter = _make_lsa_adapter() + result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="") + assert result[0].link == ( + "https://padoka.landtag.sachsen-anhalt.de/files/drs/wp8/drs/d6726lan.pdf" + ) + + def test_client_side_query_filter(self): + adapter = _make_lsa_adapter() + result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="Demokratie") + assert len(result) == 1 + assert result[0].drucksache == "8/6726" + + +# ───────────────────────────────────────────────────────────────────────────── +# PortalaAdapter — Auto-detection between dump and card formats +# ───────────────────────────────────────────────────────────────────────────── + +class TestPortalaAdapterAutoDetect: + def test_dump_html_routes_to_dump_parser(self): + adapter = _make_lsa_adapter() + result = adapter._parse_hit_list_html(LSA_DUMP_FIXTURE, query_filter="") + assert len(result) == 2 + + def test_card_html_routes_to_card_parser(self): + adapter = _make_be_adapter() + result = adapter._parse_hit_list_html(BE_CARD_FIXTURE, query_filter="") + assert len(result) == 2 + + +# ───────────────────────────────────────────────────────────────────────────── +# PortalaAdapter._normalize_fraktion — canonical fraction codes +# ───────────────────────────────────────────────────────────────────────────── + +class TestPortalaAdapterNormalizeFraktion: + def test_comma_separated_list(self): + out = PortalaAdapter._normalize_fraktion("CDU, SPD, F.D.P.") + assert "CDU" in out and "SPD" in out and "FDP" in out + + def test_buendnis_90_die_gruenen(self): + out = PortalaAdapter._normalize_fraktion("BÜNDNIS 90/DIE GRÜNEN") + assert out == ["GRÜNE"] + + def test_die_linke(self): + out = PortalaAdapter._normalize_fraktion("DIE LINKE") + assert out == ["LINKE"] + + def test_bsw(self): + out = PortalaAdapter._normalize_fraktion("BSW") + assert out == ["BSW"] + + def test_landesregierung_keywords(self): + out = PortalaAdapter._normalize_fraktion("Senat von Berlin") + assert "Landesregierung" in out + + def test_empty_string(self): + assert PortalaAdapter._normalize_fraktion("") == [] + + +# ───────────────────────────────────────────────────────────────────────────── +# ParLDokAdapter — JSON hit dict → Drucksache mapping +# ───────────────────────────────────────────────────────────────────────────── + +def _make_mv_adapter(): + return ParLDokAdapter( + bundesland="MV", + name="test MV", + base_url="https://www.dokumentation.landtag-mv.de", + wahlperiode=8, + prefix="/parldok", + document_typ="Antrag", + ) + + +SAMPLE_PARLDOK_HIT = { + "id": 70748, + "title": "Zweckentfremdung von Sondervermögen des Bundes beenden", + "date": "18.03.2026", + "prelink": "/dokument/70748", + "link": "/dokument/70748#navpanes=0", + "authorhtml": "FDP", + "kind": "Drucksache", + "type": "Antrag", + "lp": 8, + "number": "6409", +} + + +class TestParLDokAdapterHitMapping: + def test_hit_to_drucksache_basic(self): + adapter = _make_mv_adapter() + d = adapter._hit_to_drucksache(SAMPLE_PARLDOK_HIT) + assert d is not None + assert d.drucksache == "8/6409" + assert d.title == "Zweckentfremdung von Sondervermögen des Bundes beenden" + assert d.datum == "2026-03-18" + assert d.fraktionen == ["FDP"] + assert d.typ == "Antrag" + assert d.bundesland == "MV" + + def test_pdf_link_strips_navpanes_fragment_and_prepends_prefix(self): + adapter = _make_mv_adapter() + d = adapter._hit_to_drucksache(SAMPLE_PARLDOK_HIT) + assert d.link == "https://www.dokumentation.landtag-mv.de/parldok/dokument/70748" + assert "#navpanes" not in d.link + + def test_missing_lp_returns_none(self): + adapter = _make_mv_adapter() + hit = dict(SAMPLE_PARLDOK_HIT) + del hit["lp"] + assert adapter._hit_to_drucksache(hit) is None + + def test_mdl_with_party_in_parens(self): + """MV often packs the MdL into authorhtml: 'Thomas X (AfD)'.""" + adapter = _make_mv_adapter() + hit = dict(SAMPLE_PARLDOK_HIT, authorhtml="Thomas de Jesus Fernandes (AfD)") + d = adapter._hit_to_drucksache(hit) + assert "AfD" in d.fraktionen + + def test_landesregierung_detection(self): + adapter = _make_mv_adapter() + hit = dict(SAMPLE_PARLDOK_HIT, authorhtml="Ministerium der Finanzen") + d = adapter._hit_to_drucksache(hit) + assert "Landesregierung" in d.fraktionen + + +class TestParLDokFulltextIdSanitization: + """Reverse-engineered from bundle.js pd.getFulltextId — must mirror exactly. + + Even though server-side fulltext is currently disabled (#18), the helper + is kept around in code as documentation. If it ever gets re-activated, + the sanitization must still match the SPA's behavior 1:1. + """ + + def test_simple_word_unchanged(self): + assert ParLDokAdapter._fulltext_id("Schule") == "Schule" + + def test_whitespace_becomes_dash(self): + assert ParLDokAdapter._fulltext_id("Klima Schutz") == "Klima-Schutz" + + def test_umlauts_become_dashes(self): + # The JS regex is /[^a-zA-z0-9]/ — note the lowercase z, deliberate. + # Umlauts are non-ASCII so they get replaced. + assert ParLDokAdapter._fulltext_id("Bürger") == "B-rger" + + def test_punctuation_becomes_dashes(self): + assert ParLDokAdapter._fulltext_id("CO2-Emission") == "CO2-Emission" + + +# ───────────────────────────────────────────────────────────────────────────── +# Adapter registry sanity +# ───────────────────────────────────────────────────────────────────────────── + +class TestAdapterRegistry: + def test_active_adapters_present(self): + from app.parlamente import ADAPTERS + for code in ["NRW", "MV", "BE", "LSA"]: + assert code in ADAPTERS, f"missing adapter for {code}" + + def test_get_adapter_returns_none_for_unknown(self): + from app.parlamente import get_adapter + assert get_adapter("XX") is None + + def test_mv_adapter_is_parldok_instance(self): + from app.parlamente import ADAPTERS + assert isinstance(ADAPTERS["MV"], ParLDokAdapter) + + def test_be_adapter_is_portala_instance(self): + from app.parlamente import ADAPTERS + assert isinstance(ADAPTERS["BE"], PortalaAdapter) + + def test_lsa_adapter_is_portala_instance(self): + from app.parlamente import ADAPTERS + assert isinstance(ADAPTERS["LSA"], PortalaAdapter) diff --git a/tests/test_wahlprogramme.py b/tests/test_wahlprogramme.py new file mode 100644 index 0000000..dedbfd2 --- /dev/null +++ b/tests/test_wahlprogramme.py @@ -0,0 +1,118 @@ +"""Tests for wahlprogramme.py — registry consistency + file existence.""" +from app.wahlprogramme import ( + WAHLPROGRAMME, + REFERENZEN_PATH, + get_wahlprogramm, + parteien_mit_wahlprogramm, +) + + +# ───────────────────────────────────────────────────────────────────────────── +# Registry consistency +# ───────────────────────────────────────────────────────────────────────────── + +class TestRegistryStructure: + def test_active_bundeslaender_present(self): + for code in ["NRW", "LSA", "MV", "BE"]: + assert code in WAHLPROGRAMME, f"missing wahlprogramme entry for {code}" + + def test_each_entry_has_required_keys(self): + required = {"file", "titel", "partei", "jahr", "seiten"} + for bl, parteien in WAHLPROGRAMME.items(): + for partei, info in parteien.items(): + missing = required - set(info.keys()) + assert not missing, f"{bl}/{partei} missing keys: {missing}" + + def test_jahr_is_integer(self): + for bl, parteien in WAHLPROGRAMME.items(): + for partei, info in parteien.items(): + assert isinstance(info["jahr"], int), f"{bl}/{partei} jahr not int" + + def test_seiten_is_positive_integer(self): + for bl, parteien in WAHLPROGRAMME.items(): + for partei, info in parteien.items(): + assert isinstance(info["seiten"], int) + assert info["seiten"] > 0 + + def test_file_extension_is_pdf(self): + for bl, parteien in WAHLPROGRAMME.items(): + for partei, info in parteien.items(): + assert info["file"].endswith(".pdf") + + +# ───────────────────────────────────────────────────────────────────────────── +# File existence — every registered file must exist on disk +# ───────────────────────────────────────────────────────────────────────────── + +class TestFileExistence: + """Catches typos in the file field that would silently break embedding + indexing or PDF download links.""" + + def test_every_registered_pdf_exists(self): + missing = [] + for bl, parteien in WAHLPROGRAMME.items(): + for partei, info in parteien.items(): + path = REFERENZEN_PATH / info["file"] + if not path.exists(): + missing.append(f"{bl}/{partei}: {info['file']}") + assert not missing, "missing PDFs:\n " + "\n ".join(missing) + + +# ───────────────────────────────────────────────────────────────────────────── +# Lookup helpers +# ───────────────────────────────────────────────────────────────────────────── + +class TestGetWahlprogramm: + def test_returns_dict_for_known_combination(self): + info = get_wahlprogramm("MV", "CDU") + assert info is not None + assert info["partei"] == "CDU Mecklenburg-Vorpommern" + + def test_returns_none_for_unknown_bundesland(self): + assert get_wahlprogramm("XX", "CDU") is None + + def test_returns_none_for_unknown_partei(self): + assert get_wahlprogramm("NRW", "BSW") is None + + +class TestParteienMitWahlprogramm: + def test_nrw_has_five_parteien(self): + parteien = parteien_mit_wahlprogramm("NRW") + assert len(parteien) == 5 + assert set(parteien) == {"CDU", "SPD", "GRÜNE", "FDP", "AfD"} + + def test_mv_has_six_parteien(self): + parteien = parteien_mit_wahlprogramm("MV") + assert set(parteien) == {"CDU", "SPD", "GRÜNE", "FDP", "AfD", "LINKE"} + + def test_be_has_five_parteien(self): + parteien = parteien_mit_wahlprogramm("BE") + assert set(parteien) == {"CDU", "SPD", "GRÜNE", "LINKE", "AfD"} + + def test_unknown_bundesland_empty_list(self): + assert parteien_mit_wahlprogramm("XX") == [] + + +# ───────────────────────────────────────────────────────────────────────────── +# embeddings.PROGRAMME consistency cross-check +# ───────────────────────────────────────────────────────────────────────────── + +class TestEmbeddingsRegistryConsistency: + """Every entry in WAHLPROGRAMME must also exist in embeddings.PROGRAMME + so the indexer can find it. Mismatch is the kind of bug a manual smoke + misses but would show up during indexing.""" + + def test_every_wahlprogramm_has_embeddings_entry(self): + from app.embeddings import PROGRAMME + + # Build expected programm_id from filename: "cdu-mv-2021.pdf" → "cdu-mv-2021" + missing = [] + for bl, parteien in WAHLPROGRAMME.items(): + for partei, info in parteien.items(): + pid = info["file"].rsplit(".", 1)[0] + if pid not in PROGRAMME: + missing.append(f"{bl}/{partei} → {pid}") + assert not missing, ( + "WAHLPROGRAMME entries missing in embeddings.PROGRAMME:\n " + + "\n ".join(missing) + )