"""Tests for parlamente.py adapter parsers — pure functions over fixture HTML. Reproduces the three regression scenarios from the 2026-04-08 adapter session: 1. PortalaAdapter `_parse_hit_list_cards` had a `doctype` vs. `doctype_full` NameError that was hot-fixed live on the prod server (commit 1cb030a). 2. ParLDokAdapter `_hit_to_drucksache` needs to map ParlDok 8.x JSON hit dicts to Drucksache objects without losing fraction or date info. 3. PortalaAdapter `_normalize_fraktion` and ParLDokAdapter same-named method must yield canonical fraction codes for both comma-lists and embedded "MdL (Partei)" patterns. """ import asyncio import pytest from unittest.mock import AsyncMock, patch, MagicMock from app.parlamente import ParLDokAdapter, PortalaAdapter, NRWAdapter, Drucksache # ───────────────────────────────────────────────────────────────────────────── # PortalaAdapter — Berlin-style HTML cards # ───────────────────────────────────────────────────────────────────────────── BE_CARD_FIXTURE = """
ignored

Schwimmstatistik für die dritten Klassen der Berliner Schulen

Antrag (Eilantrag) Drucksache 19/3104 S. 1 bis 24 vom 31.03.2026

Klimaneutrales Bauen im Bestand

Antrag CDU, SPD Drucksache 19/3107 vom 02.04.2026
""" def _make_be_adapter(): return PortalaAdapter( bundesland="BE", name="test BE", base_url="https://pardok.parlament-berlin.de", db_id="lah.lissh", wahlperiode=19, portala_path="/portala", document_type=None, ) class TestPortalaAdapterCardParser: """Issue: doctype/doctype_full NameError (hot-fix 1cb030a).""" def test_parses_two_cards_without_nameerror(self): """Smoke test — must NOT raise NameError or any other exception. Pre-fix this method referenced an undefined ``doctype`` variable in the query-filter branch when computing the haystack. The fix renamed it to ``doctype_full``. A direct call covers both branches. """ adapter = _make_be_adapter() result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="") assert len(result) == 2 def test_first_card_extracts_drucksache_and_title(self): adapter = _make_be_adapter() result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="") d = result[0] assert d.drucksache == "19/3104" assert "Schwimmstatistik" in d.title assert d.datum == "2026-03-31" assert d.bundesland == "BE" def test_second_card_extracts_fraktionen_from_h6(self): """Card 2 packs CDU+SPD into the type line — must split out cleanly.""" adapter = _make_be_adapter() result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="") d = result[1] assert d.drucksache == "19/3107" assert d.fraktionen == ["CDU", "SPD"] # The typ string should have the parties stripped back out assert d.typ.strip() == "Antrag" def test_pdf_link_is_absolute_url(self): adapter = _make_be_adapter() result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="") assert result[0].link.startswith("https://pardok.parlament-berlin.de/") assert result[0].link.endswith(".pdf") def test_query_filter_uses_doctype_full_not_doctype(self): """Regression: the filter branch references doctype_full, not doctype. Pre-fix this raised NameError as soon as a query was passed. """ adapter = _make_be_adapter() # Schwimmstatistik matches card 1, Klimaneutral matches card 2 result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="Schwimm") assert len(result) == 1 assert result[0].drucksache == "19/3104" # ───────────────────────────────────────────────────────────────────────────── # PortalaAdapter — LSA-style Perl-Dump records # ───────────────────────────────────────────────────────────────────────────── LSA_DUMP_FIXTURE = """
$VAR1 = {
  'WEV06' => [{ 'main' => 'Demokratie beginnt im Klassenzimmer' }],
  'WEV32' => [{
    'main' => 'Antrag B\\x{fc}ndnis 90/Die Gr\\x{fc}nen 06.03.2026 Drucksache 8/6726 ...',
    '5' => 'drs/wp8/drs/d6726lan.pdf'
  }]
}
$VAR1 = {
  'WEV06' => [{ 'main' => 'Andere Drucksache ohne Schul-Bezug' }],
  'WEV32' => [{
    'main' => 'Antrag CDU, SPD 14.01.2026 Drucksache 8/6171 ...',
    '5' => 'drs/wp8/drs/d6171lan.pdf'
  }]
}
""" def _make_lsa_adapter(): return PortalaAdapter( bundesland="LSA", name="test LSA", base_url="https://padoka.landtag.sachsen-anhalt.de", db_id="lsa.lissh", wahlperiode=8, portala_path="/portal", document_type="Antrag", ) class TestPortalaAdapterDumpParser: def test_parses_two_dump_records(self): adapter = _make_lsa_adapter() result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="") assert len(result) == 2 def test_extracts_drucksache_from_perl_dump(self): adapter = _make_lsa_adapter() result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="") assert result[0].drucksache == "8/6726" assert result[1].drucksache == "8/6171" def test_decodes_perl_hex_escapes_in_urheber(self): """The first record has \\x{fc} (ü) and \\x{e4} (ä) in WEV32.""" adapter = _make_lsa_adapter() result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="") # GRÜNE should be detected from "Bündnis 90/Die Grünen" assert "GRÜNE" in result[0].fraktionen def test_extracts_date_iso(self): adapter = _make_lsa_adapter() result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="") assert result[0].datum == "2026-03-06" assert result[1].datum == "2026-01-14" def test_pdf_url_uses_pdf_url_prefix(self): adapter = _make_lsa_adapter() result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="") assert result[0].link == ( "https://padoka.landtag.sachsen-anhalt.de/files/drs/wp8/drs/d6726lan.pdf" ) def test_client_side_query_filter(self): adapter = _make_lsa_adapter() result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="Demokratie") assert len(result) == 1 assert result[0].drucksache == "8/6726" # ───────────────────────────────────────────────────────────────────────────── # PortalaAdapter — Auto-detection between dump and card formats # ───────────────────────────────────────────────────────────────────────────── class TestPortalaAdapterAutoDetect: def test_dump_html_routes_to_dump_parser(self): adapter = _make_lsa_adapter() result = adapter._parse_hit_list_html(LSA_DUMP_FIXTURE, query_filter="") assert len(result) == 2 def test_card_html_routes_to_card_parser(self): adapter = _make_be_adapter() result = adapter._parse_hit_list_html(BE_CARD_FIXTURE, query_filter="") assert len(result) == 2 # ───────────────────────────────────────────────────────────────────────────── # Adapter._normalize_fraktion — Roundtrip-Test über eine echte Instanz # # Die ausführliche Pattern-Sammlung lebt nach #55 in tests/test_parteien.py. # Hier verifizieren wir nur, dass der Adapter-Shim die zentrale Funktion # tatsächlich aufruft und das Bundesland korrekt durchreicht. # ───────────────────────────────────────────────────────────────────────────── class TestAdapterNormalizeFraktionRoundtrip: def test_portala_lsa_adapter_instance(self): adapter = _make_lsa_adapter() assert "CDU" in adapter._normalize_fraktion("CDU") assert adapter._normalize_fraktion("BÜNDNIS 90/DIE GRÜNEN") == ["GRÜNE"] def test_portala_be_adapter_instance(self): adapter = _make_be_adapter() out = adapter._normalize_fraktion("Senat von Berlin") assert "Landesregierung" in out def test_empty_string(self): adapter = _make_lsa_adapter() assert adapter._normalize_fraktion("") == [] def test_freie_waehler_disambiguates_by_adapter_bundesland(self): # BB-Adapter → BVB-FW, RP-Adapter → FREIE WÄHLER. Das ist der # eigentliche Mehrwert von #55, hier roundtripped via Adapter. from app.parlamente import PortalaAdapter bb = PortalaAdapter( bundesland="BB", name="test BB", base_url="https://www.parlamentsdokumentation.brandenburg.de", db_id="lap.lap8", wahlperiode=8, portala_path="/portal", document_type="Antrag", ) rp = PortalaAdapter( bundesland="RP", name="test RP", base_url="https://opal.rlp.de", db_id="rlp.opal", wahlperiode=18, portala_path="/portal", document_type="Antrag", ) assert bb._normalize_fraktion("FREIE WÄHLER") == ["BVB-FW"] assert rp._normalize_fraktion("FREIE WÄHLER") == ["FREIE WÄHLER"] # ───────────────────────────────────────────────────────────────────────────── # ParLDokAdapter — JSON hit dict → Drucksache mapping # ───────────────────────────────────────────────────────────────────────────── def _make_mv_adapter(): return ParLDokAdapter( bundesland="MV", name="test MV", base_url="https://www.dokumentation.landtag-mv.de", wahlperiode=8, prefix="/parldok", document_typ="Antrag", ) SAMPLE_PARLDOK_HIT = { "id": 70748, "title": "Zweckentfremdung von Sondervermögen des Bundes beenden", "date": "18.03.2026", "prelink": "/dokument/70748", "link": "/dokument/70748#navpanes=0", "authorhtml": "FDP", "kind": "Drucksache", "type": "Antrag", "lp": 8, "number": "6409", } class TestParLDokAdapterHitMapping: def test_hit_to_drucksache_basic(self): adapter = _make_mv_adapter() d = adapter._hit_to_drucksache(SAMPLE_PARLDOK_HIT) assert d is not None assert d.drucksache == "8/6409" assert d.title == "Zweckentfremdung von Sondervermögen des Bundes beenden" assert d.datum == "2026-03-18" assert d.fraktionen == ["FDP"] assert d.typ == "Antrag" assert d.bundesland == "MV" def test_pdf_link_strips_navpanes_fragment_and_prepends_prefix(self): adapter = _make_mv_adapter() d = adapter._hit_to_drucksache(SAMPLE_PARLDOK_HIT) assert d.link == "https://www.dokumentation.landtag-mv.de/parldok/dokument/70748" assert "#navpanes" not in d.link def test_missing_lp_returns_none(self): adapter = _make_mv_adapter() hit = dict(SAMPLE_PARLDOK_HIT) del hit["lp"] assert adapter._hit_to_drucksache(hit) is None def test_mdl_with_party_in_parens(self): """MV often packs the MdL into authorhtml: 'Thomas X (AfD)'.""" adapter = _make_mv_adapter() hit = dict(SAMPLE_PARLDOK_HIT, authorhtml="Thomas de Jesus Fernandes (AfD)") d = adapter._hit_to_drucksache(hit) assert "AfD" in d.fraktionen def test_landesregierung_detection(self): adapter = _make_mv_adapter() hit = dict(SAMPLE_PARLDOK_HIT, authorhtml="Ministerium der Finanzen") d = adapter._hit_to_drucksache(hit) assert "Landesregierung" in d.fraktionen class TestParLDokFulltextIdSanitization: """Reverse-engineered from bundle.js pd.getFulltextId — must mirror exactly. Even though server-side fulltext is currently disabled (#18), the helper is kept around in code as documentation. If it ever gets re-activated, the sanitization must still match the SPA's behavior 1:1. """ def test_simple_word_unchanged(self): assert ParLDokAdapter._fulltext_id("Schule") == "Schule" def test_whitespace_becomes_dash(self): assert ParLDokAdapter._fulltext_id("Klima Schutz") == "Klima-Schutz" def test_umlauts_become_dashes(self): # The JS regex is /[^a-zA-z0-9]/ — note the lowercase z, deliberate. # Umlauts are non-ASCII so they get replaced. assert ParLDokAdapter._fulltext_id("Bürger") == "B-rger" def test_punctuation_becomes_dashes(self): assert ParLDokAdapter._fulltext_id("CO2-Emission") == "CO2-Emission" # ───────────────────────────────────────────────────────────────────────────── # Adapter registry sanity # ───────────────────────────────────────────────────────────────────────────── class TestAdapterRegistry: def test_active_adapters_present(self): from app.parlamente import ADAPTERS for code in ["NRW", "MV", "BE", "LSA"]: assert code in ADAPTERS, f"missing adapter for {code}" def test_get_adapter_returns_none_for_unknown(self): from app.parlamente import get_adapter assert get_adapter("XX") is None def test_mv_adapter_is_parldok_instance(self): from app.parlamente import ADAPTERS assert isinstance(ADAPTERS["MV"], ParLDokAdapter) def test_be_adapter_is_portala_instance(self): from app.parlamente import ADAPTERS assert isinstance(ADAPTERS["BE"], PortalaAdapter) def test_lsa_adapter_is_portala_instance(self): from app.parlamente import ADAPTERS assert isinstance(ADAPTERS["LSA"], PortalaAdapter) # ───────────────────────────────────────────────────────────────────────────── # Bug #135 — NRW: empty query returns results (monitoring path) # ───────────────────────────────────────────────────────────────────────────── # Minimal OPAL HTML fixture with one valid Drucksache result _NRW_RESULT_HTML = """
  • Klimaschutz im Ruhrgebiet Download Antrag

    Urheber: SPD

  • """ class TestNRWEmptyQueryMonitoringPath: """Regression: search("") must return ≥1 Drucksachen, not 0. Pre-fix: OPAL rejects empty dokNum and returns 0 hits. The adapter now substitutes the current year so OPAL returns recent documents. """ def _make_mock_responses(self, html=_NRW_RESULT_HTML): """Returns two mock httpx.Response objects: initial GET + search POST.""" initial = MagicMock() initial.status_code = 200 initial.text = '
    ' initial.cookies = {} search_resp = MagicMock() search_resp.status_code = 200 search_resp.text = html return initial, search_resp def test_empty_query_uses_year_as_api_query(self): """_parse_query("") yields api_query="", but search() substitutes the year.""" adapter = NRWAdapter() api_q, terms, is_exact = adapter._parse_query("") # The adapter substitutes year inside search(), not in _parse_query — so # _parse_query itself still returns "". The substitution is tested via # _matches_all_terms below. assert api_q == "" def test_matches_all_terms_with_empty_terms_is_true(self): """With filter_terms=[""], every document matches (wildcard semantics).""" adapter = NRWAdapter() doc = Drucksache( drucksache="18/1234", title="Klimaschutz", fraktionen=["SPD"], datum="2026-04-15", link="https://example.com/x.pdf", bundesland="NRW", typ="Antrag", ) assert adapter._matches_all_terms(doc, [""], is_exact=False) is True def test_matches_all_terms_with_wildcard_star_is_true(self): """filter_terms=["*"] is treated as match-all.""" adapter = NRWAdapter() doc = Drucksache( drucksache="18/1234", title="Klimaschutz", fraktionen=["SPD"], datum="2026-04-15", link="https://example.com/x.pdf", bundesland="NRW", typ="Antrag", ) assert adapter._matches_all_terms(doc, ["*"], is_exact=False) is True def test_matches_all_terms_with_real_term_filters_correctly(self): """Normal search terms still filter as before.""" adapter = NRWAdapter() doc = Drucksache( drucksache="18/1234", title="Klimaschutz", fraktionen=["SPD"], datum="2026-04-15", link="https://example.com/x.pdf", bundesland="NRW", typ="Antrag", ) assert adapter._matches_all_terms(doc, ["klimaschutz"], is_exact=False) is True assert adapter._matches_all_terms(doc, ["haushalt"], is_exact=False) is False # ───────────────────────────────────────────────────────────────────────────── # Bug #135 — SL: timeout must propagate, not be swallowed as [] # ───────────────────────────────────────────────────────────────────────────── class TestSaarlandTimeoutPropagates: """Regression: ReadTimeout in _post_search must not be caught and returned as []. Pre-fix: except-block returned [] silently, making monitoring report errors='ok' instead of surfacing the failure. Post-fix: the except-block re-raises so callers can see the error. """ def test_post_search_propagates_timeout(self): """A ReadTimeout from httpx must propagate out of _post_search.""" import httpx from app.parlamente import SaarlandAdapter adapter = SaarlandAdapter() async def _run(): mock_client = AsyncMock() mock_client.post.side_effect = httpx.ReadTimeout("timeout", request=None) await adapter._post_search(mock_client, "Schule") with pytest.raises(httpx.ReadTimeout): asyncio.run(_run()) def test_post_search_propagates_connection_error(self): """Generic network errors also propagate.""" import httpx from app.parlamente import SaarlandAdapter adapter = SaarlandAdapter() async def _run(): mock_client = AsyncMock() mock_client.post.side_effect = httpx.ConnectError("refused") await adapter._post_search(mock_client, "Schule") with pytest.raises(httpx.ConnectError): asyncio.run(_run()) # ───────────────────────────────────────────────────────────────────────────── # Bug #135 — NI: excluded from monitoring scan (login-protected portal) # ───────────────────────────────────────────────────────────────────────────── class TestNIMonitoringSkip: """NI is in _MONITORING_SKIP because NILAS requires login (#22). Unauthenticated requests return login-page HTML that the JSON-comment parser misreads as ~50 junk records. Until a valid HAR-Capture is available, NI must be excluded from daily_scan(). """ def test_ni_in_monitoring_skip_set(self): """NI must appear in the _MONITORING_SKIP constant.""" from app.monitoring import _MONITORING_SKIP assert "NI" in _MONITORING_SKIP def test_daily_scan_skips_ni(self): """daily_scan() must not call the NI adapter at all.""" import asyncio from unittest.mock import AsyncMock, patch, MagicMock import sys, types # Stub heavy deps if not already present for mod in ("aiosqlite", "fitz"): if mod not in sys.modules: sys.modules[mod] = types.ModuleType(mod) from app.bundeslaender import Bundesland ni_adapter = MagicMock() ni_adapter.search = AsyncMock(return_value=[]) fake_bls = [ Bundesland( code="NI", name="NI", parlament_name="NI", wahlperiode=19, wahlperiode_start="2022-01-01", naechste_wahl=None, regierungsfraktionen=[], landtagsfraktionen=[], doku_system="Test", doku_base_url="http://example.com", drucksache_format="19/1234", dokukratie_scraper=None, aktiv=True, ) ] import app.monitoring as mon_mod import app.database as db_mod import app.parlamente as parl_mod original_adapters = getattr(parl_mod, "ADAPTERS", {}) parl_mod.ADAPTERS = {"NI": ni_adapter} try: with ( patch("app.monitoring.aktive_bundeslaender", return_value=fake_bls), patch.object(db_mod, "upsert_monitoring_scan", new_callable=AsyncMock), patch.object(db_mod, "upsert_monitoring_summary", new_callable=AsyncMock), ): asyncio.run(mon_mod.daily_scan()) finally: parl_mod.ADAPTERS = original_adapters ni_adapter.search.assert_not_called() # ───────────────────────────────────────────────────────────────────────────── # Fix #142 — SL: search() must propagate network errors (not swallow as []) # ───────────────────────────────────────────────────────────────────────────── class TestSaarlandSearchPropagatesErrors: """Regression: network errors in SaarlandAdapter.search() must not be caught at the search()-level — they must propagate so the monitoring layer records them as errors instead of seeing ``seen=0 errors=None``.""" def test_search_propagates_read_timeout(self): """ReadTimeout from _post_search must propagate out of search().""" import httpx from app.parlamente import SaarlandAdapter adapter = SaarlandAdapter() async def _run(): mock_client = AsyncMock() mock_client.__aenter__ = AsyncMock(return_value=mock_client) mock_client.__aexit__ = AsyncMock(return_value=False) mock_client.post.side_effect = httpx.ReadTimeout("timeout", request=None) with patch.object(adapter, "_make_client", return_value=mock_client): await adapter.search("Schule") with pytest.raises(httpx.ReadTimeout): asyncio.run(_run()) def test_search_propagates_connect_error(self): """ConnectError from _post_search must propagate out of search().""" import httpx from app.parlamente import SaarlandAdapter adapter = SaarlandAdapter() async def _run(): mock_client = AsyncMock() mock_client.__aenter__ = AsyncMock(return_value=mock_client) mock_client.__aexit__ = AsyncMock(return_value=False) mock_client.post.side_effect = httpx.ConnectError("refused") with patch.object(adapter, "_make_client", return_value=mock_client): await adapter.search("Schule") with pytest.raises(httpx.ConnectError): asyncio.run(_run()) def test_search_propagates_http_500(self): """HTTP 5xx response must NOT be silently turned into empty results (regression #142): a 500 from the Umbraco backend used to log+return [], hiding it from the monitoring summary.""" import httpx from app.parlamente import SaarlandAdapter adapter = SaarlandAdapter() async def _run(): mock_client = AsyncMock() mock_client.__aenter__ = AsyncMock(return_value=mock_client) mock_client.__aexit__ = AsyncMock(return_value=False) mock_resp = MagicMock() mock_resp.status_code = 500 mock_resp.text = "Server Error" mock_resp.request = MagicMock() mock_client.post = AsyncMock(return_value=mock_resp) with patch.object(adapter, "_make_client", return_value=mock_client): await adapter.search("Schule") with pytest.raises(httpx.HTTPStatusError): asyncio.run(_run())