Zentrale `app/parteien.py` als Single Source of Truth für die Partei- Auflösung: - `PARTEIEN`-Tabelle mit kanonischem Key, langem Display-Namen, allen bekannten Aliasen, optionalem `bundesland_scope` und Government- Marker. 14 Einträge (CDU, CSU, SPD, GRÜNE, FDP, LINKE, AfD, BSW, SSW, BiW + die Freie-Wähler-Familie BVB-FW, FW-BAYERN, FW-SL und der generische FREIE WÄHLER-Eintrag). - `normalize_partei(raw, *, bundesland=None)` für Single-String-Lookups mit Government-Vorrang und FW-Familien-Disambiguierung - `extract_fraktionen(text, *, bundesland=None)` als Funnel für die vier alten Adapter-Helper. Kommagetrennte Listen, MdL-mit-Klammer- partei, HTML-Reste — alles fließt durch eine Stelle, mit BL-Scope- Filter (SSW nur in SH, BVB-FW nur in BB, etc.). - `display_name(canonical, *, long=False)` für UI/PDF — kurze Form bleibt der kanonische Key, lange Form ist "BÜNDNIS 90/DIE GRÜNEN" statt "GRÜNE" etc. Adapter-Migration in `app/parlamente.py`: - Vier nahezu identische `_normalize_fraktion()`-Methoden in PortalaAdapter, ParLDokAdapter, StarFinderCGIAdapter, PARLISAdapter durch einen einzeiligen Shim ersetzt, der `extract_fraktionen` mit `self.bundesland` aufruft. ~120 Zeilen Duplikation entfernt. - `@staticmethod` aufgehoben, weil wir jetzt `self.bundesland` brauchen für die FW-Disambiguierung — alle Aufrufer waren bereits `self._...`, also keine Call-Site-Änderung nötig. `app/embeddings.py:496` Workaround-Hack entfernt: - `partei.upper() if partei != "GRÜNE" else "GRÜNE"` durch zentralen `normalize_partei()`-Aufruf ersetzt — der Hack war ein Kommentarzeichen dafür, dass die Partei-Schreibweise irgendwo zwischen Adapter und Embedding-Lookup driften konnte. Mit dem Mapper ist die Schreibweise überall garantiert kanonisch. Tests: - Neue `tests/test_parteien.py` mit 52 Cases — Single-Lookup, FW- Disambiguierung (BVB/Bayern/Saarland/RP), Volltext-Extraktion, Government-Marker, Tabellen-Konsistenz - `tests/test_parlamente.py` Test-Klasse umgeschrieben: statt der 6 statischen `PortalaAdapter._normalize_fraktion(...)`-Tests jetzt 4 Roundtrip-Tests über echte Adapter-Instanzen, inkl. expliziter BB→BVB-FW vs. RP→FREIE WÄHLER-Verifikation 157 Unit-Tests grün (105 alt + 52 neu). Backwards-kompatibel — die kanonischen Keys sind exakt die in der DB stehenden Strings, kein Migrations-Schritt nötig. Refs: #55, #59 (Phase B) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
343 lines
15 KiB
Python
343 lines
15 KiB
Python
"""Tests for parlamente.py adapter parsers — pure functions over fixture HTML.
|
|
|
|
Reproduces the three regression scenarios from the 2026-04-08 adapter session:
|
|
|
|
1. PortalaAdapter `_parse_hit_list_cards` had a `doctype` vs. `doctype_full`
|
|
NameError that was hot-fixed live on the prod server (commit 1cb030a).
|
|
2. ParLDokAdapter `_hit_to_drucksache` needs to map ParlDok 8.x JSON hit
|
|
dicts to Drucksache objects without losing fraction or date info.
|
|
3. PortalaAdapter `_normalize_fraktion` and ParLDokAdapter same-named method
|
|
must yield canonical fraction codes for both comma-lists and embedded
|
|
"MdL (Partei)" patterns.
|
|
"""
|
|
from app.parlamente import ParLDokAdapter, PortalaAdapter, NRWAdapter, Drucksache
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# PortalaAdapter — Berlin-style HTML cards
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
BE_CARD_FIXTURE = """
|
|
<div class="other-prelude">ignored</div>
|
|
<div class="record-card efxRecordRepeater">
|
|
<h3 class="h5"><span>Schwimmstatistik für die dritten Klassen der Berliner Schulen</span></h3>
|
|
<span class="h6">Antrag (Eilantrag) <a href="/files/drs19-3104.pdf">Drucksache 19/3104</a> S. 1 bis 24 vom 31.03.2026</span>
|
|
</div>
|
|
<div class="record-card efxRecordRepeater">
|
|
<h3 class="h5"><span>Klimaneutrales Bauen im Bestand</span></h3>
|
|
<span class="h6">Antrag CDU, SPD <a href="/files/drs19-3107.pdf">Drucksache 19/3107</a> vom 02.04.2026</span>
|
|
</div>
|
|
"""
|
|
|
|
|
|
def _make_be_adapter():
|
|
return PortalaAdapter(
|
|
bundesland="BE",
|
|
name="test BE",
|
|
base_url="https://pardok.parlament-berlin.de",
|
|
db_id="lah.lissh",
|
|
wahlperiode=19,
|
|
portala_path="/portala",
|
|
document_type=None,
|
|
)
|
|
|
|
|
|
class TestPortalaAdapterCardParser:
|
|
"""Issue: doctype/doctype_full NameError (hot-fix 1cb030a)."""
|
|
|
|
def test_parses_two_cards_without_nameerror(self):
|
|
"""Smoke test — must NOT raise NameError or any other exception.
|
|
|
|
Pre-fix this method referenced an undefined ``doctype`` variable
|
|
in the query-filter branch when computing the haystack. The fix
|
|
renamed it to ``doctype_full``. A direct call covers both branches.
|
|
"""
|
|
adapter = _make_be_adapter()
|
|
result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="")
|
|
assert len(result) == 2
|
|
|
|
def test_first_card_extracts_drucksache_and_title(self):
|
|
adapter = _make_be_adapter()
|
|
result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="")
|
|
d = result[0]
|
|
assert d.drucksache == "19/3104"
|
|
assert "Schwimmstatistik" in d.title
|
|
assert d.datum == "2026-03-31"
|
|
assert d.bundesland == "BE"
|
|
|
|
def test_second_card_extracts_fraktionen_from_h6(self):
|
|
"""Card 2 packs CDU+SPD into the type line — must split out cleanly."""
|
|
adapter = _make_be_adapter()
|
|
result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="")
|
|
d = result[1]
|
|
assert d.drucksache == "19/3107"
|
|
assert d.fraktionen == ["CDU", "SPD"]
|
|
# The typ string should have the parties stripped back out
|
|
assert d.typ.strip() == "Antrag"
|
|
|
|
def test_pdf_link_is_absolute_url(self):
|
|
adapter = _make_be_adapter()
|
|
result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="")
|
|
assert result[0].link.startswith("https://pardok.parlament-berlin.de/")
|
|
assert result[0].link.endswith(".pdf")
|
|
|
|
def test_query_filter_uses_doctype_full_not_doctype(self):
|
|
"""Regression: the filter branch references doctype_full, not doctype.
|
|
|
|
Pre-fix this raised NameError as soon as a query was passed.
|
|
"""
|
|
adapter = _make_be_adapter()
|
|
# Schwimmstatistik matches card 1, Klimaneutral matches card 2
|
|
result = adapter._parse_hit_list_cards(BE_CARD_FIXTURE, query_filter="Schwimm")
|
|
assert len(result) == 1
|
|
assert result[0].drucksache == "19/3104"
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# PortalaAdapter — LSA-style Perl-Dump records
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
LSA_DUMP_FIXTURE = """
|
|
<pre>$VAR1 = {
|
|
'WEV06' => [{ 'main' => 'Demokratie beginnt im Klassenzimmer' }],
|
|
'WEV32' => [{
|
|
'main' => 'Antrag B\\x{fc}ndnis 90/Die Gr\\x{fc}nen 06.03.2026 Drucksache <b>8/6726</b> ...',
|
|
'5' => 'drs/wp8/drs/d6726lan.pdf'
|
|
}]
|
|
}</pre>
|
|
<pre>$VAR1 = {
|
|
'WEV06' => [{ 'main' => 'Andere Drucksache ohne Schul-Bezug' }],
|
|
'WEV32' => [{
|
|
'main' => 'Antrag CDU, SPD 14.01.2026 Drucksache <b>8/6171</b> ...',
|
|
'5' => 'drs/wp8/drs/d6171lan.pdf'
|
|
}]
|
|
}</pre>
|
|
"""
|
|
|
|
|
|
def _make_lsa_adapter():
|
|
return PortalaAdapter(
|
|
bundesland="LSA",
|
|
name="test LSA",
|
|
base_url="https://padoka.landtag.sachsen-anhalt.de",
|
|
db_id="lsa.lissh",
|
|
wahlperiode=8,
|
|
portala_path="/portal",
|
|
document_type="Antrag",
|
|
)
|
|
|
|
|
|
class TestPortalaAdapterDumpParser:
|
|
def test_parses_two_dump_records(self):
|
|
adapter = _make_lsa_adapter()
|
|
result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="")
|
|
assert len(result) == 2
|
|
|
|
def test_extracts_drucksache_from_perl_dump(self):
|
|
adapter = _make_lsa_adapter()
|
|
result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="")
|
|
assert result[0].drucksache == "8/6726"
|
|
assert result[1].drucksache == "8/6171"
|
|
|
|
def test_decodes_perl_hex_escapes_in_urheber(self):
|
|
"""The first record has \\x{fc} (ü) and \\x{e4} (ä) in WEV32."""
|
|
adapter = _make_lsa_adapter()
|
|
result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="")
|
|
# GRÜNE should be detected from "Bündnis 90/Die Grünen"
|
|
assert "GRÜNE" in result[0].fraktionen
|
|
|
|
def test_extracts_date_iso(self):
|
|
adapter = _make_lsa_adapter()
|
|
result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="")
|
|
assert result[0].datum == "2026-03-06"
|
|
assert result[1].datum == "2026-01-14"
|
|
|
|
def test_pdf_url_uses_pdf_url_prefix(self):
|
|
adapter = _make_lsa_adapter()
|
|
result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="")
|
|
assert result[0].link == (
|
|
"https://padoka.landtag.sachsen-anhalt.de/files/drs/wp8/drs/d6726lan.pdf"
|
|
)
|
|
|
|
def test_client_side_query_filter(self):
|
|
adapter = _make_lsa_adapter()
|
|
result = adapter._parse_hit_list_dump(LSA_DUMP_FIXTURE, query_filter="Demokratie")
|
|
assert len(result) == 1
|
|
assert result[0].drucksache == "8/6726"
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# PortalaAdapter — Auto-detection between dump and card formats
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestPortalaAdapterAutoDetect:
|
|
def test_dump_html_routes_to_dump_parser(self):
|
|
adapter = _make_lsa_adapter()
|
|
result = adapter._parse_hit_list_html(LSA_DUMP_FIXTURE, query_filter="")
|
|
assert len(result) == 2
|
|
|
|
def test_card_html_routes_to_card_parser(self):
|
|
adapter = _make_be_adapter()
|
|
result = adapter._parse_hit_list_html(BE_CARD_FIXTURE, query_filter="")
|
|
assert len(result) == 2
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Adapter._normalize_fraktion — Roundtrip-Test über eine echte Instanz
|
|
#
|
|
# Die ausführliche Pattern-Sammlung lebt nach #55 in tests/test_parteien.py.
|
|
# Hier verifizieren wir nur, dass der Adapter-Shim die zentrale Funktion
|
|
# tatsächlich aufruft und das Bundesland korrekt durchreicht.
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestAdapterNormalizeFraktionRoundtrip:
|
|
def test_portala_lsa_adapter_instance(self):
|
|
adapter = _make_lsa_adapter()
|
|
assert "CDU" in adapter._normalize_fraktion("CDU")
|
|
assert adapter._normalize_fraktion("BÜNDNIS 90/DIE GRÜNEN") == ["GRÜNE"]
|
|
|
|
def test_portala_be_adapter_instance(self):
|
|
adapter = _make_be_adapter()
|
|
out = adapter._normalize_fraktion("Senat von Berlin")
|
|
assert "Landesregierung" in out
|
|
|
|
def test_empty_string(self):
|
|
adapter = _make_lsa_adapter()
|
|
assert adapter._normalize_fraktion("") == []
|
|
|
|
def test_freie_waehler_disambiguates_by_adapter_bundesland(self):
|
|
# BB-Adapter → BVB-FW, RP-Adapter → FREIE WÄHLER. Das ist der
|
|
# eigentliche Mehrwert von #55, hier roundtripped via Adapter.
|
|
from app.parlamente import PortalaAdapter
|
|
bb = PortalaAdapter(
|
|
bundesland="BB", name="test BB",
|
|
base_url="https://www.parlamentsdokumentation.brandenburg.de",
|
|
db_id="lap.lap8", wahlperiode=8,
|
|
portala_path="/portal", document_type="Antrag",
|
|
)
|
|
rp = PortalaAdapter(
|
|
bundesland="RP", name="test RP",
|
|
base_url="https://opal.rlp.de",
|
|
db_id="rlp.opal", wahlperiode=18,
|
|
portala_path="/portal", document_type="Antrag",
|
|
)
|
|
assert bb._normalize_fraktion("FREIE WÄHLER") == ["BVB-FW"]
|
|
assert rp._normalize_fraktion("FREIE WÄHLER") == ["FREIE WÄHLER"]
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# ParLDokAdapter — JSON hit dict → Drucksache mapping
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
def _make_mv_adapter():
|
|
return ParLDokAdapter(
|
|
bundesland="MV",
|
|
name="test MV",
|
|
base_url="https://www.dokumentation.landtag-mv.de",
|
|
wahlperiode=8,
|
|
prefix="/parldok",
|
|
document_typ="Antrag",
|
|
)
|
|
|
|
|
|
SAMPLE_PARLDOK_HIT = {
|
|
"id": 70748,
|
|
"title": "Zweckentfremdung von Sondervermögen des Bundes beenden",
|
|
"date": "18.03.2026",
|
|
"prelink": "/dokument/70748",
|
|
"link": "/dokument/70748#navpanes=0",
|
|
"authorhtml": "FDP",
|
|
"kind": "Drucksache",
|
|
"type": "Antrag",
|
|
"lp": 8,
|
|
"number": "6409",
|
|
}
|
|
|
|
|
|
class TestParLDokAdapterHitMapping:
|
|
def test_hit_to_drucksache_basic(self):
|
|
adapter = _make_mv_adapter()
|
|
d = adapter._hit_to_drucksache(SAMPLE_PARLDOK_HIT)
|
|
assert d is not None
|
|
assert d.drucksache == "8/6409"
|
|
assert d.title == "Zweckentfremdung von Sondervermögen des Bundes beenden"
|
|
assert d.datum == "2026-03-18"
|
|
assert d.fraktionen == ["FDP"]
|
|
assert d.typ == "Antrag"
|
|
assert d.bundesland == "MV"
|
|
|
|
def test_pdf_link_strips_navpanes_fragment_and_prepends_prefix(self):
|
|
adapter = _make_mv_adapter()
|
|
d = adapter._hit_to_drucksache(SAMPLE_PARLDOK_HIT)
|
|
assert d.link == "https://www.dokumentation.landtag-mv.de/parldok/dokument/70748"
|
|
assert "#navpanes" not in d.link
|
|
|
|
def test_missing_lp_returns_none(self):
|
|
adapter = _make_mv_adapter()
|
|
hit = dict(SAMPLE_PARLDOK_HIT)
|
|
del hit["lp"]
|
|
assert adapter._hit_to_drucksache(hit) is None
|
|
|
|
def test_mdl_with_party_in_parens(self):
|
|
"""MV often packs the MdL into authorhtml: 'Thomas X (AfD)'."""
|
|
adapter = _make_mv_adapter()
|
|
hit = dict(SAMPLE_PARLDOK_HIT, authorhtml="Thomas de Jesus Fernandes (AfD)")
|
|
d = adapter._hit_to_drucksache(hit)
|
|
assert "AfD" in d.fraktionen
|
|
|
|
def test_landesregierung_detection(self):
|
|
adapter = _make_mv_adapter()
|
|
hit = dict(SAMPLE_PARLDOK_HIT, authorhtml="Ministerium der Finanzen")
|
|
d = adapter._hit_to_drucksache(hit)
|
|
assert "Landesregierung" in d.fraktionen
|
|
|
|
|
|
class TestParLDokFulltextIdSanitization:
|
|
"""Reverse-engineered from bundle.js pd.getFulltextId — must mirror exactly.
|
|
|
|
Even though server-side fulltext is currently disabled (#18), the helper
|
|
is kept around in code as documentation. If it ever gets re-activated,
|
|
the sanitization must still match the SPA's behavior 1:1.
|
|
"""
|
|
|
|
def test_simple_word_unchanged(self):
|
|
assert ParLDokAdapter._fulltext_id("Schule") == "Schule"
|
|
|
|
def test_whitespace_becomes_dash(self):
|
|
assert ParLDokAdapter._fulltext_id("Klima Schutz") == "Klima-Schutz"
|
|
|
|
def test_umlauts_become_dashes(self):
|
|
# The JS regex is /[^a-zA-z0-9]/ — note the lowercase z, deliberate.
|
|
# Umlauts are non-ASCII so they get replaced.
|
|
assert ParLDokAdapter._fulltext_id("Bürger") == "B-rger"
|
|
|
|
def test_punctuation_becomes_dashes(self):
|
|
assert ParLDokAdapter._fulltext_id("CO2-Emission") == "CO2-Emission"
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Adapter registry sanity
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestAdapterRegistry:
|
|
def test_active_adapters_present(self):
|
|
from app.parlamente import ADAPTERS
|
|
for code in ["NRW", "MV", "BE", "LSA"]:
|
|
assert code in ADAPTERS, f"missing adapter for {code}"
|
|
|
|
def test_get_adapter_returns_none_for_unknown(self):
|
|
from app.parlamente import get_adapter
|
|
assert get_adapter("XX") is None
|
|
|
|
def test_mv_adapter_is_parldok_instance(self):
|
|
from app.parlamente import ADAPTERS
|
|
assert isinstance(ADAPTERS["MV"], ParLDokAdapter)
|
|
|
|
def test_be_adapter_is_portala_instance(self):
|
|
from app.parlamente import ADAPTERS
|
|
assert isinstance(ADAPTERS["BE"], PortalaAdapter)
|
|
|
|
def test_lsa_adapter_is_portala_instance(self):
|
|
from app.parlamente import ADAPTERS
|
|
assert isinstance(ADAPTERS["LSA"], PortalaAdapter)
|