"""Sub-Issue A — Live Adapter Tests gegen die echten Landtag-Backends. Pro aktivem Bundesland aus ``aktive_bundeslaender()`` werden die folgenden Eigenschaften geprüft: 1. Reachability — ``adapter.search("", limit=5)`` läuft erfolgreich durch 2. Result-Anzahl > 0 (0 Treffer ist Indikator für Schema-Drift) 3. Drucksachen-ID-Format ``\\d+/\\d+`` 4. Type-Filter — kein Result hat einen ``typ``, der eindeutig kein Antrag ist (Substring-Match auf "Antrag" weil TH "Antrag gemäß § 79 GO" nutzt) 5. Datum-Plausibilität — wenn gesetzt, dann zwischen ``wahlperiode_start`` und heute 6. Fraktionen-Plausibilität — falls gesetzt, müssen sie in ``landtagsfraktionen ∪ {"Landesregierung", "BSW", "FREIE WÄHLER", "SSW"}`` liegen 7. PDF-Link erreichbar (markiert als ``slow``) Bug-Klassen aus den letzten Sessions, die diese Datei abdeckt: - 2 (LSA WEV01-vs-WEV06 Format-Drift) - 6 (TH composite type "Antrag gemäß § 79 GO") - 7 (HE Card-Layout — sobald HE wieder im aktiven Set ist) - 8 (NI Login-Page → xfail) - 13 (Datum leer trotz BE-Format mit "vom") - 16 (Pagination liefert 0 Anträge) - 18 (PDF-Download-Link kaputt) Issue: #51 (Sub-Issue A des Umbrella #50) """ import re from datetime import date import httpx import pytest from app.bundeslaender import BUNDESLAENDER, aktive_bundeslaender from app.parlamente import ADAPTERS, Drucksache pytestmark = pytest.mark.integration # ───────────────────────────────────────────────────────────────────────────── # Setup # ───────────────────────────────────────────────────────────────────────────── # All currently active state codes, parametrised so each BL appears as its # own test entry in the pytest output. NI is xfailed because nilas/portal # is login-protected (see issue #22 for the deferred state). _ACTIVE_CODES = [bl.code for bl in aktive_bundeslaender()] _BL_PARAMS = [ pytest.param( code, marks=pytest.mark.xfail( reason="nilas.niedersachsen.de/portal/ ist Login-protected, deferred (Issue #22)", strict=False, ), ) if code == "NI" else code for code in _ACTIVE_CODES ] # Whitelist of acceptable hit-typ values. Strict-Match would fail TH because # its types look like "Antrag gemäß § 79 GO". Substring "Antrag" is the # pragmatic invariant. The blacklist below is the explicit anti-marker. _ACCEPTABLE_TYP_SUBSTRING = "antrag" # Hits with these typ-substrings are clearly NOT Anträge — if any of these # appears in the result list the type-filter has drifted. _FORBIDDEN_TYP_SUBSTRINGS = ( "kleine anfrage", "große anfrage", "grosse anfrage", "plenarprotokoll", "sitzung", "ausschussvorlage", "beschlussempfehlung", "gesetz- und verordnungsblatt", "tagesordnung", ) # Wahltermin-Insensitive Whitelist of fraction codes that may appear in # any active Bundesland's hit list, on top of the BL-specific # landtagsfraktionen. _UNIVERSAL_FRAKTIONEN = { "Landesregierung", # synthetic from _normalize_fraktion } # ───────────────────────────────────────────────────────────────────────────── # 1. Reachability + 2. Result-Anzahl # ───────────────────────────────────────────────────────────────────────────── @pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c) async def test_adapter_search_reachable(code: str): """The adapter must answer ``search('', limit=5)`` with at least 1 hit without raising or returning empty. A 0-hit response is the strongest indicator of schema-drift, e.g. when a Landtag changes their backend HTML structure or moves their endpoint. """ adapter = ADAPTERS[code] results = await adapter.search("", limit=5) assert isinstance(results, list) assert len(results) > 0, ( f"{code} adapter ({type(adapter).__name__}) returned 0 hits for " "an unfiltered browse — likely schema-drift in the live backend" ) # ───────────────────────────────────────────────────────────────────────────── # 3. Drucksachen-ID-Format # ───────────────────────────────────────────────────────────────────────────── _RE_DRUCKSACHE_ID = re.compile(r"^\d+/\d+(?:\(neu\))?$") @pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c) async def test_drucksache_id_format(code: str): """Every result must have a Drucksache-Nummer in the canonical ``/`` form (e.g. ``8/6390``). Some adapters annotate re-issued documents with ``(neu)`` — that's allowed too.""" adapter = ADAPTERS[code] results = await adapter.search("", limit=10) invalid = [r.drucksache for r in results if not _RE_DRUCKSACHE_ID.match(r.drucksache)] assert not invalid, ( f"{code}: Drucksachen-IDs verletzen das ``/``-Format: {invalid}" ) # ───────────────────────────────────────────────────────────────────────────── # 4. Type-Filter-Wirksamkeit # ───────────────────────────────────────────────────────────────────────────── @pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c) async def test_type_filter_returns_only_antraege(code: str): """No hit may have a ``typ`` that's clearly NOT an Antrag. The whitelist is permissive (substring "antrag", to allow TH-style "Antrag gemäß § 79 GO"). The blacklist below is the explicit anti-marker — if any forbidden substring appears, the type filter has drifted. """ adapter = ADAPTERS[code] results = await adapter.search("", limit=10) bad: list[tuple[str, str]] = [] for r in results: typ_lower = (r.typ or "").lower() for forbidden in _FORBIDDEN_TYP_SUBSTRINGS: if forbidden in typ_lower: bad.append((r.drucksache, r.typ)) break assert not bad, ( f"{code}: hit list contains non-Antrag entries: {bad}" ) # ───────────────────────────────────────────────────────────────────────────── # 5. Datum-Plausibilität # ───────────────────────────────────────────────────────────────────────────── @pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c) async def test_datum_within_wahlperiode_window(code: str): """If a hit has a ``datum``, it must lie between ``wahlperiode_start`` and today. Hits with empty ``datum`` are not asserted (some adapters legitimately can't always extract one).""" adapter = ADAPTERS[code] bl = BUNDESLAENDER[code] wp_start = bl.wahlperiode_start today_iso = date.today().isoformat() results = await adapter.search("", limit=10) bad: list[str] = [] for r in results: if not r.datum: continue if not (wp_start <= r.datum <= today_iso): bad.append(f"{r.drucksache} datum={r.datum} not in [{wp_start}..{today_iso}]") assert not bad, ( f"{code}: implausible Drucksachen-Datümer: " + "; ".join(bad) ) # ───────────────────────────────────────────────────────────────────────────── # 6. Fraktionen-Plausibilität # ───────────────────────────────────────────────────────────────────────────── @pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c) async def test_fraktionen_in_landtag(code: str): """If a hit has Fraktionen, every entry must be either a known Landtagsfraktion or one of the universal extras (Landesregierung).""" adapter = ADAPTERS[code] bl = BUNDESLAENDER[code] allowed = set(bl.landtagsfraktionen) | _UNIVERSAL_FRAKTIONEN results = await adapter.search("", limit=10) bad: list[tuple[str, list[str]]] = [] for r in results: if not r.fraktionen: continue unknown = [f for f in r.fraktionen if f not in allowed] if unknown: bad.append((r.drucksache, unknown)) assert not bad, ( f"{code}: unknown Fraktionen in hit list (allowed={sorted(allowed)}): {bad}" ) # ───────────────────────────────────────────────────────────────────────────── # 7. PDF-Link erreichbar (slow) # ───────────────────────────────────────────────────────────────────────────── @pytest.mark.slow @pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c) async def test_first_result_pdf_link_reachable(code: str): """HEAD-probe against the first hit's PDF link. Server must answer 200, 301, 302 or 303 (redirects to a real file).""" adapter = ADAPTERS[code] results = await adapter.search("", limit=1) assert len(results) > 0, f"{code}: no hit to probe" link = results[0].link assert link, f"{code}: first hit has no link" async with httpx.AsyncClient( timeout=30, follow_redirects=False, headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer-Test"}, ) as client: resp = await client.head(link) assert resp.status_code in (200, 301, 302, 303), ( f"{code}: PDF link HEAD returned {resp.status_code}: {link}" )