241 lines
11 KiB
Python
241 lines
11 KiB
Python
|
|
"""Sub-Issue A — Live Adapter Tests gegen die echten Landtag-Backends.
|
|||
|
|
|
|||
|
|
Pro aktivem Bundesland aus ``aktive_bundeslaender()`` werden die folgenden
|
|||
|
|
Eigenschaften geprüft:
|
|||
|
|
|
|||
|
|
1. Reachability — ``adapter.search("", limit=5)`` läuft erfolgreich durch
|
|||
|
|
2. Result-Anzahl > 0 (0 Treffer ist Indikator für Schema-Drift)
|
|||
|
|
3. Drucksachen-ID-Format ``\\d+/\\d+``
|
|||
|
|
4. Type-Filter — kein Result hat einen ``typ``, der eindeutig kein Antrag
|
|||
|
|
ist (Substring-Match auf "Antrag" weil TH "Antrag gemäß § 79 GO" nutzt)
|
|||
|
|
5. Datum-Plausibilität — wenn gesetzt, dann zwischen ``wahlperiode_start``
|
|||
|
|
und heute
|
|||
|
|
6. Fraktionen-Plausibilität — falls gesetzt, müssen sie in
|
|||
|
|
``landtagsfraktionen ∪ {"Landesregierung", "BSW", "FREIE WÄHLER", "SSW"}``
|
|||
|
|
liegen
|
|||
|
|
7. PDF-Link erreichbar (markiert als ``slow``)
|
|||
|
|
|
|||
|
|
Bug-Klassen aus den letzten Sessions, die diese Datei abdeckt:
|
|||
|
|
- 2 (LSA WEV01-vs-WEV06 Format-Drift)
|
|||
|
|
- 6 (TH composite type "Antrag gemäß § 79 GO")
|
|||
|
|
- 7 (HE Card-Layout — sobald HE wieder im aktiven Set ist)
|
|||
|
|
- 8 (NI Login-Page → xfail)
|
|||
|
|
- 13 (Datum leer trotz BE-Format mit "vom")
|
|||
|
|
- 16 (Pagination liefert 0 Anträge)
|
|||
|
|
- 18 (PDF-Download-Link kaputt)
|
|||
|
|
|
|||
|
|
Issue: #51 (Sub-Issue A des Umbrella #50)
|
|||
|
|
"""
|
|||
|
|
import re
|
|||
|
|
from datetime import date
|
|||
|
|
|
|||
|
|
import httpx
|
|||
|
|
import pytest
|
|||
|
|
|
|||
|
|
from app.bundeslaender import BUNDESLAENDER, aktive_bundeslaender
|
|||
|
|
from app.parlamente import ADAPTERS, Drucksache
|
|||
|
|
|
|||
|
|
|
|||
|
|
pytestmark = pytest.mark.integration
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
# Setup
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
# All currently active state codes, parametrised so each BL appears as its
|
|||
|
|
# own test entry in the pytest output. NI is xfailed because nilas/portal
|
|||
|
|
# is login-protected (see issue #22 for the deferred state).
|
|||
|
|
_ACTIVE_CODES = [bl.code for bl in aktive_bundeslaender()]
|
|||
|
|
|
|||
|
|
_BL_PARAMS = [
|
|||
|
|
pytest.param(
|
|||
|
|
code,
|
|||
|
|
marks=pytest.mark.xfail(
|
|||
|
|
reason="nilas.niedersachsen.de/portal/ ist Login-protected, deferred (Issue #22)",
|
|||
|
|
strict=False,
|
|||
|
|
),
|
|||
|
|
)
|
|||
|
|
if code == "NI"
|
|||
|
|
else code
|
|||
|
|
for code in _ACTIVE_CODES
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
|
|||
|
|
# Whitelist of acceptable hit-typ values. Strict-Match would fail TH because
|
|||
|
|
# its types look like "Antrag gemäß § 79 GO". Substring "Antrag" is the
|
|||
|
|
# pragmatic invariant. The blacklist below is the explicit anti-marker.
|
|||
|
|
_ACCEPTABLE_TYP_SUBSTRING = "antrag"
|
|||
|
|
|
|||
|
|
# Hits with these typ-substrings are clearly NOT Anträge — if any of these
|
|||
|
|
# appears in the result list the type-filter has drifted.
|
|||
|
|
_FORBIDDEN_TYP_SUBSTRINGS = (
|
|||
|
|
"kleine anfrage",
|
|||
|
|
"große anfrage",
|
|||
|
|
"grosse anfrage",
|
|||
|
|
"plenarprotokoll",
|
|||
|
|
"sitzung",
|
|||
|
|
"ausschussvorlage",
|
|||
|
|
"beschlussempfehlung",
|
|||
|
|
"gesetz- und verordnungsblatt",
|
|||
|
|
"tagesordnung",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Wahltermin-Insensitive Whitelist of fraction codes that may appear in
|
|||
|
|
# any active Bundesland's hit list, on top of the BL-specific
|
|||
|
|
# landtagsfraktionen.
|
|||
|
|
_UNIVERSAL_FRAKTIONEN = {
|
|||
|
|
"Landesregierung", # synthetic from _normalize_fraktion
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
# 1. Reachability + 2. Result-Anzahl
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
|
|||
|
|
@pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c)
|
|||
|
|
async def test_adapter_search_reachable(code: str):
|
|||
|
|
"""The adapter must answer ``search('', limit=5)`` with at least 1 hit
|
|||
|
|
without raising or returning empty.
|
|||
|
|
|
|||
|
|
A 0-hit response is the strongest indicator of schema-drift, e.g. when
|
|||
|
|
a Landtag changes their backend HTML structure or moves their endpoint.
|
|||
|
|
"""
|
|||
|
|
adapter = ADAPTERS[code]
|
|||
|
|
results = await adapter.search("", limit=5)
|
|||
|
|
assert isinstance(results, list)
|
|||
|
|
assert len(results) > 0, (
|
|||
|
|
f"{code} adapter ({type(adapter).__name__}) returned 0 hits for "
|
|||
|
|
"an unfiltered browse — likely schema-drift in the live backend"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
# 3. Drucksachen-ID-Format
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
_RE_DRUCKSACHE_ID = re.compile(r"^\d+/\d+(?:\(neu\))?$")
|
|||
|
|
|
|||
|
|
|
|||
|
|
@pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c)
|
|||
|
|
async def test_drucksache_id_format(code: str):
|
|||
|
|
"""Every result must have a Drucksache-Nummer in the canonical
|
|||
|
|
``<wp>/<num>`` form (e.g. ``8/6390``). Some adapters annotate
|
|||
|
|
re-issued documents with ``(neu)`` — that's allowed too."""
|
|||
|
|
adapter = ADAPTERS[code]
|
|||
|
|
results = await adapter.search("", limit=10)
|
|||
|
|
invalid = [r.drucksache for r in results if not _RE_DRUCKSACHE_ID.match(r.drucksache)]
|
|||
|
|
assert not invalid, (
|
|||
|
|
f"{code}: Drucksachen-IDs verletzen das ``<wp>/<num>``-Format: {invalid}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
# 4. Type-Filter-Wirksamkeit
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
|
|||
|
|
@pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c)
|
|||
|
|
async def test_type_filter_returns_only_antraege(code: str):
|
|||
|
|
"""No hit may have a ``typ`` that's clearly NOT an Antrag.
|
|||
|
|
|
|||
|
|
The whitelist is permissive (substring "antrag", to allow TH-style
|
|||
|
|
"Antrag gemäß § 79 GO"). The blacklist below is the explicit
|
|||
|
|
anti-marker — if any forbidden substring appears, the type filter
|
|||
|
|
has drifted.
|
|||
|
|
"""
|
|||
|
|
adapter = ADAPTERS[code]
|
|||
|
|
results = await adapter.search("", limit=10)
|
|||
|
|
bad: list[tuple[str, str]] = []
|
|||
|
|
for r in results:
|
|||
|
|
typ_lower = (r.typ or "").lower()
|
|||
|
|
for forbidden in _FORBIDDEN_TYP_SUBSTRINGS:
|
|||
|
|
if forbidden in typ_lower:
|
|||
|
|
bad.append((r.drucksache, r.typ))
|
|||
|
|
break
|
|||
|
|
assert not bad, (
|
|||
|
|
f"{code}: hit list contains non-Antrag entries: {bad}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
# 5. Datum-Plausibilität
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
|
|||
|
|
@pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c)
|
|||
|
|
async def test_datum_within_wahlperiode_window(code: str):
|
|||
|
|
"""If a hit has a ``datum``, it must lie between ``wahlperiode_start``
|
|||
|
|
and today. Hits with empty ``datum`` are not asserted (some adapters
|
|||
|
|
legitimately can't always extract one)."""
|
|||
|
|
adapter = ADAPTERS[code]
|
|||
|
|
bl = BUNDESLAENDER[code]
|
|||
|
|
wp_start = bl.wahlperiode_start
|
|||
|
|
today_iso = date.today().isoformat()
|
|||
|
|
|
|||
|
|
results = await adapter.search("", limit=10)
|
|||
|
|
bad: list[str] = []
|
|||
|
|
for r in results:
|
|||
|
|
if not r.datum:
|
|||
|
|
continue
|
|||
|
|
if not (wp_start <= r.datum <= today_iso):
|
|||
|
|
bad.append(f"{r.drucksache} datum={r.datum} not in [{wp_start}..{today_iso}]")
|
|||
|
|
assert not bad, (
|
|||
|
|
f"{code}: implausible Drucksachen-Datümer: " + "; ".join(bad)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
# 6. Fraktionen-Plausibilität
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
|
|||
|
|
@pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c)
|
|||
|
|
async def test_fraktionen_in_landtag(code: str):
|
|||
|
|
"""If a hit has Fraktionen, every entry must be either a known
|
|||
|
|
Landtagsfraktion or one of the universal extras (Landesregierung)."""
|
|||
|
|
adapter = ADAPTERS[code]
|
|||
|
|
bl = BUNDESLAENDER[code]
|
|||
|
|
allowed = set(bl.landtagsfraktionen) | _UNIVERSAL_FRAKTIONEN
|
|||
|
|
|
|||
|
|
results = await adapter.search("", limit=10)
|
|||
|
|
bad: list[tuple[str, list[str]]] = []
|
|||
|
|
for r in results:
|
|||
|
|
if not r.fraktionen:
|
|||
|
|
continue
|
|||
|
|
unknown = [f for f in r.fraktionen if f not in allowed]
|
|||
|
|
if unknown:
|
|||
|
|
bad.append((r.drucksache, unknown))
|
|||
|
|
assert not bad, (
|
|||
|
|
f"{code}: unknown Fraktionen in hit list (allowed={sorted(allowed)}): {bad}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
# 7. PDF-Link erreichbar (slow)
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
|
|||
|
|
@pytest.mark.slow
|
|||
|
|
@pytest.mark.parametrize("code", _BL_PARAMS, ids=lambda c: c)
|
|||
|
|
async def test_first_result_pdf_link_reachable(code: str):
|
|||
|
|
"""HEAD-probe against the first hit's PDF link. Server must answer
|
|||
|
|
200, 301, 302 or 303 (redirects to a real file)."""
|
|||
|
|
adapter = ADAPTERS[code]
|
|||
|
|
results = await adapter.search("", limit=1)
|
|||
|
|
assert len(results) > 0, f"{code}: no hit to probe"
|
|||
|
|
|
|||
|
|
link = results[0].link
|
|||
|
|
assert link, f"{code}: first hit has no link"
|
|||
|
|
|
|||
|
|
async with httpx.AsyncClient(
|
|||
|
|
timeout=30,
|
|||
|
|
follow_redirects=False,
|
|||
|
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer-Test"},
|
|||
|
|
) as client:
|
|||
|
|
resp = await client.head(link)
|
|||
|
|
assert resp.status_code in (200, 301, 302, 303), (
|
|||
|
|
f"{code}: PDF link HEAD returned {resp.status_code}: {link}"
|
|||
|
|
)
|