diff --git a/app/bundeslaender.py b/app/bundeslaender.py index db6daa5..a45f8a9 100644 --- a/app/bundeslaender.py +++ b/app/bundeslaender.py @@ -80,10 +80,18 @@ BUNDESLAENDER: dict[str, Bundesland] = { doku_base_url="https://parlis.landtag-bw.de", drucksache_format="17/12345", dokukratie_scraper="bw", + aktiv=True, anmerkung=( - "Wahl zum 18. Landtag fand am 08.03.2026 statt; Koalitionsverhandlungen " - "GRÜNE+CDU laufen, Kabinett Kretschmann III geschäftsführend. Nach " - "Konstituierung des 18. LT ca. Mai 2026 müssen WP und Wahltermin aktualisiert werden." + "PARLIS auf parlis.landtag-bw.de läuft auf demselben " + "eUI-Backend wie LSA-PADOKA und BE-PARDOK, aber mit drei " + "Unterschieden: minimales lines-Schema (l1/l2/l3/l4), " + "asynchrones Polling (initial → search_id → poll → " + "report_id) und Hit-Records als JSON-in-HTML-Comments. " + "Eigene Adapter-Klasse PARLISAdapter (#29). Wahl zum 18. " + "Landtag fand am 08.03.2026 statt; Koalitionsverhandlungen " + "GRÜNE+CDU laufen, Kabinett Kretschmann III geschäftsführend. " + "Nach Konstituierung des 18. LT ca. Mai 2026 müssen WP und " + "Wahltermin aktualisiert werden." ), ), "BY": Bundesland( diff --git a/app/parlamente.py b/app/parlamente.py index 8ec3cc6..bb37e74 100644 --- a/app/parlamente.py +++ b/app/parlamente.py @@ -1256,22 +1256,408 @@ class BayernAdapter(ParlamentAdapter): return None -class BWAdapter(ParlamentAdapter): - """Adapter for Baden-Württemberg Landtag.""" - - bundesland = "BW" - name = "Landtag Baden-Württemberg" - base_url = "https://www.landtag-bw.de" - +class PARLISAdapter(ParlamentAdapter): + """Adapter for Baden-Württemberg's PARLIS — eUI/portala-Variante mit + polling und JSON-in-HTML-Comment-Records. + + PARLIS auf ``parlis.landtag-bw.de`` läuft technisch auf demselben + eUI-Backend wie LSA-PADOKA und BE-PARDOK, aber mit drei wichtigen + Unterschieden, die eine eigene Klasse statt einer PortalaAdapter- + Subklasse rechtfertigen: + + 1. **Body-Schema:** Statt der portala/LSA-typischen ``search.lines`` + mit ``2/3/4/10/11/20.x/90.x``-Slots nutzt PARLIS ein viel kürzeres + ``l1/l2/l3/l4`` Schema (siehe ``dokukratie/scrapers/portala.query.bw.json``). + ``serverrecordname`` ist ``"vorgang"`` statt ``"sr_generic1"``, + ``format`` ist ``"suchergebnis-vorgang-full"``, ``sort`` ist + ``"SORT01/D SORT02/D SORT03"``. Es gibt kein ``parsed`` und kein + ``json``-Tree — der Server akzeptiert das minimale Schema direkt. + + 2. **Async polling:** Im Gegensatz zu LSA/BE liefert die initiale + ``Fulltext/Search``-Antwort nur eine ``search_id`` mit + ``status: "running"``, KEINE ``report_id``. Erst eine zweite + ``SearchAndDisplay``-Anfrage mit ``id: `` (und ohne + ``search``-Component) bekommt die fertige ``report_id`` zurück. + In meinen Live-Tests reichte ein einziger 2-Sekunden-Sleep + zwischen den Calls. + + 3. **Hit-Format:** Die ``report.tt.html``-Antwort liefert keine + Perl-Dump-Blöcke (LSA) und keine Bootstrap-Card-Divs (BE), + sondern **JSON-Records in HTML-Kommentaren**:: + + + + Der Parser zieht die Comments raw raus und mappt die WMV/EWBV- + Felder auf das ``Drucksache``-Dataclass. + + Reverse-Engineering-Quelle: ``dokukratie/scrapers/portala.query.bw.json`` + + Live-HAR gegen ``parlis.landtag-bw.de`` (Issue #29). + """ + + # Reverse-engineered field map for the JSON records that come embedded + # in HTML comments inside report.tt.html responses. + # + # Records look like ```` and may contain + # nested ``...`` highlight tags inside the JSON values. + # Non-greedy match against the literal closing ``}-->`` because that + # delimiter does not appear inside the JSON payload itself. + _RE_RECORD = re.compile(r"", re.DOTALL) + _RE_DRUCKSACHE = re.compile(r"Drucksache\s+(\d+/\d+)") + _RE_DATUM = re.compile(r"(\d{1,2}\.\d{1,2}\.\d{4})") + + def __init__( + self, + *, + bundesland: str, + name: str, + base_url: str, + wahlperiode: int, + prefix: str = "/parlis", + document_typ: str = "Antrag", + date_window_days: int = 730, + poll_attempts: int = 15, + poll_interval_seconds: float = 2.0, + ) -> None: + """Configure a PARLIS adapter for one specific parliament instance. + + Args: + bundesland: state code, e.g. ``"BW"``. + name: human-readable label. + base_url: ``https://parlis.landtag-bw.de`` (no trailing slash). + wahlperiode: legislative period — feeds into ``lines.l1``. + prefix: app prefix where PARLIS lives. ``/parlis`` for BW. + document_typ: feeds into ``lines.l4``. The server interprets + this as a German document type label like ``"Antrag"``. + date_window_days: look-back window for the search range, + quick-win against title-only filtering — same approach + as the PortalaAdapter for LSA/BE. + poll_attempts: how many times to poll for ``report_id`` before + giving up. ~15 × 2s = 30s upper bound. + poll_interval_seconds: sleep between poll attempts. + """ + self.bundesland = bundesland + self.name = name + self.base_url = base_url.rstrip("/") + self.prefix = "/" + prefix.strip("/") + self.wahlperiode = wahlperiode + self.document_typ = document_typ + self.date_window_days = date_window_days + self.poll_attempts = poll_attempts + self.poll_interval_seconds = poll_interval_seconds + + @staticmethod + def _datum_de_to_iso(datum_de: str) -> str: + """DD.MM.YYYY → YYYY-MM-DD; '' for empty input.""" + if not datum_de: + return "" + try: + d, m, y = datum_de.split(".") + return f"{y}-{m.zfill(2)}-{d.zfill(2)}" + except ValueError: + return "" + + @staticmethod + def _normalize_fraktion(text: str) -> list[str]: + """Map a free-text Urheber line to canonical fraction codes. + + PARLIS packs the originator into ``EWBV23`` like + ``"Antrag Felix Herkens (GRÜNE), Saskia Frank (GRÜNE), ... 16.03.2026"`` + — multiple MdLs with their party in parentheses, comma-separated. + Same logic as ``ParLDokAdapter._normalize_fraktion`` (#46 fixed + the MINISTER/MINISTERIUM regex there too). + """ + if not text: + return [] + u = text.upper() + out: list[str] = [] + if re.search(r"\bBÜNDNIS\s*90\b", u) or re.search(r"\bGR(?:Ü|UE)NE\b", u): + out.append("GRÜNE") + if re.search(r"\bCDU\b", u): + out.append("CDU") + if re.search(r"\bSPD\b", u): + out.append("SPD") + if re.search(r"\bF\.?\s*D\.?\s*P\.?\b", u): + out.append("FDP") + if re.search(r"\bAFD\b", u): + out.append("AfD") + if re.search(r"\bLINKE\b", u): + out.append("LINKE") + if re.search(r"\bBSW\b", u): + out.append("BSW") + if re.search(r"LANDESREGIERUNG|\bMINISTER|STAATSKANZLEI|MINISTERPRÄSIDENT", u): + out.append("Landesregierung") + return out + + def _build_initial_body(self, start_date: str, end_date: str) -> dict: + """Build the first ``SearchAndDisplay`` body with the search component. + + The schema follows ``dokukratie/scrapers/portala.query.bw.json`` + verbatim — only the placeholder values are substituted. + """ + return { + "action": "SearchAndDisplay", + "report": { + "rhl": "main", + "rhlmode": "add", + "format": "suchergebnis-vorgang-full", + "mime": "html", + "sort": "SORT01/D SORT02/D SORT03", + }, + "search": { + "lines": { + "l1": str(self.wahlperiode), + "l2": start_date, + "l3": end_date, + "l4": self.document_typ, + }, + "serverrecordname": "vorgang", + }, + "sources": ["Star"], + } + + def _build_poll_body(self, search_id: str) -> dict: + """Build the polling body — same action, but with the search_id + instead of a fresh search component.""" + return { + "action": "SearchAndDisplay", + "report": { + "rhl": "main", + "rhlmode": "add", + "format": "suchergebnis-vorgang-full", + "mime": "html", + "sort": "SORT01/D SORT02/D SORT03", + }, + "id": search_id, + "sources": ["Star"], + } + + def _hit_record_to_drucksache(self, record: dict) -> Optional[Drucksache]: + """Map a single JSON-in-comment record to a ``Drucksache``. + + PARLIS-record schema (reverse-engineered, all values are arrays + of ``{"main": ...}`` dicts): + + - ``EWBV22``: "Drucksache 17/10323" + - ``EWBD05``: direct PDF URL + - ``EWBV23``: "Antrag " — single combined line + - ``WMV30``: short Urheber summary ("Felix Herkens (GRÜNE) u. a.") + - ``WMV33``: subject keywords (Schlagworte) + - ``EWBD01``: "Drucksache " + """ + def first(field: str) -> str: + block = record.get(field) + if isinstance(block, list) and block: + return (block[0].get("main") or "").strip() + return "" + + ds_text = first("EWBV22") or first("EWBD01") + m_ds = self._RE_DRUCKSACHE.search(ds_text) + if not m_ds: + return None + drucksache = m_ds.group(1) + + # The "title" we want is the Schlagworte/topic, not the + # Drucksachen-Header. PARLIS keeps the human-readable subject + # in WMV33 (Schlagworte joined by semicolons) — that's the + # closest equivalent to "title" the LSA/BE adapters expose. + # Fallback to the EWBV23 line if WMV33 is empty. + schlagworte = first("WMV33") + # Strip embedded ... highlight tags + schlagworte_clean = re.sub(r"", "", schlagworte).strip() + title = schlagworte_clean or first("EWBV23") or f"Drucksache {drucksache}" + + # Date + Urheber out of EWBV23 ("Antrag ") + ewbv23 = first("EWBV23") + m_dat = self._RE_DATUM.search(ewbv23) + datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "") + urheber_short = first("WMV30") + fraktionen = self._normalize_fraktion(urheber_short or ewbv23) + + pdf_url = first("EWBD05") + + return Drucksache( + drucksache=drucksache, + title=title, + fraktionen=fraktionen, + datum=datum_iso, + link=pdf_url, + bundesland=self.bundesland, + typ=self.document_typ, + ) + + async def _initial_search_and_poll( + self, client: httpx.AsyncClient, start_date: str, end_date: str, + ) -> Optional[str]: + """Run the initial search + poll until ``report_id`` arrives.""" + import asyncio + + browse_html = f"{self.base_url}{self.prefix}/browse.tt.html" + browse_json = f"{self.base_url}{self.prefix}/browse.tt.json" + + # Step 1: warm cookies + await client.get(browse_html) + + # Step 2: initial search + try: + resp = await client.post( + browse_json, + json=self._build_initial_body(start_date, end_date), + headers={"Referer": browse_html}, + ) + except Exception: + logger.exception("%s initial search request error", self.bundesland) + return None + if resp.status_code != 200: + logger.error("%s initial search HTTP %s", self.bundesland, resp.status_code) + return None + data = resp.json() + if data.get("report_id"): + return data["report_id"] + search_id = data.get("search_id") + if not search_id: + logger.error("%s no search_id in initial response: %s", self.bundesland, data) + return None + + # Step 3: poll until report_id appears or we run out of attempts + for _ in range(self.poll_attempts): + await asyncio.sleep(self.poll_interval_seconds) + try: + resp = await client.post( + browse_json, + json=self._build_poll_body(search_id), + headers={"Referer": browse_html}, + ) + except Exception: + logger.exception("%s poll request error", self.bundesland) + return None + if resp.status_code != 200: + logger.error("%s poll HTTP %s", self.bundesland, resp.status_code) + return None + data = resp.json() + if data.get("report_id"): + return data["report_id"] + star = data.get("sources", {}).get("Star", {}) + if star.get("status") == "stopped" and not data.get("report_id"): + # Search finished but no report — empty result + return None + + logger.warning("%s gave up polling after %d attempts", self.bundesland, self.poll_attempts) + return None + + def _parse_report_html(self, html: str) -> list[Drucksache]: + """Extract Drucksachen from a report.tt.html response. + + Records are JSON objects embedded in HTML comments. We pull each + comment block via regex, parse it as JSON, and map the WMV/EWBV + fields to a Drucksache. + """ + results: list[Drucksache] = [] + for m in self._RE_RECORD.finditer(html): + json_text = m.group(1) + try: + record = json.loads(json_text) + except json.JSONDecodeError: + continue + doc = self._hit_record_to_drucksache(record) + if doc: + results.append(doc) + return results + async def search(self, query: str, limit: int = 20) -> list[Drucksache]: - # TODO: Implement BW search - return [] - + """Search recent BW Anträge with optional client-side title filter. + + Server-side full-text is not used (#18 — einheitliches + Verhalten ohne Volltext bis alle Adapter es können). The + client filter looks at title (Schlagworte) + Urheber. + """ + from datetime import date, timedelta + + end = date.today() + start = end - timedelta(days=self.date_window_days) + + async with httpx.AsyncClient( + timeout=60, + follow_redirects=True, + headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, + ) as client: + try: + report_id = await self._initial_search_and_poll( + client, start.isoformat(), end.isoformat(), + ) + if not report_id: + return [] + + # Pull a generous chunk so the client-side filter has + # enough material to work with. + chunksize = max(limit * 10, 200) if query else max(limit * 2, 50) + report_url = ( + f"{self.base_url}{self.prefix}/report.tt.html" + f"?report_id={report_id}&start=0&chunksize={chunksize}" + ) + resp = await client.get( + report_url, + headers={"Referer": f"{self.base_url}{self.prefix}/browse.tt.html"}, + ) + if resp.status_code != 200: + logger.error("%s report HTTP %s", self.bundesland, resp.status_code) + return [] + + results = self._parse_report_html(resp.text) + except Exception: + logger.exception("%s search error", self.bundesland) + return [] + + # Client-side filter + if query: + terms = [t.lower() for t in query.split() if t] + results = [ + d for d in results + if all(t in f"{d.title} {' '.join(d.fraktionen)}".lower() for t in terms) + ] + return results[:limit] + async def get_document(self, drucksache: str) -> Optional[Drucksache]: + """Look up a single Drucksache by ID via a broad browse.""" + results = await self.search(query="", limit=200) + for doc in results: + if doc.drucksache == drucksache: + return doc return None - + async def download_text(self, drucksache: str) -> Optional[str]: - return None + """Download the PDF for a Drucksache and extract its text.""" + import fitz # PyMuPDF + + doc = await self.get_document(drucksache) + if not doc or not doc.link: + return None + + async with httpx.AsyncClient( + timeout=60, + follow_redirects=True, + headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, + ) as client: + try: + resp = await client.get(doc.link) + if resp.status_code != 200: + logger.error( + "%s PDF HTTP %s for %s (%s)", + self.bundesland, resp.status_code, drucksache, doc.link, + ) + return None + pdf = fitz.open(stream=resp.content, filetype="pdf") + text = "" + for page in pdf: + text += page.get_text() + pdf.close() + return text + except Exception: + logger.exception("%s PDF download error for %s", self.bundesland, drucksache) + return None # Registry of adapters @@ -1315,7 +1701,14 @@ ADAPTERS = { document_typ="Antrag", ), "BY": BayernAdapter(), - "BW": BWAdapter(), + "BW": PARLISAdapter( + bundesland="BW", + name="Landtag von Baden-Württemberg (PARLIS)", + base_url="https://parlis.landtag-bw.de", + wahlperiode=17, + prefix="/parlis", + document_typ="Antrag", + ), } diff --git a/tests/test_bundeslaender.py b/tests/test_bundeslaender.py index 0660b8f..f9e53c6 100644 --- a/tests/test_bundeslaender.py +++ b/tests/test_bundeslaender.py @@ -24,10 +24,14 @@ class TestRegistryStructure: class TestActiveBundeslaender: - def test_four_active_bundeslaender(self): - active = aktive_bundeslaender() - codes = {bl.code for bl in active} - assert codes == {"NRW", "LSA", "MV", "BE"} + def test_active_bundeslaender_include_phase_1_set(self): + """At least the original four (NRW, LSA, MV, BE) plus any + Phase-1 additions (BW after #29) must be active. The test + avoids hardcoding the exact count so adding a new active + Bundesland in a follow-up doesn't break this case.""" + active_codes = {bl.code for bl in aktive_bundeslaender()} + original = {"NRW", "LSA", "MV", "BE"} + assert original <= active_codes def test_alle_bundeslaender_returns_all_sixteen(self): assert len(alle_bundeslaender()) == 16