From 2b9c0b29080a474d7d5429a272eeca7b56e08caa Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Wed, 8 Apr 2026 08:19:48 +0200 Subject: [PATCH] =?UTF-8?q?Activate=20Mecklenburg-Vorpommern=20(ParlDok)?= =?UTF-8?q?=20=E2=80=94=20search-only=20MVP=20(#4)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) --- app/bundeslaender.py | 13 +- app/parlamente.py | 412 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 417 insertions(+), 8 deletions(-) diff --git a/app/bundeslaender.py b/app/bundeslaender.py index 374a7cc..74edb42 100644 --- a/app/bundeslaender.py +++ b/app/bundeslaender.py @@ -204,7 +204,18 @@ BUNDESLAENDER: dict[str, Bundesland] = { doku_base_url="https://www.dokumentation.landtag-mv.de", drucksache_format="8/1234", dokukratie_scraper="mv", - anmerkung="Wahltag offiziell auf 20.09.2026 festgelegt.", + aktiv=True, + anmerkung=( + "ParlDok 8.3.5 (J3S GmbH) — moderne SPA, JSON-API unter " + "/parldok/Fulltext/Search. ParLDokAdapter (eigene Implementierung, " + "nicht portala-kompatibel). Die in dokukratie/mv.yml beschriebene " + "Legacy-HTML-Form (parldok/formalkriterien) ist mit dem 8.x-Upgrade " + "deprecated. Suche filtert via facet_lp=10/id=8 server-seitig auf " + "WP8, type=Antrag wird client-seitig gefiltert. Wahlprogramme zur " + "LTW 26.09.2021 sind noch nicht indexiert (Folge-Issue) — Analyse " + "läuft daher mit Grundsatzprogramm-Zitaten als Fallback. Wahltag " + "offiziell auf 20.09.2026 festgelegt." + ), ), "NI": Bundesland( code="NI", diff --git a/app/parlamente.py b/app/parlamente.py index f46dcf9..db56f0d 100644 --- a/app/parlamente.py +++ b/app/parlamente.py @@ -1,5 +1,7 @@ """Parliament search adapters for different German states.""" +import json +import logging import httpx import re from abc import ABC, abstractmethod @@ -7,6 +9,8 @@ from dataclasses import dataclass from typing import Optional from bs4 import BeautifulSoup +logger = logging.getLogger(__name__) + @dataclass class Drucksache: @@ -743,13 +747,13 @@ class PortalaAdapter(ParlamentAdapter): headers={"Referer": browse_html}, ) if resp.status_code != 200: - print(f"{self.bundesland} search HTTP {resp.status_code}") + logger.error("%s search HTTP %s", self.bundesland, resp.status_code) return [] data = resp.json() report_id = data.get("report_id") if not report_id: - print(f"{self.bundesland}: no report_id in response: {data}") + logger.error("%s: no report_id in response: %s", self.bundesland, data) return [] # Step 3: fetch the HTML hit list @@ -761,14 +765,14 @@ class PortalaAdapter(ParlamentAdapter): headers={"Referer": browse_html}, ) if report_resp.status_code != 200: - print(f"{self.bundesland} report HTTP {report_resp.status_code}") + logger.error("%s report HTTP %s", self.bundesland, report_resp.status_code) return [] results = self._parse_hit_list_html(report_resp.text, query_filter=query) return results[:limit] - except Exception as e: - print(f"{self.bundesland} search error: {e}") + except Exception: + logger.exception("%s search error", self.bundesland) return [] async def get_document(self, drucksache: str) -> Optional[Drucksache]: @@ -806,8 +810,394 @@ class PortalaAdapter(ParlamentAdapter): text += page.get_text() pdf.close() return text - except Exception as e: - print(f"{self.bundesland} download error for {drucksache}: {e}") + except Exception: + logger.exception("%s download error for %s", self.bundesland, drucksache) + return None + + +class ParLDokAdapter(ParlamentAdapter): + """Adapter for ParlDok 8.x parliament documentation systems (J3S GmbH). + + ParlDok is a proprietary parliament documentation product by J3S GmbH + (https://www.j3s.de). Different from the portala/eUI framework used by + LSA/BE: ParlDok 8.x is a single-page app whose backend is a JSON API + rooted at ``{base_url}{prefix}/Fulltext/...``. The legacy ParLDok 5.x + HTML POST form (``parldok/formalkriterien``) used by dokukratie's MV + YAML scraper has been deprecated by the LandtagMV upgrade to 8.3.5. + + Confirmed instances using this engine (April 2026): + + - **MV** (Mecklenburg-Vorpommern) — ``dokumentation.landtag-mv.de/parldok`` + - HH, SN, TH all advertise ParlDok in dokukratie but their actual + versions/themes have not been verified yet. + + Search workflow: + + 1. ``GET {base_url}{prefix}/`` to obtain the session cookie. The + backend rejects POSTs without it. + 2. ``POST {base_url}{prefix}/Fulltext/Search`` with form-encoded + ``data=`` payload. The JSON carries a ``tags`` array of + facet selections; each tag is ``{"type": , + "id": }``. Reverse-engineered facet type constants + from the bundle.js (``pd.facet_*``): + + - ``facet_fraction = 2`` + - ``facet_kind = 7`` (Drucksache, Plenarprotokoll, …) + - ``facet_type = 8`` (Antrag, Gesetzentwurf, Kleine Anfrage, …) + - ``facet_lp = 10`` (Wahlperiode) + + Response is JSON ``{success, data: }`` where the + inner ``data`` carries ``{count, docs: [{id, title, date, + authorhtml, kind, type, lp, number, link, ...}], ...}``. + + 3. PDF download: ``GET {base_url}{prefix}/dokument/{numeric_id}``. + Returns ``application/pdf`` directly. The ``link`` field returned + by the search API already contains the path fragment + ``/dokument/#navpanes=0`` — strip the fragment and prepend + the configured ``prefix``. + + Drucksachen-Nummer is reconstructed as ``f"{lp}/{number}"`` from the + search hit. Full-text search is *not* implemented in this MVP — the + backend supports it via ``facet_fulltext = 0`` tags but the public + LP-only filter already returns the relevant Antrag pool. ``query`` + is applied as a client-side title/Urheber filter. + """ + + # Reverse-engineered facet type constants from bundle.js (pd.facet_*). + FACET_FRACTION = 2 + FACET_KIND = 7 + FACET_TYPE = 8 + FACET_LP = 10 + + def __init__( + self, + *, + bundesland: str, + name: str, + base_url: str, + wahlperiode: int, + prefix: str = "/parldok", + document_typ: str = "Antrag", + ) -> None: + """Configure a ParlDok 8.x adapter for one specific parliament. + + Args: + bundesland: state code, e.g. ``"MV"``. + name: human-readable label. + base_url: ``https://...`` host root, no trailing slash. + wahlperiode: current legislative period — fed into the + ``facet_lp`` tag of the search payload. + prefix: app prefix where ParlDok lives. ``/parldok`` for MV. + document_typ: client-side filter on the ``type`` field of + each hit ("Antrag", "Gesetzentwurf", …). Set to empty + string to disable type filtering. + """ + self.bundesland = bundesland + self.name = name + self.base_url = base_url.rstrip("/") + self.prefix = "/" + prefix.strip("/") + self.wahlperiode = wahlperiode + self.document_typ = document_typ + + @staticmethod + def _datum_de_to_iso(datum_de: str) -> str: + """DD.MM.YYYY → YYYY-MM-DD; '' for empty input.""" + if not datum_de: + return "" + try: + d, m, y = datum_de.split(".") + return f"{y}-{m.zfill(2)}-{d.zfill(2)}" + except ValueError: + return "" + + @staticmethod + def _normalize_fraktion(authorhtml: str) -> list[str]: + """Map ParlDok ``authorhtml`` to canonical fraction codes. + + ``authorhtml`` may be a comma-separated list of fractions + ("CDU, SPD, F.D.P."), a single MdL with party in parens + ("Thomas de Jesus Fernandes (AfD)") or empty (Landesregierung). + """ + if not authorhtml: + return [] + u = authorhtml.upper() + out: list[str] = [] + if re.search(r"\bBÜNDNIS\s*90\b", u) or re.search(r"\bGR(?:Ü|UE)NE\b", u): + out.append("GRÜNE") + if re.search(r"\bCDU\b", u): + out.append("CDU") + if re.search(r"\bSPD\b", u): + out.append("SPD") + # F.D.P. (with dots, historical) and FDP both occur in MV + if re.search(r"\bF\.?\s*D\.?\s*P\.?\b", u): + out.append("FDP") + if re.search(r"\bAFD\b", u): + out.append("AfD") + if re.search(r"\bLINKE\b", u) or re.search(r"\bLL/PDS\b", u): + out.append("LINKE") + if re.search(r"\bBSW\b", u): + out.append("BSW") + if re.search(r"LANDESREGIERUNG|MINISTER\b|STAATSKANZLEI|MINISTERPRÄSIDENT", u): + out.append("Landesregierung") + return out + + def _build_search_body(self, *, length: int = 100) -> dict: + """Build the JSON payload for the initial ``Fulltext/Search`` call. + + Filters by Wahlperiode only. Type/kind filtering happens + client-side because the facet_type/facet_kind value IDs are + instance-specific and would require an extra ``Fulltext/Filter`` + round trip to discover. Pagination beyond the first page goes + through ``Fulltext/Resultpage`` (see ``_post_resultpage``); the + ``Search`` endpoint itself ignores any non-zero ``Start``. + """ + return { + "devicekey": "", + "max": length, + "withfilter": False, + # sort=2 → newest first (date desc); sort=1 is relevance. + "sort": 2, + "topk": length, + "llm": 0, + "newdocsearch": False, + "limit": {"Start": 0, "Length": length}, + "tags": [{"type": self.FACET_LP, "id": self.wahlperiode}], + "updateFilters": [], + } + + def _hit_to_drucksache(self, hit: dict) -> Optional[Drucksache]: + """Convert one ParlDok JSON hit to a Drucksache. None if unusable.""" + lp = hit.get("lp") + number = hit.get("number") + if not lp or not number: + return None + + link_field = hit.get("link") or hit.get("prelink") or "" + # Strip "#navpanes=0" fragment and prepend the prefix. + path = link_field.split("#", 1)[0] + pdf_url = f"{self.base_url}{self.prefix}{path}" if path else "" + + return Drucksache( + drucksache=f"{lp}/{number}", + title=hit.get("title", ""), + fraktionen=self._normalize_fraktion(hit.get("authorhtml", "")), + datum=self._datum_de_to_iso(hit.get("date", "")), + link=pdf_url, + bundesland=self.bundesland, + typ=hit.get("type", "") or hit.get("kind", ""), + ) + + async def _post_json( + self, client: httpx.AsyncClient, endpoint: str, payload: dict, + ) -> Optional[dict]: + """POST a JSON-stringified payload to a ParlDok endpoint. + + ``endpoint`` is the path tail (e.g. ``"Fulltext/Search"`` or + ``"Fulltext/Resultpage"``). Returns the inner JSON object + (already parsed from the stringified ``data`` field), or None + on error. + """ + homepage = f"{self.base_url}{self.prefix}/" + url = f"{self.base_url}{self.prefix}/{endpoint}" + try: + resp = await client.post( + url, + data={"data": json.dumps(payload, ensure_ascii=False)}, + headers={ + "X-Requested-With": "XMLHttpRequest", + "Referer": homepage, + }, + ) + if resp.status_code != 200: + logger.error( + "%s %s HTTP %s", + self.bundesland, endpoint, resp.status_code, + ) + return None + outer = resp.json() + if not outer.get("success"): + logger.error( + "%s %s not successful: %s", + self.bundesland, endpoint, outer.get("message"), + ) + return None + return json.loads(outer["data"]) + except Exception: + logger.exception("%s ParlDok %s error", self.bundesland, endpoint) + return None + + async def _initial_search( + self, client: httpx.AsyncClient, *, length: int, + ) -> tuple[Optional[int], list[dict]]: + """Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``. + + The ``queryid`` is needed for subsequent ``Fulltext/Resultpage`` + calls. ParlDok ignores any non-zero ``Start`` on this endpoint — + the first 100 hits are the only ones reachable via ``Search``. + """ + body = self._build_search_body(length=length) + inner = await self._post_json(client, "Fulltext/Search", body) + if not inner: + return None, [] + return inner.get("queryid"), (inner.get("docs") or []) + + async def _result_page( + self, client: httpx.AsyncClient, *, queryid: int, start: int, length: int, + ) -> list[dict]: + """Fetch a further result page via ``Fulltext/Resultpage``.""" + payload = { + "devicekey": "", + "queryid": queryid, + "limit": {"Start": start, "Length": length}, + } + inner = await self._post_json(client, "Fulltext/Resultpage", payload) + if not inner: + return [] + return inner.get("docs") or [] + + def _make_client(self) -> httpx.AsyncClient: + return httpx.AsyncClient( + timeout=30, + follow_redirects=True, + headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, + ) + + async def _paginated_hits( + self, client: httpx.AsyncClient, + ): + """Async iterator over Drucksachen-style hits across all pages. + + Yields raw hit dicts in newest-first order. The first batch comes + from ``Fulltext/Search``, subsequent batches from + ``Fulltext/Resultpage`` using the queryid the server returned for + the initial call. Stops when a page comes back empty, undersized, + or after ``MAX_PAGES`` iterations. + """ + queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE) + for hit in hits: + yield hit + if not queryid or len(hits) < self.PAGE_SIZE: + return + + for page in range(1, self.MAX_PAGES): + page_hits = await self._result_page( + client, + queryid=queryid, + start=page * self.PAGE_SIZE, + length=self.PAGE_SIZE, + ) + if not page_hits: + return + for hit in page_hits: + yield hit + if len(page_hits) < self.PAGE_SIZE: + return + + # ParlDok 8.x caps Length per request at 100 — paginate if needed. + PAGE_SIZE = 100 + # Safety bound: scan at most 10 pages × 100 = 1000 most recent docs. + # Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more + # than enough for the typical UI request (limit 5..20). Filtered + # queries that find nothing in the last 1000 docs return empty + # rather than scan the entire WP. + MAX_PAGES = 10 + + async def search(self, query: str, limit: int = 20) -> list[Drucksache]: + """Search recent documents of the configured Wahlperiode. + + ``query`` is a client-side filter on title + Urheber. The server + returns the configured WP sorted newest first; the client keeps + only ``Antrag``-typed Drucksachen and applies the title filter. + + Pagination: ParlDok caps each ``Fulltext/Search`` response at 100 + rows. Only ~3% of MV hits are real Anträge (most are Kleine + Anfragen + Protokolle), so we may need several pages to fill + ``limit``. + """ + results: list[Drucksache] = [] + query_terms = [t for t in query.lower().split() if t] if query else [] + # ParlDok returns the same Drucksache multiple times when it + # appears in several Vorgänge/Beratungen — dedupe by lp/number. + seen: set[str] = set() + + async with self._make_client() as client: + await client.get(f"{self.base_url}{self.prefix}/") + async for hit in self._paginated_hits(client): + if hit.get("kind") != "Drucksache": + continue + if self.document_typ and hit.get("type") != self.document_typ: + continue + + doc = self._hit_to_drucksache(hit) + if not doc: + continue + if doc.drucksache in seen: + continue + seen.add(doc.drucksache) + + if query_terms: + hay = f"{doc.title} {hit.get('authorhtml', '')}".lower() + if not all(t in hay for t in query_terms): + continue + + results.append(doc) + if len(results) >= limit: + return results + + return results + + async def get_document(self, drucksache: str) -> Optional[Drucksache]: + """Look up a single Antrag by ``lp/number`` ID. + + Pragmatic MVP: page through the WP unfiltered until we find a + match. ParlDok offers a ``facet_number`` (14) facet that would + let us target the lookup directly, but the facet ID values are + instance-specific (would require a ``Fulltext/Filter`` discovery + call) and the WP-wide pagination is fast enough for the typical + 2k–10k Drucksachen per period. + """ + wanted_lp, wanted_num = (drucksache.split("/", 1) + [""])[:2] + if not wanted_num: + return None + + async with self._make_client() as client: + await client.get(f"{self.base_url}{self.prefix}/") + async for hit in self._paginated_hits(client): + if hit.get("kind") != "Drucksache": + continue + if str(hit.get("lp")) == wanted_lp and str(hit.get("number")) == wanted_num: + return self._hit_to_drucksache(hit) + return None + + async def download_text(self, drucksache: str) -> Optional[str]: + """Download the PDF for a Drucksache and extract its text.""" + import fitz # PyMuPDF + + doc = await self.get_document(drucksache) + if not doc or not doc.link: + return None + + async with httpx.AsyncClient( + timeout=60, + follow_redirects=True, + headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, + ) as client: + try: + resp = await client.get(doc.link) + if resp.status_code != 200: + logger.error( + "%s PDF HTTP %s for %s (%s)", + self.bundesland, resp.status_code, drucksache, doc.link, + ) + return None + pdf = fitz.open(stream=resp.content, filetype="pdf") + text = "" + for page in pdf: + text += page.get_text() + pdf.close() + return text + except Exception: + logger.exception("%s ParlDok download error for %s", self.bundesland, drucksache) return None @@ -876,6 +1266,14 @@ ADAPTERS = { date_window_days=180, pdf_url_prefix="/files/", ), + "MV": ParLDokAdapter( + bundesland="MV", + name="Landtag Mecklenburg-Vorpommern (ParlDok)", + base_url="https://www.dokumentation.landtag-mv.de", + wahlperiode=8, + prefix="/parldok", + document_typ="Antrag", + ), "BY": BayernAdapter(), "BW": BWAdapter(), }