Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00 · 2026-04-08 08:19:48 +02:00 · 2b9c0b2908
commit 2b9c0b2908
parent 1cb030aab7
2 changed files with 417 additions and 8 deletions
--- a/app/bundeslaender.py
+++ b/app/bundeslaender.py
@ -204,7 +204,18 @@ BUNDESLAENDER: dict[str, Bundesland] = {
        doku_base_url="https://www.dokumentation.landtag-mv.de",
        drucksache_format="8/1234",
        dokukratie_scraper="mv",
-        anmerkung="Wahltag offiziell auf 20.09.2026 festgelegt.",
+        aktiv=True,
        anmerkung=(
            "ParlDok 8.3.5 (J3S GmbH) — moderne SPA, JSON-API unter "
            "/parldok/Fulltext/Search. ParLDokAdapter (eigene Implementierung, "
            "nicht portala-kompatibel). Die in dokukratie/mv.yml beschriebene "
            "Legacy-HTML-Form (parldok/formalkriterien) ist mit dem 8.x-Upgrade "
            "deprecated. Suche filtert via facet_lp=10/id=8 server-seitig auf "
            "WP8, type=Antrag wird client-seitig gefiltert. Wahlprogramme zur "
            "LTW 26.09.2021 sind noch nicht indexiert (Folge-Issue) — Analyse "
            "läuft daher mit Grundsatzprogramm-Zitaten als Fallback. Wahltag "
            "offiziell auf 20.09.2026 festgelegt."
        ),
    ),
    "NI": Bundesland(
        code="NI",
--- a/app/parlamente.py
+++ b/app/parlamente.py
@ -1,5 +1,7 @@
 """Parliament search adapters for different German states."""
 import json
 import logging
 import httpx
 import re
 from abc import ABC, abstractmethod
@ -7,6 +9,8 @@ from dataclasses import dataclass
 from typing import Optional
 from bs4 import BeautifulSoup
 logger = logging.getLogger(__name__)
@dataclass
 class Drucksache:
@ -743,13 +747,13 @@ class PortalaAdapter(ParlamentAdapter):
                    headers={"Referer": browse_html},
                )
                if resp.status_code != 200:
-                    print(f"{self.bundesland} search HTTP {resp.status_code}")
+                    logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
                    return []
                data = resp.json()
                report_id = data.get("report_id")
                if not report_id:
-                    print(f"{self.bundesland}: no report_id in response: {data}")
+                    logger.error("%s: no report_id in response: %s", self.bundesland, data)
                    return []
                # Step 3: fetch the HTML hit list
@ -761,14 +765,14 @@ class PortalaAdapter(ParlamentAdapter):
                    headers={"Referer": browse_html},
                )
                if report_resp.status_code != 200:
-                    print(f"{self.bundesland} report HTTP {report_resp.status_code}")
+                    logger.error("%s report HTTP %s", self.bundesland, report_resp.status_code)
                    return []
                results = self._parse_hit_list_html(report_resp.text, query_filter=query)
                return results[:limit]
-            except Exception as e:
+            except Exception:
-                print(f"{self.bundesland} search error: {e}")
+                logger.exception("%s search error", self.bundesland)
                return []
    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
@ -806,8 +810,394 @@ class PortalaAdapter(ParlamentAdapter):
                    text += page.get_text()
                pdf.close()
                return text
-            except Exception as e:
+            except Exception:
-                print(f"{self.bundesland} download error for {drucksache}: {e}")
+                logger.exception("%s download error for %s", self.bundesland, drucksache)
                return None
 class ParLDokAdapter(ParlamentAdapter):
    """Adapter for ParlDok 8.x parliament documentation systems (J3S GmbH).
    ParlDok is a proprietary parliament documentation product by J3S GmbH
    (https://www.j3s.de). Different from the portala/eUI framework used by
    LSA/BE: ParlDok 8.x is a single-page app whose backend is a JSON API
    rooted at ``{base_url}{prefix}/Fulltext/...``. The legacy ParLDok 5.x
    HTML POST form (``parldok/formalkriterien``) used by dokukratie's MV
    YAML scraper has been deprecated by the LandtagMV upgrade to 8.3.5.
    Confirmed instances using this engine (April 2026):
    - **MV** (Mecklenburg-Vorpommern) — ``dokumentation.landtag-mv.de/parldok``
    - HH, SN, TH all advertise ParlDok in dokukratie but their actual
      versions/themes have not been verified yet.
    Search workflow:
    1. ``GET {base_url}{prefix}/`` to obtain the session cookie. The
       backend rejects POSTs without it.
    2. ``POST {base_url}{prefix}/Fulltext/Search`` with form-encoded
       ``data=<json>`` payload. The JSON carries a ``tags`` array of
       facet selections; each tag is ``{"type": <facet_type_int>,
       "id": <facet_value>}``. Reverse-engineered facet type constants
       from the bundle.js (``pd.facet_*``):
       - ``facet_fraction = 2``
       - ``facet_kind = 7`` (Drucksache, Plenarprotokoll, …)
       - ``facet_type = 8`` (Antrag, Gesetzentwurf, Kleine Anfrage, …)
       - ``facet_lp = 10`` (Wahlperiode)
       Response is JSON ``{success, data: <stringified JSON>}`` where the
       inner ``data`` carries ``{count, docs: [{id, title, date,
       authorhtml, kind, type, lp, number, link, ...}], ...}``.
    3. PDF download: ``GET {base_url}{prefix}/dokument/{numeric_id}``.
       Returns ``application/pdf`` directly. The ``link`` field returned
       by the search API already contains the path fragment
       ``/dokument/<id>#navpanes=0`` — strip the fragment and prepend
       the configured ``prefix``.
    Drucksachen-Nummer is reconstructed as ``f"{lp}/{number}"`` from the
    search hit. Full-text search is *not* implemented in this MVP — the
    backend supports it via ``facet_fulltext = 0`` tags but the public
    LP-only filter already returns the relevant Antrag pool. ``query``
    is applied as a client-side title/Urheber filter.
    """
    # Reverse-engineered facet type constants from bundle.js (pd.facet_*).
    FACET_FRACTION = 2
    FACET_KIND = 7
    FACET_TYPE = 8
    FACET_LP = 10
    def __init__(
        self,
        *,
        bundesland: str,
        name: str,
        base_url: str,
        wahlperiode: int,
        prefix: str = "/parldok",
        document_typ: str = "Antrag",
    ) -> None:
        """Configure a ParlDok 8.x adapter for one specific parliament.
        Args:
            bundesland: state code, e.g. ``"MV"``.
            name: human-readable label.
            base_url: ``https://...`` host root, no trailing slash.
            wahlperiode: current legislative period — fed into the
                ``facet_lp`` tag of the search payload.
            prefix: app prefix where ParlDok lives. ``/parldok`` for MV.
            document_typ: client-side filter on the ``type`` field of
                each hit ("Antrag", "Gesetzentwurf", …). Set to empty
                string to disable type filtering.
        """
        self.bundesland = bundesland
        self.name = name
        self.base_url = base_url.rstrip("/")
        self.prefix = "/" + prefix.strip("/")
        self.wahlperiode = wahlperiode
        self.document_typ = document_typ
    @staticmethod
    def _datum_de_to_iso(datum_de: str) -> str:
        """DD.MM.YYYY → YYYY-MM-DD; '' for empty input."""
        if not datum_de:
            return ""
        try:
            d, m, y = datum_de.split(".")
            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
        except ValueError:
            return ""
    @staticmethod
    def _normalize_fraktion(authorhtml: str) -> list[str]:
        """Map ParlDok ``authorhtml`` to canonical fraction codes.
        ``authorhtml`` may be a comma-separated list of fractions
        ("CDU, SPD, F.D.P."), a single MdL with party in parens
        ("Thomas de Jesus Fernandes (AfD)") or empty (Landesregierung).
        """
        if not authorhtml:
            return []
        u = authorhtml.upper()
        out: list[str] = []
        if re.search(r"\bBÜNDNIS\s*90\b", u) or re.search(r"\bGR(?:Ü|UE)NE\b", u):
            out.append("GRÜNE")
        if re.search(r"\bCDU\b", u):
            out.append("CDU")
        if re.search(r"\bSPD\b", u):
            out.append("SPD")
        # F.D.P. (with dots, historical) and FDP both occur in MV
        if re.search(r"\bF\.?\s*D\.?\s*P\.?\b", u):
            out.append("FDP")
        if re.search(r"\bAFD\b", u):
            out.append("AfD")
        if re.search(r"\bLINKE\b", u) or re.search(r"\bLL/PDS\b", u):
            out.append("LINKE")
        if re.search(r"\bBSW\b", u):
            out.append("BSW")
        if re.search(r"LANDESREGIERUNG|MINISTER\b|STAATSKANZLEI|MINISTERPRÄSIDENT", u):
            out.append("Landesregierung")
        return out
    def _build_search_body(self, *, length: int = 100) -> dict:
        """Build the JSON payload for the initial ``Fulltext/Search`` call.
        Filters by Wahlperiode only. Type/kind filtering happens
        client-side because the facet_type/facet_kind value IDs are
        instance-specific and would require an extra ``Fulltext/Filter``
        round trip to discover. Pagination beyond the first page goes
        through ``Fulltext/Resultpage`` (see ``_post_resultpage``); the
        ``Search`` endpoint itself ignores any non-zero ``Start``.
        """
        return {
            "devicekey": "",
            "max": length,
            "withfilter": False,
            # sort=2 → newest first (date desc); sort=1 is relevance.
            "sort": 2,
            "topk": length,
            "llm": 0,
            "newdocsearch": False,
            "limit": {"Start": 0, "Length": length},
            "tags": [{"type": self.FACET_LP, "id": self.wahlperiode}],
            "updateFilters": [],
        }
    def _hit_to_drucksache(self, hit: dict) -> Optional[Drucksache]:
        """Convert one ParlDok JSON hit to a Drucksache. None if unusable."""
        lp = hit.get("lp")
        number = hit.get("number")
        if not lp or not number:
            return None
        link_field = hit.get("link") or hit.get("prelink") or ""
        # Strip "#navpanes=0" fragment and prepend the prefix.
        path = link_field.split("#", 1)[0]
        pdf_url = f"{self.base_url}{self.prefix}{path}" if path else ""
        return Drucksache(
            drucksache=f"{lp}/{number}",
            title=hit.get("title", ""),
            fraktionen=self._normalize_fraktion(hit.get("authorhtml", "")),
            datum=self._datum_de_to_iso(hit.get("date", "")),
            link=pdf_url,
            bundesland=self.bundesland,
            typ=hit.get("type", "") or hit.get("kind", ""),
        )
    async def _post_json(
        self, client: httpx.AsyncClient, endpoint: str, payload: dict,
    ) -> Optional[dict]:
        """POST a JSON-stringified payload to a ParlDok endpoint.
        ``endpoint`` is the path tail (e.g. ``"Fulltext/Search"`` or
        ``"Fulltext/Resultpage"``). Returns the inner JSON object
        (already parsed from the stringified ``data`` field), or None
        on error.
        """
        homepage = f"{self.base_url}{self.prefix}/"
        url = f"{self.base_url}{self.prefix}/{endpoint}"
        try:
            resp = await client.post(
                url,
                data={"data": json.dumps(payload, ensure_ascii=False)},
                headers={
                    "X-Requested-With": "XMLHttpRequest",
                    "Referer": homepage,
                },
            )
            if resp.status_code != 200:
                logger.error(
                    "%s %s HTTP %s",
                    self.bundesland, endpoint, resp.status_code,
                )
                return None
            outer = resp.json()
            if not outer.get("success"):
                logger.error(
                    "%s %s not successful: %s",
                    self.bundesland, endpoint, outer.get("message"),
                )
                return None
            return json.loads(outer["data"])
        except Exception:
            logger.exception("%s ParlDok %s error", self.bundesland, endpoint)
            return None
    async def _initial_search(
        self, client: httpx.AsyncClient, *, length: int,
    ) -> tuple[Optional[int], list[dict]]:
        """Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
        The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
        calls. ParlDok ignores any non-zero ``Start`` on this endpoint —
        the first 100 hits are the only ones reachable via ``Search``.
        """
        body = self._build_search_body(length=length)
        inner = await self._post_json(client, "Fulltext/Search", body)
        if not inner:
            return None, []
        return inner.get("queryid"), (inner.get("docs") or [])
    async def _result_page(
        self, client: httpx.AsyncClient, *, queryid: int, start: int, length: int,
    ) -> list[dict]:
        """Fetch a further result page via ``Fulltext/Resultpage``."""
        payload = {
            "devicekey": "",
            "queryid": queryid,
            "limit": {"Start": start, "Length": length},
        }
        inner = await self._post_json(client, "Fulltext/Resultpage", payload)
        if not inner:
            return []
        return inner.get("docs") or []
    def _make_client(self) -> httpx.AsyncClient:
        return httpx.AsyncClient(
            timeout=30,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        )
    async def _paginated_hits(
        self, client: httpx.AsyncClient,
    ):
        """Async iterator over Drucksachen-style hits across all pages.
        Yields raw hit dicts in newest-first order. The first batch comes
        from ``Fulltext/Search``, subsequent batches from
        ``Fulltext/Resultpage`` using the queryid the server returned for
        the initial call. Stops when a page comes back empty, undersized,
        or after ``MAX_PAGES`` iterations.
        """
        queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE)
        for hit in hits:
            yield hit
        if not queryid or len(hits) < self.PAGE_SIZE:
            return
        for page in range(1, self.MAX_PAGES):
            page_hits = await self._result_page(
                client,
                queryid=queryid,
                start=page * self.PAGE_SIZE,
                length=self.PAGE_SIZE,
            )
            if not page_hits:
                return
            for hit in page_hits:
                yield hit
            if len(page_hits) < self.PAGE_SIZE:
                return
    # ParlDok 8.x caps Length per request at 100 — paginate if needed.
    PAGE_SIZE = 100
    # Safety bound: scan at most 10 pages × 100 = 1000 most recent docs.
    # Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more
    # than enough for the typical UI request (limit 5..20). Filtered
    # queries that find nothing in the last 1000 docs return empty
    # rather than scan the entire WP.
    MAX_PAGES = 10
    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Search recent documents of the configured Wahlperiode.
        ``query`` is a client-side filter on title + Urheber. The server
        returns the configured WP sorted newest first; the client keeps
        only ``Antrag``-typed Drucksachen and applies the title filter.
        Pagination: ParlDok caps each ``Fulltext/Search`` response at 100
        rows. Only ~3% of MV hits are real Anträge (most are Kleine
        Anfragen + Protokolle), so we may need several pages to fill
        ``limit``.
        """
        results: list[Drucksache] = []
        query_terms = [t for t in query.lower().split() if t] if query else []
        # ParlDok returns the same Drucksache multiple times when it
        # appears in several Vorgänge/Beratungen — dedupe by lp/number.
        seen: set[str] = set()
        async with self._make_client() as client:
            await client.get(f"{self.base_url}{self.prefix}/")
            async for hit in self._paginated_hits(client):
                if hit.get("kind") != "Drucksache":
                    continue
                if self.document_typ and hit.get("type") != self.document_typ:
                    continue
                doc = self._hit_to_drucksache(hit)
                if not doc:
                    continue
                if doc.drucksache in seen:
                    continue
                seen.add(doc.drucksache)
                if query_terms:
                    hay = f"{doc.title} {hit.get('authorhtml', '')}".lower()
                    if not all(t in hay for t in query_terms):
                        continue
                results.append(doc)
                if len(results) >= limit:
                    return results
        return results
    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Look up a single Antrag by ``lp/number`` ID.
        Pragmatic MVP: page through the WP unfiltered until we find a
        match. ParlDok offers a ``facet_number`` (14) facet that would
        let us target the lookup directly, but the facet ID values are
        instance-specific (would require a ``Fulltext/Filter`` discovery
        call) and the WP-wide pagination is fast enough for the typical
        2k–10k Drucksachen per period.
        """
        wanted_lp, wanted_num = (drucksache.split("/", 1) + [""])[:2]
        if not wanted_num:
            return None
        async with self._make_client() as client:
            await client.get(f"{self.base_url}{self.prefix}/")
            async for hit in self._paginated_hits(client):
                if hit.get("kind") != "Drucksache":
                    continue
                if str(hit.get("lp")) == wanted_lp and str(hit.get("number")) == wanted_num:
                    return self._hit_to_drucksache(hit)
        return None
    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download the PDF for a Drucksache and extract its text."""
        import fitz  # PyMuPDF
        doc = await self.get_document(drucksache)
        if not doc or not doc.link:
            return None
        async with httpx.AsyncClient(
            timeout=60,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    logger.error(
                        "%s PDF HTTP %s for %s (%s)",
                        self.bundesland, resp.status_code, drucksache, doc.link,
                    )
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception:
                logger.exception("%s ParlDok download error for %s", self.bundesland, drucksache)
                return None
@ -876,6 +1266,14 @@ ADAPTERS = {
        date_window_days=180,
        pdf_url_prefix="/files/",
    ),
    "MV": ParLDokAdapter(
        bundesland="MV",
        name="Landtag Mecklenburg-Vorpommern (ParlDok)",
        base_url="https://www.dokumentation.landtag-mv.de",
        wahlperiode=8,
        prefix="/parldok",
        document_typ="Antrag",
    ),
    "BY": BayernAdapter(),
    "BW": BWAdapter(),
 }