From 2b9c0b29080a474d7d5429a272eeca7b56e08caa Mon Sep 17 00:00:00 2001
From: Dotty Dotter <dotty@Mac.wideopen.space>
Date: Wed, 8 Apr 2026 08:19:48 +0200
Subject: [PATCH] =?UTF-8?q?Activate=20Mecklenburg-Vorpommern=20(ParlDok)?=
 =?UTF-8?q?=20=E2=80=94=20search-only=20MVP=20(#4)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 app/bundeslaender.py |  13 +-
 app/parlamente.py    | 412 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 417 insertions(+), 8 deletions(-)

diff --git a/app/bundeslaender.py b/app/bundeslaender.py
index 374a7cc..74edb42 100644
--- a/app/bundeslaender.py
+++ b/app/bundeslaender.py
@@ -204,7 +204,18 @@ BUNDESLAENDER: dict[str, Bundesland] = {
         doku_base_url="https://www.dokumentation.landtag-mv.de",
         drucksache_format="8/1234",
         dokukratie_scraper="mv",
-        anmerkung="Wahltag offiziell auf 20.09.2026 festgelegt.",
+        aktiv=True,
+        anmerkung=(
+            "ParlDok 8.3.5 (J3S GmbH) — moderne SPA, JSON-API unter "
+            "/parldok/Fulltext/Search. ParLDokAdapter (eigene Implementierung, "
+            "nicht portala-kompatibel). Die in dokukratie/mv.yml beschriebene "
+            "Legacy-HTML-Form (parldok/formalkriterien) ist mit dem 8.x-Upgrade "
+            "deprecated. Suche filtert via facet_lp=10/id=8 server-seitig auf "
+            "WP8, type=Antrag wird client-seitig gefiltert. Wahlprogramme zur "
+            "LTW 26.09.2021 sind noch nicht indexiert (Folge-Issue) — Analyse "
+            "läuft daher mit Grundsatzprogramm-Zitaten als Fallback. Wahltag "
+            "offiziell auf 20.09.2026 festgelegt."
+        ),
     ),
     "NI": Bundesland(
         code="NI",
diff --git a/app/parlamente.py b/app/parlamente.py
index f46dcf9..db56f0d 100644
--- a/app/parlamente.py
+++ b/app/parlamente.py
@@ -1,5 +1,7 @@
 """Parliament search adapters for different German states."""
 
+import json
+import logging
 import httpx
 import re
 from abc import ABC, abstractmethod
@@ -7,6 +9,8 @@ from dataclasses import dataclass
 from typing import Optional
 from bs4 import BeautifulSoup
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class Drucksache:
@@ -743,13 +747,13 @@ class PortalaAdapter(ParlamentAdapter):
                     headers={"Referer": browse_html},
                 )
                 if resp.status_code != 200:
-                    print(f"{self.bundesland} search HTTP {resp.status_code}")
+                    logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
                     return []
 
                 data = resp.json()
                 report_id = data.get("report_id")
                 if not report_id:
-                    print(f"{self.bundesland}: no report_id in response: {data}")
+                    logger.error("%s: no report_id in response: %s", self.bundesland, data)
                     return []
 
                 # Step 3: fetch the HTML hit list
@@ -761,14 +765,14 @@ class PortalaAdapter(ParlamentAdapter):
                     headers={"Referer": browse_html},
                 )
                 if report_resp.status_code != 200:
-                    print(f"{self.bundesland} report HTTP {report_resp.status_code}")
+                    logger.error("%s report HTTP %s", self.bundesland, report_resp.status_code)
                     return []
 
                 results = self._parse_hit_list_html(report_resp.text, query_filter=query)
                 return results[:limit]
 
-            except Exception as e:
-                print(f"{self.bundesland} search error: {e}")
+            except Exception:
+                logger.exception("%s search error", self.bundesland)
                 return []
 
     async def get_document(self, drucksache: str) -> Optional[Drucksache]:
@@ -806,8 +810,394 @@ class PortalaAdapter(ParlamentAdapter):
                     text += page.get_text()
                 pdf.close()
                 return text
-            except Exception as e:
-                print(f"{self.bundesland} download error for {drucksache}: {e}")
+            except Exception:
+                logger.exception("%s download error for %s", self.bundesland, drucksache)
+                return None
+
+
+class ParLDokAdapter(ParlamentAdapter):
+    """Adapter for ParlDok 8.x parliament documentation systems (J3S GmbH).
+
+    ParlDok is a proprietary parliament documentation product by J3S GmbH
+    (https://www.j3s.de). Different from the portala/eUI framework used by
+    LSA/BE: ParlDok 8.x is a single-page app whose backend is a JSON API
+    rooted at ``{base_url}{prefix}/Fulltext/...``. The legacy ParLDok 5.x
+    HTML POST form (``parldok/formalkriterien``) used by dokukratie's MV
+    YAML scraper has been deprecated by the LandtagMV upgrade to 8.3.5.
+
+    Confirmed instances using this engine (April 2026):
+
+    - **MV** (Mecklenburg-Vorpommern) — ``dokumentation.landtag-mv.de/parldok``
+    - HH, SN, TH all advertise ParlDok in dokukratie but their actual
+      versions/themes have not been verified yet.
+
+    Search workflow:
+
+    1. ``GET {base_url}{prefix}/`` to obtain the session cookie. The
+       backend rejects POSTs without it.
+    2. ``POST {base_url}{prefix}/Fulltext/Search`` with form-encoded
+       ``data=<json>`` payload. The JSON carries a ``tags`` array of
+       facet selections; each tag is ``{"type": <facet_type_int>,
+       "id": <facet_value>}``. Reverse-engineered facet type constants
+       from the bundle.js (``pd.facet_*``):
+
+       - ``facet_fraction = 2``
+       - ``facet_kind = 7`` (Drucksache, Plenarprotokoll, …)
+       - ``facet_type = 8`` (Antrag, Gesetzentwurf, Kleine Anfrage, …)
+       - ``facet_lp = 10`` (Wahlperiode)
+
+       Response is JSON ``{success, data: <stringified JSON>}`` where the
+       inner ``data`` carries ``{count, docs: [{id, title, date,
+       authorhtml, kind, type, lp, number, link, ...}], ...}``.
+
+    3. PDF download: ``GET {base_url}{prefix}/dokument/{numeric_id}``.
+       Returns ``application/pdf`` directly. The ``link`` field returned
+       by the search API already contains the path fragment
+       ``/dokument/<id>#navpanes=0`` — strip the fragment and prepend
+       the configured ``prefix``.
+
+    Drucksachen-Nummer is reconstructed as ``f"{lp}/{number}"`` from the
+    search hit. Full-text search is *not* implemented in this MVP — the
+    backend supports it via ``facet_fulltext = 0`` tags but the public
+    LP-only filter already returns the relevant Antrag pool. ``query``
+    is applied as a client-side title/Urheber filter.
+    """
+
+    # Reverse-engineered facet type constants from bundle.js (pd.facet_*).
+    FACET_FRACTION = 2
+    FACET_KIND = 7
+    FACET_TYPE = 8
+    FACET_LP = 10
+
+    def __init__(
+        self,
+        *,
+        bundesland: str,
+        name: str,
+        base_url: str,
+        wahlperiode: int,
+        prefix: str = "/parldok",
+        document_typ: str = "Antrag",
+    ) -> None:
+        """Configure a ParlDok 8.x adapter for one specific parliament.
+
+        Args:
+            bundesland: state code, e.g. ``"MV"``.
+            name: human-readable label.
+            base_url: ``https://...`` host root, no trailing slash.
+            wahlperiode: current legislative period — fed into the
+                ``facet_lp`` tag of the search payload.
+            prefix: app prefix where ParlDok lives. ``/parldok`` for MV.
+            document_typ: client-side filter on the ``type`` field of
+                each hit ("Antrag", "Gesetzentwurf", …). Set to empty
+                string to disable type filtering.
+        """
+        self.bundesland = bundesland
+        self.name = name
+        self.base_url = base_url.rstrip("/")
+        self.prefix = "/" + prefix.strip("/")
+        self.wahlperiode = wahlperiode
+        self.document_typ = document_typ
+
+    @staticmethod
+    def _datum_de_to_iso(datum_de: str) -> str:
+        """DD.MM.YYYY → YYYY-MM-DD; '' for empty input."""
+        if not datum_de:
+            return ""
+        try:
+            d, m, y = datum_de.split(".")
+            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
+        except ValueError:
+            return ""
+
+    @staticmethod
+    def _normalize_fraktion(authorhtml: str) -> list[str]:
+        """Map ParlDok ``authorhtml`` to canonical fraction codes.
+
+        ``authorhtml`` may be a comma-separated list of fractions
+        ("CDU, SPD, F.D.P."), a single MdL with party in parens
+        ("Thomas de Jesus Fernandes (AfD)") or empty (Landesregierung).
+        """
+        if not authorhtml:
+            return []
+        u = authorhtml.upper()
+        out: list[str] = []
+        if re.search(r"\bBÜNDNIS\s*90\b", u) or re.search(r"\bGR(?:Ü|UE)NE\b", u):
+            out.append("GRÜNE")
+        if re.search(r"\bCDU\b", u):
+            out.append("CDU")
+        if re.search(r"\bSPD\b", u):
+            out.append("SPD")
+        # F.D.P. (with dots, historical) and FDP both occur in MV
+        if re.search(r"\bF\.?\s*D\.?\s*P\.?\b", u):
+            out.append("FDP")
+        if re.search(r"\bAFD\b", u):
+            out.append("AfD")
+        if re.search(r"\bLINKE\b", u) or re.search(r"\bLL/PDS\b", u):
+            out.append("LINKE")
+        if re.search(r"\bBSW\b", u):
+            out.append("BSW")
+        if re.search(r"LANDESREGIERUNG|MINISTER\b|STAATSKANZLEI|MINISTERPRÄSIDENT", u):
+            out.append("Landesregierung")
+        return out
+
+    def _build_search_body(self, *, length: int = 100) -> dict:
+        """Build the JSON payload for the initial ``Fulltext/Search`` call.
+
+        Filters by Wahlperiode only. Type/kind filtering happens
+        client-side because the facet_type/facet_kind value IDs are
+        instance-specific and would require an extra ``Fulltext/Filter``
+        round trip to discover. Pagination beyond the first page goes
+        through ``Fulltext/Resultpage`` (see ``_post_resultpage``); the
+        ``Search`` endpoint itself ignores any non-zero ``Start``.
+        """
+        return {
+            "devicekey": "",
+            "max": length,
+            "withfilter": False,
+            # sort=2 → newest first (date desc); sort=1 is relevance.
+            "sort": 2,
+            "topk": length,
+            "llm": 0,
+            "newdocsearch": False,
+            "limit": {"Start": 0, "Length": length},
+            "tags": [{"type": self.FACET_LP, "id": self.wahlperiode}],
+            "updateFilters": [],
+        }
+
+    def _hit_to_drucksache(self, hit: dict) -> Optional[Drucksache]:
+        """Convert one ParlDok JSON hit to a Drucksache. None if unusable."""
+        lp = hit.get("lp")
+        number = hit.get("number")
+        if not lp or not number:
+            return None
+
+        link_field = hit.get("link") or hit.get("prelink") or ""
+        # Strip "#navpanes=0" fragment and prepend the prefix.
+        path = link_field.split("#", 1)[0]
+        pdf_url = f"{self.base_url}{self.prefix}{path}" if path else ""
+
+        return Drucksache(
+            drucksache=f"{lp}/{number}",
+            title=hit.get("title", ""),
+            fraktionen=self._normalize_fraktion(hit.get("authorhtml", "")),
+            datum=self._datum_de_to_iso(hit.get("date", "")),
+            link=pdf_url,
+            bundesland=self.bundesland,
+            typ=hit.get("type", "") or hit.get("kind", ""),
+        )
+
+    async def _post_json(
+        self, client: httpx.AsyncClient, endpoint: str, payload: dict,
+    ) -> Optional[dict]:
+        """POST a JSON-stringified payload to a ParlDok endpoint.
+
+        ``endpoint`` is the path tail (e.g. ``"Fulltext/Search"`` or
+        ``"Fulltext/Resultpage"``). Returns the inner JSON object
+        (already parsed from the stringified ``data`` field), or None
+        on error.
+        """
+        homepage = f"{self.base_url}{self.prefix}/"
+        url = f"{self.base_url}{self.prefix}/{endpoint}"
+        try:
+            resp = await client.post(
+                url,
+                data={"data": json.dumps(payload, ensure_ascii=False)},
+                headers={
+                    "X-Requested-With": "XMLHttpRequest",
+                    "Referer": homepage,
+                },
+            )
+            if resp.status_code != 200:
+                logger.error(
+                    "%s %s HTTP %s",
+                    self.bundesland, endpoint, resp.status_code,
+                )
+                return None
+            outer = resp.json()
+            if not outer.get("success"):
+                logger.error(
+                    "%s %s not successful: %s",
+                    self.bundesland, endpoint, outer.get("message"),
+                )
+                return None
+            return json.loads(outer["data"])
+        except Exception:
+            logger.exception("%s ParlDok %s error", self.bundesland, endpoint)
+            return None
+
+    async def _initial_search(
+        self, client: httpx.AsyncClient, *, length: int,
+    ) -> tuple[Optional[int], list[dict]]:
+        """Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
+
+        The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
+        calls. ParlDok ignores any non-zero ``Start`` on this endpoint —
+        the first 100 hits are the only ones reachable via ``Search``.
+        """
+        body = self._build_search_body(length=length)
+        inner = await self._post_json(client, "Fulltext/Search", body)
+        if not inner:
+            return None, []
+        return inner.get("queryid"), (inner.get("docs") or [])
+
+    async def _result_page(
+        self, client: httpx.AsyncClient, *, queryid: int, start: int, length: int,
+    ) -> list[dict]:
+        """Fetch a further result page via ``Fulltext/Resultpage``."""
+        payload = {
+            "devicekey": "",
+            "queryid": queryid,
+            "limit": {"Start": start, "Length": length},
+        }
+        inner = await self._post_json(client, "Fulltext/Resultpage", payload)
+        if not inner:
+            return []
+        return inner.get("docs") or []
+
+    def _make_client(self) -> httpx.AsyncClient:
+        return httpx.AsyncClient(
+            timeout=30,
+            follow_redirects=True,
+            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
+        )
+
+    async def _paginated_hits(
+        self, client: httpx.AsyncClient,
+    ):
+        """Async iterator over Drucksachen-style hits across all pages.
+
+        Yields raw hit dicts in newest-first order. The first batch comes
+        from ``Fulltext/Search``, subsequent batches from
+        ``Fulltext/Resultpage`` using the queryid the server returned for
+        the initial call. Stops when a page comes back empty, undersized,
+        or after ``MAX_PAGES`` iterations.
+        """
+        queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE)
+        for hit in hits:
+            yield hit
+        if not queryid or len(hits) < self.PAGE_SIZE:
+            return
+
+        for page in range(1, self.MAX_PAGES):
+            page_hits = await self._result_page(
+                client,
+                queryid=queryid,
+                start=page * self.PAGE_SIZE,
+                length=self.PAGE_SIZE,
+            )
+            if not page_hits:
+                return
+            for hit in page_hits:
+                yield hit
+            if len(page_hits) < self.PAGE_SIZE:
+                return
+
+    # ParlDok 8.x caps Length per request at 100 — paginate if needed.
+    PAGE_SIZE = 100
+    # Safety bound: scan at most 10 pages × 100 = 1000 most recent docs.
+    # Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more
+    # than enough for the typical UI request (limit 5..20). Filtered
+    # queries that find nothing in the last 1000 docs return empty
+    # rather than scan the entire WP.
+    MAX_PAGES = 10
+
+    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
+        """Search recent documents of the configured Wahlperiode.
+
+        ``query`` is a client-side filter on title + Urheber. The server
+        returns the configured WP sorted newest first; the client keeps
+        only ``Antrag``-typed Drucksachen and applies the title filter.
+
+        Pagination: ParlDok caps each ``Fulltext/Search`` response at 100
+        rows. Only ~3% of MV hits are real Anträge (most are Kleine
+        Anfragen + Protokolle), so we may need several pages to fill
+        ``limit``.
+        """
+        results: list[Drucksache] = []
+        query_terms = [t for t in query.lower().split() if t] if query else []
+        # ParlDok returns the same Drucksache multiple times when it
+        # appears in several Vorgänge/Beratungen — dedupe by lp/number.
+        seen: set[str] = set()
+
+        async with self._make_client() as client:
+            await client.get(f"{self.base_url}{self.prefix}/")
+            async for hit in self._paginated_hits(client):
+                if hit.get("kind") != "Drucksache":
+                    continue
+                if self.document_typ and hit.get("type") != self.document_typ:
+                    continue
+
+                doc = self._hit_to_drucksache(hit)
+                if not doc:
+                    continue
+                if doc.drucksache in seen:
+                    continue
+                seen.add(doc.drucksache)
+
+                if query_terms:
+                    hay = f"{doc.title} {hit.get('authorhtml', '')}".lower()
+                    if not all(t in hay for t in query_terms):
+                        continue
+
+                results.append(doc)
+                if len(results) >= limit:
+                    return results
+
+        return results
+
+    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
+        """Look up a single Antrag by ``lp/number`` ID.
+
+        Pragmatic MVP: page through the WP unfiltered until we find a
+        match. ParlDok offers a ``facet_number`` (14) facet that would
+        let us target the lookup directly, but the facet ID values are
+        instance-specific (would require a ``Fulltext/Filter`` discovery
+        call) and the WP-wide pagination is fast enough for the typical
+        2k–10k Drucksachen per period.
+        """
+        wanted_lp, wanted_num = (drucksache.split("/", 1) + [""])[:2]
+        if not wanted_num:
+            return None
+
+        async with self._make_client() as client:
+            await client.get(f"{self.base_url}{self.prefix}/")
+            async for hit in self._paginated_hits(client):
+                if hit.get("kind") != "Drucksache":
+                    continue
+                if str(hit.get("lp")) == wanted_lp and str(hit.get("number")) == wanted_num:
+                    return self._hit_to_drucksache(hit)
+        return None
+
+    async def download_text(self, drucksache: str) -> Optional[str]:
+        """Download the PDF for a Drucksache and extract its text."""
+        import fitz  # PyMuPDF
+
+        doc = await self.get_document(drucksache)
+        if not doc or not doc.link:
+            return None
+
+        async with httpx.AsyncClient(
+            timeout=60,
+            follow_redirects=True,
+            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
+        ) as client:
+            try:
+                resp = await client.get(doc.link)
+                if resp.status_code != 200:
+                    logger.error(
+                        "%s PDF HTTP %s for %s (%s)",
+                        self.bundesland, resp.status_code, drucksache, doc.link,
+                    )
+                    return None
+                pdf = fitz.open(stream=resp.content, filetype="pdf")
+                text = ""
+                for page in pdf:
+                    text += page.get_text()
+                pdf.close()
+                return text
+            except Exception:
+                logger.exception("%s ParlDok download error for %s", self.bundesland, drucksache)
                 return None
 
 
@@ -876,6 +1266,14 @@ ADAPTERS = {
         date_window_days=180,
         pdf_url_prefix="/files/",
     ),
+    "MV": ParLDokAdapter(
+        bundesland="MV",
+        name="Landtag Mecklenburg-Vorpommern (ParlDok)",
+        base_url="https://www.dokumentation.landtag-mv.de",
+        wahlperiode=8,
+        prefix="/parldok",
+        document_typ="Antrag",
+    ),
     "BY": BayernAdapter(),
     "BW": BWAdapter(),
 }