Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dotty Dotter 2026-04-08 08:19:48 +02:00
parent 1cb030aab7
commit 2b9c0b2908
2 changed files with 417 additions and 8 deletions

View File

@ -204,7 +204,18 @@ BUNDESLAENDER: dict[str, Bundesland] = {
doku_base_url="https://www.dokumentation.landtag-mv.de", doku_base_url="https://www.dokumentation.landtag-mv.de",
drucksache_format="8/1234", drucksache_format="8/1234",
dokukratie_scraper="mv", dokukratie_scraper="mv",
anmerkung="Wahltag offiziell auf 20.09.2026 festgelegt.", aktiv=True,
anmerkung=(
"ParlDok 8.3.5 (J3S GmbH) — moderne SPA, JSON-API unter "
"/parldok/Fulltext/Search. ParLDokAdapter (eigene Implementierung, "
"nicht portala-kompatibel). Die in dokukratie/mv.yml beschriebene "
"Legacy-HTML-Form (parldok/formalkriterien) ist mit dem 8.x-Upgrade "
"deprecated. Suche filtert via facet_lp=10/id=8 server-seitig auf "
"WP8, type=Antrag wird client-seitig gefiltert. Wahlprogramme zur "
"LTW 26.09.2021 sind noch nicht indexiert (Folge-Issue) — Analyse "
"läuft daher mit Grundsatzprogramm-Zitaten als Fallback. Wahltag "
"offiziell auf 20.09.2026 festgelegt."
),
), ),
"NI": Bundesland( "NI": Bundesland(
code="NI", code="NI",

View File

@ -1,5 +1,7 @@
"""Parliament search adapters for different German states.""" """Parliament search adapters for different German states."""
import json
import logging
import httpx import httpx
import re import re
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
@ -7,6 +9,8 @@ from dataclasses import dataclass
from typing import Optional from typing import Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
@dataclass @dataclass
class Drucksache: class Drucksache:
@ -743,13 +747,13 @@ class PortalaAdapter(ParlamentAdapter):
headers={"Referer": browse_html}, headers={"Referer": browse_html},
) )
if resp.status_code != 200: if resp.status_code != 200:
print(f"{self.bundesland} search HTTP {resp.status_code}") logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
return [] return []
data = resp.json() data = resp.json()
report_id = data.get("report_id") report_id = data.get("report_id")
if not report_id: if not report_id:
print(f"{self.bundesland}: no report_id in response: {data}") logger.error("%s: no report_id in response: %s", self.bundesland, data)
return [] return []
# Step 3: fetch the HTML hit list # Step 3: fetch the HTML hit list
@ -761,14 +765,14 @@ class PortalaAdapter(ParlamentAdapter):
headers={"Referer": browse_html}, headers={"Referer": browse_html},
) )
if report_resp.status_code != 200: if report_resp.status_code != 200:
print(f"{self.bundesland} report HTTP {report_resp.status_code}") logger.error("%s report HTTP %s", self.bundesland, report_resp.status_code)
return [] return []
results = self._parse_hit_list_html(report_resp.text, query_filter=query) results = self._parse_hit_list_html(report_resp.text, query_filter=query)
return results[:limit] return results[:limit]
except Exception as e: except Exception:
print(f"{self.bundesland} search error: {e}") logger.exception("%s search error", self.bundesland)
return [] return []
async def get_document(self, drucksache: str) -> Optional[Drucksache]: async def get_document(self, drucksache: str) -> Optional[Drucksache]:
@ -806,8 +810,394 @@ class PortalaAdapter(ParlamentAdapter):
text += page.get_text() text += page.get_text()
pdf.close() pdf.close()
return text return text
except Exception as e: except Exception:
print(f"{self.bundesland} download error for {drucksache}: {e}") logger.exception("%s download error for %s", self.bundesland, drucksache)
return None
class ParLDokAdapter(ParlamentAdapter):
"""Adapter for ParlDok 8.x parliament documentation systems (J3S GmbH).
ParlDok is a proprietary parliament documentation product by J3S GmbH
(https://www.j3s.de). Different from the portala/eUI framework used by
LSA/BE: ParlDok 8.x is a single-page app whose backend is a JSON API
rooted at ``{base_url}{prefix}/Fulltext/...``. The legacy ParLDok 5.x
HTML POST form (``parldok/formalkriterien``) used by dokukratie's MV
YAML scraper has been deprecated by the LandtagMV upgrade to 8.3.5.
Confirmed instances using this engine (April 2026):
- **MV** (Mecklenburg-Vorpommern) ``dokumentation.landtag-mv.de/parldok``
- HH, SN, TH all advertise ParlDok in dokukratie but their actual
versions/themes have not been verified yet.
Search workflow:
1. ``GET {base_url}{prefix}/`` to obtain the session cookie. The
backend rejects POSTs without it.
2. ``POST {base_url}{prefix}/Fulltext/Search`` with form-encoded
``data=<json>`` payload. The JSON carries a ``tags`` array of
facet selections; each tag is ``{"type": <facet_type_int>,
"id": <facet_value>}``. Reverse-engineered facet type constants
from the bundle.js (``pd.facet_*``):
- ``facet_fraction = 2``
- ``facet_kind = 7`` (Drucksache, Plenarprotokoll, )
- ``facet_type = 8`` (Antrag, Gesetzentwurf, Kleine Anfrage, )
- ``facet_lp = 10`` (Wahlperiode)
Response is JSON ``{success, data: <stringified JSON>}`` where the
inner ``data`` carries ``{count, docs: [{id, title, date,
authorhtml, kind, type, lp, number, link, ...}], ...}``.
3. PDF download: ``GET {base_url}{prefix}/dokument/{numeric_id}``.
Returns ``application/pdf`` directly. The ``link`` field returned
by the search API already contains the path fragment
``/dokument/<id>#navpanes=0`` — strip the fragment and prepend
the configured ``prefix``.
Drucksachen-Nummer is reconstructed as ``f"{lp}/{number}"`` from the
search hit. Full-text search is *not* implemented in this MVP the
backend supports it via ``facet_fulltext = 0`` tags but the public
LP-only filter already returns the relevant Antrag pool. ``query``
is applied as a client-side title/Urheber filter.
"""
# Reverse-engineered facet type constants from bundle.js (pd.facet_*).
FACET_FRACTION = 2
FACET_KIND = 7
FACET_TYPE = 8
FACET_LP = 10
def __init__(
self,
*,
bundesland: str,
name: str,
base_url: str,
wahlperiode: int,
prefix: str = "/parldok",
document_typ: str = "Antrag",
) -> None:
"""Configure a ParlDok 8.x adapter for one specific parliament.
Args:
bundesland: state code, e.g. ``"MV"``.
name: human-readable label.
base_url: ``https://...`` host root, no trailing slash.
wahlperiode: current legislative period fed into the
``facet_lp`` tag of the search payload.
prefix: app prefix where ParlDok lives. ``/parldok`` for MV.
document_typ: client-side filter on the ``type`` field of
each hit ("Antrag", "Gesetzentwurf", ). Set to empty
string to disable type filtering.
"""
self.bundesland = bundesland
self.name = name
self.base_url = base_url.rstrip("/")
self.prefix = "/" + prefix.strip("/")
self.wahlperiode = wahlperiode
self.document_typ = document_typ
@staticmethod
def _datum_de_to_iso(datum_de: str) -> str:
"""DD.MM.YYYY → YYYY-MM-DD; '' for empty input."""
if not datum_de:
return ""
try:
d, m, y = datum_de.split(".")
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
except ValueError:
return ""
@staticmethod
def _normalize_fraktion(authorhtml: str) -> list[str]:
"""Map ParlDok ``authorhtml`` to canonical fraction codes.
``authorhtml`` may be a comma-separated list of fractions
("CDU, SPD, F.D.P."), a single MdL with party in parens
("Thomas de Jesus Fernandes (AfD)") or empty (Landesregierung).
"""
if not authorhtml:
return []
u = authorhtml.upper()
out: list[str] = []
if re.search(r"\bBÜNDNIS\s*90\b", u) or re.search(r"\bGR(?:Ü|UE)NE\b", u):
out.append("GRÜNE")
if re.search(r"\bCDU\b", u):
out.append("CDU")
if re.search(r"\bSPD\b", u):
out.append("SPD")
# F.D.P. (with dots, historical) and FDP both occur in MV
if re.search(r"\bF\.?\s*D\.?\s*P\.?\b", u):
out.append("FDP")
if re.search(r"\bAFD\b", u):
out.append("AfD")
if re.search(r"\bLINKE\b", u) or re.search(r"\bLL/PDS\b", u):
out.append("LINKE")
if re.search(r"\bBSW\b", u):
out.append("BSW")
if re.search(r"LANDESREGIERUNG|MINISTER\b|STAATSKANZLEI|MINISTERPRÄSIDENT", u):
out.append("Landesregierung")
return out
def _build_search_body(self, *, length: int = 100) -> dict:
"""Build the JSON payload for the initial ``Fulltext/Search`` call.
Filters by Wahlperiode only. Type/kind filtering happens
client-side because the facet_type/facet_kind value IDs are
instance-specific and would require an extra ``Fulltext/Filter``
round trip to discover. Pagination beyond the first page goes
through ``Fulltext/Resultpage`` (see ``_post_resultpage``); the
``Search`` endpoint itself ignores any non-zero ``Start``.
"""
return {
"devicekey": "",
"max": length,
"withfilter": False,
# sort=2 → newest first (date desc); sort=1 is relevance.
"sort": 2,
"topk": length,
"llm": 0,
"newdocsearch": False,
"limit": {"Start": 0, "Length": length},
"tags": [{"type": self.FACET_LP, "id": self.wahlperiode}],
"updateFilters": [],
}
def _hit_to_drucksache(self, hit: dict) -> Optional[Drucksache]:
"""Convert one ParlDok JSON hit to a Drucksache. None if unusable."""
lp = hit.get("lp")
number = hit.get("number")
if not lp or not number:
return None
link_field = hit.get("link") or hit.get("prelink") or ""
# Strip "#navpanes=0" fragment and prepend the prefix.
path = link_field.split("#", 1)[0]
pdf_url = f"{self.base_url}{self.prefix}{path}" if path else ""
return Drucksache(
drucksache=f"{lp}/{number}",
title=hit.get("title", ""),
fraktionen=self._normalize_fraktion(hit.get("authorhtml", "")),
datum=self._datum_de_to_iso(hit.get("date", "")),
link=pdf_url,
bundesland=self.bundesland,
typ=hit.get("type", "") or hit.get("kind", ""),
)
async def _post_json(
self, client: httpx.AsyncClient, endpoint: str, payload: dict,
) -> Optional[dict]:
"""POST a JSON-stringified payload to a ParlDok endpoint.
``endpoint`` is the path tail (e.g. ``"Fulltext/Search"`` or
``"Fulltext/Resultpage"``). Returns the inner JSON object
(already parsed from the stringified ``data`` field), or None
on error.
"""
homepage = f"{self.base_url}{self.prefix}/"
url = f"{self.base_url}{self.prefix}/{endpoint}"
try:
resp = await client.post(
url,
data={"data": json.dumps(payload, ensure_ascii=False)},
headers={
"X-Requested-With": "XMLHttpRequest",
"Referer": homepage,
},
)
if resp.status_code != 200:
logger.error(
"%s %s HTTP %s",
self.bundesland, endpoint, resp.status_code,
)
return None
outer = resp.json()
if not outer.get("success"):
logger.error(
"%s %s not successful: %s",
self.bundesland, endpoint, outer.get("message"),
)
return None
return json.loads(outer["data"])
except Exception:
logger.exception("%s ParlDok %s error", self.bundesland, endpoint)
return None
async def _initial_search(
self, client: httpx.AsyncClient, *, length: int,
) -> tuple[Optional[int], list[dict]]:
"""Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
calls. ParlDok ignores any non-zero ``Start`` on this endpoint
the first 100 hits are the only ones reachable via ``Search``.
"""
body = self._build_search_body(length=length)
inner = await self._post_json(client, "Fulltext/Search", body)
if not inner:
return None, []
return inner.get("queryid"), (inner.get("docs") or [])
async def _result_page(
self, client: httpx.AsyncClient, *, queryid: int, start: int, length: int,
) -> list[dict]:
"""Fetch a further result page via ``Fulltext/Resultpage``."""
payload = {
"devicekey": "",
"queryid": queryid,
"limit": {"Start": start, "Length": length},
}
inner = await self._post_json(client, "Fulltext/Resultpage", payload)
if not inner:
return []
return inner.get("docs") or []
def _make_client(self) -> httpx.AsyncClient:
return httpx.AsyncClient(
timeout=30,
follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
)
async def _paginated_hits(
self, client: httpx.AsyncClient,
):
"""Async iterator over Drucksachen-style hits across all pages.
Yields raw hit dicts in newest-first order. The first batch comes
from ``Fulltext/Search``, subsequent batches from
``Fulltext/Resultpage`` using the queryid the server returned for
the initial call. Stops when a page comes back empty, undersized,
or after ``MAX_PAGES`` iterations.
"""
queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE)
for hit in hits:
yield hit
if not queryid or len(hits) < self.PAGE_SIZE:
return
for page in range(1, self.MAX_PAGES):
page_hits = await self._result_page(
client,
queryid=queryid,
start=page * self.PAGE_SIZE,
length=self.PAGE_SIZE,
)
if not page_hits:
return
for hit in page_hits:
yield hit
if len(page_hits) < self.PAGE_SIZE:
return
# ParlDok 8.x caps Length per request at 100 — paginate if needed.
PAGE_SIZE = 100
# Safety bound: scan at most 10 pages × 100 = 1000 most recent docs.
# Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more
# than enough for the typical UI request (limit 5..20). Filtered
# queries that find nothing in the last 1000 docs return empty
# rather than scan the entire WP.
MAX_PAGES = 10
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
"""Search recent documents of the configured Wahlperiode.
``query`` is a client-side filter on title + Urheber. The server
returns the configured WP sorted newest first; the client keeps
only ``Antrag``-typed Drucksachen and applies the title filter.
Pagination: ParlDok caps each ``Fulltext/Search`` response at 100
rows. Only ~3% of MV hits are real Anträge (most are Kleine
Anfragen + Protokolle), so we may need several pages to fill
``limit``.
"""
results: list[Drucksache] = []
query_terms = [t for t in query.lower().split() if t] if query else []
# ParlDok returns the same Drucksache multiple times when it
# appears in several Vorgänge/Beratungen — dedupe by lp/number.
seen: set[str] = set()
async with self._make_client() as client:
await client.get(f"{self.base_url}{self.prefix}/")
async for hit in self._paginated_hits(client):
if hit.get("kind") != "Drucksache":
continue
if self.document_typ and hit.get("type") != self.document_typ:
continue
doc = self._hit_to_drucksache(hit)
if not doc:
continue
if doc.drucksache in seen:
continue
seen.add(doc.drucksache)
if query_terms:
hay = f"{doc.title} {hit.get('authorhtml', '')}".lower()
if not all(t in hay for t in query_terms):
continue
results.append(doc)
if len(results) >= limit:
return results
return results
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
"""Look up a single Antrag by ``lp/number`` ID.
Pragmatic MVP: page through the WP unfiltered until we find a
match. ParlDok offers a ``facet_number`` (14) facet that would
let us target the lookup directly, but the facet ID values are
instance-specific (would require a ``Fulltext/Filter`` discovery
call) and the WP-wide pagination is fast enough for the typical
2k10k Drucksachen per period.
"""
wanted_lp, wanted_num = (drucksache.split("/", 1) + [""])[:2]
if not wanted_num:
return None
async with self._make_client() as client:
await client.get(f"{self.base_url}{self.prefix}/")
async for hit in self._paginated_hits(client):
if hit.get("kind") != "Drucksache":
continue
if str(hit.get("lp")) == wanted_lp and str(hit.get("number")) == wanted_num:
return self._hit_to_drucksache(hit)
return None
async def download_text(self, drucksache: str) -> Optional[str]:
"""Download the PDF for a Drucksache and extract its text."""
import fitz # PyMuPDF
doc = await self.get_document(drucksache)
if not doc or not doc.link:
return None
async with httpx.AsyncClient(
timeout=60,
follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
) as client:
try:
resp = await client.get(doc.link)
if resp.status_code != 200:
logger.error(
"%s PDF HTTP %s for %s (%s)",
self.bundesland, resp.status_code, drucksache, doc.link,
)
return None
pdf = fitz.open(stream=resp.content, filetype="pdf")
text = ""
for page in pdf:
text += page.get_text()
pdf.close()
return text
except Exception:
logger.exception("%s ParlDok download error for %s", self.bundesland, drucksache)
return None return None
@ -876,6 +1266,14 @@ ADAPTERS = {
date_window_days=180, date_window_days=180,
pdf_url_prefix="/files/", pdf_url_prefix="/files/",
), ),
"MV": ParLDokAdapter(
bundesland="MV",
name="Landtag Mecklenburg-Vorpommern (ParlDok)",
base_url="https://www.dokumentation.landtag-mv.de",
wahlperiode=8,
prefix="/parldok",
document_typ="Antrag",
),
"BY": BayernAdapter(), "BY": BayernAdapter(),
"BW": BWAdapter(), "BW": BWAdapter(),
} }