Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)
Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.
Notable details:
- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
is no longer reachable via the old form fields — hence a new
adapter rather than reusing the dokukratie scraper.
- Two-stage pagination: Fulltext/Search returns the first 100 hits
+ a queryid; further pages come from Fulltext/Resultpage with
{queryid, limit:{Start,Length}}. The Search endpoint silently
ignores any non-zero Start, so single-stage offset pagination is
not an option.
- Server-side filter via facet_lp (type=10) on the configured WP;
type=Antrag is filtered client-side because the facet_type value
IDs are instance-specific and would require an extra
Fulltext/Filter discovery call. ParlDok also returns the same
Drucksache multiple times when it appears in several
Vorgänge/Beratungen, so search() dedupes by lp/number.
- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
in #4) — analyses run with the federal Grundsatzprogramm fallback,
same as Berlin until #10 lands.
Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1cb030aab7
commit
2b9c0b2908
@ -204,7 +204,18 @@ BUNDESLAENDER: dict[str, Bundesland] = {
|
||||
doku_base_url="https://www.dokumentation.landtag-mv.de",
|
||||
drucksache_format="8/1234",
|
||||
dokukratie_scraper="mv",
|
||||
anmerkung="Wahltag offiziell auf 20.09.2026 festgelegt.",
|
||||
aktiv=True,
|
||||
anmerkung=(
|
||||
"ParlDok 8.3.5 (J3S GmbH) — moderne SPA, JSON-API unter "
|
||||
"/parldok/Fulltext/Search. ParLDokAdapter (eigene Implementierung, "
|
||||
"nicht portala-kompatibel). Die in dokukratie/mv.yml beschriebene "
|
||||
"Legacy-HTML-Form (parldok/formalkriterien) ist mit dem 8.x-Upgrade "
|
||||
"deprecated. Suche filtert via facet_lp=10/id=8 server-seitig auf "
|
||||
"WP8, type=Antrag wird client-seitig gefiltert. Wahlprogramme zur "
|
||||
"LTW 26.09.2021 sind noch nicht indexiert (Folge-Issue) — Analyse "
|
||||
"läuft daher mit Grundsatzprogramm-Zitaten als Fallback. Wahltag "
|
||||
"offiziell auf 20.09.2026 festgelegt."
|
||||
),
|
||||
),
|
||||
"NI": Bundesland(
|
||||
code="NI",
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
"""Parliament search adapters for different German states."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import httpx
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
@ -7,6 +9,8 @@ from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Drucksache:
|
||||
@ -743,13 +747,13 @@ class PortalaAdapter(ParlamentAdapter):
|
||||
headers={"Referer": browse_html},
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
print(f"{self.bundesland} search HTTP {resp.status_code}")
|
||||
logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
|
||||
return []
|
||||
|
||||
data = resp.json()
|
||||
report_id = data.get("report_id")
|
||||
if not report_id:
|
||||
print(f"{self.bundesland}: no report_id in response: {data}")
|
||||
logger.error("%s: no report_id in response: %s", self.bundesland, data)
|
||||
return []
|
||||
|
||||
# Step 3: fetch the HTML hit list
|
||||
@ -761,14 +765,14 @@ class PortalaAdapter(ParlamentAdapter):
|
||||
headers={"Referer": browse_html},
|
||||
)
|
||||
if report_resp.status_code != 200:
|
||||
print(f"{self.bundesland} report HTTP {report_resp.status_code}")
|
||||
logger.error("%s report HTTP %s", self.bundesland, report_resp.status_code)
|
||||
return []
|
||||
|
||||
results = self._parse_hit_list_html(report_resp.text, query_filter=query)
|
||||
return results[:limit]
|
||||
|
||||
except Exception as e:
|
||||
print(f"{self.bundesland} search error: {e}")
|
||||
except Exception:
|
||||
logger.exception("%s search error", self.bundesland)
|
||||
return []
|
||||
|
||||
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
||||
@ -806,8 +810,394 @@ class PortalaAdapter(ParlamentAdapter):
|
||||
text += page.get_text()
|
||||
pdf.close()
|
||||
return text
|
||||
except Exception as e:
|
||||
print(f"{self.bundesland} download error for {drucksache}: {e}")
|
||||
except Exception:
|
||||
logger.exception("%s download error for %s", self.bundesland, drucksache)
|
||||
return None
|
||||
|
||||
|
||||
class ParLDokAdapter(ParlamentAdapter):
|
||||
"""Adapter for ParlDok 8.x parliament documentation systems (J3S GmbH).
|
||||
|
||||
ParlDok is a proprietary parliament documentation product by J3S GmbH
|
||||
(https://www.j3s.de). Different from the portala/eUI framework used by
|
||||
LSA/BE: ParlDok 8.x is a single-page app whose backend is a JSON API
|
||||
rooted at ``{base_url}{prefix}/Fulltext/...``. The legacy ParLDok 5.x
|
||||
HTML POST form (``parldok/formalkriterien``) used by dokukratie's MV
|
||||
YAML scraper has been deprecated by the LandtagMV upgrade to 8.3.5.
|
||||
|
||||
Confirmed instances using this engine (April 2026):
|
||||
|
||||
- **MV** (Mecklenburg-Vorpommern) — ``dokumentation.landtag-mv.de/parldok``
|
||||
- HH, SN, TH all advertise ParlDok in dokukratie but their actual
|
||||
versions/themes have not been verified yet.
|
||||
|
||||
Search workflow:
|
||||
|
||||
1. ``GET {base_url}{prefix}/`` to obtain the session cookie. The
|
||||
backend rejects POSTs without it.
|
||||
2. ``POST {base_url}{prefix}/Fulltext/Search`` with form-encoded
|
||||
``data=<json>`` payload. The JSON carries a ``tags`` array of
|
||||
facet selections; each tag is ``{"type": <facet_type_int>,
|
||||
"id": <facet_value>}``. Reverse-engineered facet type constants
|
||||
from the bundle.js (``pd.facet_*``):
|
||||
|
||||
- ``facet_fraction = 2``
|
||||
- ``facet_kind = 7`` (Drucksache, Plenarprotokoll, …)
|
||||
- ``facet_type = 8`` (Antrag, Gesetzentwurf, Kleine Anfrage, …)
|
||||
- ``facet_lp = 10`` (Wahlperiode)
|
||||
|
||||
Response is JSON ``{success, data: <stringified JSON>}`` where the
|
||||
inner ``data`` carries ``{count, docs: [{id, title, date,
|
||||
authorhtml, kind, type, lp, number, link, ...}], ...}``.
|
||||
|
||||
3. PDF download: ``GET {base_url}{prefix}/dokument/{numeric_id}``.
|
||||
Returns ``application/pdf`` directly. The ``link`` field returned
|
||||
by the search API already contains the path fragment
|
||||
``/dokument/<id>#navpanes=0`` — strip the fragment and prepend
|
||||
the configured ``prefix``.
|
||||
|
||||
Drucksachen-Nummer is reconstructed as ``f"{lp}/{number}"`` from the
|
||||
search hit. Full-text search is *not* implemented in this MVP — the
|
||||
backend supports it via ``facet_fulltext = 0`` tags but the public
|
||||
LP-only filter already returns the relevant Antrag pool. ``query``
|
||||
is applied as a client-side title/Urheber filter.
|
||||
"""
|
||||
|
||||
# Reverse-engineered facet type constants from bundle.js (pd.facet_*).
|
||||
FACET_FRACTION = 2
|
||||
FACET_KIND = 7
|
||||
FACET_TYPE = 8
|
||||
FACET_LP = 10
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
bundesland: str,
|
||||
name: str,
|
||||
base_url: str,
|
||||
wahlperiode: int,
|
||||
prefix: str = "/parldok",
|
||||
document_typ: str = "Antrag",
|
||||
) -> None:
|
||||
"""Configure a ParlDok 8.x adapter for one specific parliament.
|
||||
|
||||
Args:
|
||||
bundesland: state code, e.g. ``"MV"``.
|
||||
name: human-readable label.
|
||||
base_url: ``https://...`` host root, no trailing slash.
|
||||
wahlperiode: current legislative period — fed into the
|
||||
``facet_lp`` tag of the search payload.
|
||||
prefix: app prefix where ParlDok lives. ``/parldok`` for MV.
|
||||
document_typ: client-side filter on the ``type`` field of
|
||||
each hit ("Antrag", "Gesetzentwurf", …). Set to empty
|
||||
string to disable type filtering.
|
||||
"""
|
||||
self.bundesland = bundesland
|
||||
self.name = name
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.prefix = "/" + prefix.strip("/")
|
||||
self.wahlperiode = wahlperiode
|
||||
self.document_typ = document_typ
|
||||
|
||||
@staticmethod
|
||||
def _datum_de_to_iso(datum_de: str) -> str:
|
||||
"""DD.MM.YYYY → YYYY-MM-DD; '' for empty input."""
|
||||
if not datum_de:
|
||||
return ""
|
||||
try:
|
||||
d, m, y = datum_de.split(".")
|
||||
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
||||
except ValueError:
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _normalize_fraktion(authorhtml: str) -> list[str]:
|
||||
"""Map ParlDok ``authorhtml`` to canonical fraction codes.
|
||||
|
||||
``authorhtml`` may be a comma-separated list of fractions
|
||||
("CDU, SPD, F.D.P."), a single MdL with party in parens
|
||||
("Thomas de Jesus Fernandes (AfD)") or empty (Landesregierung).
|
||||
"""
|
||||
if not authorhtml:
|
||||
return []
|
||||
u = authorhtml.upper()
|
||||
out: list[str] = []
|
||||
if re.search(r"\bBÜNDNIS\s*90\b", u) or re.search(r"\bGR(?:Ü|UE)NE\b", u):
|
||||
out.append("GRÜNE")
|
||||
if re.search(r"\bCDU\b", u):
|
||||
out.append("CDU")
|
||||
if re.search(r"\bSPD\b", u):
|
||||
out.append("SPD")
|
||||
# F.D.P. (with dots, historical) and FDP both occur in MV
|
||||
if re.search(r"\bF\.?\s*D\.?\s*P\.?\b", u):
|
||||
out.append("FDP")
|
||||
if re.search(r"\bAFD\b", u):
|
||||
out.append("AfD")
|
||||
if re.search(r"\bLINKE\b", u) or re.search(r"\bLL/PDS\b", u):
|
||||
out.append("LINKE")
|
||||
if re.search(r"\bBSW\b", u):
|
||||
out.append("BSW")
|
||||
if re.search(r"LANDESREGIERUNG|MINISTER\b|STAATSKANZLEI|MINISTERPRÄSIDENT", u):
|
||||
out.append("Landesregierung")
|
||||
return out
|
||||
|
||||
def _build_search_body(self, *, length: int = 100) -> dict:
|
||||
"""Build the JSON payload for the initial ``Fulltext/Search`` call.
|
||||
|
||||
Filters by Wahlperiode only. Type/kind filtering happens
|
||||
client-side because the facet_type/facet_kind value IDs are
|
||||
instance-specific and would require an extra ``Fulltext/Filter``
|
||||
round trip to discover. Pagination beyond the first page goes
|
||||
through ``Fulltext/Resultpage`` (see ``_post_resultpage``); the
|
||||
``Search`` endpoint itself ignores any non-zero ``Start``.
|
||||
"""
|
||||
return {
|
||||
"devicekey": "",
|
||||
"max": length,
|
||||
"withfilter": False,
|
||||
# sort=2 → newest first (date desc); sort=1 is relevance.
|
||||
"sort": 2,
|
||||
"topk": length,
|
||||
"llm": 0,
|
||||
"newdocsearch": False,
|
||||
"limit": {"Start": 0, "Length": length},
|
||||
"tags": [{"type": self.FACET_LP, "id": self.wahlperiode}],
|
||||
"updateFilters": [],
|
||||
}
|
||||
|
||||
def _hit_to_drucksache(self, hit: dict) -> Optional[Drucksache]:
|
||||
"""Convert one ParlDok JSON hit to a Drucksache. None if unusable."""
|
||||
lp = hit.get("lp")
|
||||
number = hit.get("number")
|
||||
if not lp or not number:
|
||||
return None
|
||||
|
||||
link_field = hit.get("link") or hit.get("prelink") or ""
|
||||
# Strip "#navpanes=0" fragment and prepend the prefix.
|
||||
path = link_field.split("#", 1)[0]
|
||||
pdf_url = f"{self.base_url}{self.prefix}{path}" if path else ""
|
||||
|
||||
return Drucksache(
|
||||
drucksache=f"{lp}/{number}",
|
||||
title=hit.get("title", ""),
|
||||
fraktionen=self._normalize_fraktion(hit.get("authorhtml", "")),
|
||||
datum=self._datum_de_to_iso(hit.get("date", "")),
|
||||
link=pdf_url,
|
||||
bundesland=self.bundesland,
|
||||
typ=hit.get("type", "") or hit.get("kind", ""),
|
||||
)
|
||||
|
||||
async def _post_json(
|
||||
self, client: httpx.AsyncClient, endpoint: str, payload: dict,
|
||||
) -> Optional[dict]:
|
||||
"""POST a JSON-stringified payload to a ParlDok endpoint.
|
||||
|
||||
``endpoint`` is the path tail (e.g. ``"Fulltext/Search"`` or
|
||||
``"Fulltext/Resultpage"``). Returns the inner JSON object
|
||||
(already parsed from the stringified ``data`` field), or None
|
||||
on error.
|
||||
"""
|
||||
homepage = f"{self.base_url}{self.prefix}/"
|
||||
url = f"{self.base_url}{self.prefix}/{endpoint}"
|
||||
try:
|
||||
resp = await client.post(
|
||||
url,
|
||||
data={"data": json.dumps(payload, ensure_ascii=False)},
|
||||
headers={
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"Referer": homepage,
|
||||
},
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
logger.error(
|
||||
"%s %s HTTP %s",
|
||||
self.bundesland, endpoint, resp.status_code,
|
||||
)
|
||||
return None
|
||||
outer = resp.json()
|
||||
if not outer.get("success"):
|
||||
logger.error(
|
||||
"%s %s not successful: %s",
|
||||
self.bundesland, endpoint, outer.get("message"),
|
||||
)
|
||||
return None
|
||||
return json.loads(outer["data"])
|
||||
except Exception:
|
||||
logger.exception("%s ParlDok %s error", self.bundesland, endpoint)
|
||||
return None
|
||||
|
||||
async def _initial_search(
|
||||
self, client: httpx.AsyncClient, *, length: int,
|
||||
) -> tuple[Optional[int], list[dict]]:
|
||||
"""Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
|
||||
|
||||
The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
|
||||
calls. ParlDok ignores any non-zero ``Start`` on this endpoint —
|
||||
the first 100 hits are the only ones reachable via ``Search``.
|
||||
"""
|
||||
body = self._build_search_body(length=length)
|
||||
inner = await self._post_json(client, "Fulltext/Search", body)
|
||||
if not inner:
|
||||
return None, []
|
||||
return inner.get("queryid"), (inner.get("docs") or [])
|
||||
|
||||
async def _result_page(
|
||||
self, client: httpx.AsyncClient, *, queryid: int, start: int, length: int,
|
||||
) -> list[dict]:
|
||||
"""Fetch a further result page via ``Fulltext/Resultpage``."""
|
||||
payload = {
|
||||
"devicekey": "",
|
||||
"queryid": queryid,
|
||||
"limit": {"Start": start, "Length": length},
|
||||
}
|
||||
inner = await self._post_json(client, "Fulltext/Resultpage", payload)
|
||||
if not inner:
|
||||
return []
|
||||
return inner.get("docs") or []
|
||||
|
||||
def _make_client(self) -> httpx.AsyncClient:
|
||||
return httpx.AsyncClient(
|
||||
timeout=30,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
||||
)
|
||||
|
||||
async def _paginated_hits(
|
||||
self, client: httpx.AsyncClient,
|
||||
):
|
||||
"""Async iterator over Drucksachen-style hits across all pages.
|
||||
|
||||
Yields raw hit dicts in newest-first order. The first batch comes
|
||||
from ``Fulltext/Search``, subsequent batches from
|
||||
``Fulltext/Resultpage`` using the queryid the server returned for
|
||||
the initial call. Stops when a page comes back empty, undersized,
|
||||
or after ``MAX_PAGES`` iterations.
|
||||
"""
|
||||
queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE)
|
||||
for hit in hits:
|
||||
yield hit
|
||||
if not queryid or len(hits) < self.PAGE_SIZE:
|
||||
return
|
||||
|
||||
for page in range(1, self.MAX_PAGES):
|
||||
page_hits = await self._result_page(
|
||||
client,
|
||||
queryid=queryid,
|
||||
start=page * self.PAGE_SIZE,
|
||||
length=self.PAGE_SIZE,
|
||||
)
|
||||
if not page_hits:
|
||||
return
|
||||
for hit in page_hits:
|
||||
yield hit
|
||||
if len(page_hits) < self.PAGE_SIZE:
|
||||
return
|
||||
|
||||
# ParlDok 8.x caps Length per request at 100 — paginate if needed.
|
||||
PAGE_SIZE = 100
|
||||
# Safety bound: scan at most 10 pages × 100 = 1000 most recent docs.
|
||||
# Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more
|
||||
# than enough for the typical UI request (limit 5..20). Filtered
|
||||
# queries that find nothing in the last 1000 docs return empty
|
||||
# rather than scan the entire WP.
|
||||
MAX_PAGES = 10
|
||||
|
||||
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
||||
"""Search recent documents of the configured Wahlperiode.
|
||||
|
||||
``query`` is a client-side filter on title + Urheber. The server
|
||||
returns the configured WP sorted newest first; the client keeps
|
||||
only ``Antrag``-typed Drucksachen and applies the title filter.
|
||||
|
||||
Pagination: ParlDok caps each ``Fulltext/Search`` response at 100
|
||||
rows. Only ~3% of MV hits are real Anträge (most are Kleine
|
||||
Anfragen + Protokolle), so we may need several pages to fill
|
||||
``limit``.
|
||||
"""
|
||||
results: list[Drucksache] = []
|
||||
query_terms = [t for t in query.lower().split() if t] if query else []
|
||||
# ParlDok returns the same Drucksache multiple times when it
|
||||
# appears in several Vorgänge/Beratungen — dedupe by lp/number.
|
||||
seen: set[str] = set()
|
||||
|
||||
async with self._make_client() as client:
|
||||
await client.get(f"{self.base_url}{self.prefix}/")
|
||||
async for hit in self._paginated_hits(client):
|
||||
if hit.get("kind") != "Drucksache":
|
||||
continue
|
||||
if self.document_typ and hit.get("type") != self.document_typ:
|
||||
continue
|
||||
|
||||
doc = self._hit_to_drucksache(hit)
|
||||
if not doc:
|
||||
continue
|
||||
if doc.drucksache in seen:
|
||||
continue
|
||||
seen.add(doc.drucksache)
|
||||
|
||||
if query_terms:
|
||||
hay = f"{doc.title} {hit.get('authorhtml', '')}".lower()
|
||||
if not all(t in hay for t in query_terms):
|
||||
continue
|
||||
|
||||
results.append(doc)
|
||||
if len(results) >= limit:
|
||||
return results
|
||||
|
||||
return results
|
||||
|
||||
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
||||
"""Look up a single Antrag by ``lp/number`` ID.
|
||||
|
||||
Pragmatic MVP: page through the WP unfiltered until we find a
|
||||
match. ParlDok offers a ``facet_number`` (14) facet that would
|
||||
let us target the lookup directly, but the facet ID values are
|
||||
instance-specific (would require a ``Fulltext/Filter`` discovery
|
||||
call) and the WP-wide pagination is fast enough for the typical
|
||||
2k–10k Drucksachen per period.
|
||||
"""
|
||||
wanted_lp, wanted_num = (drucksache.split("/", 1) + [""])[:2]
|
||||
if not wanted_num:
|
||||
return None
|
||||
|
||||
async with self._make_client() as client:
|
||||
await client.get(f"{self.base_url}{self.prefix}/")
|
||||
async for hit in self._paginated_hits(client):
|
||||
if hit.get("kind") != "Drucksache":
|
||||
continue
|
||||
if str(hit.get("lp")) == wanted_lp and str(hit.get("number")) == wanted_num:
|
||||
return self._hit_to_drucksache(hit)
|
||||
return None
|
||||
|
||||
async def download_text(self, drucksache: str) -> Optional[str]:
|
||||
"""Download the PDF for a Drucksache and extract its text."""
|
||||
import fitz # PyMuPDF
|
||||
|
||||
doc = await self.get_document(drucksache)
|
||||
if not doc or not doc.link:
|
||||
return None
|
||||
|
||||
async with httpx.AsyncClient(
|
||||
timeout=60,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
||||
) as client:
|
||||
try:
|
||||
resp = await client.get(doc.link)
|
||||
if resp.status_code != 200:
|
||||
logger.error(
|
||||
"%s PDF HTTP %s for %s (%s)",
|
||||
self.bundesland, resp.status_code, drucksache, doc.link,
|
||||
)
|
||||
return None
|
||||
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
||||
text = ""
|
||||
for page in pdf:
|
||||
text += page.get_text()
|
||||
pdf.close()
|
||||
return text
|
||||
except Exception:
|
||||
logger.exception("%s ParlDok download error for %s", self.bundesland, drucksache)
|
||||
return None
|
||||
|
||||
|
||||
@ -876,6 +1266,14 @@ ADAPTERS = {
|
||||
date_window_days=180,
|
||||
pdf_url_prefix="/files/",
|
||||
),
|
||||
"MV": ParLDokAdapter(
|
||||
bundesland="MV",
|
||||
name="Landtag Mecklenburg-Vorpommern (ParlDok)",
|
||||
base_url="https://www.dokumentation.landtag-mv.de",
|
||||
wahlperiode=8,
|
||||
prefix="/parldok",
|
||||
document_typ="Antrag",
|
||||
),
|
||||
"BY": BayernAdapter(),
|
||||
"BW": BWAdapter(),
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user