Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)
Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.
Notable details:
- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
is no longer reachable via the old form fields — hence a new
adapter rather than reusing the dokukratie scraper.
- Two-stage pagination: Fulltext/Search returns the first 100 hits
+ a queryid; further pages come from Fulltext/Resultpage with
{queryid, limit:{Start,Length}}. The Search endpoint silently
ignores any non-zero Start, so single-stage offset pagination is
not an option.
- Server-side filter via facet_lp (type=10) on the configured WP;
type=Antrag is filtered client-side because the facet_type value
IDs are instance-specific and would require an extra
Fulltext/Filter discovery call. ParlDok also returns the same
Drucksache multiple times when it appears in several
Vorgänge/Beratungen, so search() dedupes by lp/number.
- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
in #4) — analyses run with the federal Grundsatzprogramm fallback,
same as Berlin until #10 lands.
Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1cb030aab7
commit
2b9c0b2908
@ -204,7 +204,18 @@ BUNDESLAENDER: dict[str, Bundesland] = {
|
|||||||
doku_base_url="https://www.dokumentation.landtag-mv.de",
|
doku_base_url="https://www.dokumentation.landtag-mv.de",
|
||||||
drucksache_format="8/1234",
|
drucksache_format="8/1234",
|
||||||
dokukratie_scraper="mv",
|
dokukratie_scraper="mv",
|
||||||
anmerkung="Wahltag offiziell auf 20.09.2026 festgelegt.",
|
aktiv=True,
|
||||||
|
anmerkung=(
|
||||||
|
"ParlDok 8.3.5 (J3S GmbH) — moderne SPA, JSON-API unter "
|
||||||
|
"/parldok/Fulltext/Search. ParLDokAdapter (eigene Implementierung, "
|
||||||
|
"nicht portala-kompatibel). Die in dokukratie/mv.yml beschriebene "
|
||||||
|
"Legacy-HTML-Form (parldok/formalkriterien) ist mit dem 8.x-Upgrade "
|
||||||
|
"deprecated. Suche filtert via facet_lp=10/id=8 server-seitig auf "
|
||||||
|
"WP8, type=Antrag wird client-seitig gefiltert. Wahlprogramme zur "
|
||||||
|
"LTW 26.09.2021 sind noch nicht indexiert (Folge-Issue) — Analyse "
|
||||||
|
"läuft daher mit Grundsatzprogramm-Zitaten als Fallback. Wahltag "
|
||||||
|
"offiziell auf 20.09.2026 festgelegt."
|
||||||
|
),
|
||||||
),
|
),
|
||||||
"NI": Bundesland(
|
"NI": Bundesland(
|
||||||
code="NI",
|
code="NI",
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
"""Parliament search adapters for different German states."""
|
"""Parliament search adapters for different German states."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
import httpx
|
import httpx
|
||||||
import re
|
import re
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
@ -7,6 +9,8 @@ from dataclasses import dataclass
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Drucksache:
|
class Drucksache:
|
||||||
@ -743,13 +747,13 @@ class PortalaAdapter(ParlamentAdapter):
|
|||||||
headers={"Referer": browse_html},
|
headers={"Referer": browse_html},
|
||||||
)
|
)
|
||||||
if resp.status_code != 200:
|
if resp.status_code != 200:
|
||||||
print(f"{self.bundesland} search HTTP {resp.status_code}")
|
logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
report_id = data.get("report_id")
|
report_id = data.get("report_id")
|
||||||
if not report_id:
|
if not report_id:
|
||||||
print(f"{self.bundesland}: no report_id in response: {data}")
|
logger.error("%s: no report_id in response: %s", self.bundesland, data)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Step 3: fetch the HTML hit list
|
# Step 3: fetch the HTML hit list
|
||||||
@ -761,14 +765,14 @@ class PortalaAdapter(ParlamentAdapter):
|
|||||||
headers={"Referer": browse_html},
|
headers={"Referer": browse_html},
|
||||||
)
|
)
|
||||||
if report_resp.status_code != 200:
|
if report_resp.status_code != 200:
|
||||||
print(f"{self.bundesland} report HTTP {report_resp.status_code}")
|
logger.error("%s report HTTP %s", self.bundesland, report_resp.status_code)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
results = self._parse_hit_list_html(report_resp.text, query_filter=query)
|
results = self._parse_hit_list_html(report_resp.text, query_filter=query)
|
||||||
return results[:limit]
|
return results[:limit]
|
||||||
|
|
||||||
except Exception as e:
|
except Exception:
|
||||||
print(f"{self.bundesland} search error: {e}")
|
logger.exception("%s search error", self.bundesland)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
||||||
@ -806,8 +810,394 @@ class PortalaAdapter(ParlamentAdapter):
|
|||||||
text += page.get_text()
|
text += page.get_text()
|
||||||
pdf.close()
|
pdf.close()
|
||||||
return text
|
return text
|
||||||
except Exception as e:
|
except Exception:
|
||||||
print(f"{self.bundesland} download error for {drucksache}: {e}")
|
logger.exception("%s download error for %s", self.bundesland, drucksache)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class ParLDokAdapter(ParlamentAdapter):
|
||||||
|
"""Adapter for ParlDok 8.x parliament documentation systems (J3S GmbH).
|
||||||
|
|
||||||
|
ParlDok is a proprietary parliament documentation product by J3S GmbH
|
||||||
|
(https://www.j3s.de). Different from the portala/eUI framework used by
|
||||||
|
LSA/BE: ParlDok 8.x is a single-page app whose backend is a JSON API
|
||||||
|
rooted at ``{base_url}{prefix}/Fulltext/...``. The legacy ParLDok 5.x
|
||||||
|
HTML POST form (``parldok/formalkriterien``) used by dokukratie's MV
|
||||||
|
YAML scraper has been deprecated by the LandtagMV upgrade to 8.3.5.
|
||||||
|
|
||||||
|
Confirmed instances using this engine (April 2026):
|
||||||
|
|
||||||
|
- **MV** (Mecklenburg-Vorpommern) — ``dokumentation.landtag-mv.de/parldok``
|
||||||
|
- HH, SN, TH all advertise ParlDok in dokukratie but their actual
|
||||||
|
versions/themes have not been verified yet.
|
||||||
|
|
||||||
|
Search workflow:
|
||||||
|
|
||||||
|
1. ``GET {base_url}{prefix}/`` to obtain the session cookie. The
|
||||||
|
backend rejects POSTs without it.
|
||||||
|
2. ``POST {base_url}{prefix}/Fulltext/Search`` with form-encoded
|
||||||
|
``data=<json>`` payload. The JSON carries a ``tags`` array of
|
||||||
|
facet selections; each tag is ``{"type": <facet_type_int>,
|
||||||
|
"id": <facet_value>}``. Reverse-engineered facet type constants
|
||||||
|
from the bundle.js (``pd.facet_*``):
|
||||||
|
|
||||||
|
- ``facet_fraction = 2``
|
||||||
|
- ``facet_kind = 7`` (Drucksache, Plenarprotokoll, …)
|
||||||
|
- ``facet_type = 8`` (Antrag, Gesetzentwurf, Kleine Anfrage, …)
|
||||||
|
- ``facet_lp = 10`` (Wahlperiode)
|
||||||
|
|
||||||
|
Response is JSON ``{success, data: <stringified JSON>}`` where the
|
||||||
|
inner ``data`` carries ``{count, docs: [{id, title, date,
|
||||||
|
authorhtml, kind, type, lp, number, link, ...}], ...}``.
|
||||||
|
|
||||||
|
3. PDF download: ``GET {base_url}{prefix}/dokument/{numeric_id}``.
|
||||||
|
Returns ``application/pdf`` directly. The ``link`` field returned
|
||||||
|
by the search API already contains the path fragment
|
||||||
|
``/dokument/<id>#navpanes=0`` — strip the fragment and prepend
|
||||||
|
the configured ``prefix``.
|
||||||
|
|
||||||
|
Drucksachen-Nummer is reconstructed as ``f"{lp}/{number}"`` from the
|
||||||
|
search hit. Full-text search is *not* implemented in this MVP — the
|
||||||
|
backend supports it via ``facet_fulltext = 0`` tags but the public
|
||||||
|
LP-only filter already returns the relevant Antrag pool. ``query``
|
||||||
|
is applied as a client-side title/Urheber filter.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Reverse-engineered facet type constants from bundle.js (pd.facet_*).
|
||||||
|
FACET_FRACTION = 2
|
||||||
|
FACET_KIND = 7
|
||||||
|
FACET_TYPE = 8
|
||||||
|
FACET_LP = 10
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
bundesland: str,
|
||||||
|
name: str,
|
||||||
|
base_url: str,
|
||||||
|
wahlperiode: int,
|
||||||
|
prefix: str = "/parldok",
|
||||||
|
document_typ: str = "Antrag",
|
||||||
|
) -> None:
|
||||||
|
"""Configure a ParlDok 8.x adapter for one specific parliament.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bundesland: state code, e.g. ``"MV"``.
|
||||||
|
name: human-readable label.
|
||||||
|
base_url: ``https://...`` host root, no trailing slash.
|
||||||
|
wahlperiode: current legislative period — fed into the
|
||||||
|
``facet_lp`` tag of the search payload.
|
||||||
|
prefix: app prefix where ParlDok lives. ``/parldok`` for MV.
|
||||||
|
document_typ: client-side filter on the ``type`` field of
|
||||||
|
each hit ("Antrag", "Gesetzentwurf", …). Set to empty
|
||||||
|
string to disable type filtering.
|
||||||
|
"""
|
||||||
|
self.bundesland = bundesland
|
||||||
|
self.name = name
|
||||||
|
self.base_url = base_url.rstrip("/")
|
||||||
|
self.prefix = "/" + prefix.strip("/")
|
||||||
|
self.wahlperiode = wahlperiode
|
||||||
|
self.document_typ = document_typ
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _datum_de_to_iso(datum_de: str) -> str:
|
||||||
|
"""DD.MM.YYYY → YYYY-MM-DD; '' for empty input."""
|
||||||
|
if not datum_de:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
d, m, y = datum_de.split(".")
|
||||||
|
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
||||||
|
except ValueError:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _normalize_fraktion(authorhtml: str) -> list[str]:
|
||||||
|
"""Map ParlDok ``authorhtml`` to canonical fraction codes.
|
||||||
|
|
||||||
|
``authorhtml`` may be a comma-separated list of fractions
|
||||||
|
("CDU, SPD, F.D.P."), a single MdL with party in parens
|
||||||
|
("Thomas de Jesus Fernandes (AfD)") or empty (Landesregierung).
|
||||||
|
"""
|
||||||
|
if not authorhtml:
|
||||||
|
return []
|
||||||
|
u = authorhtml.upper()
|
||||||
|
out: list[str] = []
|
||||||
|
if re.search(r"\bBÜNDNIS\s*90\b", u) or re.search(r"\bGR(?:Ü|UE)NE\b", u):
|
||||||
|
out.append("GRÜNE")
|
||||||
|
if re.search(r"\bCDU\b", u):
|
||||||
|
out.append("CDU")
|
||||||
|
if re.search(r"\bSPD\b", u):
|
||||||
|
out.append("SPD")
|
||||||
|
# F.D.P. (with dots, historical) and FDP both occur in MV
|
||||||
|
if re.search(r"\bF\.?\s*D\.?\s*P\.?\b", u):
|
||||||
|
out.append("FDP")
|
||||||
|
if re.search(r"\bAFD\b", u):
|
||||||
|
out.append("AfD")
|
||||||
|
if re.search(r"\bLINKE\b", u) or re.search(r"\bLL/PDS\b", u):
|
||||||
|
out.append("LINKE")
|
||||||
|
if re.search(r"\bBSW\b", u):
|
||||||
|
out.append("BSW")
|
||||||
|
if re.search(r"LANDESREGIERUNG|MINISTER\b|STAATSKANZLEI|MINISTERPRÄSIDENT", u):
|
||||||
|
out.append("Landesregierung")
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _build_search_body(self, *, length: int = 100) -> dict:
|
||||||
|
"""Build the JSON payload for the initial ``Fulltext/Search`` call.
|
||||||
|
|
||||||
|
Filters by Wahlperiode only. Type/kind filtering happens
|
||||||
|
client-side because the facet_type/facet_kind value IDs are
|
||||||
|
instance-specific and would require an extra ``Fulltext/Filter``
|
||||||
|
round trip to discover. Pagination beyond the first page goes
|
||||||
|
through ``Fulltext/Resultpage`` (see ``_post_resultpage``); the
|
||||||
|
``Search`` endpoint itself ignores any non-zero ``Start``.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"devicekey": "",
|
||||||
|
"max": length,
|
||||||
|
"withfilter": False,
|
||||||
|
# sort=2 → newest first (date desc); sort=1 is relevance.
|
||||||
|
"sort": 2,
|
||||||
|
"topk": length,
|
||||||
|
"llm": 0,
|
||||||
|
"newdocsearch": False,
|
||||||
|
"limit": {"Start": 0, "Length": length},
|
||||||
|
"tags": [{"type": self.FACET_LP, "id": self.wahlperiode}],
|
||||||
|
"updateFilters": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
def _hit_to_drucksache(self, hit: dict) -> Optional[Drucksache]:
|
||||||
|
"""Convert one ParlDok JSON hit to a Drucksache. None if unusable."""
|
||||||
|
lp = hit.get("lp")
|
||||||
|
number = hit.get("number")
|
||||||
|
if not lp or not number:
|
||||||
|
return None
|
||||||
|
|
||||||
|
link_field = hit.get("link") or hit.get("prelink") or ""
|
||||||
|
# Strip "#navpanes=0" fragment and prepend the prefix.
|
||||||
|
path = link_field.split("#", 1)[0]
|
||||||
|
pdf_url = f"{self.base_url}{self.prefix}{path}" if path else ""
|
||||||
|
|
||||||
|
return Drucksache(
|
||||||
|
drucksache=f"{lp}/{number}",
|
||||||
|
title=hit.get("title", ""),
|
||||||
|
fraktionen=self._normalize_fraktion(hit.get("authorhtml", "")),
|
||||||
|
datum=self._datum_de_to_iso(hit.get("date", "")),
|
||||||
|
link=pdf_url,
|
||||||
|
bundesland=self.bundesland,
|
||||||
|
typ=hit.get("type", "") or hit.get("kind", ""),
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _post_json(
|
||||||
|
self, client: httpx.AsyncClient, endpoint: str, payload: dict,
|
||||||
|
) -> Optional[dict]:
|
||||||
|
"""POST a JSON-stringified payload to a ParlDok endpoint.
|
||||||
|
|
||||||
|
``endpoint`` is the path tail (e.g. ``"Fulltext/Search"`` or
|
||||||
|
``"Fulltext/Resultpage"``). Returns the inner JSON object
|
||||||
|
(already parsed from the stringified ``data`` field), or None
|
||||||
|
on error.
|
||||||
|
"""
|
||||||
|
homepage = f"{self.base_url}{self.prefix}/"
|
||||||
|
url = f"{self.base_url}{self.prefix}/{endpoint}"
|
||||||
|
try:
|
||||||
|
resp = await client.post(
|
||||||
|
url,
|
||||||
|
data={"data": json.dumps(payload, ensure_ascii=False)},
|
||||||
|
headers={
|
||||||
|
"X-Requested-With": "XMLHttpRequest",
|
||||||
|
"Referer": homepage,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
logger.error(
|
||||||
|
"%s %s HTTP %s",
|
||||||
|
self.bundesland, endpoint, resp.status_code,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
outer = resp.json()
|
||||||
|
if not outer.get("success"):
|
||||||
|
logger.error(
|
||||||
|
"%s %s not successful: %s",
|
||||||
|
self.bundesland, endpoint, outer.get("message"),
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
return json.loads(outer["data"])
|
||||||
|
except Exception:
|
||||||
|
logger.exception("%s ParlDok %s error", self.bundesland, endpoint)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _initial_search(
|
||||||
|
self, client: httpx.AsyncClient, *, length: int,
|
||||||
|
) -> tuple[Optional[int], list[dict]]:
|
||||||
|
"""Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
|
||||||
|
|
||||||
|
The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
|
||||||
|
calls. ParlDok ignores any non-zero ``Start`` on this endpoint —
|
||||||
|
the first 100 hits are the only ones reachable via ``Search``.
|
||||||
|
"""
|
||||||
|
body = self._build_search_body(length=length)
|
||||||
|
inner = await self._post_json(client, "Fulltext/Search", body)
|
||||||
|
if not inner:
|
||||||
|
return None, []
|
||||||
|
return inner.get("queryid"), (inner.get("docs") or [])
|
||||||
|
|
||||||
|
async def _result_page(
|
||||||
|
self, client: httpx.AsyncClient, *, queryid: int, start: int, length: int,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Fetch a further result page via ``Fulltext/Resultpage``."""
|
||||||
|
payload = {
|
||||||
|
"devicekey": "",
|
||||||
|
"queryid": queryid,
|
||||||
|
"limit": {"Start": start, "Length": length},
|
||||||
|
}
|
||||||
|
inner = await self._post_json(client, "Fulltext/Resultpage", payload)
|
||||||
|
if not inner:
|
||||||
|
return []
|
||||||
|
return inner.get("docs") or []
|
||||||
|
|
||||||
|
def _make_client(self) -> httpx.AsyncClient:
|
||||||
|
return httpx.AsyncClient(
|
||||||
|
timeout=30,
|
||||||
|
follow_redirects=True,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _paginated_hits(
|
||||||
|
self, client: httpx.AsyncClient,
|
||||||
|
):
|
||||||
|
"""Async iterator over Drucksachen-style hits across all pages.
|
||||||
|
|
||||||
|
Yields raw hit dicts in newest-first order. The first batch comes
|
||||||
|
from ``Fulltext/Search``, subsequent batches from
|
||||||
|
``Fulltext/Resultpage`` using the queryid the server returned for
|
||||||
|
the initial call. Stops when a page comes back empty, undersized,
|
||||||
|
or after ``MAX_PAGES`` iterations.
|
||||||
|
"""
|
||||||
|
queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE)
|
||||||
|
for hit in hits:
|
||||||
|
yield hit
|
||||||
|
if not queryid or len(hits) < self.PAGE_SIZE:
|
||||||
|
return
|
||||||
|
|
||||||
|
for page in range(1, self.MAX_PAGES):
|
||||||
|
page_hits = await self._result_page(
|
||||||
|
client,
|
||||||
|
queryid=queryid,
|
||||||
|
start=page * self.PAGE_SIZE,
|
||||||
|
length=self.PAGE_SIZE,
|
||||||
|
)
|
||||||
|
if not page_hits:
|
||||||
|
return
|
||||||
|
for hit in page_hits:
|
||||||
|
yield hit
|
||||||
|
if len(page_hits) < self.PAGE_SIZE:
|
||||||
|
return
|
||||||
|
|
||||||
|
# ParlDok 8.x caps Length per request at 100 — paginate if needed.
|
||||||
|
PAGE_SIZE = 100
|
||||||
|
# Safety bound: scan at most 10 pages × 100 = 1000 most recent docs.
|
||||||
|
# Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more
|
||||||
|
# than enough for the typical UI request (limit 5..20). Filtered
|
||||||
|
# queries that find nothing in the last 1000 docs return empty
|
||||||
|
# rather than scan the entire WP.
|
||||||
|
MAX_PAGES = 10
|
||||||
|
|
||||||
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
||||||
|
"""Search recent documents of the configured Wahlperiode.
|
||||||
|
|
||||||
|
``query`` is a client-side filter on title + Urheber. The server
|
||||||
|
returns the configured WP sorted newest first; the client keeps
|
||||||
|
only ``Antrag``-typed Drucksachen and applies the title filter.
|
||||||
|
|
||||||
|
Pagination: ParlDok caps each ``Fulltext/Search`` response at 100
|
||||||
|
rows. Only ~3% of MV hits are real Anträge (most are Kleine
|
||||||
|
Anfragen + Protokolle), so we may need several pages to fill
|
||||||
|
``limit``.
|
||||||
|
"""
|
||||||
|
results: list[Drucksache] = []
|
||||||
|
query_terms = [t for t in query.lower().split() if t] if query else []
|
||||||
|
# ParlDok returns the same Drucksache multiple times when it
|
||||||
|
# appears in several Vorgänge/Beratungen — dedupe by lp/number.
|
||||||
|
seen: set[str] = set()
|
||||||
|
|
||||||
|
async with self._make_client() as client:
|
||||||
|
await client.get(f"{self.base_url}{self.prefix}/")
|
||||||
|
async for hit in self._paginated_hits(client):
|
||||||
|
if hit.get("kind") != "Drucksache":
|
||||||
|
continue
|
||||||
|
if self.document_typ and hit.get("type") != self.document_typ:
|
||||||
|
continue
|
||||||
|
|
||||||
|
doc = self._hit_to_drucksache(hit)
|
||||||
|
if not doc:
|
||||||
|
continue
|
||||||
|
if doc.drucksache in seen:
|
||||||
|
continue
|
||||||
|
seen.add(doc.drucksache)
|
||||||
|
|
||||||
|
if query_terms:
|
||||||
|
hay = f"{doc.title} {hit.get('authorhtml', '')}".lower()
|
||||||
|
if not all(t in hay for t in query_terms):
|
||||||
|
continue
|
||||||
|
|
||||||
|
results.append(doc)
|
||||||
|
if len(results) >= limit:
|
||||||
|
return results
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
||||||
|
"""Look up a single Antrag by ``lp/number`` ID.
|
||||||
|
|
||||||
|
Pragmatic MVP: page through the WP unfiltered until we find a
|
||||||
|
match. ParlDok offers a ``facet_number`` (14) facet that would
|
||||||
|
let us target the lookup directly, but the facet ID values are
|
||||||
|
instance-specific (would require a ``Fulltext/Filter`` discovery
|
||||||
|
call) and the WP-wide pagination is fast enough for the typical
|
||||||
|
2k–10k Drucksachen per period.
|
||||||
|
"""
|
||||||
|
wanted_lp, wanted_num = (drucksache.split("/", 1) + [""])[:2]
|
||||||
|
if not wanted_num:
|
||||||
|
return None
|
||||||
|
|
||||||
|
async with self._make_client() as client:
|
||||||
|
await client.get(f"{self.base_url}{self.prefix}/")
|
||||||
|
async for hit in self._paginated_hits(client):
|
||||||
|
if hit.get("kind") != "Drucksache":
|
||||||
|
continue
|
||||||
|
if str(hit.get("lp")) == wanted_lp and str(hit.get("number")) == wanted_num:
|
||||||
|
return self._hit_to_drucksache(hit)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
||||||
|
"""Download the PDF for a Drucksache and extract its text."""
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
|
||||||
|
doc = await self.get_document(drucksache)
|
||||||
|
if not doc or not doc.link:
|
||||||
|
return None
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=60,
|
||||||
|
follow_redirects=True,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
||||||
|
) as client:
|
||||||
|
try:
|
||||||
|
resp = await client.get(doc.link)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
logger.error(
|
||||||
|
"%s PDF HTTP %s for %s (%s)",
|
||||||
|
self.bundesland, resp.status_code, drucksache, doc.link,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
||||||
|
text = ""
|
||||||
|
for page in pdf:
|
||||||
|
text += page.get_text()
|
||||||
|
pdf.close()
|
||||||
|
return text
|
||||||
|
except Exception:
|
||||||
|
logger.exception("%s ParlDok download error for %s", self.bundesland, drucksache)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@ -876,6 +1266,14 @@ ADAPTERS = {
|
|||||||
date_window_days=180,
|
date_window_days=180,
|
||||||
pdf_url_prefix="/files/",
|
pdf_url_prefix="/files/",
|
||||||
),
|
),
|
||||||
|
"MV": ParLDokAdapter(
|
||||||
|
bundesland="MV",
|
||||||
|
name="Landtag Mecklenburg-Vorpommern (ParlDok)",
|
||||||
|
base_url="https://www.dokumentation.landtag-mv.de",
|
||||||
|
wahlperiode=8,
|
||||||
|
prefix="/parldok",
|
||||||
|
document_typ="Antrag",
|
||||||
|
),
|
||||||
"BY": BayernAdapter(),
|
"BY": BayernAdapter(),
|
||||||
"BW": BWAdapter(),
|
"BW": BWAdapter(),
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user