Real server-side fulltext search through the eUI sf-Index requires reverse-engineering the LSA/BE-specific search field (the obvious candidates VOLL, VOLL.main, WEV62 and bare-term-without-sf all return zero hits when probed). Without browser DevTools to capture a real fulltext request that's a multi-hour project — split out to remain in #13 as a follow-up. This commit ships the pragmatic interim fix from #11: - BE date_window_days: 180 → 730 Berlin had a tight default window because PARDOK has ~10x more documents than PADOKA. With the bigger window the client-side title/Urheber filter reaches back across most of WP19 instead of just the last six months. - chunksize logic in PortalaAdapter.search() inverted from "small when query, big when no query" to the opposite. The query-filtered path now pulls up to max(limit*10, 500) records per page so the title-filter has enough material; the unfiltered browse path stays at max(limit*2, 100). - httpx timeout 30s → 60s. LSA's report.tt.html occasionally takes 30+s on cold start; warm requests are <10s. Smoke test (local): BE Schule: 15 hits (was 0) LSA Schule: 14 hits (was N/A; same path) Live verification follows after deploy. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1340 lines
54 KiB
Python
1340 lines
54 KiB
Python
"""Parliament search adapters for different German states."""
|
||
|
||
import json
|
||
import logging
|
||
import httpx
|
||
import re
|
||
from abc import ABC, abstractmethod
|
||
from dataclasses import dataclass
|
||
from typing import Optional
|
||
from bs4 import BeautifulSoup
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
@dataclass
|
||
class Drucksache:
|
||
"""A parliamentary document."""
|
||
drucksache: str # e.g. "18/8125"
|
||
title: str
|
||
fraktionen: list[str]
|
||
datum: str # ISO date
|
||
link: str # PDF URL
|
||
bundesland: str
|
||
typ: str = "Antrag" # Antrag, Anfrage, Beschlussempfehlung, etc.
|
||
|
||
|
||
class ParlamentAdapter(ABC):
|
||
"""Base adapter for searching parliament documents."""
|
||
|
||
bundesland: str
|
||
name: str
|
||
|
||
@abstractmethod
|
||
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
||
"""Search for documents matching query."""
|
||
pass
|
||
|
||
@abstractmethod
|
||
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
||
"""Get a specific document by ID."""
|
||
pass
|
||
|
||
@abstractmethod
|
||
async def download_text(self, drucksache: str) -> Optional[str]:
|
||
"""Download and extract text from a document."""
|
||
pass
|
||
|
||
|
||
class NRWAdapter(ParlamentAdapter):
|
||
"""Adapter for NRW Landtag (opal.landtag.nrw.de)."""
|
||
|
||
bundesland = "NRW"
|
||
name = "Landtag Nordrhein-Westfalen"
|
||
base_url = "https://opal.landtag.nrw.de"
|
||
search_url = "https://opal.landtag.nrw.de/home/dokumente/dokumentensuche/parlamentsdokumente/aktuelle-dokumente.html"
|
||
|
||
def _parse_query(self, query: str) -> tuple[str, list[str], bool]:
|
||
"""
|
||
Parse search query for AND logic and exact phrases.
|
||
Returns: (search_term_for_api, filter_terms, is_exact)
|
||
|
||
Examples:
|
||
- 'Klimaschutz Energie' -> ('Klimaschutz', ['klimaschutz', 'energie'], False)
|
||
- '"Grüner Stahl"' -> ('Grüner Stahl', ['grüner stahl'], True)
|
||
- 'Klimaschutz "erneuerbare Energie"' -> ('Klimaschutz', ['klimaschutz', 'erneuerbare energie'], False)
|
||
"""
|
||
query = query.strip()
|
||
|
||
# Check for exact phrase (entire query in quotes)
|
||
if query.startswith('"') and query.endswith('"') and query.count('"') == 2:
|
||
exact = query[1:-1].strip()
|
||
return (exact, [exact.lower()], True)
|
||
|
||
# Extract quoted phrases and regular terms
|
||
import shlex
|
||
try:
|
||
parts = shlex.split(query)
|
||
except ValueError:
|
||
# Fallback for unbalanced quotes
|
||
parts = query.split()
|
||
|
||
if not parts:
|
||
return (query, [query.lower()], False)
|
||
|
||
# Use first term for API search, all terms for filtering
|
||
filter_terms = [p.lower() for p in parts]
|
||
return (parts[0], filter_terms, False)
|
||
|
||
def _matches_all_terms(self, doc: 'Drucksache', terms: list[str], is_exact: bool) -> bool:
|
||
"""Check if document matches all search terms (AND logic)."""
|
||
searchable = f"{doc.title} {doc.drucksache} {' '.join(doc.fraktionen)} {doc.typ}".lower()
|
||
|
||
if is_exact:
|
||
# Exact phrase must appear
|
||
return terms[0] in searchable
|
||
else:
|
||
# All terms must appear (AND)
|
||
return all(term in searchable for term in terms)
|
||
|
||
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
||
"""Search NRW Landtag documents via OPAL portal."""
|
||
results = []
|
||
|
||
# Parse query for AND logic
|
||
api_query, filter_terms, is_exact = self._parse_query(query)
|
||
|
||
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
||
try:
|
||
# First, get the page to establish session
|
||
initial = await client.get(self.search_url)
|
||
if initial.status_code != 200:
|
||
print(f"NRW search initial request failed: {initial.status_code}")
|
||
return []
|
||
|
||
# Parse for webflow token from pagination links
|
||
soup = BeautifulSoup(initial.text, 'html.parser')
|
||
|
||
# Find a pagination link to extract the webflow token
|
||
pagination_link = soup.select_one('a[href*="webflowexecution"]')
|
||
webflow_token = ""
|
||
webflow_execution = ""
|
||
|
||
if pagination_link:
|
||
href = pagination_link.get('href', '')
|
||
# Extract webflowToken and webflowexecution from URL
|
||
token_match = re.search(r'webflowToken=([^&]*)', href)
|
||
exec_match = re.search(r'(webflowexecution[^=]+)=([^&]+)', href)
|
||
if token_match:
|
||
webflow_token = token_match.group(1)
|
||
if exec_match:
|
||
webflow_execution = f"{exec_match.group(1)}={exec_match.group(2)}"
|
||
|
||
# Now perform the search with POST
|
||
# Find the form action URL with webflow token
|
||
form = soup.select_one('form#docSearchByItem')
|
||
form_action = self.search_url
|
||
if form and form.get('action'):
|
||
action = form.get('action')
|
||
if action.startswith('/'):
|
||
form_action = f"{self.base_url}{action}"
|
||
elif action.startswith('http'):
|
||
form_action = action
|
||
else:
|
||
form_action = f"{self.search_url}?{action}"
|
||
|
||
# Build form data for "Einfache Suche" (searchByItem form)
|
||
form_data = {
|
||
'_eventId_sendform': '1',
|
||
'dokNum': api_query, # This is the text search field
|
||
'formId': 'searchByItem',
|
||
'dokTyp': '', # All types
|
||
'wp': '18', # Wahlperiode 18
|
||
}
|
||
|
||
# POST request with form data to the form action URL
|
||
search_resp = await client.post(
|
||
form_action,
|
||
data=form_data,
|
||
cookies=initial.cookies,
|
||
headers={'Content-Type': 'application/x-www-form-urlencoded'}
|
||
)
|
||
|
||
if search_resp.status_code != 200:
|
||
print(f"NRW search request failed: {search_resp.status_code}")
|
||
return []
|
||
|
||
# Parse results
|
||
soup = BeautifulSoup(search_resp.text, 'html.parser')
|
||
|
||
# Find all document result items (li elements containing articles)
|
||
items = soup.select('li:has(article)')
|
||
|
||
for item in items[:limit]:
|
||
try:
|
||
# Extract drucksache number from first link
|
||
num_link = item.select_one('a[href*="MMD"]')
|
||
if not num_link:
|
||
continue
|
||
|
||
href = num_link.get('href', '')
|
||
# Extract number: MMD18-12345.pdf -> 18/12345
|
||
match = re.search(r'MMD(\d+)-(\d+)\.pdf', href)
|
||
if not match:
|
||
continue
|
||
|
||
legislatur, nummer = match.groups()
|
||
drucksache = f"{legislatur}/{nummer}"
|
||
pdf_url = f"https://www.landtag.nrw.de{href}" if href.startswith('/') else href
|
||
|
||
# Extract title from the title link (class e-document-result-item__title)
|
||
title_elem = item.select_one('a.e-document-result-item__title')
|
||
if title_elem:
|
||
# Get text content, clean it up
|
||
title = title_elem.get_text(strip=True)
|
||
# Remove SVG icon text and clean
|
||
title = re.sub(r'\s*<svg.*', '', title)
|
||
title = re.sub(r'\s+', ' ', title).strip()
|
||
else:
|
||
# Fallback: try to find any longer text
|
||
title = f"Drucksache {drucksache}"
|
||
|
||
# Clean up common artifacts
|
||
title = re.sub(r'\s*\(\s*externer Link.*?\)', '', title).strip()
|
||
|
||
# Extract type (Antrag, Kleine Anfrage, etc.)
|
||
typ_elem = item.select_one('.e-document-result-item__category')
|
||
typ = typ_elem.get_text(strip=True) if typ_elem else "Drucksache"
|
||
|
||
# Extract date
|
||
time_elem = item.select_one('time')
|
||
datum = ""
|
||
if time_elem:
|
||
datum_text = time_elem.get_text(strip=True)
|
||
# Convert DD.MM.YYYY to YYYY-MM-DD
|
||
date_match = re.match(r'(\d{2})\.(\d{2})\.(\d{4})', datum_text)
|
||
if date_match:
|
||
d, m, y = date_match.groups()
|
||
datum = f"{y}-{m}-{d}"
|
||
|
||
# Extract Urheber (fraktionen) - look for paragraph containing "Urheber:"
|
||
urheber_text = ""
|
||
for p in item.select('p'):
|
||
if 'Urheber:' in p.get_text():
|
||
urheber_text = p.get_text()
|
||
break
|
||
|
||
fraktionen = []
|
||
if urheber_text:
|
||
# Extract party names (SPD, CDU, GRÜNE, FDP, AfD)
|
||
for party in ['SPD', 'CDU', 'GRÜNE', 'Grüne', 'FDP', 'AfD']:
|
||
if party in urheber_text:
|
||
fraktionen.append(party.upper() if party.lower() != 'grüne' else 'GRÜNE')
|
||
|
||
doc = Drucksache(
|
||
drucksache=drucksache,
|
||
title=title,
|
||
fraktionen=fraktionen,
|
||
datum=datum,
|
||
link=pdf_url,
|
||
bundesland="NRW",
|
||
typ=typ,
|
||
)
|
||
|
||
# Apply AND filter (all terms must match)
|
||
if self._matches_all_terms(doc, filter_terms, is_exact):
|
||
results.append(doc)
|
||
|
||
except Exception as e:
|
||
print(f"Error parsing item: {e}")
|
||
continue
|
||
|
||
except Exception as e:
|
||
print(f"NRW search error: {e}")
|
||
|
||
return results
|
||
|
||
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
||
"""Get document metadata by drucksache ID (e.g. '18/8125')."""
|
||
# Parse legislatur and number
|
||
match = re.match(r"(\d+)/(\d+)", drucksache)
|
||
if not match:
|
||
return None
|
||
|
||
legislatur, nummer = match.groups()
|
||
pdf_url = f"https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument/MMD{legislatur}-{nummer}.pdf"
|
||
|
||
# Try to fetch and extract basic info
|
||
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
||
try:
|
||
resp = await client.head(pdf_url)
|
||
if resp.status_code == 200:
|
||
return Drucksache(
|
||
drucksache=drucksache,
|
||
title=f"Drucksache {drucksache}",
|
||
fraktionen=[],
|
||
datum="",
|
||
link=pdf_url,
|
||
bundesland="NRW",
|
||
)
|
||
except:
|
||
pass
|
||
|
||
return None
|
||
|
||
async def download_text(self, drucksache: str) -> Optional[str]:
|
||
"""Download PDF and extract text."""
|
||
import fitz # PyMuPDF
|
||
|
||
doc = await self.get_document(drucksache)
|
||
if not doc:
|
||
return None
|
||
|
||
async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
|
||
try:
|
||
resp = await client.get(doc.link)
|
||
if resp.status_code != 200:
|
||
return None
|
||
|
||
# Extract text with PyMuPDF
|
||
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
||
text = ""
|
||
for page in pdf:
|
||
text += page.get_text()
|
||
pdf.close()
|
||
|
||
return text
|
||
except Exception as e:
|
||
print(f"Error downloading {drucksache}: {e}")
|
||
return None
|
||
|
||
|
||
class PortalaAdapter(ParlamentAdapter):
|
||
"""Adapter for portala/eUI-based parliament documentation systems.
|
||
|
||
Used by parliaments running the proprietary "esearch" / portala framework
|
||
(originally developed for STAR/StarFinder backends, now wrapped in a
|
||
Single-Page App with Template Toolkit on the server side):
|
||
|
||
- **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de``
|
||
under ``/portal/`` (singular)
|
||
- **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` under
|
||
``/portala/`` (with the trailing 'a')
|
||
|
||
Both instances share the same JSON action schema, only the base URL,
|
||
the data source ID, the application path prefix and a few minor
|
||
quirks differ — those are constructor parameters so that the same
|
||
class can serve both states (and any future portala-based parliament).
|
||
|
||
The search workflow is two-stage:
|
||
|
||
1. ``POST {base}{path}/browse.tt.json`` with a complex JSON ``action``
|
||
body that contains an Elasticsearch-style query tree under
|
||
``search.json``. The server returns a ``report_id`` plus hit count.
|
||
2. ``POST {base}{path}/report.tt.html`` with ``{report_id, start,
|
||
chunksize}`` to fetch the HTML hit list. Each hit carries a Perl
|
||
Data::Dumper block in a ``<pre>`` tag with the canonical metadata.
|
||
|
||
The query body schema was reverse-engineered from
|
||
https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
|
||
(GPL-3.0 — only structure/selectors are reused, not Python code).
|
||
|
||
Full-text search is **not** implemented in the MVP: the adapter
|
||
returns documents of the current Wahlperiode in the given date
|
||
window, and the search query is applied as a client-side
|
||
title/Urheber filter. The server-side full-text path requires
|
||
state-specific ``sf`` index names that are not yet known.
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
*,
|
||
bundesland: str,
|
||
name: str,
|
||
base_url: str,
|
||
db_id: str,
|
||
wahlperiode: int,
|
||
portala_path: str = "/portal",
|
||
document_type: Optional[str] = "Antrag",
|
||
pdf_url_prefix: str = "/files/",
|
||
date_window_days: int = 730,
|
||
) -> None:
|
||
"""Configure a portala/eUI adapter for one specific parliament.
|
||
|
||
Args:
|
||
bundesland: state code (e.g. ``"LSA"``, ``"BE"``).
|
||
name: human-readable adapter label (used in logs/UI).
|
||
base_url: ``https://...`` of the portal host without trailing slash.
|
||
db_id: data source identifier the eUI server expects in
|
||
``action.sources``, e.g. ``"lsa.lissh"`` or ``"lah.lissh"``.
|
||
wahlperiode: current legislative period — fed into the WP
|
||
term of the search tree.
|
||
portala_path: path prefix where the portala app lives. ``/portal``
|
||
for LSA, ``/portala`` for Berlin.
|
||
document_type: optional filter applied via ETYPF/DTYPF/DART
|
||
terms. ``"Antrag"`` works for LSA; for instances where
|
||
the index uses different document_type values (e.g. Berlin),
|
||
pass ``None`` to drop the document_type subtree entirely
|
||
— the user can still filter client-side by title.
|
||
pdf_url_prefix: URL fragment between ``base_url`` and the
|
||
relative PDF path returned by the server.
|
||
date_window_days: how many days back ``search()`` looks by
|
||
default.
|
||
"""
|
||
self.bundesland = bundesland
|
||
self.name = name
|
||
self.base_url = base_url.rstrip("/")
|
||
self.db_id = db_id
|
||
self.wahlperiode = wahlperiode
|
||
self.portala_path = "/" + portala_path.strip("/")
|
||
self.document_type = document_type
|
||
self.pdf_url_prefix = "/" + pdf_url_prefix.strip("/") + "/"
|
||
self.date_window_days = date_window_days
|
||
|
||
# ── LSA-style hit list (Perl Data::Dumper inside <pre> blocks) ──
|
||
# Reverse-engineered "WEV*" record fields:
|
||
# WEV06.main = title
|
||
# WEV32.5 = relative PDF path
|
||
# WEV32.main = "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b> ..."
|
||
_RE_TITLE = re.compile(r"'WEV06'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
|
||
_RE_PDF = re.compile(r"'5'\s*=>\s*'([^']*\.pdf)'")
|
||
_RE_DRUCKSACHE = re.compile(r"Drucksache\s*<b>(\d+/\d+)</b>")
|
||
_RE_URHEBER_DATUM = re.compile(
|
||
r"'WEV32'\s*=>\s*\[\s*\{[^}]*'main'\s*=>\s*[\"']Antrag\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
|
||
)
|
||
_RE_PRE_BLOCK = re.compile(r'<pre>\$VAR1 = (.*?)</pre>', re.DOTALL)
|
||
|
||
# ── Berlin-style hit list (production HTML cards, no Perl dump) ──
|
||
# The whole div for one record:
|
||
_RE_BE_RECORD = re.compile(
|
||
r'<div[^>]*class="[^"]*efxRecordRepeater[^"]*"[^>]*data-efx-rec="[^"]*"[^>]*>(.*?)(?=<div[^>]*efxRecordRepeater|<div[^>]*id="efxResultsEnd"|</main>|$)',
|
||
re.DOTALL,
|
||
)
|
||
_RE_BE_TITLE = re.compile(r'<h3[^>]*class="h5[^"]*"[^>]*>\s*<span>([^<]+)</span>')
|
||
_RE_BE_LINK = re.compile(r'<a[^>]*href="([^"]+\.pdf)"[^>]*>')
|
||
# The metadata h6 looks like:
|
||
# <span class="h6">Antrag (Eilantrag) <a ...>Drucksache 19/3104</a> S. 1 bis 24 vom 31.03.2026</span>
|
||
_RE_BE_DRUCKSACHE = re.compile(r'Drucksache\s+(\d+/\d+)')
|
||
_RE_BE_DATUM = re.compile(r'vom\s+(\d{1,2}\.\d{1,2}\.\d{4})')
|
||
_RE_BE_DOCTYPE = re.compile(r'<span class="h6">\s*([^<&]+?)(?: |<)')
|
||
|
||
@staticmethod
|
||
def _decode_perl_hex(s: str) -> str:
|
||
"""Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
|
||
return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s)
|
||
|
||
@staticmethod
|
||
def _normalize_fraktion(urheber: str) -> list[str]:
|
||
"""Map Urheber-String to canonical fraction codes.
|
||
|
||
Uses regex word boundaries instead of plain substring matching so
|
||
that comma-separated lists ("CDU, SPD") and the embedded "DIE
|
||
LINKE" are matched reliably.
|
||
"""
|
||
u = urheber.upper()
|
||
out: list[str] = []
|
||
|
||
def has(pattern: str) -> bool:
|
||
return re.search(pattern, u) is not None
|
||
|
||
if has(r"\bBÜNDNIS\s*90\b") or has(r"\bGR(?:Ü|UE)NE\b"):
|
||
out.append("GRÜNE")
|
||
if has(r"\bCDU\b"):
|
||
out.append("CDU")
|
||
if has(r"\bSPD\b"):
|
||
out.append("SPD")
|
||
if has(r"\bFDP\b"):
|
||
out.append("FDP")
|
||
if has(r"\bAFD\b"):
|
||
out.append("AfD")
|
||
if has(r"\bLINKE\b"):
|
||
out.append("LINKE")
|
||
if has(r"\bBSW\b"):
|
||
out.append("BSW")
|
||
if has(r"LANDESREGIERUNG|SENAT VON BERLIN|REGIERENDE[RN]?\s+BÜRGERMEISTER|MINISTER\b|STAATSKANZLEI"):
|
||
out.append("Landesregierung")
|
||
return out
|
||
|
||
def _build_search_body(
|
||
self,
|
||
wahlperiode: int,
|
||
start_date: str,
|
||
end_date: str,
|
||
) -> dict:
|
||
"""Build the action JSON body for browse.tt.json.
|
||
|
||
The schema is taken from dokukratie's portala.query.json template
|
||
and only differs in the data source and the variable substitutions.
|
||
When ``self.document_type`` is None, the ETYPF/DTYPF/DART subtree
|
||
is dropped — useful for parliaments whose ETYPF index uses
|
||
different value strings than ``"Antrag"``.
|
||
"""
|
||
document_type = self.document_type
|
||
date_range_text = f"{start_date} THRU {end_date}"
|
||
date_term = lambda sf, num: { # noqa: E731 — local helper
|
||
"tn": "trange", "sf": sf, "op": "eq", "num": num,
|
||
"idx": 119, "l": 3,
|
||
"p1": start_date, "t1": start_date,
|
||
"p2": end_date, "t2": end_date,
|
||
"t": date_range_text,
|
||
}
|
||
|
||
# Build the search.lines (form-state mirror) and the json tree
|
||
lines: dict = {
|
||
"2": str(wahlperiode),
|
||
"10": start_date,
|
||
"11": end_date,
|
||
"20.1": "alWEBBI",
|
||
"20.2": "alWEBBI",
|
||
"20.3": "alWEBBI",
|
||
"90.1": "AND",
|
||
"90.2": "AND",
|
||
"90.3": "AND",
|
||
}
|
||
if document_type is not None:
|
||
lines["3"] = document_type
|
||
lines["4"] = "D"
|
||
|
||
# Top-level AND tree
|
||
top_terms: list = [
|
||
{"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3,
|
||
"sf": "WP", "op": "eq", "num": 5},
|
||
]
|
||
|
||
if document_type is not None:
|
||
top_terms.append({"tn": "or", "num": 3, "terms": [
|
||
{"tn": "or", "num": 4, "terms": [
|
||
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
||
"l": 4, "sf": "ETYPF", "op": "eq", "num": 10},
|
||
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
||
"l": 4, "sf": "ETYP2F", "op": "eq", "num": 11},
|
||
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
||
"l": 4, "sf": "DTYPF", "op": "eq", "num": 12},
|
||
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
||
"l": 4, "sf": "DTYP2F", "op": "eq", "num": 13},
|
||
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
||
"l": 4, "sf": "1VTYPF", "op": "eq", "num": 14},
|
||
]},
|
||
{"tn": "or", "num": 15, "terms": [
|
||
{"tn": "term", "t": '"D"', "idx": 93, "l": 4,
|
||
"sf": "DART", "op": "eq", "num": 16},
|
||
{"tn": "term", "t": '"D"', "idx": 93, "l": 4,
|
||
"sf": "DARTS", "op": "eq", "num": 17},
|
||
]},
|
||
]})
|
||
|
||
top_terms.append({"tn": "or", "num": 18, "terms": [
|
||
{"tn": "or", "num": 19, "terms": [
|
||
date_term("DAT", 20),
|
||
date_term("DDAT", 21),
|
||
]},
|
||
date_term("SDAT", 22),
|
||
]})
|
||
top_terms.append({"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1,
|
||
"sf": "TYP", "op": "eq", "num": 23})
|
||
|
||
# Mirror the same shape into the parsed/sref display strings
|
||
if document_type is not None:
|
||
parsed = (
|
||
f"((/WP {wahlperiode}) AND "
|
||
f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
|
||
f"AND (/DART,DARTS (\"D\")) AND "
|
||
f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
|
||
)
|
||
else:
|
||
parsed = (
|
||
f"((/WP {wahlperiode}) AND "
|
||
f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
|
||
)
|
||
|
||
return {
|
||
"action": "SearchAndDisplay",
|
||
"sources": [self.db_id],
|
||
"report": {
|
||
"rhl": "main",
|
||
"rhlmode": "add",
|
||
"format": "generic1-full",
|
||
"mime": "html",
|
||
"sort": "WEVSO1/D WEVSO2 WEVSO3",
|
||
},
|
||
"search": {
|
||
"lines": lines,
|
||
"serverrecordname": "sr_generic1",
|
||
"parsed": parsed,
|
||
"sref": parsed,
|
||
"json": [{
|
||
"tn": "and",
|
||
"num": 1,
|
||
"terms": top_terms,
|
||
}],
|
||
},
|
||
"dataSet": "1",
|
||
}
|
||
|
||
@staticmethod
|
||
def _datum_de_to_iso(datum_de: str) -> str:
|
||
"""Convert DD.MM.YYYY → YYYY-MM-DD; return '' for empty input."""
|
||
if not datum_de:
|
||
return ""
|
||
d, m, y = datum_de.split(".")
|
||
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
||
|
||
def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]:
|
||
"""Extract Drucksachen from a report.tt.html response.
|
||
|
||
Two formats are supported and auto-detected:
|
||
|
||
- **LSA-style:** the records are embedded as Perl Data::Dumper
|
||
dumps inside ``<pre>$VAR1 = …</pre>`` blocks. WEV06 → title,
|
||
WEV32 → metadata + PDF path. Used by Sachsen-Anhalt's PADOKA
|
||
template.
|
||
- **Berlin-style:** standard production HTML cards with
|
||
``efxRecordRepeater`` divs. Title in an ``<h3 class="h5">``,
|
||
metadata + PDF link in an ``<span class="h6">``. Used by
|
||
Berlin's PARDOK template.
|
||
"""
|
||
if self._RE_PRE_BLOCK.search(html):
|
||
return self._parse_hit_list_dump(html, query_filter)
|
||
return self._parse_hit_list_cards(html, query_filter)
|
||
|
||
def _parse_hit_list_dump(self, html: str, query_filter: str) -> list[Drucksache]:
|
||
"""Parse LSA-style ``<pre>$VAR1 = …</pre>`` Perl-dump records."""
|
||
results: list[Drucksache] = []
|
||
for pre in self._RE_PRE_BLOCK.findall(html):
|
||
m_ds = self._RE_DRUCKSACHE.search(pre)
|
||
if not m_ds:
|
||
continue
|
||
drucksache = m_ds.group(1)
|
||
|
||
m_t = self._RE_TITLE.search(pre)
|
||
title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"
|
||
|
||
m_pdf = self._RE_PDF.search(pre)
|
||
pdf_rel = m_pdf.group(1) if m_pdf else ""
|
||
pdf_url = f"{self.base_url}{self.pdf_url_prefix}{pdf_rel}" if pdf_rel else ""
|
||
|
||
m_w32 = self._RE_URHEBER_DATUM.search(pre)
|
||
urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else ""
|
||
datum_iso = self._datum_de_to_iso(m_w32.group(2) if m_w32 else "")
|
||
fraktionen = self._normalize_fraktion(urheber) if urheber else []
|
||
|
||
doc = Drucksache(
|
||
drucksache=drucksache,
|
||
title=title,
|
||
fraktionen=fraktionen,
|
||
datum=datum_iso,
|
||
link=pdf_url,
|
||
bundesland=self.bundesland,
|
||
typ="Antrag",
|
||
)
|
||
|
||
if query_filter:
|
||
hay = f"{title} {urheber}".lower()
|
||
if not all(t in hay for t in query_filter.lower().split()):
|
||
continue
|
||
|
||
results.append(doc)
|
||
|
||
return results
|
||
|
||
def _parse_hit_list_cards(self, html: str, query_filter: str) -> list[Drucksache]:
|
||
"""Parse Berlin-style ``efxRecordRepeater`` HTML-card records.
|
||
|
||
Each card contains an ``<h3>`` title, a metadata ``<span class="h6">``
|
||
with the document type, the Drucksachen-Nummer, and the date,
|
||
plus a direct ``<a href="…pdf">`` link to the PDF on the same host.
|
||
"""
|
||
results: list[Drucksache] = []
|
||
|
||
# Split the HTML on every record-div opener — easier than balancing
|
||
# divs with regex.
|
||
chunks = html.split('class="record')
|
||
# First chunk is the prelude, skip it
|
||
for chunk in chunks[1:]:
|
||
# Each chunk now starts at the record class attribute
|
||
m_t = self._RE_BE_TITLE.search(chunk)
|
||
title = m_t.group(1).strip() if m_t else "Ohne Titel"
|
||
|
||
m_ds = self._RE_BE_DRUCKSACHE.search(chunk)
|
||
if not m_ds:
|
||
continue
|
||
drucksache = m_ds.group(1)
|
||
|
||
m_pdf = self._RE_BE_LINK.search(chunk)
|
||
pdf_url = ""
|
||
if m_pdf:
|
||
href = m_pdf.group(1)
|
||
if href.startswith("http://") or href.startswith("https://"):
|
||
pdf_url = href
|
||
elif href.startswith("/"):
|
||
pdf_url = f"{self.base_url}{href}"
|
||
else:
|
||
pdf_url = f"{self.base_url}{self.pdf_url_prefix}{href}"
|
||
|
||
m_dat = self._RE_BE_DATUM.search(chunk)
|
||
datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "")
|
||
|
||
m_doc = self._RE_BE_DOCTYPE.search(chunk)
|
||
doctype_full = m_doc.group(1).strip() if m_doc else "Drucksache"
|
||
|
||
# Berlin often packs the originator(s) into the same h6 line:
|
||
# "Antrag CDU, SPD" → fraktionen = ["CDU","SPD"], typ = "Antrag"
|
||
# Senat-Vorlagen carry no fraction, only "Vorlage zur …".
|
||
fraktionen = self._normalize_fraktion(doctype_full)
|
||
# Strip the fraction names back out of the typ string so the UI
|
||
# shows a clean "Antrag" / "Vorlage …" label.
|
||
typ = doctype_full
|
||
if fraktionen:
|
||
# Cut at the first occurrence of any party name
|
||
cuts = [typ.upper().find(f.upper()) for f in fraktionen]
|
||
cuts = [c for c in cuts if c >= 0]
|
||
if cuts:
|
||
typ = typ[: min(cuts)].rstrip(" ,")
|
||
|
||
doc = Drucksache(
|
||
drucksache=drucksache,
|
||
title=title,
|
||
fraktionen=fraktionen,
|
||
datum=datum_iso,
|
||
link=pdf_url,
|
||
bundesland=self.bundesland,
|
||
typ=typ,
|
||
)
|
||
|
||
if query_filter:
|
||
hay = f"{title} {doctype_full}".lower()
|
||
if not all(t in hay for t in query_filter.lower().split()):
|
||
continue
|
||
|
||
results.append(doc)
|
||
|
||
return results
|
||
|
||
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
||
"""Search recent documents of the current Wahlperiode.
|
||
|
||
``query`` is applied as a client-side title/Urheber filter; the
|
||
server-side query covers the configured ``date_window_days``
|
||
(default 24 months).
|
||
"""
|
||
from datetime import date, timedelta
|
||
|
||
end = date.today()
|
||
start = end - timedelta(days=self.date_window_days)
|
||
body = self._build_search_body(
|
||
wahlperiode=self.wahlperiode,
|
||
start_date=start.isoformat(),
|
||
end_date=end.isoformat(),
|
||
)
|
||
|
||
browse_html = f"{self.base_url}{self.portala_path}/browse.tt.html"
|
||
browse_json = f"{self.base_url}{self.portala_path}/browse.tt.json"
|
||
report_html = f"{self.base_url}{self.portala_path}/report.tt.html"
|
||
|
||
async with httpx.AsyncClient(
|
||
# Bumped from 30s for #13 quick-win: chunksize=500 against the
|
||
# LSA report.tt.html endpoint occasionally takes 30+ seconds.
|
||
timeout=60,
|
||
follow_redirects=True,
|
||
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
||
) as client:
|
||
try:
|
||
# Step 1: warm up cookies via the browse page
|
||
await client.get(browse_html)
|
||
|
||
# Step 2: submit the search action
|
||
resp = await client.post(
|
||
browse_json,
|
||
json=body,
|
||
headers={"Referer": browse_html},
|
||
)
|
||
if resp.status_code != 200:
|
||
logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
|
||
return []
|
||
|
||
data = resp.json()
|
||
report_id = data.get("report_id")
|
||
if not report_id:
|
||
logger.error("%s: no report_id in response: %s", self.bundesland, data)
|
||
return []
|
||
|
||
# Step 3: fetch the HTML hit list
|
||
# Take a generous chunk so the client-side title filter
|
||
# still has enough material to work with. Quick-win for #13
|
||
# until the eUI sf-Index for real server-side fulltext is
|
||
# reverse-engineered: bump the unfiltered chunk floor and
|
||
# the query-filtered chunk ceiling.
|
||
chunksize = max(limit * 10, 500) if query else max(limit * 2, 100)
|
||
report_resp = await client.post(
|
||
report_html,
|
||
json={"report_id": report_id, "start": 0, "chunksize": chunksize},
|
||
headers={"Referer": browse_html},
|
||
)
|
||
if report_resp.status_code != 200:
|
||
logger.error("%s report HTTP %s", self.bundesland, report_resp.status_code)
|
||
return []
|
||
|
||
results = self._parse_hit_list_html(report_resp.text, query_filter=query)
|
||
return results[:limit]
|
||
|
||
except Exception:
|
||
logger.exception("%s search error", self.bundesland)
|
||
return []
|
||
|
||
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
||
"""Look up a single document by ID via the search endpoint with a
|
||
document_number filter."""
|
||
# Pragmatic MVP: do a broad search and filter for the requested ID.
|
||
# A targeted single-document fetch would require a different
|
||
# action.search.json structure that we have not reverse-engineered yet.
|
||
results = await self.search(query="", limit=200)
|
||
for doc in results:
|
||
if doc.drucksache == drucksache:
|
||
return doc
|
||
return None
|
||
|
||
async def download_text(self, drucksache: str) -> Optional[str]:
|
||
"""Download the PDF for a Drucksache and extract its text."""
|
||
import fitz # PyMuPDF
|
||
|
||
doc = await self.get_document(drucksache)
|
||
if not doc or not doc.link:
|
||
return None
|
||
|
||
async with httpx.AsyncClient(
|
||
timeout=60,
|
||
follow_redirects=True,
|
||
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
||
) as client:
|
||
try:
|
||
resp = await client.get(doc.link)
|
||
if resp.status_code != 200:
|
||
return None
|
||
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
||
text = ""
|
||
for page in pdf:
|
||
text += page.get_text()
|
||
pdf.close()
|
||
return text
|
||
except Exception:
|
||
logger.exception("%s download error for %s", self.bundesland, drucksache)
|
||
return None
|
||
|
||
|
||
class ParLDokAdapter(ParlamentAdapter):
|
||
"""Adapter for ParlDok 8.x parliament documentation systems (J3S GmbH).
|
||
|
||
ParlDok is a proprietary parliament documentation product by J3S GmbH
|
||
(https://www.j3s.de). Different from the portala/eUI framework used by
|
||
LSA/BE: ParlDok 8.x is a single-page app whose backend is a JSON API
|
||
rooted at ``{base_url}{prefix}/Fulltext/...``. The legacy ParLDok 5.x
|
||
HTML POST form (``parldok/formalkriterien``) used by dokukratie's MV
|
||
YAML scraper has been deprecated by the LandtagMV upgrade to 8.3.5.
|
||
|
||
Confirmed instances using this engine (April 2026):
|
||
|
||
- **MV** (Mecklenburg-Vorpommern) — ``dokumentation.landtag-mv.de/parldok``
|
||
- HH, SN, TH all advertise ParlDok in dokukratie but their actual
|
||
versions/themes have not been verified yet.
|
||
|
||
Search workflow:
|
||
|
||
1. ``GET {base_url}{prefix}/`` to obtain the session cookie. The
|
||
backend rejects POSTs without it.
|
||
2. ``POST {base_url}{prefix}/Fulltext/Search`` with form-encoded
|
||
``data=<json>`` payload. The JSON carries a ``tags`` array of
|
||
facet selections; each tag is ``{"type": <facet_type_int>,
|
||
"id": <facet_value>}``. Reverse-engineered facet type constants
|
||
from the bundle.js (``pd.facet_*``):
|
||
|
||
- ``facet_fraction = 2``
|
||
- ``facet_kind = 7`` (Drucksache, Plenarprotokoll, …)
|
||
- ``facet_type = 8`` (Antrag, Gesetzentwurf, Kleine Anfrage, …)
|
||
- ``facet_lp = 10`` (Wahlperiode)
|
||
|
||
Response is JSON ``{success, data: <stringified JSON>}`` where the
|
||
inner ``data`` carries ``{count, docs: [{id, title, date,
|
||
authorhtml, kind, type, lp, number, link, ...}], ...}``.
|
||
|
||
3. PDF download: ``GET {base_url}{prefix}/dokument/{numeric_id}``.
|
||
Returns ``application/pdf`` directly. The ``link`` field returned
|
||
by the search API already contains the path fragment
|
||
``/dokument/<id>#navpanes=0`` — strip the fragment and prepend
|
||
the configured ``prefix``.
|
||
|
||
Drucksachen-Nummer is reconstructed as ``f"{lp}/{number}"`` from the
|
||
search hit. Full-text search is *not* implemented in this MVP — the
|
||
backend supports it via ``facet_fulltext = 0`` tags but the public
|
||
LP-only filter already returns the relevant Antrag pool. ``query``
|
||
is applied as a client-side title/Urheber filter.
|
||
"""
|
||
|
||
# Reverse-engineered facet type constants from bundle.js (pd.facet_*).
|
||
FACET_FULLTEXT = 0
|
||
FACET_FRACTION = 2
|
||
FACET_KIND = 7
|
||
FACET_TYPE = 8
|
||
FACET_LP = 10
|
||
|
||
def __init__(
|
||
self,
|
||
*,
|
||
bundesland: str,
|
||
name: str,
|
||
base_url: str,
|
||
wahlperiode: int,
|
||
prefix: str = "/parldok",
|
||
document_typ: str = "Antrag",
|
||
) -> None:
|
||
"""Configure a ParlDok 8.x adapter for one specific parliament.
|
||
|
||
Args:
|
||
bundesland: state code, e.g. ``"MV"``.
|
||
name: human-readable label.
|
||
base_url: ``https://...`` host root, no trailing slash.
|
||
wahlperiode: current legislative period — fed into the
|
||
``facet_lp`` tag of the search payload.
|
||
prefix: app prefix where ParlDok lives. ``/parldok`` for MV.
|
||
document_typ: client-side filter on the ``type`` field of
|
||
each hit ("Antrag", "Gesetzentwurf", …). Set to empty
|
||
string to disable type filtering.
|
||
"""
|
||
self.bundesland = bundesland
|
||
self.name = name
|
||
self.base_url = base_url.rstrip("/")
|
||
self.prefix = "/" + prefix.strip("/")
|
||
self.wahlperiode = wahlperiode
|
||
self.document_typ = document_typ
|
||
|
||
@staticmethod
|
||
def _datum_de_to_iso(datum_de: str) -> str:
|
||
"""DD.MM.YYYY → YYYY-MM-DD; '' for empty input."""
|
||
if not datum_de:
|
||
return ""
|
||
try:
|
||
d, m, y = datum_de.split(".")
|
||
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
||
except ValueError:
|
||
return ""
|
||
|
||
@staticmethod
|
||
def _normalize_fraktion(authorhtml: str) -> list[str]:
|
||
"""Map ParlDok ``authorhtml`` to canonical fraction codes.
|
||
|
||
``authorhtml`` may be a comma-separated list of fractions
|
||
("CDU, SPD, F.D.P."), a single MdL with party in parens
|
||
("Thomas de Jesus Fernandes (AfD)") or empty (Landesregierung).
|
||
"""
|
||
if not authorhtml:
|
||
return []
|
||
u = authorhtml.upper()
|
||
out: list[str] = []
|
||
if re.search(r"\bBÜNDNIS\s*90\b", u) or re.search(r"\bGR(?:Ü|UE)NE\b", u):
|
||
out.append("GRÜNE")
|
||
if re.search(r"\bCDU\b", u):
|
||
out.append("CDU")
|
||
if re.search(r"\bSPD\b", u):
|
||
out.append("SPD")
|
||
# F.D.P. (with dots, historical) and FDP both occur in MV
|
||
if re.search(r"\bF\.?\s*D\.?\s*P\.?\b", u):
|
||
out.append("FDP")
|
||
if re.search(r"\bAFD\b", u):
|
||
out.append("AfD")
|
||
if re.search(r"\bLINKE\b", u) or re.search(r"\bLL/PDS\b", u):
|
||
out.append("LINKE")
|
||
if re.search(r"\bBSW\b", u):
|
||
out.append("BSW")
|
||
if re.search(r"LANDESREGIERUNG|MINISTER\b|STAATSKANZLEI|MINISTERPRÄSIDENT", u):
|
||
out.append("Landesregierung")
|
||
return out
|
||
|
||
@staticmethod
|
||
def _fulltext_id(term: str) -> str:
|
||
"""Sanitize a search term to ParlDok's facet ID format.
|
||
|
||
Mirrors ``pd.getFulltextId`` from ``bundle.js``: replace every
|
||
non-alphanumeric character with ``-``. The server uses this to
|
||
deduplicate identical search facets.
|
||
"""
|
||
return re.sub(r"[^a-zA-Z0-9]", "-", term)
|
||
|
||
def _build_search_body(self, *, length: int = 100, query: str = "") -> dict:
|
||
"""Build the JSON payload for the initial ``Fulltext/Search`` call.
|
||
|
||
Filters by Wahlperiode + optional server-side full-text search.
|
||
Type/kind filtering still happens client-side because the
|
||
facet_type/facet_kind value IDs are instance-specific and would
|
||
require an extra ``Fulltext/Filter`` round trip to discover.
|
||
|
||
Pagination beyond the first page goes through
|
||
``Fulltext/Resultpage`` — the ``Search`` endpoint itself
|
||
ignores any non-zero ``Start``.
|
||
|
||
The full-text tag schema is reverse-engineered from
|
||
``pd.addInput`` in ``bundle.js`` and matches the SPA payload
|
||
verbatim::
|
||
|
||
{"type": 0, "id": "<sanitized>", "fulltext": "<raw>",
|
||
"label": "<raw>", "field": "Alle"}
|
||
|
||
``field="Alle"`` means "search all indexed fields"
|
||
(``pd.currentFTSearchMode`` default). The server tokenizes
|
||
the term and applies AND-semantics across whitespace.
|
||
"""
|
||
tags: list[dict] = [{"type": self.FACET_LP, "id": self.wahlperiode}]
|
||
if query:
|
||
tags.append({
|
||
"type": self.FACET_FULLTEXT,
|
||
"id": self._fulltext_id(query),
|
||
"fulltext": query,
|
||
"label": query,
|
||
"field": "Alle",
|
||
})
|
||
return {
|
||
"devicekey": "",
|
||
"max": length,
|
||
"withfilter": False,
|
||
# sort=2 → newest first (date desc); sort=1 is relevance.
|
||
"sort": 2,
|
||
"topk": length,
|
||
"llm": 0,
|
||
"newdocsearch": False,
|
||
"limit": {"Start": 0, "Length": length},
|
||
"tags": tags,
|
||
"updateFilters": [],
|
||
}
|
||
|
||
def _hit_to_drucksache(self, hit: dict) -> Optional[Drucksache]:
|
||
"""Convert one ParlDok JSON hit to a Drucksache. None if unusable."""
|
||
lp = hit.get("lp")
|
||
number = hit.get("number")
|
||
if not lp or not number:
|
||
return None
|
||
|
||
link_field = hit.get("link") or hit.get("prelink") or ""
|
||
# Strip "#navpanes=0" fragment and prepend the prefix.
|
||
path = link_field.split("#", 1)[0]
|
||
pdf_url = f"{self.base_url}{self.prefix}{path}" if path else ""
|
||
|
||
return Drucksache(
|
||
drucksache=f"{lp}/{number}",
|
||
title=hit.get("title", ""),
|
||
fraktionen=self._normalize_fraktion(hit.get("authorhtml", "")),
|
||
datum=self._datum_de_to_iso(hit.get("date", "")),
|
||
link=pdf_url,
|
||
bundesland=self.bundesland,
|
||
typ=hit.get("type", "") or hit.get("kind", ""),
|
||
)
|
||
|
||
async def _post_json(
|
||
self, client: httpx.AsyncClient, endpoint: str, payload: dict,
|
||
) -> Optional[dict]:
|
||
"""POST a JSON-stringified payload to a ParlDok endpoint.
|
||
|
||
``endpoint`` is the path tail (e.g. ``"Fulltext/Search"`` or
|
||
``"Fulltext/Resultpage"``). Returns the inner JSON object
|
||
(already parsed from the stringified ``data`` field), or None
|
||
on error.
|
||
"""
|
||
homepage = f"{self.base_url}{self.prefix}/"
|
||
url = f"{self.base_url}{self.prefix}/{endpoint}"
|
||
try:
|
||
resp = await client.post(
|
||
url,
|
||
data={"data": json.dumps(payload, ensure_ascii=False)},
|
||
headers={
|
||
"X-Requested-With": "XMLHttpRequest",
|
||
"Referer": homepage,
|
||
},
|
||
)
|
||
if resp.status_code != 200:
|
||
logger.error(
|
||
"%s %s HTTP %s",
|
||
self.bundesland, endpoint, resp.status_code,
|
||
)
|
||
return None
|
||
outer = resp.json()
|
||
if not outer.get("success"):
|
||
logger.error(
|
||
"%s %s not successful: %s",
|
||
self.bundesland, endpoint, outer.get("message"),
|
||
)
|
||
return None
|
||
return json.loads(outer["data"])
|
||
except Exception:
|
||
logger.exception("%s ParlDok %s error", self.bundesland, endpoint)
|
||
return None
|
||
|
||
async def _initial_search(
|
||
self, client: httpx.AsyncClient, *, length: int, query: str = "",
|
||
) -> tuple[Optional[int], list[dict]]:
|
||
"""Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
|
||
|
||
The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
|
||
calls. ParlDok ignores any non-zero ``Start`` on this endpoint —
|
||
the first 100 hits are the only ones reachable via ``Search``.
|
||
|
||
``query`` is sent server-side as a ``facet_fulltext`` tag — see
|
||
``_build_search_body``.
|
||
"""
|
||
body = self._build_search_body(length=length, query=query)
|
||
inner = await self._post_json(client, "Fulltext/Search", body)
|
||
if not inner:
|
||
return None, []
|
||
return inner.get("queryid"), (inner.get("docs") or [])
|
||
|
||
async def _result_page(
|
||
self, client: httpx.AsyncClient, *, queryid: int, start: int, length: int,
|
||
) -> list[dict]:
|
||
"""Fetch a further result page via ``Fulltext/Resultpage``."""
|
||
payload = {
|
||
"devicekey": "",
|
||
"queryid": queryid,
|
||
"limit": {"Start": start, "Length": length},
|
||
}
|
||
inner = await self._post_json(client, "Fulltext/Resultpage", payload)
|
||
if not inner:
|
||
return []
|
||
return inner.get("docs") or []
|
||
|
||
def _make_client(self) -> httpx.AsyncClient:
|
||
return httpx.AsyncClient(
|
||
timeout=30,
|
||
follow_redirects=True,
|
||
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
||
)
|
||
|
||
async def _paginated_hits(
|
||
self, client: httpx.AsyncClient, *, query: str = "",
|
||
):
|
||
"""Async iterator over Drucksachen-style hits across all pages.
|
||
|
||
Yields raw hit dicts in newest-first order. The first batch comes
|
||
from ``Fulltext/Search``, subsequent batches from
|
||
``Fulltext/Resultpage`` using the queryid the server returned for
|
||
the initial call. Stops when a page comes back empty, undersized,
|
||
or after ``MAX_PAGES`` iterations.
|
||
|
||
``query`` is forwarded as a server-side full-text filter to
|
||
``_initial_search``; the resulting ``queryid`` is bound to that
|
||
filter, so subsequent ``Resultpage`` calls automatically inherit
|
||
it without needing to repeat the tag.
|
||
"""
|
||
queryid, hits = await self._initial_search(
|
||
client, length=self.PAGE_SIZE, query=query,
|
||
)
|
||
for hit in hits:
|
||
yield hit
|
||
if not queryid or len(hits) < self.PAGE_SIZE:
|
||
return
|
||
|
||
for page in range(1, self.MAX_PAGES):
|
||
page_hits = await self._result_page(
|
||
client,
|
||
queryid=queryid,
|
||
start=page * self.PAGE_SIZE,
|
||
length=self.PAGE_SIZE,
|
||
)
|
||
if not page_hits:
|
||
return
|
||
for hit in page_hits:
|
||
yield hit
|
||
if len(page_hits) < self.PAGE_SIZE:
|
||
return
|
||
|
||
# ParlDok 8.x caps Length per request at 100 — paginate if needed.
|
||
PAGE_SIZE = 100
|
||
# Safety bound: scan at most 10 pages × 100 = 1000 most recent docs.
|
||
# Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more
|
||
# than enough for the typical UI request (limit 5..20). Filtered
|
||
# queries that find nothing in the last 1000 docs return empty
|
||
# rather than scan the entire WP.
|
||
MAX_PAGES = 10
|
||
|
||
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
||
"""Search recent documents of the configured Wahlperiode.
|
||
|
||
Server-side full-text search via the ``facet_fulltext`` tag (#12)
|
||
when ``query`` is non-empty; otherwise pure browse mode. The
|
||
server returns the WP sorted newest-first across all document
|
||
kinds, the client keeps only ``Antrag``-typed Drucksachen and
|
||
dedupes by lp/number (ParlDok returns the same Drucksache
|
||
multiple times when it appears in several Vorgänge/Beratungen).
|
||
|
||
Pagination: ParlDok caps each response at 100 rows; further
|
||
pages come from ``Fulltext/Resultpage`` bound to the
|
||
server-assigned ``queryid``.
|
||
"""
|
||
results: list[Drucksache] = []
|
||
seen: set[str] = set()
|
||
|
||
async with self._make_client() as client:
|
||
await client.get(f"{self.base_url}{self.prefix}/")
|
||
async for hit in self._paginated_hits(client, query=query):
|
||
if hit.get("kind") != "Drucksache":
|
||
continue
|
||
if self.document_typ and hit.get("type") != self.document_typ:
|
||
continue
|
||
|
||
doc = self._hit_to_drucksache(hit)
|
||
if not doc:
|
||
continue
|
||
if doc.drucksache in seen:
|
||
continue
|
||
seen.add(doc.drucksache)
|
||
|
||
results.append(doc)
|
||
if len(results) >= limit:
|
||
return results
|
||
|
||
return results
|
||
|
||
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
||
"""Look up a single Antrag by ``lp/number`` ID.
|
||
|
||
Pragmatic MVP: page through the WP unfiltered until we find a
|
||
match. ParlDok offers a ``facet_number`` (14) facet that would
|
||
let us target the lookup directly, but the facet ID values are
|
||
instance-specific (would require a ``Fulltext/Filter`` discovery
|
||
call) and the WP-wide pagination is fast enough for the typical
|
||
2k–10k Drucksachen per period.
|
||
"""
|
||
wanted_lp, wanted_num = (drucksache.split("/", 1) + [""])[:2]
|
||
if not wanted_num:
|
||
return None
|
||
|
||
async with self._make_client() as client:
|
||
await client.get(f"{self.base_url}{self.prefix}/")
|
||
async for hit in self._paginated_hits(client):
|
||
if hit.get("kind") != "Drucksache":
|
||
continue
|
||
if str(hit.get("lp")) == wanted_lp and str(hit.get("number")) == wanted_num:
|
||
return self._hit_to_drucksache(hit)
|
||
return None
|
||
|
||
async def download_text(self, drucksache: str) -> Optional[str]:
|
||
"""Download the PDF for a Drucksache and extract its text."""
|
||
import fitz # PyMuPDF
|
||
|
||
doc = await self.get_document(drucksache)
|
||
if not doc or not doc.link:
|
||
return None
|
||
|
||
async with httpx.AsyncClient(
|
||
timeout=60,
|
||
follow_redirects=True,
|
||
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
||
) as client:
|
||
try:
|
||
resp = await client.get(doc.link)
|
||
if resp.status_code != 200:
|
||
logger.error(
|
||
"%s PDF HTTP %s for %s (%s)",
|
||
self.bundesland, resp.status_code, drucksache, doc.link,
|
||
)
|
||
return None
|
||
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
||
text = ""
|
||
for page in pdf:
|
||
text += page.get_text()
|
||
pdf.close()
|
||
return text
|
||
except Exception:
|
||
logger.exception("%s ParlDok download error for %s", self.bundesland, drucksache)
|
||
return None
|
||
|
||
|
||
class BayernAdapter(ParlamentAdapter):
|
||
"""Adapter for Bayerischer Landtag."""
|
||
|
||
bundesland = "BY"
|
||
name = "Bayerischer Landtag"
|
||
base_url = "https://www.bayern.landtag.de"
|
||
|
||
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
||
# TODO: Implement Bayern search
|
||
return []
|
||
|
||
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
||
# TODO: Implement
|
||
return None
|
||
|
||
async def download_text(self, drucksache: str) -> Optional[str]:
|
||
return None
|
||
|
||
|
||
class BWAdapter(ParlamentAdapter):
|
||
"""Adapter for Baden-Württemberg Landtag."""
|
||
|
||
bundesland = "BW"
|
||
name = "Landtag Baden-Württemberg"
|
||
base_url = "https://www.landtag-bw.de"
|
||
|
||
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
||
# TODO: Implement BW search
|
||
return []
|
||
|
||
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
||
return None
|
||
|
||
async def download_text(self, drucksache: str) -> Optional[str]:
|
||
return None
|
||
|
||
|
||
# Registry of adapters
|
||
ADAPTERS = {
|
||
"NRW": NRWAdapter(),
|
||
"LSA": PortalaAdapter(
|
||
bundesland="LSA",
|
||
name="Landtag von Sachsen-Anhalt (PADOKA)",
|
||
base_url="https://padoka.landtag.sachsen-anhalt.de",
|
||
db_id="lsa.lissh",
|
||
wahlperiode=8,
|
||
portala_path="/portal",
|
||
document_type="Antrag",
|
||
pdf_url_prefix="/files/",
|
||
),
|
||
"BE": PortalaAdapter(
|
||
bundesland="BE",
|
||
name="Abgeordnetenhaus von Berlin (PARDOK)",
|
||
base_url="https://pardok.parlament-berlin.de",
|
||
db_id="lah.lissh",
|
||
wahlperiode=19,
|
||
portala_path="/portala",
|
||
# Berlin's ETYPF index uses different value strings — drop the
|
||
# document_type subtree, fall back to client-side title filter.
|
||
document_type=None,
|
||
# Quick-win for #13: pulled the date window from the original
|
||
# 180-day MVP up to 730 days so client-side title-filter searches
|
||
# ("Schule" etc.) reach back across more of the WP19 corpus until
|
||
# the eUI fulltext-sf is reverse-engineered. The chunksize bump
|
||
# in PortalaAdapter.search() means the per-request payload stays
|
||
# bounded.
|
||
date_window_days=730,
|
||
pdf_url_prefix="/files/",
|
||
),
|
||
"MV": ParLDokAdapter(
|
||
bundesland="MV",
|
||
name="Landtag Mecklenburg-Vorpommern (ParlDok)",
|
||
base_url="https://www.dokumentation.landtag-mv.de",
|
||
wahlperiode=8,
|
||
prefix="/parldok",
|
||
document_typ="Antrag",
|
||
),
|
||
"BY": BayernAdapter(),
|
||
"BW": BWAdapter(),
|
||
}
|
||
|
||
|
||
def get_adapter(bundesland: str) -> Optional[ParlamentAdapter]:
|
||
"""Get adapter for a bundesland."""
|
||
return ADAPTERS.get(bundesland)
|
||
|
||
|
||
async def search_all(query: str, bundesland: str = "NRW", limit: int = 20) -> list[Drucksache]:
|
||
"""Search parliament documents in a specific state."""
|
||
adapter = get_adapter(bundesland)
|
||
if not adapter:
|
||
return []
|
||
return await adapter.search(query, limit)
|