_parse_hit_list_cards referenced an undefined `doctype` instead of `doctype_full` on the query-filter path. The surrounding try/except in search() swallowed the exception, so Berlin queries silently returned 0 hits whenever a search term was given. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
895 lines
36 KiB
Python
895 lines
36 KiB
Python
"""Parliament search adapters for different German states."""
|
|
|
|
import httpx
|
|
import re
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
@dataclass
|
|
class Drucksache:
|
|
"""A parliamentary document."""
|
|
drucksache: str # e.g. "18/8125"
|
|
title: str
|
|
fraktionen: list[str]
|
|
datum: str # ISO date
|
|
link: str # PDF URL
|
|
bundesland: str
|
|
typ: str = "Antrag" # Antrag, Anfrage, Beschlussempfehlung, etc.
|
|
|
|
|
|
class ParlamentAdapter(ABC):
|
|
"""Base adapter for searching parliament documents."""
|
|
|
|
bundesland: str
|
|
name: str
|
|
|
|
@abstractmethod
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
"""Search for documents matching query."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
"""Get a specific document by ID."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
"""Download and extract text from a document."""
|
|
pass
|
|
|
|
|
|
class NRWAdapter(ParlamentAdapter):
|
|
"""Adapter for NRW Landtag (opal.landtag.nrw.de)."""
|
|
|
|
bundesland = "NRW"
|
|
name = "Landtag Nordrhein-Westfalen"
|
|
base_url = "https://opal.landtag.nrw.de"
|
|
search_url = "https://opal.landtag.nrw.de/home/dokumente/dokumentensuche/parlamentsdokumente/aktuelle-dokumente.html"
|
|
|
|
def _parse_query(self, query: str) -> tuple[str, list[str], bool]:
|
|
"""
|
|
Parse search query for AND logic and exact phrases.
|
|
Returns: (search_term_for_api, filter_terms, is_exact)
|
|
|
|
Examples:
|
|
- 'Klimaschutz Energie' -> ('Klimaschutz', ['klimaschutz', 'energie'], False)
|
|
- '"Grüner Stahl"' -> ('Grüner Stahl', ['grüner stahl'], True)
|
|
- 'Klimaschutz "erneuerbare Energie"' -> ('Klimaschutz', ['klimaschutz', 'erneuerbare energie'], False)
|
|
"""
|
|
query = query.strip()
|
|
|
|
# Check for exact phrase (entire query in quotes)
|
|
if query.startswith('"') and query.endswith('"') and query.count('"') == 2:
|
|
exact = query[1:-1].strip()
|
|
return (exact, [exact.lower()], True)
|
|
|
|
# Extract quoted phrases and regular terms
|
|
import shlex
|
|
try:
|
|
parts = shlex.split(query)
|
|
except ValueError:
|
|
# Fallback for unbalanced quotes
|
|
parts = query.split()
|
|
|
|
if not parts:
|
|
return (query, [query.lower()], False)
|
|
|
|
# Use first term for API search, all terms for filtering
|
|
filter_terms = [p.lower() for p in parts]
|
|
return (parts[0], filter_terms, False)
|
|
|
|
def _matches_all_terms(self, doc: 'Drucksache', terms: list[str], is_exact: bool) -> bool:
|
|
"""Check if document matches all search terms (AND logic)."""
|
|
searchable = f"{doc.title} {doc.drucksache} {' '.join(doc.fraktionen)} {doc.typ}".lower()
|
|
|
|
if is_exact:
|
|
# Exact phrase must appear
|
|
return terms[0] in searchable
|
|
else:
|
|
# All terms must appear (AND)
|
|
return all(term in searchable for term in terms)
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
"""Search NRW Landtag documents via OPAL portal."""
|
|
results = []
|
|
|
|
# Parse query for AND logic
|
|
api_query, filter_terms, is_exact = self._parse_query(query)
|
|
|
|
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
|
try:
|
|
# First, get the page to establish session
|
|
initial = await client.get(self.search_url)
|
|
if initial.status_code != 200:
|
|
print(f"NRW search initial request failed: {initial.status_code}")
|
|
return []
|
|
|
|
# Parse for webflow token from pagination links
|
|
soup = BeautifulSoup(initial.text, 'html.parser')
|
|
|
|
# Find a pagination link to extract the webflow token
|
|
pagination_link = soup.select_one('a[href*="webflowexecution"]')
|
|
webflow_token = ""
|
|
webflow_execution = ""
|
|
|
|
if pagination_link:
|
|
href = pagination_link.get('href', '')
|
|
# Extract webflowToken and webflowexecution from URL
|
|
token_match = re.search(r'webflowToken=([^&]*)', href)
|
|
exec_match = re.search(r'(webflowexecution[^=]+)=([^&]+)', href)
|
|
if token_match:
|
|
webflow_token = token_match.group(1)
|
|
if exec_match:
|
|
webflow_execution = f"{exec_match.group(1)}={exec_match.group(2)}"
|
|
|
|
# Now perform the search with POST
|
|
# Find the form action URL with webflow token
|
|
form = soup.select_one('form#docSearchByItem')
|
|
form_action = self.search_url
|
|
if form and form.get('action'):
|
|
action = form.get('action')
|
|
if action.startswith('/'):
|
|
form_action = f"{self.base_url}{action}"
|
|
elif action.startswith('http'):
|
|
form_action = action
|
|
else:
|
|
form_action = f"{self.search_url}?{action}"
|
|
|
|
# Build form data for "Einfache Suche" (searchByItem form)
|
|
form_data = {
|
|
'_eventId_sendform': '1',
|
|
'dokNum': api_query, # This is the text search field
|
|
'formId': 'searchByItem',
|
|
'dokTyp': '', # All types
|
|
'wp': '18', # Wahlperiode 18
|
|
}
|
|
|
|
# POST request with form data to the form action URL
|
|
search_resp = await client.post(
|
|
form_action,
|
|
data=form_data,
|
|
cookies=initial.cookies,
|
|
headers={'Content-Type': 'application/x-www-form-urlencoded'}
|
|
)
|
|
|
|
if search_resp.status_code != 200:
|
|
print(f"NRW search request failed: {search_resp.status_code}")
|
|
return []
|
|
|
|
# Parse results
|
|
soup = BeautifulSoup(search_resp.text, 'html.parser')
|
|
|
|
# Find all document result items (li elements containing articles)
|
|
items = soup.select('li:has(article)')
|
|
|
|
for item in items[:limit]:
|
|
try:
|
|
# Extract drucksache number from first link
|
|
num_link = item.select_one('a[href*="MMD"]')
|
|
if not num_link:
|
|
continue
|
|
|
|
href = num_link.get('href', '')
|
|
# Extract number: MMD18-12345.pdf -> 18/12345
|
|
match = re.search(r'MMD(\d+)-(\d+)\.pdf', href)
|
|
if not match:
|
|
continue
|
|
|
|
legislatur, nummer = match.groups()
|
|
drucksache = f"{legislatur}/{nummer}"
|
|
pdf_url = f"https://www.landtag.nrw.de{href}" if href.startswith('/') else href
|
|
|
|
# Extract title from the title link (class e-document-result-item__title)
|
|
title_elem = item.select_one('a.e-document-result-item__title')
|
|
if title_elem:
|
|
# Get text content, clean it up
|
|
title = title_elem.get_text(strip=True)
|
|
# Remove SVG icon text and clean
|
|
title = re.sub(r'\s*<svg.*', '', title)
|
|
title = re.sub(r'\s+', ' ', title).strip()
|
|
else:
|
|
# Fallback: try to find any longer text
|
|
title = f"Drucksache {drucksache}"
|
|
|
|
# Clean up common artifacts
|
|
title = re.sub(r'\s*\(\s*externer Link.*?\)', '', title).strip()
|
|
|
|
# Extract type (Antrag, Kleine Anfrage, etc.)
|
|
typ_elem = item.select_one('.e-document-result-item__category')
|
|
typ = typ_elem.get_text(strip=True) if typ_elem else "Drucksache"
|
|
|
|
# Extract date
|
|
time_elem = item.select_one('time')
|
|
datum = ""
|
|
if time_elem:
|
|
datum_text = time_elem.get_text(strip=True)
|
|
# Convert DD.MM.YYYY to YYYY-MM-DD
|
|
date_match = re.match(r'(\d{2})\.(\d{2})\.(\d{4})', datum_text)
|
|
if date_match:
|
|
d, m, y = date_match.groups()
|
|
datum = f"{y}-{m}-{d}"
|
|
|
|
# Extract Urheber (fraktionen) - look for paragraph containing "Urheber:"
|
|
urheber_text = ""
|
|
for p in item.select('p'):
|
|
if 'Urheber:' in p.get_text():
|
|
urheber_text = p.get_text()
|
|
break
|
|
|
|
fraktionen = []
|
|
if urheber_text:
|
|
# Extract party names (SPD, CDU, GRÜNE, FDP, AfD)
|
|
for party in ['SPD', 'CDU', 'GRÜNE', 'Grüne', 'FDP', 'AfD']:
|
|
if party in urheber_text:
|
|
fraktionen.append(party.upper() if party.lower() != 'grüne' else 'GRÜNE')
|
|
|
|
doc = Drucksache(
|
|
drucksache=drucksache,
|
|
title=title,
|
|
fraktionen=fraktionen,
|
|
datum=datum,
|
|
link=pdf_url,
|
|
bundesland="NRW",
|
|
typ=typ,
|
|
)
|
|
|
|
# Apply AND filter (all terms must match)
|
|
if self._matches_all_terms(doc, filter_terms, is_exact):
|
|
results.append(doc)
|
|
|
|
except Exception as e:
|
|
print(f"Error parsing item: {e}")
|
|
continue
|
|
|
|
except Exception as e:
|
|
print(f"NRW search error: {e}")
|
|
|
|
return results
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
"""Get document metadata by drucksache ID (e.g. '18/8125')."""
|
|
# Parse legislatur and number
|
|
match = re.match(r"(\d+)/(\d+)", drucksache)
|
|
if not match:
|
|
return None
|
|
|
|
legislatur, nummer = match.groups()
|
|
pdf_url = f"https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument/MMD{legislatur}-{nummer}.pdf"
|
|
|
|
# Try to fetch and extract basic info
|
|
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
|
try:
|
|
resp = await client.head(pdf_url)
|
|
if resp.status_code == 200:
|
|
return Drucksache(
|
|
drucksache=drucksache,
|
|
title=f"Drucksache {drucksache}",
|
|
fraktionen=[],
|
|
datum="",
|
|
link=pdf_url,
|
|
bundesland="NRW",
|
|
)
|
|
except:
|
|
pass
|
|
|
|
return None
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
"""Download PDF and extract text."""
|
|
import fitz # PyMuPDF
|
|
|
|
doc = await self.get_document(drucksache)
|
|
if not doc:
|
|
return None
|
|
|
|
async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
|
|
try:
|
|
resp = await client.get(doc.link)
|
|
if resp.status_code != 200:
|
|
return None
|
|
|
|
# Extract text with PyMuPDF
|
|
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
|
text = ""
|
|
for page in pdf:
|
|
text += page.get_text()
|
|
pdf.close()
|
|
|
|
return text
|
|
except Exception as e:
|
|
print(f"Error downloading {drucksache}: {e}")
|
|
return None
|
|
|
|
|
|
class PortalaAdapter(ParlamentAdapter):
|
|
"""Adapter for portala/eUI-based parliament documentation systems.
|
|
|
|
Used by parliaments running the proprietary "esearch" / portala framework
|
|
(originally developed for STAR/StarFinder backends, now wrapped in a
|
|
Single-Page App with Template Toolkit on the server side):
|
|
|
|
- **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de``
|
|
under ``/portal/`` (singular)
|
|
- **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` under
|
|
``/portala/`` (with the trailing 'a')
|
|
|
|
Both instances share the same JSON action schema, only the base URL,
|
|
the data source ID, the application path prefix and a few minor
|
|
quirks differ — those are constructor parameters so that the same
|
|
class can serve both states (and any future portala-based parliament).
|
|
|
|
The search workflow is two-stage:
|
|
|
|
1. ``POST {base}{path}/browse.tt.json`` with a complex JSON ``action``
|
|
body that contains an Elasticsearch-style query tree under
|
|
``search.json``. The server returns a ``report_id`` plus hit count.
|
|
2. ``POST {base}{path}/report.tt.html`` with ``{report_id, start,
|
|
chunksize}`` to fetch the HTML hit list. Each hit carries a Perl
|
|
Data::Dumper block in a ``<pre>`` tag with the canonical metadata.
|
|
|
|
The query body schema was reverse-engineered from
|
|
https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
|
|
(GPL-3.0 — only structure/selectors are reused, not Python code).
|
|
|
|
Full-text search is **not** implemented in the MVP: the adapter
|
|
returns documents of the current Wahlperiode in the given date
|
|
window, and the search query is applied as a client-side
|
|
title/Urheber filter. The server-side full-text path requires
|
|
state-specific ``sf`` index names that are not yet known.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
bundesland: str,
|
|
name: str,
|
|
base_url: str,
|
|
db_id: str,
|
|
wahlperiode: int,
|
|
portala_path: str = "/portal",
|
|
document_type: Optional[str] = "Antrag",
|
|
pdf_url_prefix: str = "/files/",
|
|
date_window_days: int = 730,
|
|
) -> None:
|
|
"""Configure a portala/eUI adapter for one specific parliament.
|
|
|
|
Args:
|
|
bundesland: state code (e.g. ``"LSA"``, ``"BE"``).
|
|
name: human-readable adapter label (used in logs/UI).
|
|
base_url: ``https://...`` of the portal host without trailing slash.
|
|
db_id: data source identifier the eUI server expects in
|
|
``action.sources``, e.g. ``"lsa.lissh"`` or ``"lah.lissh"``.
|
|
wahlperiode: current legislative period — fed into the WP
|
|
term of the search tree.
|
|
portala_path: path prefix where the portala app lives. ``/portal``
|
|
for LSA, ``/portala`` for Berlin.
|
|
document_type: optional filter applied via ETYPF/DTYPF/DART
|
|
terms. ``"Antrag"`` works for LSA; for instances where
|
|
the index uses different document_type values (e.g. Berlin),
|
|
pass ``None`` to drop the document_type subtree entirely
|
|
— the user can still filter client-side by title.
|
|
pdf_url_prefix: URL fragment between ``base_url`` and the
|
|
relative PDF path returned by the server.
|
|
date_window_days: how many days back ``search()`` looks by
|
|
default.
|
|
"""
|
|
self.bundesland = bundesland
|
|
self.name = name
|
|
self.base_url = base_url.rstrip("/")
|
|
self.db_id = db_id
|
|
self.wahlperiode = wahlperiode
|
|
self.portala_path = "/" + portala_path.strip("/")
|
|
self.document_type = document_type
|
|
self.pdf_url_prefix = "/" + pdf_url_prefix.strip("/") + "/"
|
|
self.date_window_days = date_window_days
|
|
|
|
# ── LSA-style hit list (Perl Data::Dumper inside <pre> blocks) ──
|
|
# Reverse-engineered "WEV*" record fields:
|
|
# WEV06.main = title
|
|
# WEV32.5 = relative PDF path
|
|
# WEV32.main = "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b> ..."
|
|
_RE_TITLE = re.compile(r"'WEV06'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
|
|
_RE_PDF = re.compile(r"'5'\s*=>\s*'([^']*\.pdf)'")
|
|
_RE_DRUCKSACHE = re.compile(r"Drucksache\s*<b>(\d+/\d+)</b>")
|
|
_RE_URHEBER_DATUM = re.compile(
|
|
r"'WEV32'\s*=>\s*\[\s*\{[^}]*'main'\s*=>\s*[\"']Antrag\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
|
|
)
|
|
_RE_PRE_BLOCK = re.compile(r'<pre>\$VAR1 = (.*?)</pre>', re.DOTALL)
|
|
|
|
# ── Berlin-style hit list (production HTML cards, no Perl dump) ──
|
|
# The whole div for one record:
|
|
_RE_BE_RECORD = re.compile(
|
|
r'<div[^>]*class="[^"]*efxRecordRepeater[^"]*"[^>]*data-efx-rec="[^"]*"[^>]*>(.*?)(?=<div[^>]*efxRecordRepeater|<div[^>]*id="efxResultsEnd"|</main>|$)',
|
|
re.DOTALL,
|
|
)
|
|
_RE_BE_TITLE = re.compile(r'<h3[^>]*class="h5[^"]*"[^>]*>\s*<span>([^<]+)</span>')
|
|
_RE_BE_LINK = re.compile(r'<a[^>]*href="([^"]+\.pdf)"[^>]*>')
|
|
# The metadata h6 looks like:
|
|
# <span class="h6">Antrag (Eilantrag) <a ...>Drucksache 19/3104</a> S. 1 bis 24 vom 31.03.2026</span>
|
|
_RE_BE_DRUCKSACHE = re.compile(r'Drucksache\s+(\d+/\d+)')
|
|
_RE_BE_DATUM = re.compile(r'vom\s+(\d{1,2}\.\d{1,2}\.\d{4})')
|
|
_RE_BE_DOCTYPE = re.compile(r'<span class="h6">\s*([^<&]+?)(?: |<)')
|
|
|
|
@staticmethod
|
|
def _decode_perl_hex(s: str) -> str:
|
|
"""Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
|
|
return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s)
|
|
|
|
@staticmethod
|
|
def _normalize_fraktion(urheber: str) -> list[str]:
|
|
"""Map Urheber-String to canonical fraction codes.
|
|
|
|
Uses regex word boundaries instead of plain substring matching so
|
|
that comma-separated lists ("CDU, SPD") and the embedded "DIE
|
|
LINKE" are matched reliably.
|
|
"""
|
|
u = urheber.upper()
|
|
out: list[str] = []
|
|
|
|
def has(pattern: str) -> bool:
|
|
return re.search(pattern, u) is not None
|
|
|
|
if has(r"\bBÜNDNIS\s*90\b") or has(r"\bGR(?:Ü|UE)NE\b"):
|
|
out.append("GRÜNE")
|
|
if has(r"\bCDU\b"):
|
|
out.append("CDU")
|
|
if has(r"\bSPD\b"):
|
|
out.append("SPD")
|
|
if has(r"\bFDP\b"):
|
|
out.append("FDP")
|
|
if has(r"\bAFD\b"):
|
|
out.append("AfD")
|
|
if has(r"\bLINKE\b"):
|
|
out.append("LINKE")
|
|
if has(r"\bBSW\b"):
|
|
out.append("BSW")
|
|
if has(r"LANDESREGIERUNG|SENAT VON BERLIN|REGIERENDE[RN]?\s+BÜRGERMEISTER|MINISTER\b|STAATSKANZLEI"):
|
|
out.append("Landesregierung")
|
|
return out
|
|
|
|
def _build_search_body(
|
|
self,
|
|
wahlperiode: int,
|
|
start_date: str,
|
|
end_date: str,
|
|
) -> dict:
|
|
"""Build the action JSON body for browse.tt.json.
|
|
|
|
The schema is taken from dokukratie's portala.query.json template
|
|
and only differs in the data source and the variable substitutions.
|
|
When ``self.document_type`` is None, the ETYPF/DTYPF/DART subtree
|
|
is dropped — useful for parliaments whose ETYPF index uses
|
|
different value strings than ``"Antrag"``.
|
|
"""
|
|
document_type = self.document_type
|
|
date_range_text = f"{start_date} THRU {end_date}"
|
|
date_term = lambda sf, num: { # noqa: E731 — local helper
|
|
"tn": "trange", "sf": sf, "op": "eq", "num": num,
|
|
"idx": 119, "l": 3,
|
|
"p1": start_date, "t1": start_date,
|
|
"p2": end_date, "t2": end_date,
|
|
"t": date_range_text,
|
|
}
|
|
|
|
# Build the search.lines (form-state mirror) and the json tree
|
|
lines: dict = {
|
|
"2": str(wahlperiode),
|
|
"10": start_date,
|
|
"11": end_date,
|
|
"20.1": "alWEBBI",
|
|
"20.2": "alWEBBI",
|
|
"20.3": "alWEBBI",
|
|
"90.1": "AND",
|
|
"90.2": "AND",
|
|
"90.3": "AND",
|
|
}
|
|
if document_type is not None:
|
|
lines["3"] = document_type
|
|
lines["4"] = "D"
|
|
|
|
# Top-level AND tree
|
|
top_terms: list = [
|
|
{"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3,
|
|
"sf": "WP", "op": "eq", "num": 5},
|
|
]
|
|
|
|
if document_type is not None:
|
|
top_terms.append({"tn": "or", "num": 3, "terms": [
|
|
{"tn": "or", "num": 4, "terms": [
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
"l": 4, "sf": "ETYPF", "op": "eq", "num": 10},
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
"l": 4, "sf": "ETYP2F", "op": "eq", "num": 11},
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
"l": 4, "sf": "DTYPF", "op": "eq", "num": 12},
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
"l": 4, "sf": "DTYP2F", "op": "eq", "num": 13},
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
"l": 4, "sf": "1VTYPF", "op": "eq", "num": 14},
|
|
]},
|
|
{"tn": "or", "num": 15, "terms": [
|
|
{"tn": "term", "t": '"D"', "idx": 93, "l": 4,
|
|
"sf": "DART", "op": "eq", "num": 16},
|
|
{"tn": "term", "t": '"D"', "idx": 93, "l": 4,
|
|
"sf": "DARTS", "op": "eq", "num": 17},
|
|
]},
|
|
]})
|
|
|
|
top_terms.append({"tn": "or", "num": 18, "terms": [
|
|
{"tn": "or", "num": 19, "terms": [
|
|
date_term("DAT", 20),
|
|
date_term("DDAT", 21),
|
|
]},
|
|
date_term("SDAT", 22),
|
|
]})
|
|
top_terms.append({"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1,
|
|
"sf": "TYP", "op": "eq", "num": 23})
|
|
|
|
# Mirror the same shape into the parsed/sref display strings
|
|
if document_type is not None:
|
|
parsed = (
|
|
f"((/WP {wahlperiode}) AND "
|
|
f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
|
|
f"AND (/DART,DARTS (\"D\")) AND "
|
|
f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
|
|
)
|
|
else:
|
|
parsed = (
|
|
f"((/WP {wahlperiode}) AND "
|
|
f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
|
|
)
|
|
|
|
return {
|
|
"action": "SearchAndDisplay",
|
|
"sources": [self.db_id],
|
|
"report": {
|
|
"rhl": "main",
|
|
"rhlmode": "add",
|
|
"format": "generic1-full",
|
|
"mime": "html",
|
|
"sort": "WEVSO1/D WEVSO2 WEVSO3",
|
|
},
|
|
"search": {
|
|
"lines": lines,
|
|
"serverrecordname": "sr_generic1",
|
|
"parsed": parsed,
|
|
"sref": parsed,
|
|
"json": [{
|
|
"tn": "and",
|
|
"num": 1,
|
|
"terms": top_terms,
|
|
}],
|
|
},
|
|
"dataSet": "1",
|
|
}
|
|
|
|
@staticmethod
|
|
def _datum_de_to_iso(datum_de: str) -> str:
|
|
"""Convert DD.MM.YYYY → YYYY-MM-DD; return '' for empty input."""
|
|
if not datum_de:
|
|
return ""
|
|
d, m, y = datum_de.split(".")
|
|
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
|
|
|
def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]:
|
|
"""Extract Drucksachen from a report.tt.html response.
|
|
|
|
Two formats are supported and auto-detected:
|
|
|
|
- **LSA-style:** the records are embedded as Perl Data::Dumper
|
|
dumps inside ``<pre>$VAR1 = …</pre>`` blocks. WEV06 → title,
|
|
WEV32 → metadata + PDF path. Used by Sachsen-Anhalt's PADOKA
|
|
template.
|
|
- **Berlin-style:** standard production HTML cards with
|
|
``efxRecordRepeater`` divs. Title in an ``<h3 class="h5">``,
|
|
metadata + PDF link in an ``<span class="h6">``. Used by
|
|
Berlin's PARDOK template.
|
|
"""
|
|
if self._RE_PRE_BLOCK.search(html):
|
|
return self._parse_hit_list_dump(html, query_filter)
|
|
return self._parse_hit_list_cards(html, query_filter)
|
|
|
|
def _parse_hit_list_dump(self, html: str, query_filter: str) -> list[Drucksache]:
|
|
"""Parse LSA-style ``<pre>$VAR1 = …</pre>`` Perl-dump records."""
|
|
results: list[Drucksache] = []
|
|
for pre in self._RE_PRE_BLOCK.findall(html):
|
|
m_ds = self._RE_DRUCKSACHE.search(pre)
|
|
if not m_ds:
|
|
continue
|
|
drucksache = m_ds.group(1)
|
|
|
|
m_t = self._RE_TITLE.search(pre)
|
|
title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"
|
|
|
|
m_pdf = self._RE_PDF.search(pre)
|
|
pdf_rel = m_pdf.group(1) if m_pdf else ""
|
|
pdf_url = f"{self.base_url}{self.pdf_url_prefix}{pdf_rel}" if pdf_rel else ""
|
|
|
|
m_w32 = self._RE_URHEBER_DATUM.search(pre)
|
|
urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else ""
|
|
datum_iso = self._datum_de_to_iso(m_w32.group(2) if m_w32 else "")
|
|
fraktionen = self._normalize_fraktion(urheber) if urheber else []
|
|
|
|
doc = Drucksache(
|
|
drucksache=drucksache,
|
|
title=title,
|
|
fraktionen=fraktionen,
|
|
datum=datum_iso,
|
|
link=pdf_url,
|
|
bundesland=self.bundesland,
|
|
typ="Antrag",
|
|
)
|
|
|
|
if query_filter:
|
|
hay = f"{title} {urheber}".lower()
|
|
if not all(t in hay for t in query_filter.lower().split()):
|
|
continue
|
|
|
|
results.append(doc)
|
|
|
|
return results
|
|
|
|
def _parse_hit_list_cards(self, html: str, query_filter: str) -> list[Drucksache]:
|
|
"""Parse Berlin-style ``efxRecordRepeater`` HTML-card records.
|
|
|
|
Each card contains an ``<h3>`` title, a metadata ``<span class="h6">``
|
|
with the document type, the Drucksachen-Nummer, and the date,
|
|
plus a direct ``<a href="…pdf">`` link to the PDF on the same host.
|
|
"""
|
|
results: list[Drucksache] = []
|
|
|
|
# Split the HTML on every record-div opener — easier than balancing
|
|
# divs with regex.
|
|
chunks = html.split('class="record')
|
|
# First chunk is the prelude, skip it
|
|
for chunk in chunks[1:]:
|
|
# Each chunk now starts at the record class attribute
|
|
m_t = self._RE_BE_TITLE.search(chunk)
|
|
title = m_t.group(1).strip() if m_t else "Ohne Titel"
|
|
|
|
m_ds = self._RE_BE_DRUCKSACHE.search(chunk)
|
|
if not m_ds:
|
|
continue
|
|
drucksache = m_ds.group(1)
|
|
|
|
m_pdf = self._RE_BE_LINK.search(chunk)
|
|
pdf_url = ""
|
|
if m_pdf:
|
|
href = m_pdf.group(1)
|
|
if href.startswith("http://") or href.startswith("https://"):
|
|
pdf_url = href
|
|
elif href.startswith("/"):
|
|
pdf_url = f"{self.base_url}{href}"
|
|
else:
|
|
pdf_url = f"{self.base_url}{self.pdf_url_prefix}{href}"
|
|
|
|
m_dat = self._RE_BE_DATUM.search(chunk)
|
|
datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "")
|
|
|
|
m_doc = self._RE_BE_DOCTYPE.search(chunk)
|
|
doctype_full = m_doc.group(1).strip() if m_doc else "Drucksache"
|
|
|
|
# Berlin often packs the originator(s) into the same h6 line:
|
|
# "Antrag CDU, SPD" → fraktionen = ["CDU","SPD"], typ = "Antrag"
|
|
# Senat-Vorlagen carry no fraction, only "Vorlage zur …".
|
|
fraktionen = self._normalize_fraktion(doctype_full)
|
|
# Strip the fraction names back out of the typ string so the UI
|
|
# shows a clean "Antrag" / "Vorlage …" label.
|
|
typ = doctype_full
|
|
if fraktionen:
|
|
# Cut at the first occurrence of any party name
|
|
cuts = [typ.upper().find(f.upper()) for f in fraktionen]
|
|
cuts = [c for c in cuts if c >= 0]
|
|
if cuts:
|
|
typ = typ[: min(cuts)].rstrip(" ,")
|
|
|
|
doc = Drucksache(
|
|
drucksache=drucksache,
|
|
title=title,
|
|
fraktionen=fraktionen,
|
|
datum=datum_iso,
|
|
link=pdf_url,
|
|
bundesland=self.bundesland,
|
|
typ=typ,
|
|
)
|
|
|
|
if query_filter:
|
|
hay = f"{title} {doctype_full}".lower()
|
|
if not all(t in hay for t in query_filter.lower().split()):
|
|
continue
|
|
|
|
results.append(doc)
|
|
|
|
return results
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
"""Search recent documents of the current Wahlperiode.
|
|
|
|
``query`` is applied as a client-side title/Urheber filter; the
|
|
server-side query covers the configured ``date_window_days``
|
|
(default 24 months).
|
|
"""
|
|
from datetime import date, timedelta
|
|
|
|
end = date.today()
|
|
start = end - timedelta(days=self.date_window_days)
|
|
body = self._build_search_body(
|
|
wahlperiode=self.wahlperiode,
|
|
start_date=start.isoformat(),
|
|
end_date=end.isoformat(),
|
|
)
|
|
|
|
browse_html = f"{self.base_url}{self.portala_path}/browse.tt.html"
|
|
browse_json = f"{self.base_url}{self.portala_path}/browse.tt.json"
|
|
report_html = f"{self.base_url}{self.portala_path}/report.tt.html"
|
|
|
|
async with httpx.AsyncClient(
|
|
timeout=30,
|
|
follow_redirects=True,
|
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
|
) as client:
|
|
try:
|
|
# Step 1: warm up cookies via the browse page
|
|
await client.get(browse_html)
|
|
|
|
# Step 2: submit the search action
|
|
resp = await client.post(
|
|
browse_json,
|
|
json=body,
|
|
headers={"Referer": browse_html},
|
|
)
|
|
if resp.status_code != 200:
|
|
print(f"{self.bundesland} search HTTP {resp.status_code}")
|
|
return []
|
|
|
|
data = resp.json()
|
|
report_id = data.get("report_id")
|
|
if not report_id:
|
|
print(f"{self.bundesland}: no report_id in response: {data}")
|
|
return []
|
|
|
|
# Step 3: fetch the HTML hit list
|
|
# Take a generous chunk so client-side filter still has enough
|
|
chunksize = 100 if query else limit
|
|
report_resp = await client.post(
|
|
report_html,
|
|
json={"report_id": report_id, "start": 0, "chunksize": chunksize},
|
|
headers={"Referer": browse_html},
|
|
)
|
|
if report_resp.status_code != 200:
|
|
print(f"{self.bundesland} report HTTP {report_resp.status_code}")
|
|
return []
|
|
|
|
results = self._parse_hit_list_html(report_resp.text, query_filter=query)
|
|
return results[:limit]
|
|
|
|
except Exception as e:
|
|
print(f"{self.bundesland} search error: {e}")
|
|
return []
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
"""Look up a single document by ID via the search endpoint with a
|
|
document_number filter."""
|
|
# Pragmatic MVP: do a broad search and filter for the requested ID.
|
|
# A targeted single-document fetch would require a different
|
|
# action.search.json structure that we have not reverse-engineered yet.
|
|
results = await self.search(query="", limit=200)
|
|
for doc in results:
|
|
if doc.drucksache == drucksache:
|
|
return doc
|
|
return None
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
"""Download the PDF for a Drucksache and extract its text."""
|
|
import fitz # PyMuPDF
|
|
|
|
doc = await self.get_document(drucksache)
|
|
if not doc or not doc.link:
|
|
return None
|
|
|
|
async with httpx.AsyncClient(
|
|
timeout=60,
|
|
follow_redirects=True,
|
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
|
) as client:
|
|
try:
|
|
resp = await client.get(doc.link)
|
|
if resp.status_code != 200:
|
|
return None
|
|
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
|
text = ""
|
|
for page in pdf:
|
|
text += page.get_text()
|
|
pdf.close()
|
|
return text
|
|
except Exception as e:
|
|
print(f"{self.bundesland} download error for {drucksache}: {e}")
|
|
return None
|
|
|
|
|
|
class BayernAdapter(ParlamentAdapter):
|
|
"""Adapter for Bayerischer Landtag."""
|
|
|
|
bundesland = "BY"
|
|
name = "Bayerischer Landtag"
|
|
base_url = "https://www.bayern.landtag.de"
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
# TODO: Implement Bayern search
|
|
return []
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
# TODO: Implement
|
|
return None
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
return None
|
|
|
|
|
|
class BWAdapter(ParlamentAdapter):
|
|
"""Adapter for Baden-Württemberg Landtag."""
|
|
|
|
bundesland = "BW"
|
|
name = "Landtag Baden-Württemberg"
|
|
base_url = "https://www.landtag-bw.de"
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
# TODO: Implement BW search
|
|
return []
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
return None
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
return None
|
|
|
|
|
|
# Registry of adapters
|
|
ADAPTERS = {
|
|
"NRW": NRWAdapter(),
|
|
"LSA": PortalaAdapter(
|
|
bundesland="LSA",
|
|
name="Landtag von Sachsen-Anhalt (PADOKA)",
|
|
base_url="https://padoka.landtag.sachsen-anhalt.de",
|
|
db_id="lsa.lissh",
|
|
wahlperiode=8,
|
|
portala_path="/portal",
|
|
document_type="Antrag",
|
|
pdf_url_prefix="/files/",
|
|
),
|
|
"BE": PortalaAdapter(
|
|
bundesland="BE",
|
|
name="Abgeordnetenhaus von Berlin (PARDOK)",
|
|
base_url="https://pardok.parlament-berlin.de",
|
|
db_id="lah.lissh",
|
|
wahlperiode=19,
|
|
portala_path="/portala",
|
|
# Berlin's ETYPF index uses different value strings — drop the
|
|
# document_type subtree, fall back to client-side title filter.
|
|
document_type=None,
|
|
# Tighter date window: BE has ~10x more documents than LSA, so a
|
|
# narrower window keeps the per-request payload bounded.
|
|
date_window_days=180,
|
|
pdf_url_prefix="/files/",
|
|
),
|
|
"BY": BayernAdapter(),
|
|
"BW": BWAdapter(),
|
|
}
|
|
|
|
|
|
def get_adapter(bundesland: str) -> Optional[ParlamentAdapter]:
|
|
"""Get adapter for a bundesland."""
|
|
return ADAPTERS.get(bundesland)
|
|
|
|
|
|
async def search_all(query: str, bundesland: str = "NRW", limit: int = 20) -> list[Drucksache]:
|
|
"""Search parliament documents in a specific state."""
|
|
adapter = get_adapter(bundesland)
|
|
if not adapter:
|
|
return []
|
|
return await adapter.search(query, limit)
|