PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
895 lines
36 KiB
Python
895 lines
36 KiB
Python
"""Parliament search adapters for different German states."""
|
|
|
|
import httpx
|
|
import re
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
@dataclass
|
|
class Drucksache:
|
|
"""A parliamentary document."""
|
|
drucksache: str # e.g. "18/8125"
|
|
title: str
|
|
fraktionen: list[str]
|
|
datum: str # ISO date
|
|
link: str # PDF URL
|
|
bundesland: str
|
|
typ: str = "Antrag" # Antrag, Anfrage, Beschlussempfehlung, etc.
|
|
|
|
|
|
class ParlamentAdapter(ABC):
|
|
"""Base adapter for searching parliament documents."""
|
|
|
|
bundesland: str
|
|
name: str
|
|
|
|
@abstractmethod
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
"""Search for documents matching query."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
"""Get a specific document by ID."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
"""Download and extract text from a document."""
|
|
pass
|
|
|
|
|
|
class NRWAdapter(ParlamentAdapter):
|
|
"""Adapter for NRW Landtag (opal.landtag.nrw.de)."""
|
|
|
|
bundesland = "NRW"
|
|
name = "Landtag Nordrhein-Westfalen"
|
|
base_url = "https://opal.landtag.nrw.de"
|
|
search_url = "https://opal.landtag.nrw.de/home/dokumente/dokumentensuche/parlamentsdokumente/aktuelle-dokumente.html"
|
|
|
|
def _parse_query(self, query: str) -> tuple[str, list[str], bool]:
|
|
"""
|
|
Parse search query for AND logic and exact phrases.
|
|
Returns: (search_term_for_api, filter_terms, is_exact)
|
|
|
|
Examples:
|
|
- 'Klimaschutz Energie' -> ('Klimaschutz', ['klimaschutz', 'energie'], False)
|
|
- '"Grüner Stahl"' -> ('Grüner Stahl', ['grüner stahl'], True)
|
|
- 'Klimaschutz "erneuerbare Energie"' -> ('Klimaschutz', ['klimaschutz', 'erneuerbare energie'], False)
|
|
"""
|
|
query = query.strip()
|
|
|
|
# Check for exact phrase (entire query in quotes)
|
|
if query.startswith('"') and query.endswith('"') and query.count('"') == 2:
|
|
exact = query[1:-1].strip()
|
|
return (exact, [exact.lower()], True)
|
|
|
|
# Extract quoted phrases and regular terms
|
|
import shlex
|
|
try:
|
|
parts = shlex.split(query)
|
|
except ValueError:
|
|
# Fallback for unbalanced quotes
|
|
parts = query.split()
|
|
|
|
if not parts:
|
|
return (query, [query.lower()], False)
|
|
|
|
# Use first term for API search, all terms for filtering
|
|
filter_terms = [p.lower() for p in parts]
|
|
return (parts[0], filter_terms, False)
|
|
|
|
def _matches_all_terms(self, doc: 'Drucksache', terms: list[str], is_exact: bool) -> bool:
|
|
"""Check if document matches all search terms (AND logic)."""
|
|
searchable = f"{doc.title} {doc.drucksache} {' '.join(doc.fraktionen)} {doc.typ}".lower()
|
|
|
|
if is_exact:
|
|
# Exact phrase must appear
|
|
return terms[0] in searchable
|
|
else:
|
|
# All terms must appear (AND)
|
|
return all(term in searchable for term in terms)
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
"""Search NRW Landtag documents via OPAL portal."""
|
|
results = []
|
|
|
|
# Parse query for AND logic
|
|
api_query, filter_terms, is_exact = self._parse_query(query)
|
|
|
|
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
|
try:
|
|
# First, get the page to establish session
|
|
initial = await client.get(self.search_url)
|
|
if initial.status_code != 200:
|
|
print(f"NRW search initial request failed: {initial.status_code}")
|
|
return []
|
|
|
|
# Parse for webflow token from pagination links
|
|
soup = BeautifulSoup(initial.text, 'html.parser')
|
|
|
|
# Find a pagination link to extract the webflow token
|
|
pagination_link = soup.select_one('a[href*="webflowexecution"]')
|
|
webflow_token = ""
|
|
webflow_execution = ""
|
|
|
|
if pagination_link:
|
|
href = pagination_link.get('href', '')
|
|
# Extract webflowToken and webflowexecution from URL
|
|
token_match = re.search(r'webflowToken=([^&]*)', href)
|
|
exec_match = re.search(r'(webflowexecution[^=]+)=([^&]+)', href)
|
|
if token_match:
|
|
webflow_token = token_match.group(1)
|
|
if exec_match:
|
|
webflow_execution = f"{exec_match.group(1)}={exec_match.group(2)}"
|
|
|
|
# Now perform the search with POST
|
|
# Find the form action URL with webflow token
|
|
form = soup.select_one('form#docSearchByItem')
|
|
form_action = self.search_url
|
|
if form and form.get('action'):
|
|
action = form.get('action')
|
|
if action.startswith('/'):
|
|
form_action = f"{self.base_url}{action}"
|
|
elif action.startswith('http'):
|
|
form_action = action
|
|
else:
|
|
form_action = f"{self.search_url}?{action}"
|
|
|
|
# Build form data for "Einfache Suche" (searchByItem form)
|
|
form_data = {
|
|
'_eventId_sendform': '1',
|
|
'dokNum': api_query, # This is the text search field
|
|
'formId': 'searchByItem',
|
|
'dokTyp': '', # All types
|
|
'wp': '18', # Wahlperiode 18
|
|
}
|
|
|
|
# POST request with form data to the form action URL
|
|
search_resp = await client.post(
|
|
form_action,
|
|
data=form_data,
|
|
cookies=initial.cookies,
|
|
headers={'Content-Type': 'application/x-www-form-urlencoded'}
|
|
)
|
|
|
|
if search_resp.status_code != 200:
|
|
print(f"NRW search request failed: {search_resp.status_code}")
|
|
return []
|
|
|
|
# Parse results
|
|
soup = BeautifulSoup(search_resp.text, 'html.parser')
|
|
|
|
# Find all document result items (li elements containing articles)
|
|
items = soup.select('li:has(article)')
|
|
|
|
for item in items[:limit]:
|
|
try:
|
|
# Extract drucksache number from first link
|
|
num_link = item.select_one('a[href*="MMD"]')
|
|
if not num_link:
|
|
continue
|
|
|
|
href = num_link.get('href', '')
|
|
# Extract number: MMD18-12345.pdf -> 18/12345
|
|
match = re.search(r'MMD(\d+)-(\d+)\.pdf', href)
|
|
if not match:
|
|
continue
|
|
|
|
legislatur, nummer = match.groups()
|
|
drucksache = f"{legislatur}/{nummer}"
|
|
pdf_url = f"https://www.landtag.nrw.de{href}" if href.startswith('/') else href
|
|
|
|
# Extract title from the title link (class e-document-result-item__title)
|
|
title_elem = item.select_one('a.e-document-result-item__title')
|
|
if title_elem:
|
|
# Get text content, clean it up
|
|
title = title_elem.get_text(strip=True)
|
|
# Remove SVG icon text and clean
|
|
title = re.sub(r'\s*<svg.*', '', title)
|
|
title = re.sub(r'\s+', ' ', title).strip()
|
|
else:
|
|
# Fallback: try to find any longer text
|
|
title = f"Drucksache {drucksache}"
|
|
|
|
# Clean up common artifacts
|
|
title = re.sub(r'\s*\(\s*externer Link.*?\)', '', title).strip()
|
|
|
|
# Extract type (Antrag, Kleine Anfrage, etc.)
|
|
typ_elem = item.select_one('.e-document-result-item__category')
|
|
typ = typ_elem.get_text(strip=True) if typ_elem else "Drucksache"
|
|
|
|
# Extract date
|
|
time_elem = item.select_one('time')
|
|
datum = ""
|
|
if time_elem:
|
|
datum_text = time_elem.get_text(strip=True)
|
|
# Convert DD.MM.YYYY to YYYY-MM-DD
|
|
date_match = re.match(r'(\d{2})\.(\d{2})\.(\d{4})', datum_text)
|
|
if date_match:
|
|
d, m, y = date_match.groups()
|
|
datum = f"{y}-{m}-{d}"
|
|
|
|
# Extract Urheber (fraktionen) - look for paragraph containing "Urheber:"
|
|
urheber_text = ""
|
|
for p in item.select('p'):
|
|
if 'Urheber:' in p.get_text():
|
|
urheber_text = p.get_text()
|
|
break
|
|
|
|
fraktionen = []
|
|
if urheber_text:
|
|
# Extract party names (SPD, CDU, GRÜNE, FDP, AfD)
|
|
for party in ['SPD', 'CDU', 'GRÜNE', 'Grüne', 'FDP', 'AfD']:
|
|
if party in urheber_text:
|
|
fraktionen.append(party.upper() if party.lower() != 'grüne' else 'GRÜNE')
|
|
|
|
doc = Drucksache(
|
|
drucksache=drucksache,
|
|
title=title,
|
|
fraktionen=fraktionen,
|
|
datum=datum,
|
|
link=pdf_url,
|
|
bundesland="NRW",
|
|
typ=typ,
|
|
)
|
|
|
|
# Apply AND filter (all terms must match)
|
|
if self._matches_all_terms(doc, filter_terms, is_exact):
|
|
results.append(doc)
|
|
|
|
except Exception as e:
|
|
print(f"Error parsing item: {e}")
|
|
continue
|
|
|
|
except Exception as e:
|
|
print(f"NRW search error: {e}")
|
|
|
|
return results
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
"""Get document metadata by drucksache ID (e.g. '18/8125')."""
|
|
# Parse legislatur and number
|
|
match = re.match(r"(\d+)/(\d+)", drucksache)
|
|
if not match:
|
|
return None
|
|
|
|
legislatur, nummer = match.groups()
|
|
pdf_url = f"https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument/MMD{legislatur}-{nummer}.pdf"
|
|
|
|
# Try to fetch and extract basic info
|
|
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
|
try:
|
|
resp = await client.head(pdf_url)
|
|
if resp.status_code == 200:
|
|
return Drucksache(
|
|
drucksache=drucksache,
|
|
title=f"Drucksache {drucksache}",
|
|
fraktionen=[],
|
|
datum="",
|
|
link=pdf_url,
|
|
bundesland="NRW",
|
|
)
|
|
except:
|
|
pass
|
|
|
|
return None
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
"""Download PDF and extract text."""
|
|
import fitz # PyMuPDF
|
|
|
|
doc = await self.get_document(drucksache)
|
|
if not doc:
|
|
return None
|
|
|
|
async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
|
|
try:
|
|
resp = await client.get(doc.link)
|
|
if resp.status_code != 200:
|
|
return None
|
|
|
|
# Extract text with PyMuPDF
|
|
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
|
text = ""
|
|
for page in pdf:
|
|
text += page.get_text()
|
|
pdf.close()
|
|
|
|
return text
|
|
except Exception as e:
|
|
print(f"Error downloading {drucksache}: {e}")
|
|
return None
|
|
|
|
|
|
class PortalaAdapter(ParlamentAdapter):
|
|
"""Adapter for portala/eUI-based parliament documentation systems.
|
|
|
|
Used by parliaments running the proprietary "esearch" / portala framework
|
|
(originally developed for STAR/StarFinder backends, now wrapped in a
|
|
Single-Page App with Template Toolkit on the server side):
|
|
|
|
- **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de``
|
|
under ``/portal/`` (singular)
|
|
- **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` under
|
|
``/portala/`` (with the trailing 'a')
|
|
|
|
Both instances share the same JSON action schema, only the base URL,
|
|
the data source ID, the application path prefix and a few minor
|
|
quirks differ — those are constructor parameters so that the same
|
|
class can serve both states (and any future portala-based parliament).
|
|
|
|
The search workflow is two-stage:
|
|
|
|
1. ``POST {base}{path}/browse.tt.json`` with a complex JSON ``action``
|
|
body that contains an Elasticsearch-style query tree under
|
|
``search.json``. The server returns a ``report_id`` plus hit count.
|
|
2. ``POST {base}{path}/report.tt.html`` with ``{report_id, start,
|
|
chunksize}`` to fetch the HTML hit list. Each hit carries a Perl
|
|
Data::Dumper block in a ``<pre>`` tag with the canonical metadata.
|
|
|
|
The query body schema was reverse-engineered from
|
|
https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
|
|
(GPL-3.0 — only structure/selectors are reused, not Python code).
|
|
|
|
Full-text search is **not** implemented in the MVP: the adapter
|
|
returns documents of the current Wahlperiode in the given date
|
|
window, and the search query is applied as a client-side
|
|
title/Urheber filter. The server-side full-text path requires
|
|
state-specific ``sf`` index names that are not yet known.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
bundesland: str,
|
|
name: str,
|
|
base_url: str,
|
|
db_id: str,
|
|
wahlperiode: int,
|
|
portala_path: str = "/portal",
|
|
document_type: Optional[str] = "Antrag",
|
|
pdf_url_prefix: str = "/files/",
|
|
date_window_days: int = 730,
|
|
) -> None:
|
|
"""Configure a portala/eUI adapter for one specific parliament.
|
|
|
|
Args:
|
|
bundesland: state code (e.g. ``"LSA"``, ``"BE"``).
|
|
name: human-readable adapter label (used in logs/UI).
|
|
base_url: ``https://...`` of the portal host without trailing slash.
|
|
db_id: data source identifier the eUI server expects in
|
|
``action.sources``, e.g. ``"lsa.lissh"`` or ``"lah.lissh"``.
|
|
wahlperiode: current legislative period — fed into the WP
|
|
term of the search tree.
|
|
portala_path: path prefix where the portala app lives. ``/portal``
|
|
for LSA, ``/portala`` for Berlin.
|
|
document_type: optional filter applied via ETYPF/DTYPF/DART
|
|
terms. ``"Antrag"`` works for LSA; for instances where
|
|
the index uses different document_type values (e.g. Berlin),
|
|
pass ``None`` to drop the document_type subtree entirely
|
|
— the user can still filter client-side by title.
|
|
pdf_url_prefix: URL fragment between ``base_url`` and the
|
|
relative PDF path returned by the server.
|
|
date_window_days: how many days back ``search()`` looks by
|
|
default.
|
|
"""
|
|
self.bundesland = bundesland
|
|
self.name = name
|
|
self.base_url = base_url.rstrip("/")
|
|
self.db_id = db_id
|
|
self.wahlperiode = wahlperiode
|
|
self.portala_path = "/" + portala_path.strip("/")
|
|
self.document_type = document_type
|
|
self.pdf_url_prefix = "/" + pdf_url_prefix.strip("/") + "/"
|
|
self.date_window_days = date_window_days
|
|
|
|
# ── LSA-style hit list (Perl Data::Dumper inside <pre> blocks) ──
|
|
# Reverse-engineered "WEV*" record fields:
|
|
# WEV06.main = title
|
|
# WEV32.5 = relative PDF path
|
|
# WEV32.main = "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b> ..."
|
|
_RE_TITLE = re.compile(r"'WEV06'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
|
|
_RE_PDF = re.compile(r"'5'\s*=>\s*'([^']*\.pdf)'")
|
|
_RE_DRUCKSACHE = re.compile(r"Drucksache\s*<b>(\d+/\d+)</b>")
|
|
_RE_URHEBER_DATUM = re.compile(
|
|
r"'WEV32'\s*=>\s*\[\s*\{[^}]*'main'\s*=>\s*[\"']Antrag\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
|
|
)
|
|
_RE_PRE_BLOCK = re.compile(r'<pre>\$VAR1 = (.*?)</pre>', re.DOTALL)
|
|
|
|
# ── Berlin-style hit list (production HTML cards, no Perl dump) ──
|
|
# The whole div for one record:
|
|
_RE_BE_RECORD = re.compile(
|
|
r'<div[^>]*class="[^"]*efxRecordRepeater[^"]*"[^>]*data-efx-rec="[^"]*"[^>]*>(.*?)(?=<div[^>]*efxRecordRepeater|<div[^>]*id="efxResultsEnd"|</main>|$)',
|
|
re.DOTALL,
|
|
)
|
|
_RE_BE_TITLE = re.compile(r'<h3[^>]*class="h5[^"]*"[^>]*>\s*<span>([^<]+)</span>')
|
|
_RE_BE_LINK = re.compile(r'<a[^>]*href="([^"]+\.pdf)"[^>]*>')
|
|
# The metadata h6 looks like:
|
|
# <span class="h6">Antrag (Eilantrag) <a ...>Drucksache 19/3104</a> S. 1 bis 24 vom 31.03.2026</span>
|
|
_RE_BE_DRUCKSACHE = re.compile(r'Drucksache\s+(\d+/\d+)')
|
|
_RE_BE_DATUM = re.compile(r'vom\s+(\d{1,2}\.\d{1,2}\.\d{4})')
|
|
_RE_BE_DOCTYPE = re.compile(r'<span class="h6">\s*([^<&]+?)(?: |<)')
|
|
|
|
@staticmethod
|
|
def _decode_perl_hex(s: str) -> str:
|
|
"""Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
|
|
return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s)
|
|
|
|
@staticmethod
|
|
def _normalize_fraktion(urheber: str) -> list[str]:
|
|
"""Map Urheber-String to canonical fraction codes.
|
|
|
|
Uses regex word boundaries instead of plain substring matching so
|
|
that comma-separated lists ("CDU, SPD") and the embedded "DIE
|
|
LINKE" are matched reliably.
|
|
"""
|
|
u = urheber.upper()
|
|
out: list[str] = []
|
|
|
|
def has(pattern: str) -> bool:
|
|
return re.search(pattern, u) is not None
|
|
|
|
if has(r"\bBÜNDNIS\s*90\b") or has(r"\bGR(?:Ü|UE)NE\b"):
|
|
out.append("GRÜNE")
|
|
if has(r"\bCDU\b"):
|
|
out.append("CDU")
|
|
if has(r"\bSPD\b"):
|
|
out.append("SPD")
|
|
if has(r"\bFDP\b"):
|
|
out.append("FDP")
|
|
if has(r"\bAFD\b"):
|
|
out.append("AfD")
|
|
if has(r"\bLINKE\b"):
|
|
out.append("LINKE")
|
|
if has(r"\bBSW\b"):
|
|
out.append("BSW")
|
|
if has(r"LANDESREGIERUNG|SENAT VON BERLIN|REGIERENDE[RN]?\s+BÜRGERMEISTER|MINISTER\b|STAATSKANZLEI"):
|
|
out.append("Landesregierung")
|
|
return out
|
|
|
|
def _build_search_body(
|
|
self,
|
|
wahlperiode: int,
|
|
start_date: str,
|
|
end_date: str,
|
|
) -> dict:
|
|
"""Build the action JSON body for browse.tt.json.
|
|
|
|
The schema is taken from dokukratie's portala.query.json template
|
|
and only differs in the data source and the variable substitutions.
|
|
When ``self.document_type`` is None, the ETYPF/DTYPF/DART subtree
|
|
is dropped — useful for parliaments whose ETYPF index uses
|
|
different value strings than ``"Antrag"``.
|
|
"""
|
|
document_type = self.document_type
|
|
date_range_text = f"{start_date} THRU {end_date}"
|
|
date_term = lambda sf, num: { # noqa: E731 — local helper
|
|
"tn": "trange", "sf": sf, "op": "eq", "num": num,
|
|
"idx": 119, "l": 3,
|
|
"p1": start_date, "t1": start_date,
|
|
"p2": end_date, "t2": end_date,
|
|
"t": date_range_text,
|
|
}
|
|
|
|
# Build the search.lines (form-state mirror) and the json tree
|
|
lines: dict = {
|
|
"2": str(wahlperiode),
|
|
"10": start_date,
|
|
"11": end_date,
|
|
"20.1": "alWEBBI",
|
|
"20.2": "alWEBBI",
|
|
"20.3": "alWEBBI",
|
|
"90.1": "AND",
|
|
"90.2": "AND",
|
|
"90.3": "AND",
|
|
}
|
|
if document_type is not None:
|
|
lines["3"] = document_type
|
|
lines["4"] = "D"
|
|
|
|
# Top-level AND tree
|
|
top_terms: list = [
|
|
{"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3,
|
|
"sf": "WP", "op": "eq", "num": 5},
|
|
]
|
|
|
|
if document_type is not None:
|
|
top_terms.append({"tn": "or", "num": 3, "terms": [
|
|
{"tn": "or", "num": 4, "terms": [
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
"l": 4, "sf": "ETYPF", "op": "eq", "num": 10},
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
"l": 4, "sf": "ETYP2F", "op": "eq", "num": 11},
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
"l": 4, "sf": "DTYPF", "op": "eq", "num": 12},
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
"l": 4, "sf": "DTYP2F", "op": "eq", "num": 13},
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
"l": 4, "sf": "1VTYPF", "op": "eq", "num": 14},
|
|
]},
|
|
{"tn": "or", "num": 15, "terms": [
|
|
{"tn": "term", "t": '"D"', "idx": 93, "l": 4,
|
|
"sf": "DART", "op": "eq", "num": 16},
|
|
{"tn": "term", "t": '"D"', "idx": 93, "l": 4,
|
|
"sf": "DARTS", "op": "eq", "num": 17},
|
|
]},
|
|
]})
|
|
|
|
top_terms.append({"tn": "or", "num": 18, "terms": [
|
|
{"tn": "or", "num": 19, "terms": [
|
|
date_term("DAT", 20),
|
|
date_term("DDAT", 21),
|
|
]},
|
|
date_term("SDAT", 22),
|
|
]})
|
|
top_terms.append({"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1,
|
|
"sf": "TYP", "op": "eq", "num": 23})
|
|
|
|
# Mirror the same shape into the parsed/sref display strings
|
|
if document_type is not None:
|
|
parsed = (
|
|
f"((/WP {wahlperiode}) AND "
|
|
f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
|
|
f"AND (/DART,DARTS (\"D\")) AND "
|
|
f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
|
|
)
|
|
else:
|
|
parsed = (
|
|
f"((/WP {wahlperiode}) AND "
|
|
f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
|
|
)
|
|
|
|
return {
|
|
"action": "SearchAndDisplay",
|
|
"sources": [self.db_id],
|
|
"report": {
|
|
"rhl": "main",
|
|
"rhlmode": "add",
|
|
"format": "generic1-full",
|
|
"mime": "html",
|
|
"sort": "WEVSO1/D WEVSO2 WEVSO3",
|
|
},
|
|
"search": {
|
|
"lines": lines,
|
|
"serverrecordname": "sr_generic1",
|
|
"parsed": parsed,
|
|
"sref": parsed,
|
|
"json": [{
|
|
"tn": "and",
|
|
"num": 1,
|
|
"terms": top_terms,
|
|
}],
|
|
},
|
|
"dataSet": "1",
|
|
}
|
|
|
|
@staticmethod
|
|
def _datum_de_to_iso(datum_de: str) -> str:
|
|
"""Convert DD.MM.YYYY → YYYY-MM-DD; return '' for empty input."""
|
|
if not datum_de:
|
|
return ""
|
|
d, m, y = datum_de.split(".")
|
|
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
|
|
|
def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]:
|
|
"""Extract Drucksachen from a report.tt.html response.
|
|
|
|
Two formats are supported and auto-detected:
|
|
|
|
- **LSA-style:** the records are embedded as Perl Data::Dumper
|
|
dumps inside ``<pre>$VAR1 = …</pre>`` blocks. WEV06 → title,
|
|
WEV32 → metadata + PDF path. Used by Sachsen-Anhalt's PADOKA
|
|
template.
|
|
- **Berlin-style:** standard production HTML cards with
|
|
``efxRecordRepeater`` divs. Title in an ``<h3 class="h5">``,
|
|
metadata + PDF link in an ``<span class="h6">``. Used by
|
|
Berlin's PARDOK template.
|
|
"""
|
|
if self._RE_PRE_BLOCK.search(html):
|
|
return self._parse_hit_list_dump(html, query_filter)
|
|
return self._parse_hit_list_cards(html, query_filter)
|
|
|
|
def _parse_hit_list_dump(self, html: str, query_filter: str) -> list[Drucksache]:
|
|
"""Parse LSA-style ``<pre>$VAR1 = …</pre>`` Perl-dump records."""
|
|
results: list[Drucksache] = []
|
|
for pre in self._RE_PRE_BLOCK.findall(html):
|
|
m_ds = self._RE_DRUCKSACHE.search(pre)
|
|
if not m_ds:
|
|
continue
|
|
drucksache = m_ds.group(1)
|
|
|
|
m_t = self._RE_TITLE.search(pre)
|
|
title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"
|
|
|
|
m_pdf = self._RE_PDF.search(pre)
|
|
pdf_rel = m_pdf.group(1) if m_pdf else ""
|
|
pdf_url = f"{self.base_url}{self.pdf_url_prefix}{pdf_rel}" if pdf_rel else ""
|
|
|
|
m_w32 = self._RE_URHEBER_DATUM.search(pre)
|
|
urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else ""
|
|
datum_iso = self._datum_de_to_iso(m_w32.group(2) if m_w32 else "")
|
|
fraktionen = self._normalize_fraktion(urheber) if urheber else []
|
|
|
|
doc = Drucksache(
|
|
drucksache=drucksache,
|
|
title=title,
|
|
fraktionen=fraktionen,
|
|
datum=datum_iso,
|
|
link=pdf_url,
|
|
bundesland=self.bundesland,
|
|
typ="Antrag",
|
|
)
|
|
|
|
if query_filter:
|
|
hay = f"{title} {urheber}".lower()
|
|
if not all(t in hay for t in query_filter.lower().split()):
|
|
continue
|
|
|
|
results.append(doc)
|
|
|
|
return results
|
|
|
|
def _parse_hit_list_cards(self, html: str, query_filter: str) -> list[Drucksache]:
|
|
"""Parse Berlin-style ``efxRecordRepeater`` HTML-card records.
|
|
|
|
Each card contains an ``<h3>`` title, a metadata ``<span class="h6">``
|
|
with the document type, the Drucksachen-Nummer, and the date,
|
|
plus a direct ``<a href="…pdf">`` link to the PDF on the same host.
|
|
"""
|
|
results: list[Drucksache] = []
|
|
|
|
# Split the HTML on every record-div opener — easier than balancing
|
|
# divs with regex.
|
|
chunks = html.split('class="record')
|
|
# First chunk is the prelude, skip it
|
|
for chunk in chunks[1:]:
|
|
# Each chunk now starts at the record class attribute
|
|
m_t = self._RE_BE_TITLE.search(chunk)
|
|
title = m_t.group(1).strip() if m_t else "Ohne Titel"
|
|
|
|
m_ds = self._RE_BE_DRUCKSACHE.search(chunk)
|
|
if not m_ds:
|
|
continue
|
|
drucksache = m_ds.group(1)
|
|
|
|
m_pdf = self._RE_BE_LINK.search(chunk)
|
|
pdf_url = ""
|
|
if m_pdf:
|
|
href = m_pdf.group(1)
|
|
if href.startswith("http://") or href.startswith("https://"):
|
|
pdf_url = href
|
|
elif href.startswith("/"):
|
|
pdf_url = f"{self.base_url}{href}"
|
|
else:
|
|
pdf_url = f"{self.base_url}{self.pdf_url_prefix}{href}"
|
|
|
|
m_dat = self._RE_BE_DATUM.search(chunk)
|
|
datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "")
|
|
|
|
m_doc = self._RE_BE_DOCTYPE.search(chunk)
|
|
doctype_full = m_doc.group(1).strip() if m_doc else "Drucksache"
|
|
|
|
# Berlin often packs the originator(s) into the same h6 line:
|
|
# "Antrag CDU, SPD" → fraktionen = ["CDU","SPD"], typ = "Antrag"
|
|
# Senat-Vorlagen carry no fraction, only "Vorlage zur …".
|
|
fraktionen = self._normalize_fraktion(doctype_full)
|
|
# Strip the fraction names back out of the typ string so the UI
|
|
# shows a clean "Antrag" / "Vorlage …" label.
|
|
typ = doctype_full
|
|
if fraktionen:
|
|
# Cut at the first occurrence of any party name
|
|
cuts = [typ.upper().find(f.upper()) for f in fraktionen]
|
|
cuts = [c for c in cuts if c >= 0]
|
|
if cuts:
|
|
typ = typ[: min(cuts)].rstrip(" ,")
|
|
|
|
doc = Drucksache(
|
|
drucksache=drucksache,
|
|
title=title,
|
|
fraktionen=fraktionen,
|
|
datum=datum_iso,
|
|
link=pdf_url,
|
|
bundesland=self.bundesland,
|
|
typ=typ,
|
|
)
|
|
|
|
if query_filter:
|
|
hay = f"{title} {doctype}".lower()
|
|
if not all(t in hay for t in query_filter.lower().split()):
|
|
continue
|
|
|
|
results.append(doc)
|
|
|
|
return results
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
"""Search recent documents of the current Wahlperiode.
|
|
|
|
``query`` is applied as a client-side title/Urheber filter; the
|
|
server-side query covers the configured ``date_window_days``
|
|
(default 24 months).
|
|
"""
|
|
from datetime import date, timedelta
|
|
|
|
end = date.today()
|
|
start = end - timedelta(days=self.date_window_days)
|
|
body = self._build_search_body(
|
|
wahlperiode=self.wahlperiode,
|
|
start_date=start.isoformat(),
|
|
end_date=end.isoformat(),
|
|
)
|
|
|
|
browse_html = f"{self.base_url}{self.portala_path}/browse.tt.html"
|
|
browse_json = f"{self.base_url}{self.portala_path}/browse.tt.json"
|
|
report_html = f"{self.base_url}{self.portala_path}/report.tt.html"
|
|
|
|
async with httpx.AsyncClient(
|
|
timeout=30,
|
|
follow_redirects=True,
|
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
|
) as client:
|
|
try:
|
|
# Step 1: warm up cookies via the browse page
|
|
await client.get(browse_html)
|
|
|
|
# Step 2: submit the search action
|
|
resp = await client.post(
|
|
browse_json,
|
|
json=body,
|
|
headers={"Referer": browse_html},
|
|
)
|
|
if resp.status_code != 200:
|
|
print(f"{self.bundesland} search HTTP {resp.status_code}")
|
|
return []
|
|
|
|
data = resp.json()
|
|
report_id = data.get("report_id")
|
|
if not report_id:
|
|
print(f"{self.bundesland}: no report_id in response: {data}")
|
|
return []
|
|
|
|
# Step 3: fetch the HTML hit list
|
|
# Take a generous chunk so client-side filter still has enough
|
|
chunksize = 100 if query else limit
|
|
report_resp = await client.post(
|
|
report_html,
|
|
json={"report_id": report_id, "start": 0, "chunksize": chunksize},
|
|
headers={"Referer": browse_html},
|
|
)
|
|
if report_resp.status_code != 200:
|
|
print(f"{self.bundesland} report HTTP {report_resp.status_code}")
|
|
return []
|
|
|
|
results = self._parse_hit_list_html(report_resp.text, query_filter=query)
|
|
return results[:limit]
|
|
|
|
except Exception as e:
|
|
print(f"{self.bundesland} search error: {e}")
|
|
return []
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
"""Look up a single document by ID via the search endpoint with a
|
|
document_number filter."""
|
|
# Pragmatic MVP: do a broad search and filter for the requested ID.
|
|
# A targeted single-document fetch would require a different
|
|
# action.search.json structure that we have not reverse-engineered yet.
|
|
results = await self.search(query="", limit=200)
|
|
for doc in results:
|
|
if doc.drucksache == drucksache:
|
|
return doc
|
|
return None
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
"""Download the PDF for a Drucksache and extract its text."""
|
|
import fitz # PyMuPDF
|
|
|
|
doc = await self.get_document(drucksache)
|
|
if not doc or not doc.link:
|
|
return None
|
|
|
|
async with httpx.AsyncClient(
|
|
timeout=60,
|
|
follow_redirects=True,
|
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
|
) as client:
|
|
try:
|
|
resp = await client.get(doc.link)
|
|
if resp.status_code != 200:
|
|
return None
|
|
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
|
text = ""
|
|
for page in pdf:
|
|
text += page.get_text()
|
|
pdf.close()
|
|
return text
|
|
except Exception as e:
|
|
print(f"{self.bundesland} download error for {drucksache}: {e}")
|
|
return None
|
|
|
|
|
|
class BayernAdapter(ParlamentAdapter):
|
|
"""Adapter for Bayerischer Landtag."""
|
|
|
|
bundesland = "BY"
|
|
name = "Bayerischer Landtag"
|
|
base_url = "https://www.bayern.landtag.de"
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
# TODO: Implement Bayern search
|
|
return []
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
# TODO: Implement
|
|
return None
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
return None
|
|
|
|
|
|
class BWAdapter(ParlamentAdapter):
|
|
"""Adapter for Baden-Württemberg Landtag."""
|
|
|
|
bundesland = "BW"
|
|
name = "Landtag Baden-Württemberg"
|
|
base_url = "https://www.landtag-bw.de"
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
# TODO: Implement BW search
|
|
return []
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
return None
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
return None
|
|
|
|
|
|
# Registry of adapters
|
|
ADAPTERS = {
|
|
"NRW": NRWAdapter(),
|
|
"LSA": PortalaAdapter(
|
|
bundesland="LSA",
|
|
name="Landtag von Sachsen-Anhalt (PADOKA)",
|
|
base_url="https://padoka.landtag.sachsen-anhalt.de",
|
|
db_id="lsa.lissh",
|
|
wahlperiode=8,
|
|
portala_path="/portal",
|
|
document_type="Antrag",
|
|
pdf_url_prefix="/files/",
|
|
),
|
|
"BE": PortalaAdapter(
|
|
bundesland="BE",
|
|
name="Abgeordnetenhaus von Berlin (PARDOK)",
|
|
base_url="https://pardok.parlament-berlin.de",
|
|
db_id="lah.lissh",
|
|
wahlperiode=19,
|
|
portala_path="/portala",
|
|
# Berlin's ETYPF index uses different value strings — drop the
|
|
# document_type subtree, fall back to client-side title filter.
|
|
document_type=None,
|
|
# Tighter date window: BE has ~10x more documents than LSA, so a
|
|
# narrower window keeps the per-request payload bounded.
|
|
date_window_days=180,
|
|
pdf_url_prefix="/files/",
|
|
),
|
|
"BY": BayernAdapter(),
|
|
"BW": BWAdapter(),
|
|
}
|
|
|
|
|
|
def get_adapter(bundesland: str) -> Optional[ParlamentAdapter]:
|
|
"""Get adapter for a bundesland."""
|
|
return ADAPTERS.get(bundesland)
|
|
|
|
|
|
async def search_all(query: str, bundesland: str = "NRW", limit: int = 20) -> list[Drucksache]:
|
|
"""Search parliament documents in a specific state."""
|
|
adapter = get_adapter(bundesland)
|
|
if not adapter:
|
|
return []
|
|
return await adapter.search(query, limit)
|