2026-03-28 22:30:24 +01:00
|
|
|
"""Parliament search adapters for different German states."""
|
|
|
|
|
|
|
|
|
|
import httpx
|
|
|
|
|
import re
|
|
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
from typing import Optional
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class Drucksache:
|
|
|
|
|
"""A parliamentary document."""
|
|
|
|
|
drucksache: str # e.g. "18/8125"
|
|
|
|
|
title: str
|
|
|
|
|
fraktionen: list[str]
|
|
|
|
|
datum: str # ISO date
|
|
|
|
|
link: str # PDF URL
|
|
|
|
|
bundesland: str
|
|
|
|
|
typ: str = "Antrag" # Antrag, Anfrage, Beschlussempfehlung, etc.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ParlamentAdapter(ABC):
|
|
|
|
|
"""Base adapter for searching parliament documents."""
|
|
|
|
|
|
|
|
|
|
bundesland: str
|
|
|
|
|
name: str
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
|
|
|
"""Search for documents matching query."""
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
|
|
|
"""Get a specific document by ID."""
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
|
|
|
"""Download and extract text from a document."""
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NRWAdapter(ParlamentAdapter):
|
|
|
|
|
"""Adapter for NRW Landtag (opal.landtag.nrw.de)."""
|
|
|
|
|
|
|
|
|
|
bundesland = "NRW"
|
|
|
|
|
name = "Landtag Nordrhein-Westfalen"
|
|
|
|
|
base_url = "https://opal.landtag.nrw.de"
|
|
|
|
|
search_url = "https://opal.landtag.nrw.de/home/dokumente/dokumentensuche/parlamentsdokumente/aktuelle-dokumente.html"
|
|
|
|
|
|
|
|
|
|
def _parse_query(self, query: str) -> tuple[str, list[str], bool]:
|
|
|
|
|
"""
|
|
|
|
|
Parse search query for AND logic and exact phrases.
|
|
|
|
|
Returns: (search_term_for_api, filter_terms, is_exact)
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
- 'Klimaschutz Energie' -> ('Klimaschutz', ['klimaschutz', 'energie'], False)
|
|
|
|
|
- '"Grüner Stahl"' -> ('Grüner Stahl', ['grüner stahl'], True)
|
|
|
|
|
- 'Klimaschutz "erneuerbare Energie"' -> ('Klimaschutz', ['klimaschutz', 'erneuerbare energie'], False)
|
|
|
|
|
"""
|
|
|
|
|
query = query.strip()
|
|
|
|
|
|
|
|
|
|
# Check for exact phrase (entire query in quotes)
|
|
|
|
|
if query.startswith('"') and query.endswith('"') and query.count('"') == 2:
|
|
|
|
|
exact = query[1:-1].strip()
|
|
|
|
|
return (exact, [exact.lower()], True)
|
|
|
|
|
|
|
|
|
|
# Extract quoted phrases and regular terms
|
|
|
|
|
import shlex
|
|
|
|
|
try:
|
|
|
|
|
parts = shlex.split(query)
|
|
|
|
|
except ValueError:
|
|
|
|
|
# Fallback for unbalanced quotes
|
|
|
|
|
parts = query.split()
|
|
|
|
|
|
|
|
|
|
if not parts:
|
|
|
|
|
return (query, [query.lower()], False)
|
|
|
|
|
|
|
|
|
|
# Use first term for API search, all terms for filtering
|
|
|
|
|
filter_terms = [p.lower() for p in parts]
|
|
|
|
|
return (parts[0], filter_terms, False)
|
|
|
|
|
|
|
|
|
|
def _matches_all_terms(self, doc: 'Drucksache', terms: list[str], is_exact: bool) -> bool:
|
|
|
|
|
"""Check if document matches all search terms (AND logic)."""
|
|
|
|
|
searchable = f"{doc.title} {doc.drucksache} {' '.join(doc.fraktionen)} {doc.typ}".lower()
|
|
|
|
|
|
|
|
|
|
if is_exact:
|
|
|
|
|
# Exact phrase must appear
|
|
|
|
|
return terms[0] in searchable
|
|
|
|
|
else:
|
|
|
|
|
# All terms must appear (AND)
|
|
|
|
|
return all(term in searchable for term in terms)
|
|
|
|
|
|
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
|
|
|
"""Search NRW Landtag documents via OPAL portal."""
|
|
|
|
|
results = []
|
|
|
|
|
|
|
|
|
|
# Parse query for AND logic
|
|
|
|
|
api_query, filter_terms, is_exact = self._parse_query(query)
|
|
|
|
|
|
|
|
|
|
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
|
|
|
|
try:
|
|
|
|
|
# First, get the page to establish session
|
|
|
|
|
initial = await client.get(self.search_url)
|
|
|
|
|
if initial.status_code != 200:
|
|
|
|
|
print(f"NRW search initial request failed: {initial.status_code}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
# Parse for webflow token from pagination links
|
|
|
|
|
soup = BeautifulSoup(initial.text, 'html.parser')
|
|
|
|
|
|
|
|
|
|
# Find a pagination link to extract the webflow token
|
|
|
|
|
pagination_link = soup.select_one('a[href*="webflowexecution"]')
|
|
|
|
|
webflow_token = ""
|
|
|
|
|
webflow_execution = ""
|
|
|
|
|
|
|
|
|
|
if pagination_link:
|
|
|
|
|
href = pagination_link.get('href', '')
|
|
|
|
|
# Extract webflowToken and webflowexecution from URL
|
|
|
|
|
token_match = re.search(r'webflowToken=([^&]*)', href)
|
|
|
|
|
exec_match = re.search(r'(webflowexecution[^=]+)=([^&]+)', href)
|
|
|
|
|
if token_match:
|
|
|
|
|
webflow_token = token_match.group(1)
|
|
|
|
|
if exec_match:
|
|
|
|
|
webflow_execution = f"{exec_match.group(1)}={exec_match.group(2)}"
|
|
|
|
|
|
|
|
|
|
# Now perform the search with POST
|
|
|
|
|
# Find the form action URL with webflow token
|
|
|
|
|
form = soup.select_one('form#docSearchByItem')
|
|
|
|
|
form_action = self.search_url
|
|
|
|
|
if form and form.get('action'):
|
|
|
|
|
action = form.get('action')
|
|
|
|
|
if action.startswith('/'):
|
|
|
|
|
form_action = f"{self.base_url}{action}"
|
|
|
|
|
elif action.startswith('http'):
|
|
|
|
|
form_action = action
|
|
|
|
|
else:
|
|
|
|
|
form_action = f"{self.search_url}?{action}"
|
|
|
|
|
|
|
|
|
|
# Build form data for "Einfache Suche" (searchByItem form)
|
|
|
|
|
form_data = {
|
|
|
|
|
'_eventId_sendform': '1',
|
|
|
|
|
'dokNum': api_query, # This is the text search field
|
|
|
|
|
'formId': 'searchByItem',
|
|
|
|
|
'dokTyp': '', # All types
|
|
|
|
|
'wp': '18', # Wahlperiode 18
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# POST request with form data to the form action URL
|
|
|
|
|
search_resp = await client.post(
|
|
|
|
|
form_action,
|
|
|
|
|
data=form_data,
|
|
|
|
|
cookies=initial.cookies,
|
|
|
|
|
headers={'Content-Type': 'application/x-www-form-urlencoded'}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if search_resp.status_code != 200:
|
|
|
|
|
print(f"NRW search request failed: {search_resp.status_code}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
# Parse results
|
|
|
|
|
soup = BeautifulSoup(search_resp.text, 'html.parser')
|
|
|
|
|
|
|
|
|
|
# Find all document result items (li elements containing articles)
|
|
|
|
|
items = soup.select('li:has(article)')
|
|
|
|
|
|
|
|
|
|
for item in items[:limit]:
|
|
|
|
|
try:
|
|
|
|
|
# Extract drucksache number from first link
|
|
|
|
|
num_link = item.select_one('a[href*="MMD"]')
|
|
|
|
|
if not num_link:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
href = num_link.get('href', '')
|
|
|
|
|
# Extract number: MMD18-12345.pdf -> 18/12345
|
|
|
|
|
match = re.search(r'MMD(\d+)-(\d+)\.pdf', href)
|
|
|
|
|
if not match:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
legislatur, nummer = match.groups()
|
|
|
|
|
drucksache = f"{legislatur}/{nummer}"
|
|
|
|
|
pdf_url = f"https://www.landtag.nrw.de{href}" if href.startswith('/') else href
|
|
|
|
|
|
|
|
|
|
# Extract title from the title link (class e-document-result-item__title)
|
|
|
|
|
title_elem = item.select_one('a.e-document-result-item__title')
|
|
|
|
|
if title_elem:
|
|
|
|
|
# Get text content, clean it up
|
|
|
|
|
title = title_elem.get_text(strip=True)
|
|
|
|
|
# Remove SVG icon text and clean
|
|
|
|
|
title = re.sub(r'\s*<svg.*', '', title)
|
|
|
|
|
title = re.sub(r'\s+', ' ', title).strip()
|
|
|
|
|
else:
|
|
|
|
|
# Fallback: try to find any longer text
|
|
|
|
|
title = f"Drucksache {drucksache}"
|
|
|
|
|
|
|
|
|
|
# Clean up common artifacts
|
|
|
|
|
title = re.sub(r'\s*\(\s*externer Link.*?\)', '', title).strip()
|
|
|
|
|
|
|
|
|
|
# Extract type (Antrag, Kleine Anfrage, etc.)
|
|
|
|
|
typ_elem = item.select_one('.e-document-result-item__category')
|
|
|
|
|
typ = typ_elem.get_text(strip=True) if typ_elem else "Drucksache"
|
|
|
|
|
|
|
|
|
|
# Extract date
|
|
|
|
|
time_elem = item.select_one('time')
|
|
|
|
|
datum = ""
|
|
|
|
|
if time_elem:
|
|
|
|
|
datum_text = time_elem.get_text(strip=True)
|
|
|
|
|
# Convert DD.MM.YYYY to YYYY-MM-DD
|
|
|
|
|
date_match = re.match(r'(\d{2})\.(\d{2})\.(\d{4})', datum_text)
|
|
|
|
|
if date_match:
|
|
|
|
|
d, m, y = date_match.groups()
|
|
|
|
|
datum = f"{y}-{m}-{d}"
|
|
|
|
|
|
|
|
|
|
# Extract Urheber (fraktionen) - look for paragraph containing "Urheber:"
|
|
|
|
|
urheber_text = ""
|
|
|
|
|
for p in item.select('p'):
|
|
|
|
|
if 'Urheber:' in p.get_text():
|
|
|
|
|
urheber_text = p.get_text()
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
fraktionen = []
|
|
|
|
|
if urheber_text:
|
|
|
|
|
# Extract party names (SPD, CDU, GRÜNE, FDP, AfD)
|
|
|
|
|
for party in ['SPD', 'CDU', 'GRÜNE', 'Grüne', 'FDP', 'AfD']:
|
|
|
|
|
if party in urheber_text:
|
|
|
|
|
fraktionen.append(party.upper() if party.lower() != 'grüne' else 'GRÜNE')
|
|
|
|
|
|
|
|
|
|
doc = Drucksache(
|
|
|
|
|
drucksache=drucksache,
|
|
|
|
|
title=title,
|
|
|
|
|
fraktionen=fraktionen,
|
|
|
|
|
datum=datum,
|
|
|
|
|
link=pdf_url,
|
|
|
|
|
bundesland="NRW",
|
|
|
|
|
typ=typ,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Apply AND filter (all terms must match)
|
|
|
|
|
if self._matches_all_terms(doc, filter_terms, is_exact):
|
|
|
|
|
results.append(doc)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error parsing item: {e}")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"NRW search error: {e}")
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
|
|
|
"""Get document metadata by drucksache ID (e.g. '18/8125')."""
|
|
|
|
|
# Parse legislatur and number
|
|
|
|
|
match = re.match(r"(\d+)/(\d+)", drucksache)
|
|
|
|
|
if not match:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
legislatur, nummer = match.groups()
|
|
|
|
|
pdf_url = f"https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument/MMD{legislatur}-{nummer}.pdf"
|
|
|
|
|
|
|
|
|
|
# Try to fetch and extract basic info
|
|
|
|
|
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
|
|
|
|
try:
|
|
|
|
|
resp = await client.head(pdf_url)
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
return Drucksache(
|
|
|
|
|
drucksache=drucksache,
|
|
|
|
|
title=f"Drucksache {drucksache}",
|
|
|
|
|
fraktionen=[],
|
|
|
|
|
datum="",
|
|
|
|
|
link=pdf_url,
|
|
|
|
|
bundesland="NRW",
|
|
|
|
|
)
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
|
|
|
"""Download PDF and extract text."""
|
|
|
|
|
import fitz # PyMuPDF
|
|
|
|
|
|
|
|
|
|
doc = await self.get_document(drucksache)
|
|
|
|
|
if not doc:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
|
|
|
|
|
try:
|
|
|
|
|
resp = await client.get(doc.link)
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
# Extract text with PyMuPDF
|
|
|
|
|
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
|
|
|
|
text = ""
|
|
|
|
|
for page in pdf:
|
|
|
|
|
text += page.get_text()
|
|
|
|
|
pdf.close()
|
|
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error downloading {drucksache}: {e}")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
class PortalaAdapter(ParlamentAdapter):
|
|
|
|
|
"""Adapter for portala/eUI-based parliament documentation systems.
|
|
|
|
|
|
|
|
|
|
Used by parliaments running the proprietary "esearch" / portala framework
|
|
|
|
|
(originally developed for STAR/StarFinder backends, now wrapped in a
|
|
|
|
|
Single-Page App with Template Toolkit on the server side):
|
|
|
|
|
|
|
|
|
|
- **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de``
|
|
|
|
|
- **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` (future)
|
|
|
|
|
|
|
|
|
|
The search workflow is two-stage:
|
|
|
|
|
|
|
|
|
|
1. ``POST /portal/browse.tt.json`` with a complex JSON ``action`` body
|
|
|
|
|
that contains an Elasticsearch-style query tree under
|
|
|
|
|
``search.json``. The server returns a ``report_id`` plus hit count.
|
|
|
|
|
2. ``POST /portal/report.tt.html`` with ``{report_id, start, chunksize}``
|
|
|
|
|
to fetch the HTML hit list. Each hit carries a Perl Data::Dumper
|
|
|
|
|
block in a ``<pre>`` tag with the canonical metadata.
|
|
|
|
|
|
|
|
|
|
The query body schema was reverse-engineered from
|
|
|
|
|
https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
|
|
|
|
|
(GPL-3.0 — only structure/selectors are reused, not Python code).
|
|
|
|
|
|
|
|
|
|
Full-text search is **not** implemented in the MVP: the adapter
|
|
|
|
|
returns the most recent ``Anträge`` of the current Wahlperiode in the
|
|
|
|
|
given date window, and the search query is applied as a client-side
|
|
|
|
|
title/Urheber filter. The portala server-side full-text path requires
|
|
|
|
|
LSA-specific ``sf`` index names that are not yet known.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
bundesland = "LSA"
|
|
|
|
|
name = "Landtag von Sachsen-Anhalt (PADOKA)"
|
|
|
|
|
base_url = "https://padoka.landtag.sachsen-anhalt.de"
|
|
|
|
|
db_id = "lsa.lissh"
|
|
|
|
|
wahlperiode = 8
|
|
|
|
|
|
|
|
|
|
# Reverse-engineered "WEV*" Perl record fields used in the hit-list dumps:
|
|
|
|
|
# WEV06.main = title
|
|
|
|
|
# WEV32.5 = relative PDF path
|
|
|
|
|
# WEV32.main = "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b> ..."
|
|
|
|
|
_RE_TITLE = re.compile(r"'WEV06'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
|
|
|
|
|
_RE_PDF = re.compile(r"'5'\s*=>\s*'([^']*\.pdf)'")
|
|
|
|
|
_RE_DRUCKSACHE = re.compile(r"Drucksache\s*<b>(\d+/\d+)</b>")
|
|
|
|
|
_RE_URHEBER_DATUM = re.compile(
|
|
|
|
|
r"'WEV32'\s*=>\s*\[\s*\{[^}]*'main'\s*=>\s*[\"']Antrag\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
|
|
|
|
|
)
|
|
|
|
|
_RE_PRE_BLOCK = re.compile(r'<pre>\$VAR1 = (.*?)</pre>', re.DOTALL)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _decode_perl_hex(s: str) -> str:
|
|
|
|
|
"""Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
|
|
|
|
|
return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _normalize_fraktion(urheber: str) -> list[str]:
|
|
|
|
|
"""Map Urheber-String to canonical fraction codes."""
|
|
|
|
|
u = urheber.upper()
|
|
|
|
|
out = []
|
|
|
|
|
if "BÜNDNIS 90" in u or "GRÜNE" in u or "GRUENE" in u:
|
|
|
|
|
out.append("GRÜNE")
|
|
|
|
|
if u.startswith("CDU") or " CDU " in u or u.endswith(" CDU"):
|
|
|
|
|
out.append("CDU")
|
|
|
|
|
if "SPD" in u:
|
|
|
|
|
out.append("SPD")
|
|
|
|
|
if "FDP" in u:
|
|
|
|
|
out.append("FDP")
|
|
|
|
|
if "AFD" in u:
|
|
|
|
|
out.append("AfD")
|
|
|
|
|
if "LINKE" in u or "DIE LINKE" in u:
|
|
|
|
|
out.append("LINKE")
|
|
|
|
|
if "LANDESREGIERUNG" in u or "MINISTER" in u or "STAATSKANZLEI" in u:
|
|
|
|
|
out.append("Landesregierung")
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
def _build_search_body(
|
|
|
|
|
self,
|
|
|
|
|
wahlperiode: int,
|
|
|
|
|
start_date: str,
|
|
|
|
|
end_date: str,
|
|
|
|
|
document_type: str = "Antrag",
|
|
|
|
|
) -> dict:
|
|
|
|
|
"""Build the action JSON body for browse.tt.json.
|
|
|
|
|
|
|
|
|
|
The schema is taken 1:1 from dokukratie's portala.query.json template
|
|
|
|
|
and only differs in the data source (lsa.lissh) and the variable
|
|
|
|
|
substitutions.
|
|
|
|
|
"""
|
|
|
|
|
return {
|
|
|
|
|
"action": "SearchAndDisplay",
|
|
|
|
|
"sources": [self.db_id],
|
|
|
|
|
"report": {
|
|
|
|
|
"rhl": "main",
|
|
|
|
|
"rhlmode": "add",
|
|
|
|
|
"format": "generic1-full",
|
|
|
|
|
"mime": "html",
|
|
|
|
|
"sort": "WEVSO1/D WEVSO2 WEVSO3",
|
|
|
|
|
},
|
|
|
|
|
"search": {
|
|
|
|
|
"lines": {
|
|
|
|
|
"2": str(wahlperiode),
|
|
|
|
|
"3": document_type,
|
|
|
|
|
"4": "D",
|
|
|
|
|
"10": start_date,
|
|
|
|
|
"11": end_date,
|
|
|
|
|
"20.1": "alWEBBI",
|
|
|
|
|
"20.2": "alWEBBI",
|
|
|
|
|
"20.3": "alWEBBI",
|
|
|
|
|
"90.1": "AND",
|
|
|
|
|
"90.2": "AND",
|
|
|
|
|
"90.3": "AND",
|
|
|
|
|
},
|
|
|
|
|
"serverrecordname": "sr_generic1",
|
|
|
|
|
"parsed": (
|
|
|
|
|
f"((/WP {wahlperiode}) AND "
|
|
|
|
|
f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
|
|
|
|
|
f"AND (/DART,DARTS (\"D\")) AND "
|
|
|
|
|
f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE"
|
|
|
|
|
),
|
|
|
|
|
"sref": (
|
|
|
|
|
f"((/WP {wahlperiode}) AND "
|
|
|
|
|
f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
|
|
|
|
|
f"AND (/DART,DARTS (\"D\")) AND "
|
|
|
|
|
f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE"
|
|
|
|
|
),
|
|
|
|
|
"json": [{
|
|
|
|
|
"tn": "and",
|
|
|
|
|
"num": 1,
|
|
|
|
|
"terms": [
|
|
|
|
|
{"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3,
|
|
|
|
|
"sf": "WP", "op": "eq", "num": 5},
|
|
|
|
|
{"tn": "or", "num": 3, "terms": [
|
|
|
|
|
{"tn": "or", "num": 4, "terms": [
|
|
|
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
|
|
|
"l": 4, "sf": "ETYPF", "op": "eq", "num": 10},
|
|
|
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
|
|
|
"l": 4, "sf": "ETYP2F", "op": "eq", "num": 11},
|
|
|
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
|
|
|
"l": 4, "sf": "DTYPF", "op": "eq", "num": 12},
|
|
|
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
|
|
|
"l": 4, "sf": "DTYP2F", "op": "eq", "num": 13},
|
|
|
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
|
|
|
"l": 4, "sf": "1VTYPF", "op": "eq", "num": 14},
|
|
|
|
|
]},
|
|
|
|
|
{"tn": "or", "num": 15, "terms": [
|
|
|
|
|
{"tn": "term", "t": '"D"', "idx": 93, "l": 4,
|
|
|
|
|
"sf": "DART", "op": "eq", "num": 16},
|
|
|
|
|
{"tn": "term", "t": '"D"', "idx": 93, "l": 4,
|
|
|
|
|
"sf": "DARTS", "op": "eq", "num": 17},
|
|
|
|
|
]},
|
|
|
|
|
]},
|
|
|
|
|
{"tn": "or", "num": 18, "terms": [
|
|
|
|
|
{"tn": "or", "num": 19, "terms": [
|
|
|
|
|
{"tn": "trange", "sf": "DAT", "op": "eq", "num": 20,
|
|
|
|
|
"idx": 119, "l": 3, "p1": start_date, "t1": start_date,
|
|
|
|
|
"p2": end_date, "t2": end_date,
|
|
|
|
|
"t": f"{start_date} THRU {end_date}"},
|
|
|
|
|
{"tn": "trange", "sf": "DDAT", "op": "eq", "num": 21,
|
|
|
|
|
"idx": 119, "l": 3, "p1": start_date, "t1": start_date,
|
|
|
|
|
"p2": end_date, "t2": end_date,
|
|
|
|
|
"t": f"{start_date} THRU {end_date}"},
|
|
|
|
|
]},
|
|
|
|
|
{"tn": "trange", "sf": "SDAT", "op": "eq", "num": 22,
|
|
|
|
|
"idx": 119, "l": 3, "p1": start_date, "t1": start_date,
|
|
|
|
|
"p2": end_date, "t2": end_date,
|
|
|
|
|
"t": f"{start_date} THRU {end_date}"},
|
|
|
|
|
]},
|
|
|
|
|
{"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1,
|
|
|
|
|
"sf": "TYP", "op": "eq", "num": 23},
|
|
|
|
|
],
|
|
|
|
|
}],
|
|
|
|
|
},
|
|
|
|
|
"dataSet": "1",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]:
|
|
|
|
|
"""Extract Drucksachen from a report.tt.html response."""
|
|
|
|
|
results: list[Drucksache] = []
|
|
|
|
|
for pre in self._RE_PRE_BLOCK.findall(html):
|
|
|
|
|
m_ds = self._RE_DRUCKSACHE.search(pre)
|
|
|
|
|
if not m_ds:
|
|
|
|
|
continue
|
|
|
|
|
drucksache = m_ds.group(1)
|
|
|
|
|
|
|
|
|
|
m_t = self._RE_TITLE.search(pre)
|
|
|
|
|
title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"
|
|
|
|
|
|
|
|
|
|
m_pdf = self._RE_PDF.search(pre)
|
|
|
|
|
pdf_rel = m_pdf.group(1) if m_pdf else ""
|
|
|
|
|
pdf_url = f"{self.base_url}/files/{pdf_rel}" if pdf_rel else ""
|
|
|
|
|
|
|
|
|
|
m_w32 = self._RE_URHEBER_DATUM.search(pre)
|
|
|
|
|
urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else ""
|
|
|
|
|
datum_de = m_w32.group(2) if m_w32 else ""
|
|
|
|
|
# DD.MM.YYYY -> ISO YYYY-MM-DD
|
|
|
|
|
datum_iso = ""
|
|
|
|
|
if datum_de:
|
|
|
|
|
d, m, y = datum_de.split(".")
|
|
|
|
|
datum_iso = f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
|
|
|
|
|
|
|
|
|
fraktionen = self._normalize_fraktion(urheber) if urheber else []
|
|
|
|
|
|
|
|
|
|
doc = Drucksache(
|
|
|
|
|
drucksache=drucksache,
|
|
|
|
|
title=title,
|
|
|
|
|
fraktionen=fraktionen,
|
|
|
|
|
datum=datum_iso,
|
|
|
|
|
link=pdf_url,
|
|
|
|
|
bundesland=self.bundesland,
|
|
|
|
|
typ="Antrag",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Client-side title filter (no fulltext search server-side)
|
|
|
|
|
if query_filter:
|
|
|
|
|
hay = f"{title} {urheber}".lower()
|
|
|
|
|
if not all(t in hay for t in query_filter.lower().split()):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
results.append(doc)
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
|
|
|
"""Search recent Anträge of the current Wahlperiode.
|
|
|
|
|
|
|
|
|
|
``query`` is applied as a client-side title/Urheber filter; the
|
|
|
|
|
server-side query covers the last ~24 months by default.
|
|
|
|
|
"""
|
|
|
|
|
from datetime import date, timedelta
|
|
|
|
|
|
|
|
|
|
end = date.today()
|
|
|
|
|
start = end - timedelta(days=730)
|
|
|
|
|
body = self._build_search_body(
|
|
|
|
|
wahlperiode=self.wahlperiode,
|
|
|
|
|
start_date=start.isoformat(),
|
|
|
|
|
end_date=end.isoformat(),
|
|
|
|
|
document_type="Antrag",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
async with httpx.AsyncClient(
|
|
|
|
|
timeout=30,
|
|
|
|
|
follow_redirects=True,
|
|
|
|
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
|
|
|
|
) as client:
|
|
|
|
|
try:
|
|
|
|
|
# Step 1: warm up cookies via the browse page
|
|
|
|
|
await client.get(f"{self.base_url}/portal/browse.tt.html")
|
|
|
|
|
|
|
|
|
|
# Step 2: submit the search action
|
|
|
|
|
resp = await client.post(
|
|
|
|
|
f"{self.base_url}/portal/browse.tt.json",
|
|
|
|
|
json=body,
|
|
|
|
|
headers={"Referer": f"{self.base_url}/portal/browse.tt.html"},
|
|
|
|
|
)
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
print(f"PADOKA search HTTP {resp.status_code}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
data = resp.json()
|
|
|
|
|
report_id = data.get("report_id")
|
|
|
|
|
if not report_id:
|
|
|
|
|
print(f"PADOKA: no report_id in response: {data}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
# Step 3: fetch the HTML hit list
|
|
|
|
|
# Take a generous chunk so client-side filter still has enough
|
|
|
|
|
chunksize = 100 if query else limit
|
|
|
|
|
report_resp = await client.post(
|
|
|
|
|
f"{self.base_url}/portal/report.tt.html",
|
|
|
|
|
json={"report_id": report_id, "start": 0, "chunksize": chunksize},
|
|
|
|
|
headers={"Referer": f"{self.base_url}/portal/browse.tt.html"},
|
|
|
|
|
)
|
|
|
|
|
if report_resp.status_code != 200:
|
|
|
|
|
print(f"PADOKA report HTTP {report_resp.status_code}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
results = self._parse_hit_list_html(report_resp.text, query_filter=query)
|
|
|
|
|
return results[:limit]
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"PADOKA search error: {e}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
|
|
|
"""Look up a single document by ID via the search endpoint with a
|
|
|
|
|
document_number filter."""
|
|
|
|
|
# Pragmatic MVP: do a broad search and filter for the requested ID.
|
|
|
|
|
# A targeted single-document fetch would require a different
|
|
|
|
|
# action.search.json structure that we have not reverse-engineered yet.
|
|
|
|
|
results = await self.search(query="", limit=200)
|
|
|
|
|
for doc in results:
|
|
|
|
|
if doc.drucksache == drucksache:
|
|
|
|
|
return doc
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
|
|
|
"""Download the PDF for a Drucksache and extract its text."""
|
|
|
|
|
import fitz # PyMuPDF
|
|
|
|
|
|
|
|
|
|
doc = await self.get_document(drucksache)
|
|
|
|
|
if not doc or not doc.link:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
async with httpx.AsyncClient(
|
|
|
|
|
timeout=60,
|
|
|
|
|
follow_redirects=True,
|
|
|
|
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
|
|
|
|
) as client:
|
|
|
|
|
try:
|
|
|
|
|
resp = await client.get(doc.link)
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
return None
|
|
|
|
|
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
|
|
|
|
text = ""
|
|
|
|
|
for page in pdf:
|
|
|
|
|
text += page.get_text()
|
|
|
|
|
pdf.close()
|
|
|
|
|
return text
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"PADOKA download error for {drucksache}: {e}")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
class BayernAdapter(ParlamentAdapter):
|
|
|
|
|
"""Adapter for Bayerischer Landtag."""
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
bundesland = "BY"
|
|
|
|
|
name = "Bayerischer Landtag"
|
|
|
|
|
base_url = "https://www.bayern.landtag.de"
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
|
|
|
# TODO: Implement Bayern search
|
|
|
|
|
return []
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
|
|
|
# TODO: Implement
|
|
|
|
|
return None
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BWAdapter(ParlamentAdapter):
|
|
|
|
|
"""Adapter for Baden-Württemberg Landtag."""
|
|
|
|
|
|
|
|
|
|
bundesland = "BW"
|
|
|
|
|
name = "Landtag Baden-Württemberg"
|
|
|
|
|
base_url = "https://www.landtag-bw.de"
|
|
|
|
|
|
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
|
|
|
# TODO: Implement BW search
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Registry of adapters
|
|
|
|
|
ADAPTERS = {
|
|
|
|
|
"NRW": NRWAdapter(),
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
"LSA": PortalaAdapter(),
|
2026-03-28 22:30:24 +01:00
|
|
|
"BY": BayernAdapter(),
|
|
|
|
|
"BW": BWAdapter(),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_adapter(bundesland: str) -> Optional[ParlamentAdapter]:
|
|
|
|
|
"""Get adapter for a bundesland."""
|
|
|
|
|
return ADAPTERS.get(bundesland)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def search_all(query: str, bundesland: str = "NRW", limit: int = 20) -> list[Drucksache]:
|
|
|
|
|
"""Search parliament documents in a specific state."""
|
|
|
|
|
adapter = get_adapter(bundesland)
|
|
|
|
|
if not adapter:
|
|
|
|
|
return []
|
|
|
|
|
return await adapter.search(query, limit)
|