2026-03-28 22:30:24 +01:00
|
|
|
|
"""Parliament search adapters for different German states."""
|
|
|
|
|
|
|
2026-04-08 08:19:48 +02:00
|
|
|
|
import json
|
|
|
|
|
|
import logging
|
2026-03-28 22:30:24 +01:00
|
|
|
|
import httpx
|
|
|
|
|
|
import re
|
|
|
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
from typing import Optional
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
2026-04-08 08:19:48 +02:00
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class Drucksache:
|
|
|
|
|
|
"""A parliamentary document."""
|
|
|
|
|
|
drucksache: str # e.g. "18/8125"
|
|
|
|
|
|
title: str
|
|
|
|
|
|
fraktionen: list[str]
|
|
|
|
|
|
datum: str # ISO date
|
|
|
|
|
|
link: str # PDF URL
|
|
|
|
|
|
bundesland: str
|
|
|
|
|
|
typ: str = "Antrag" # Antrag, Anfrage, Beschlussempfehlung, etc.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ParlamentAdapter(ABC):
|
|
|
|
|
|
"""Base adapter for searching parliament documents."""
|
|
|
|
|
|
|
|
|
|
|
|
bundesland: str
|
|
|
|
|
|
name: str
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
|
|
|
|
"""Search for documents matching query."""
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
|
|
|
|
"""Get a specific document by ID."""
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
|
|
|
|
"""Download and extract text from a document."""
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NRWAdapter(ParlamentAdapter):
|
|
|
|
|
|
"""Adapter for NRW Landtag (opal.landtag.nrw.de)."""
|
|
|
|
|
|
|
|
|
|
|
|
bundesland = "NRW"
|
|
|
|
|
|
name = "Landtag Nordrhein-Westfalen"
|
|
|
|
|
|
base_url = "https://opal.landtag.nrw.de"
|
|
|
|
|
|
search_url = "https://opal.landtag.nrw.de/home/dokumente/dokumentensuche/parlamentsdokumente/aktuelle-dokumente.html"
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_query(self, query: str) -> tuple[str, list[str], bool]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Parse search query for AND logic and exact phrases.
|
|
|
|
|
|
Returns: (search_term_for_api, filter_terms, is_exact)
|
|
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
|
- 'Klimaschutz Energie' -> ('Klimaschutz', ['klimaschutz', 'energie'], False)
|
|
|
|
|
|
- '"Grüner Stahl"' -> ('Grüner Stahl', ['grüner stahl'], True)
|
|
|
|
|
|
- 'Klimaschutz "erneuerbare Energie"' -> ('Klimaschutz', ['klimaschutz', 'erneuerbare energie'], False)
|
|
|
|
|
|
"""
|
|
|
|
|
|
query = query.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Check for exact phrase (entire query in quotes)
|
|
|
|
|
|
if query.startswith('"') and query.endswith('"') and query.count('"') == 2:
|
|
|
|
|
|
exact = query[1:-1].strip()
|
|
|
|
|
|
return (exact, [exact.lower()], True)
|
|
|
|
|
|
|
|
|
|
|
|
# Extract quoted phrases and regular terms
|
|
|
|
|
|
import shlex
|
|
|
|
|
|
try:
|
|
|
|
|
|
parts = shlex.split(query)
|
|
|
|
|
|
except ValueError:
|
|
|
|
|
|
# Fallback for unbalanced quotes
|
|
|
|
|
|
parts = query.split()
|
|
|
|
|
|
|
|
|
|
|
|
if not parts:
|
|
|
|
|
|
return (query, [query.lower()], False)
|
|
|
|
|
|
|
|
|
|
|
|
# Use first term for API search, all terms for filtering
|
|
|
|
|
|
filter_terms = [p.lower() for p in parts]
|
|
|
|
|
|
return (parts[0], filter_terms, False)
|
|
|
|
|
|
|
|
|
|
|
|
def _matches_all_terms(self, doc: 'Drucksache', terms: list[str], is_exact: bool) -> bool:
|
|
|
|
|
|
"""Check if document matches all search terms (AND logic)."""
|
|
|
|
|
|
searchable = f"{doc.title} {doc.drucksache} {' '.join(doc.fraktionen)} {doc.typ}".lower()
|
|
|
|
|
|
|
|
|
|
|
|
if is_exact:
|
|
|
|
|
|
# Exact phrase must appear
|
|
|
|
|
|
return terms[0] in searchable
|
|
|
|
|
|
else:
|
|
|
|
|
|
# All terms must appear (AND)
|
|
|
|
|
|
return all(term in searchable for term in terms)
|
|
|
|
|
|
|
|
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
|
|
|
|
"""Search NRW Landtag documents via OPAL portal."""
|
|
|
|
|
|
results = []
|
|
|
|
|
|
|
|
|
|
|
|
# Parse query for AND logic
|
|
|
|
|
|
api_query, filter_terms, is_exact = self._parse_query(query)
|
|
|
|
|
|
|
|
|
|
|
|
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
|
|
|
|
|
try:
|
|
|
|
|
|
# First, get the page to establish session
|
|
|
|
|
|
initial = await client.get(self.search_url)
|
|
|
|
|
|
if initial.status_code != 200:
|
|
|
|
|
|
print(f"NRW search initial request failed: {initial.status_code}")
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
# Parse for webflow token from pagination links
|
|
|
|
|
|
soup = BeautifulSoup(initial.text, 'html.parser')
|
|
|
|
|
|
|
|
|
|
|
|
# Find a pagination link to extract the webflow token
|
|
|
|
|
|
pagination_link = soup.select_one('a[href*="webflowexecution"]')
|
|
|
|
|
|
webflow_token = ""
|
|
|
|
|
|
webflow_execution = ""
|
|
|
|
|
|
|
|
|
|
|
|
if pagination_link:
|
|
|
|
|
|
href = pagination_link.get('href', '')
|
|
|
|
|
|
# Extract webflowToken and webflowexecution from URL
|
|
|
|
|
|
token_match = re.search(r'webflowToken=([^&]*)', href)
|
|
|
|
|
|
exec_match = re.search(r'(webflowexecution[^=]+)=([^&]+)', href)
|
|
|
|
|
|
if token_match:
|
|
|
|
|
|
webflow_token = token_match.group(1)
|
|
|
|
|
|
if exec_match:
|
|
|
|
|
|
webflow_execution = f"{exec_match.group(1)}={exec_match.group(2)}"
|
|
|
|
|
|
|
|
|
|
|
|
# Now perform the search with POST
|
|
|
|
|
|
# Find the form action URL with webflow token
|
|
|
|
|
|
form = soup.select_one('form#docSearchByItem')
|
|
|
|
|
|
form_action = self.search_url
|
|
|
|
|
|
if form and form.get('action'):
|
|
|
|
|
|
action = form.get('action')
|
|
|
|
|
|
if action.startswith('/'):
|
|
|
|
|
|
form_action = f"{self.base_url}{action}"
|
|
|
|
|
|
elif action.startswith('http'):
|
|
|
|
|
|
form_action = action
|
|
|
|
|
|
else:
|
|
|
|
|
|
form_action = f"{self.search_url}?{action}"
|
|
|
|
|
|
|
|
|
|
|
|
# Build form data for "Einfache Suche" (searchByItem form)
|
|
|
|
|
|
form_data = {
|
|
|
|
|
|
'_eventId_sendform': '1',
|
|
|
|
|
|
'dokNum': api_query, # This is the text search field
|
|
|
|
|
|
'formId': 'searchByItem',
|
|
|
|
|
|
'dokTyp': '', # All types
|
|
|
|
|
|
'wp': '18', # Wahlperiode 18
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# POST request with form data to the form action URL
|
|
|
|
|
|
search_resp = await client.post(
|
|
|
|
|
|
form_action,
|
|
|
|
|
|
data=form_data,
|
|
|
|
|
|
cookies=initial.cookies,
|
|
|
|
|
|
headers={'Content-Type': 'application/x-www-form-urlencoded'}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if search_resp.status_code != 200:
|
|
|
|
|
|
print(f"NRW search request failed: {search_resp.status_code}")
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
# Parse results
|
|
|
|
|
|
soup = BeautifulSoup(search_resp.text, 'html.parser')
|
|
|
|
|
|
|
|
|
|
|
|
# Find all document result items (li elements containing articles)
|
|
|
|
|
|
items = soup.select('li:has(article)')
|
|
|
|
|
|
|
|
|
|
|
|
for item in items[:limit]:
|
|
|
|
|
|
try:
|
|
|
|
|
|
# Extract drucksache number from first link
|
|
|
|
|
|
num_link = item.select_one('a[href*="MMD"]')
|
|
|
|
|
|
if not num_link:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
href = num_link.get('href', '')
|
|
|
|
|
|
# Extract number: MMD18-12345.pdf -> 18/12345
|
|
|
|
|
|
match = re.search(r'MMD(\d+)-(\d+)\.pdf', href)
|
|
|
|
|
|
if not match:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
legislatur, nummer = match.groups()
|
|
|
|
|
|
drucksache = f"{legislatur}/{nummer}"
|
|
|
|
|
|
pdf_url = f"https://www.landtag.nrw.de{href}" if href.startswith('/') else href
|
|
|
|
|
|
|
|
|
|
|
|
# Extract title from the title link (class e-document-result-item__title)
|
|
|
|
|
|
title_elem = item.select_one('a.e-document-result-item__title')
|
|
|
|
|
|
if title_elem:
|
|
|
|
|
|
# Get text content, clean it up
|
|
|
|
|
|
title = title_elem.get_text(strip=True)
|
|
|
|
|
|
# Remove SVG icon text and clean
|
|
|
|
|
|
title = re.sub(r'\s*<svg.*', '', title)
|
|
|
|
|
|
title = re.sub(r'\s+', ' ', title).strip()
|
|
|
|
|
|
else:
|
|
|
|
|
|
# Fallback: try to find any longer text
|
|
|
|
|
|
title = f"Drucksache {drucksache}"
|
|
|
|
|
|
|
|
|
|
|
|
# Clean up common artifacts
|
|
|
|
|
|
title = re.sub(r'\s*\(\s*externer Link.*?\)', '', title).strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Extract type (Antrag, Kleine Anfrage, etc.)
|
|
|
|
|
|
typ_elem = item.select_one('.e-document-result-item__category')
|
|
|
|
|
|
typ = typ_elem.get_text(strip=True) if typ_elem else "Drucksache"
|
|
|
|
|
|
|
|
|
|
|
|
# Extract date
|
|
|
|
|
|
time_elem = item.select_one('time')
|
|
|
|
|
|
datum = ""
|
|
|
|
|
|
if time_elem:
|
|
|
|
|
|
datum_text = time_elem.get_text(strip=True)
|
|
|
|
|
|
# Convert DD.MM.YYYY to YYYY-MM-DD
|
|
|
|
|
|
date_match = re.match(r'(\d{2})\.(\d{2})\.(\d{4})', datum_text)
|
|
|
|
|
|
if date_match:
|
|
|
|
|
|
d, m, y = date_match.groups()
|
|
|
|
|
|
datum = f"{y}-{m}-{d}"
|
|
|
|
|
|
|
|
|
|
|
|
# Extract Urheber (fraktionen) - look for paragraph containing "Urheber:"
|
|
|
|
|
|
urheber_text = ""
|
|
|
|
|
|
for p in item.select('p'):
|
|
|
|
|
|
if 'Urheber:' in p.get_text():
|
|
|
|
|
|
urheber_text = p.get_text()
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
fraktionen = []
|
|
|
|
|
|
if urheber_text:
|
|
|
|
|
|
# Extract party names (SPD, CDU, GRÜNE, FDP, AfD)
|
|
|
|
|
|
for party in ['SPD', 'CDU', 'GRÜNE', 'Grüne', 'FDP', 'AfD']:
|
|
|
|
|
|
if party in urheber_text:
|
|
|
|
|
|
fraktionen.append(party.upper() if party.lower() != 'grüne' else 'GRÜNE')
|
|
|
|
|
|
|
|
|
|
|
|
doc = Drucksache(
|
|
|
|
|
|
drucksache=drucksache,
|
|
|
|
|
|
title=title,
|
|
|
|
|
|
fraktionen=fraktionen,
|
|
|
|
|
|
datum=datum,
|
|
|
|
|
|
link=pdf_url,
|
|
|
|
|
|
bundesland="NRW",
|
|
|
|
|
|
typ=typ,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# Apply AND filter (all terms must match)
|
|
|
|
|
|
if self._matches_all_terms(doc, filter_terms, is_exact):
|
|
|
|
|
|
results.append(doc)
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"Error parsing item: {e}")
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"NRW search error: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
|
|
|
|
"""Get document metadata by drucksache ID (e.g. '18/8125')."""
|
|
|
|
|
|
# Parse legislatur and number
|
|
|
|
|
|
match = re.match(r"(\d+)/(\d+)", drucksache)
|
|
|
|
|
|
if not match:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
legislatur, nummer = match.groups()
|
|
|
|
|
|
pdf_url = f"https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument/MMD{legislatur}-{nummer}.pdf"
|
|
|
|
|
|
|
|
|
|
|
|
# Try to fetch and extract basic info
|
|
|
|
|
|
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
|
|
|
|
|
try:
|
|
|
|
|
|
resp = await client.head(pdf_url)
|
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
|
return Drucksache(
|
|
|
|
|
|
drucksache=drucksache,
|
|
|
|
|
|
title=f"Drucksache {drucksache}",
|
|
|
|
|
|
fraktionen=[],
|
|
|
|
|
|
datum="",
|
|
|
|
|
|
link=pdf_url,
|
|
|
|
|
|
bundesland="NRW",
|
|
|
|
|
|
)
|
|
|
|
|
|
except:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
|
|
|
|
"""Download PDF and extract text."""
|
|
|
|
|
|
import fitz # PyMuPDF
|
|
|
|
|
|
|
|
|
|
|
|
doc = await self.get_document(drucksache)
|
|
|
|
|
|
if not doc:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
|
|
|
|
|
|
try:
|
|
|
|
|
|
resp = await client.get(doc.link)
|
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# Extract text with PyMuPDF
|
|
|
|
|
|
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
|
|
|
|
|
text = ""
|
|
|
|
|
|
for page in pdf:
|
|
|
|
|
|
text += page.get_text()
|
|
|
|
|
|
pdf.close()
|
|
|
|
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"Error downloading {drucksache}: {e}")
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
class PortalaAdapter(ParlamentAdapter):
|
|
|
|
|
|
"""Adapter for portala/eUI-based parliament documentation systems.
|
|
|
|
|
|
|
|
|
|
|
|
Used by parliaments running the proprietary "esearch" / portala framework
|
|
|
|
|
|
(originally developed for STAR/StarFinder backends, now wrapped in a
|
|
|
|
|
|
Single-Page App with Template Toolkit on the server side):
|
|
|
|
|
|
|
|
|
|
|
|
- **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de``
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
under ``/portal/`` (singular)
|
|
|
|
|
|
- **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` under
|
|
|
|
|
|
``/portala/`` (with the trailing 'a')
|
|
|
|
|
|
|
|
|
|
|
|
Both instances share the same JSON action schema, only the base URL,
|
|
|
|
|
|
the data source ID, the application path prefix and a few minor
|
|
|
|
|
|
quirks differ — those are constructor parameters so that the same
|
|
|
|
|
|
class can serve both states (and any future portala-based parliament).
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
|
|
|
|
|
|
The search workflow is two-stage:
|
|
|
|
|
|
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
1. ``POST {base}{path}/browse.tt.json`` with a complex JSON ``action``
|
|
|
|
|
|
body that contains an Elasticsearch-style query tree under
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
``search.json``. The server returns a ``report_id`` plus hit count.
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
2. ``POST {base}{path}/report.tt.html`` with ``{report_id, start,
|
|
|
|
|
|
chunksize}`` to fetch the HTML hit list. Each hit carries a Perl
|
|
|
|
|
|
Data::Dumper block in a ``<pre>`` tag with the canonical metadata.
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
|
|
|
|
|
|
The query body schema was reverse-engineered from
|
|
|
|
|
|
https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
|
|
|
|
|
|
(GPL-3.0 — only structure/selectors are reused, not Python code).
|
|
|
|
|
|
|
|
|
|
|
|
Full-text search is **not** implemented in the MVP: the adapter
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
returns documents of the current Wahlperiode in the given date
|
|
|
|
|
|
window, and the search query is applied as a client-side
|
|
|
|
|
|
title/Urheber filter. The server-side full-text path requires
|
|
|
|
|
|
state-specific ``sf`` index names that are not yet known.
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
"""
|
|
|
|
|
|
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
def __init__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
*,
|
|
|
|
|
|
bundesland: str,
|
|
|
|
|
|
name: str,
|
|
|
|
|
|
base_url: str,
|
|
|
|
|
|
db_id: str,
|
|
|
|
|
|
wahlperiode: int,
|
|
|
|
|
|
portala_path: str = "/portal",
|
|
|
|
|
|
document_type: Optional[str] = "Antrag",
|
|
|
|
|
|
pdf_url_prefix: str = "/files/",
|
|
|
|
|
|
date_window_days: int = 730,
|
|
|
|
|
|
) -> None:
|
|
|
|
|
|
"""Configure a portala/eUI adapter for one specific parliament.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
bundesland: state code (e.g. ``"LSA"``, ``"BE"``).
|
|
|
|
|
|
name: human-readable adapter label (used in logs/UI).
|
|
|
|
|
|
base_url: ``https://...`` of the portal host without trailing slash.
|
|
|
|
|
|
db_id: data source identifier the eUI server expects in
|
|
|
|
|
|
``action.sources``, e.g. ``"lsa.lissh"`` or ``"lah.lissh"``.
|
|
|
|
|
|
wahlperiode: current legislative period — fed into the WP
|
|
|
|
|
|
term of the search tree.
|
|
|
|
|
|
portala_path: path prefix where the portala app lives. ``/portal``
|
|
|
|
|
|
for LSA, ``/portala`` for Berlin.
|
|
|
|
|
|
document_type: optional filter applied via ETYPF/DTYPF/DART
|
|
|
|
|
|
terms. ``"Antrag"`` works for LSA; for instances where
|
|
|
|
|
|
the index uses different document_type values (e.g. Berlin),
|
|
|
|
|
|
pass ``None`` to drop the document_type subtree entirely
|
|
|
|
|
|
— the user can still filter client-side by title.
|
|
|
|
|
|
pdf_url_prefix: URL fragment between ``base_url`` and the
|
|
|
|
|
|
relative PDF path returned by the server.
|
|
|
|
|
|
date_window_days: how many days back ``search()`` looks by
|
|
|
|
|
|
default.
|
|
|
|
|
|
"""
|
|
|
|
|
|
self.bundesland = bundesland
|
|
|
|
|
|
self.name = name
|
|
|
|
|
|
self.base_url = base_url.rstrip("/")
|
|
|
|
|
|
self.db_id = db_id
|
|
|
|
|
|
self.wahlperiode = wahlperiode
|
|
|
|
|
|
self.portala_path = "/" + portala_path.strip("/")
|
|
|
|
|
|
self.document_type = document_type
|
|
|
|
|
|
self.pdf_url_prefix = "/" + pdf_url_prefix.strip("/") + "/"
|
|
|
|
|
|
self.date_window_days = date_window_days
|
|
|
|
|
|
|
|
|
|
|
|
# ── LSA-style hit list (Perl Data::Dumper inside <pre> blocks) ──
|
|
|
|
|
|
# Reverse-engineered "WEV*" record fields:
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
# WEV06.main = title
|
|
|
|
|
|
# WEV32.5 = relative PDF path
|
|
|
|
|
|
# WEV32.main = "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b> ..."
|
|
|
|
|
|
_RE_TITLE = re.compile(r"'WEV06'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
|
|
|
|
|
|
_RE_PDF = re.compile(r"'5'\s*=>\s*'([^']*\.pdf)'")
|
|
|
|
|
|
_RE_DRUCKSACHE = re.compile(r"Drucksache\s*<b>(\d+/\d+)</b>")
|
|
|
|
|
|
_RE_URHEBER_DATUM = re.compile(
|
|
|
|
|
|
r"'WEV32'\s*=>\s*\[\s*\{[^}]*'main'\s*=>\s*[\"']Antrag\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
|
|
|
|
|
|
)
|
|
|
|
|
|
_RE_PRE_BLOCK = re.compile(r'<pre>\$VAR1 = (.*?)</pre>', re.DOTALL)
|
|
|
|
|
|
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
# ── Berlin-style hit list (production HTML cards, no Perl dump) ──
|
|
|
|
|
|
# The whole div for one record:
|
|
|
|
|
|
_RE_BE_RECORD = re.compile(
|
|
|
|
|
|
r'<div[^>]*class="[^"]*efxRecordRepeater[^"]*"[^>]*data-efx-rec="[^"]*"[^>]*>(.*?)(?=<div[^>]*efxRecordRepeater|<div[^>]*id="efxResultsEnd"|</main>|$)',
|
|
|
|
|
|
re.DOTALL,
|
|
|
|
|
|
)
|
|
|
|
|
|
_RE_BE_TITLE = re.compile(r'<h3[^>]*class="h5[^"]*"[^>]*>\s*<span>([^<]+)</span>')
|
|
|
|
|
|
_RE_BE_LINK = re.compile(r'<a[^>]*href="([^"]+\.pdf)"[^>]*>')
|
|
|
|
|
|
# The metadata h6 looks like:
|
|
|
|
|
|
# <span class="h6">Antrag (Eilantrag) <a ...>Drucksache 19/3104</a> S. 1 bis 24 vom 31.03.2026</span>
|
|
|
|
|
|
_RE_BE_DRUCKSACHE = re.compile(r'Drucksache\s+(\d+/\d+)')
|
|
|
|
|
|
_RE_BE_DATUM = re.compile(r'vom\s+(\d{1,2}\.\d{1,2}\.\d{4})')
|
|
|
|
|
|
_RE_BE_DOCTYPE = re.compile(r'<span class="h6">\s*([^<&]+?)(?: |<)')
|
|
|
|
|
|
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _decode_perl_hex(s: str) -> str:
|
|
|
|
|
|
"""Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
|
|
|
|
|
|
return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s)
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _normalize_fraktion(urheber: str) -> list[str]:
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
"""Map Urheber-String to canonical fraction codes.
|
|
|
|
|
|
|
|
|
|
|
|
Uses regex word boundaries instead of plain substring matching so
|
|
|
|
|
|
that comma-separated lists ("CDU, SPD") and the embedded "DIE
|
|
|
|
|
|
LINKE" are matched reliably.
|
|
|
|
|
|
"""
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
u = urheber.upper()
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
out: list[str] = []
|
|
|
|
|
|
|
|
|
|
|
|
def has(pattern: str) -> bool:
|
|
|
|
|
|
return re.search(pattern, u) is not None
|
|
|
|
|
|
|
|
|
|
|
|
if has(r"\bBÜNDNIS\s*90\b") or has(r"\bGR(?:Ü|UE)NE\b"):
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
out.append("GRÜNE")
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
if has(r"\bCDU\b"):
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
out.append("CDU")
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
if has(r"\bSPD\b"):
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
out.append("SPD")
|
Add pytest suite + fix two regex bugs uncovered by it (#46)
Erste Tests für die Codebase. 77 Tests, 0.08s Laufzeit, decken die
drei Bug-Klassen aus der April-2026-Adapter-Session ab plus haben
schon zwei weitere Bugs in Production-Code aufgedeckt.
## Setup
- requirements-dev.txt mit pytest + pytest-asyncio
- pytest.ini mit asyncio_mode=auto
- tests/conftest.py stubbt fitz/bs4/openai/pydantic_settings, damit
die Suite ohne den vollen prod-requirements-Satz läuft (pure unit
tests, kein PDF-Parsing, kein HTTP)
## Tests
- tests/test_parlamente.py (33 Tests)
* PortalaAdapter._parse_hit_list_cards: doctype/doctype_full
NameError-Regression aus 1cb030a, plus Title/Drucksache/Fraktion-
/Datum/PDF-Extraktion gegen ein BE-Card-Fixture
* PortalaAdapter._parse_hit_list_dump: gegen ein LSA-Perl-Dump-
Fixture inkl. Hex-Escape-Decoding (\x{fc} → ü)
* PortalaAdapter._parse_hit_list_html: Auto-Detection zwischen
Card- und Dump-Format
* PortalaAdapter._normalize_fraktion: kanonische Fraktion-Codes
inkl. F.D.P.-mit-Punkten, BÜNDNIS 90, DIE LINKE, BSW
* ParLDokAdapter._hit_to_drucksache: JSON-Hit → Drucksache
Mapping inkl. /navpanes-Stripping, MdL-mit-Partei-in-Klammern,
Landesregierung-Detection
* ParLDokAdapter._fulltext_id: bundle.js-mirroring (deferred,
aber dokumentiert)
* ADAPTERS-Registry-Sanity
- tests/test_embeddings.py (11 Tests)
* _chunk_source_label: Programm-Name + Seite (Halluzinations-
Bug-Regression aus 1b5fd96)
* format_quotes_for_prompt: jeder Chunk muss Programm-Name
enthalten, strict-citation-Hinweis muss im Output sein,
keine NRW-Halluzinationen für MV/BE-Chunk-Sets
- tests/test_wahlprogramme.py (14 Tests)
* Registry-Struktur (jahr int, seiten int, .pdf-Endung)
* File-Existenz: jede registrierte PDF muss in
static/referenzen/ liegen — würde Tippfehler in den 22
indexierten Programmen sofort fangen
* embeddings.PROGRAMME-Konsistenz-Cross-Check
- tests/test_bundeslaender.py (15 Tests)
* Sanity über 16-State-Registry
* #48-Klassifikations-Regression: TH=ParlDok, HB=StarWeb,
SN=Eigensystem
* Wahltermine plausibel (zwischen 2026 und 2035)
- tests/test_analyzer.py (4 Tests)
* Markdown-Codeblock-Stripping aus dem JSON-Retry-Loop
## Bug-Funde während der Test-Schreibphase
Zwei Production-Bugs in den _normalize_fraktion-Helfern wurden
durch die neuen Tests sofort aufgedeckt und im selben Commit gefixt:
1. PortalaAdapter._normalize_fraktion matched "F.D.P." (mit Punkten,
wie historische SH/HB-Drucksachen) nicht — Regex \bFDP\b ist zu
strikt. Fix: \bF\.?\s*D\.?\s*P\.?\b analog zu ParLDokAdapter.
2. ParLDokAdapter._normalize_fraktion (auch PortalaAdapter) matched
"Ministerium der Finanzen" nicht als Landesregierung, weil
\bMINISTER\b die Wortgrenze auch nach MINISTER verlangt — bei
MINISTERIUM steht aber IUM danach, keine Wortgrenze. Fix:
\bMINISTER ohne abschließendes \b.
Beide Bugs hätten Fraktion-Felder bei Drucksachen der Bremischen
Bürgerschaft (FDP-Listen) und bei Landesregierungs-Drucksachen
in MV/LSA fälschlich leer gelassen — exakt der "fraktionen=[]"-
Befund aus dem MV-Smoke-Test in #4.
Phase 0 aus Roadmap-Issue #49.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 23:26:06 +02:00
|
|
|
|
# F.D.P. (with dots, historical SH/HB-style) and FDP (modern) — same
|
|
|
|
|
|
# flexible pattern as ParLDokAdapter so the test suite stays consistent.
|
|
|
|
|
|
if has(r"\bF\.?\s*D\.?\s*P\.?\b"):
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
out.append("FDP")
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
if has(r"\bAFD\b"):
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
out.append("AfD")
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
if has(r"\bLINKE\b"):
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
out.append("LINKE")
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
if has(r"\bBSW\b"):
|
|
|
|
|
|
out.append("BSW")
|
Add pytest suite + fix two regex bugs uncovered by it (#46)
Erste Tests für die Codebase. 77 Tests, 0.08s Laufzeit, decken die
drei Bug-Klassen aus der April-2026-Adapter-Session ab plus haben
schon zwei weitere Bugs in Production-Code aufgedeckt.
## Setup
- requirements-dev.txt mit pytest + pytest-asyncio
- pytest.ini mit asyncio_mode=auto
- tests/conftest.py stubbt fitz/bs4/openai/pydantic_settings, damit
die Suite ohne den vollen prod-requirements-Satz läuft (pure unit
tests, kein PDF-Parsing, kein HTTP)
## Tests
- tests/test_parlamente.py (33 Tests)
* PortalaAdapter._parse_hit_list_cards: doctype/doctype_full
NameError-Regression aus 1cb030a, plus Title/Drucksache/Fraktion-
/Datum/PDF-Extraktion gegen ein BE-Card-Fixture
* PortalaAdapter._parse_hit_list_dump: gegen ein LSA-Perl-Dump-
Fixture inkl. Hex-Escape-Decoding (\x{fc} → ü)
* PortalaAdapter._parse_hit_list_html: Auto-Detection zwischen
Card- und Dump-Format
* PortalaAdapter._normalize_fraktion: kanonische Fraktion-Codes
inkl. F.D.P.-mit-Punkten, BÜNDNIS 90, DIE LINKE, BSW
* ParLDokAdapter._hit_to_drucksache: JSON-Hit → Drucksache
Mapping inkl. /navpanes-Stripping, MdL-mit-Partei-in-Klammern,
Landesregierung-Detection
* ParLDokAdapter._fulltext_id: bundle.js-mirroring (deferred,
aber dokumentiert)
* ADAPTERS-Registry-Sanity
- tests/test_embeddings.py (11 Tests)
* _chunk_source_label: Programm-Name + Seite (Halluzinations-
Bug-Regression aus 1b5fd96)
* format_quotes_for_prompt: jeder Chunk muss Programm-Name
enthalten, strict-citation-Hinweis muss im Output sein,
keine NRW-Halluzinationen für MV/BE-Chunk-Sets
- tests/test_wahlprogramme.py (14 Tests)
* Registry-Struktur (jahr int, seiten int, .pdf-Endung)
* File-Existenz: jede registrierte PDF muss in
static/referenzen/ liegen — würde Tippfehler in den 22
indexierten Programmen sofort fangen
* embeddings.PROGRAMME-Konsistenz-Cross-Check
- tests/test_bundeslaender.py (15 Tests)
* Sanity über 16-State-Registry
* #48-Klassifikations-Regression: TH=ParlDok, HB=StarWeb,
SN=Eigensystem
* Wahltermine plausibel (zwischen 2026 und 2035)
- tests/test_analyzer.py (4 Tests)
* Markdown-Codeblock-Stripping aus dem JSON-Retry-Loop
## Bug-Funde während der Test-Schreibphase
Zwei Production-Bugs in den _normalize_fraktion-Helfern wurden
durch die neuen Tests sofort aufgedeckt und im selben Commit gefixt:
1. PortalaAdapter._normalize_fraktion matched "F.D.P." (mit Punkten,
wie historische SH/HB-Drucksachen) nicht — Regex \bFDP\b ist zu
strikt. Fix: \bF\.?\s*D\.?\s*P\.?\b analog zu ParLDokAdapter.
2. ParLDokAdapter._normalize_fraktion (auch PortalaAdapter) matched
"Ministerium der Finanzen" nicht als Landesregierung, weil
\bMINISTER\b die Wortgrenze auch nach MINISTER verlangt — bei
MINISTERIUM steht aber IUM danach, keine Wortgrenze. Fix:
\bMINISTER ohne abschließendes \b.
Beide Bugs hätten Fraktion-Felder bei Drucksachen der Bremischen
Bürgerschaft (FDP-Listen) und bei Landesregierungs-Drucksachen
in MV/LSA fälschlich leer gelassen — exakt der "fraktionen=[]"-
Befund aus dem MV-Smoke-Test in #4.
Phase 0 aus Roadmap-Issue #49.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 23:26:06 +02:00
|
|
|
|
# MINISTERIUM/MINISTER beide treffen — \bMINISTER ohne abschließende
|
|
|
|
|
|
# Wortgrenze, damit "Ministerium der Finanzen" mit erfasst wird.
|
|
|
|
|
|
if has(r"LANDESREGIERUNG|SENAT VON BERLIN|REGIERENDE[RN]?\s+BÜRGERMEISTER|\bMINISTER|STAATSKANZLEI|MINISTERPRÄSIDENT"):
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
out.append("Landesregierung")
|
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
def _build_search_body(
|
|
|
|
|
|
self,
|
|
|
|
|
|
wahlperiode: int,
|
|
|
|
|
|
start_date: str,
|
|
|
|
|
|
end_date: str,
|
|
|
|
|
|
) -> dict:
|
|
|
|
|
|
"""Build the action JSON body for browse.tt.json.
|
|
|
|
|
|
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
The schema is taken from dokukratie's portala.query.json template
|
|
|
|
|
|
and only differs in the data source and the variable substitutions.
|
|
|
|
|
|
When ``self.document_type`` is None, the ETYPF/DTYPF/DART subtree
|
|
|
|
|
|
is dropped — useful for parliaments whose ETYPF index uses
|
|
|
|
|
|
different value strings than ``"Antrag"``.
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
"""
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
document_type = self.document_type
|
|
|
|
|
|
date_range_text = f"{start_date} THRU {end_date}"
|
|
|
|
|
|
date_term = lambda sf, num: { # noqa: E731 — local helper
|
|
|
|
|
|
"tn": "trange", "sf": sf, "op": "eq", "num": num,
|
|
|
|
|
|
"idx": 119, "l": 3,
|
|
|
|
|
|
"p1": start_date, "t1": start_date,
|
|
|
|
|
|
"p2": end_date, "t2": end_date,
|
|
|
|
|
|
"t": date_range_text,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# Build the search.lines (form-state mirror) and the json tree
|
|
|
|
|
|
lines: dict = {
|
|
|
|
|
|
"2": str(wahlperiode),
|
|
|
|
|
|
"10": start_date,
|
|
|
|
|
|
"11": end_date,
|
|
|
|
|
|
"20.1": "alWEBBI",
|
|
|
|
|
|
"20.2": "alWEBBI",
|
|
|
|
|
|
"20.3": "alWEBBI",
|
|
|
|
|
|
"90.1": "AND",
|
|
|
|
|
|
"90.2": "AND",
|
|
|
|
|
|
"90.3": "AND",
|
|
|
|
|
|
}
|
|
|
|
|
|
if document_type is not None:
|
|
|
|
|
|
lines["3"] = document_type
|
|
|
|
|
|
lines["4"] = "D"
|
|
|
|
|
|
|
|
|
|
|
|
# Top-level AND tree
|
|
|
|
|
|
top_terms: list = [
|
|
|
|
|
|
{"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3,
|
|
|
|
|
|
"sf": "WP", "op": "eq", "num": 5},
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
if document_type is not None:
|
|
|
|
|
|
top_terms.append({"tn": "or", "num": 3, "terms": [
|
|
|
|
|
|
{"tn": "or", "num": 4, "terms": [
|
|
|
|
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
|
|
|
|
"l": 4, "sf": "ETYPF", "op": "eq", "num": 10},
|
|
|
|
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
|
|
|
|
"l": 4, "sf": "ETYP2F", "op": "eq", "num": 11},
|
|
|
|
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
|
|
|
|
"l": 4, "sf": "DTYPF", "op": "eq", "num": 12},
|
|
|
|
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
|
|
|
|
"l": 4, "sf": "DTYP2F", "op": "eq", "num": 13},
|
|
|
|
|
|
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
|
|
|
|
|
|
"l": 4, "sf": "1VTYPF", "op": "eq", "num": 14},
|
|
|
|
|
|
]},
|
|
|
|
|
|
{"tn": "or", "num": 15, "terms": [
|
|
|
|
|
|
{"tn": "term", "t": '"D"', "idx": 93, "l": 4,
|
|
|
|
|
|
"sf": "DART", "op": "eq", "num": 16},
|
|
|
|
|
|
{"tn": "term", "t": '"D"', "idx": 93, "l": 4,
|
|
|
|
|
|
"sf": "DARTS", "op": "eq", "num": 17},
|
|
|
|
|
|
]},
|
|
|
|
|
|
]})
|
|
|
|
|
|
|
|
|
|
|
|
top_terms.append({"tn": "or", "num": 18, "terms": [
|
|
|
|
|
|
{"tn": "or", "num": 19, "terms": [
|
|
|
|
|
|
date_term("DAT", 20),
|
|
|
|
|
|
date_term("DDAT", 21),
|
|
|
|
|
|
]},
|
|
|
|
|
|
date_term("SDAT", 22),
|
|
|
|
|
|
]})
|
|
|
|
|
|
top_terms.append({"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1,
|
|
|
|
|
|
"sf": "TYP", "op": "eq", "num": 23})
|
|
|
|
|
|
|
|
|
|
|
|
# Mirror the same shape into the parsed/sref display strings
|
|
|
|
|
|
if document_type is not None:
|
|
|
|
|
|
parsed = (
|
|
|
|
|
|
f"((/WP {wahlperiode}) AND "
|
|
|
|
|
|
f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
|
|
|
|
|
|
f"AND (/DART,DARTS (\"D\")) AND "
|
|
|
|
|
|
f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
|
|
|
|
|
|
)
|
|
|
|
|
|
else:
|
|
|
|
|
|
parsed = (
|
|
|
|
|
|
f"((/WP {wahlperiode}) AND "
|
|
|
|
|
|
f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
return {
|
|
|
|
|
|
"action": "SearchAndDisplay",
|
|
|
|
|
|
"sources": [self.db_id],
|
|
|
|
|
|
"report": {
|
|
|
|
|
|
"rhl": "main",
|
|
|
|
|
|
"rhlmode": "add",
|
|
|
|
|
|
"format": "generic1-full",
|
|
|
|
|
|
"mime": "html",
|
|
|
|
|
|
"sort": "WEVSO1/D WEVSO2 WEVSO3",
|
|
|
|
|
|
},
|
|
|
|
|
|
"search": {
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
"lines": lines,
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
"serverrecordname": "sr_generic1",
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
"parsed": parsed,
|
|
|
|
|
|
"sref": parsed,
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
"json": [{
|
|
|
|
|
|
"tn": "and",
|
|
|
|
|
|
"num": 1,
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
"terms": top_terms,
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
}],
|
|
|
|
|
|
},
|
|
|
|
|
|
"dataSet": "1",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _datum_de_to_iso(datum_de: str) -> str:
|
|
|
|
|
|
"""Convert DD.MM.YYYY → YYYY-MM-DD; return '' for empty input."""
|
|
|
|
|
|
if not datum_de:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
d, m, y = datum_de.split(".")
|
|
|
|
|
|
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
|
|
|
|
|
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]:
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
"""Extract Drucksachen from a report.tt.html response.
|
|
|
|
|
|
|
|
|
|
|
|
Two formats are supported and auto-detected:
|
|
|
|
|
|
|
|
|
|
|
|
- **LSA-style:** the records are embedded as Perl Data::Dumper
|
|
|
|
|
|
dumps inside ``<pre>$VAR1 = …</pre>`` blocks. WEV06 → title,
|
|
|
|
|
|
WEV32 → metadata + PDF path. Used by Sachsen-Anhalt's PADOKA
|
|
|
|
|
|
template.
|
|
|
|
|
|
- **Berlin-style:** standard production HTML cards with
|
|
|
|
|
|
``efxRecordRepeater`` divs. Title in an ``<h3 class="h5">``,
|
|
|
|
|
|
metadata + PDF link in an ``<span class="h6">``. Used by
|
|
|
|
|
|
Berlin's PARDOK template.
|
|
|
|
|
|
"""
|
|
|
|
|
|
if self._RE_PRE_BLOCK.search(html):
|
|
|
|
|
|
return self._parse_hit_list_dump(html, query_filter)
|
|
|
|
|
|
return self._parse_hit_list_cards(html, query_filter)
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_hit_list_dump(self, html: str, query_filter: str) -> list[Drucksache]:
|
|
|
|
|
|
"""Parse LSA-style ``<pre>$VAR1 = …</pre>`` Perl-dump records."""
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
results: list[Drucksache] = []
|
|
|
|
|
|
for pre in self._RE_PRE_BLOCK.findall(html):
|
|
|
|
|
|
m_ds = self._RE_DRUCKSACHE.search(pre)
|
|
|
|
|
|
if not m_ds:
|
|
|
|
|
|
continue
|
|
|
|
|
|
drucksache = m_ds.group(1)
|
|
|
|
|
|
|
|
|
|
|
|
m_t = self._RE_TITLE.search(pre)
|
|
|
|
|
|
title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"
|
|
|
|
|
|
|
|
|
|
|
|
m_pdf = self._RE_PDF.search(pre)
|
|
|
|
|
|
pdf_rel = m_pdf.group(1) if m_pdf else ""
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
pdf_url = f"{self.base_url}{self.pdf_url_prefix}{pdf_rel}" if pdf_rel else ""
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
|
|
|
|
|
|
m_w32 = self._RE_URHEBER_DATUM.search(pre)
|
|
|
|
|
|
urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else ""
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
datum_iso = self._datum_de_to_iso(m_w32.group(2) if m_w32 else "")
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
fraktionen = self._normalize_fraktion(urheber) if urheber else []
|
|
|
|
|
|
|
|
|
|
|
|
doc = Drucksache(
|
|
|
|
|
|
drucksache=drucksache,
|
|
|
|
|
|
title=title,
|
|
|
|
|
|
fraktionen=fraktionen,
|
|
|
|
|
|
datum=datum_iso,
|
|
|
|
|
|
link=pdf_url,
|
|
|
|
|
|
bundesland=self.bundesland,
|
|
|
|
|
|
typ="Antrag",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if query_filter:
|
|
|
|
|
|
hay = f"{title} {urheber}".lower()
|
|
|
|
|
|
if not all(t in hay for t in query_filter.lower().split()):
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
results.append(doc)
|
|
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
def _parse_hit_list_cards(self, html: str, query_filter: str) -> list[Drucksache]:
|
|
|
|
|
|
"""Parse Berlin-style ``efxRecordRepeater`` HTML-card records.
|
|
|
|
|
|
|
|
|
|
|
|
Each card contains an ``<h3>`` title, a metadata ``<span class="h6">``
|
|
|
|
|
|
with the document type, the Drucksachen-Nummer, and the date,
|
|
|
|
|
|
plus a direct ``<a href="…pdf">`` link to the PDF on the same host.
|
|
|
|
|
|
"""
|
|
|
|
|
|
results: list[Drucksache] = []
|
|
|
|
|
|
|
|
|
|
|
|
# Split the HTML on every record-div opener — easier than balancing
|
|
|
|
|
|
# divs with regex.
|
|
|
|
|
|
chunks = html.split('class="record')
|
|
|
|
|
|
# First chunk is the prelude, skip it
|
|
|
|
|
|
for chunk in chunks[1:]:
|
|
|
|
|
|
# Each chunk now starts at the record class attribute
|
|
|
|
|
|
m_t = self._RE_BE_TITLE.search(chunk)
|
|
|
|
|
|
title = m_t.group(1).strip() if m_t else "Ohne Titel"
|
|
|
|
|
|
|
|
|
|
|
|
m_ds = self._RE_BE_DRUCKSACHE.search(chunk)
|
|
|
|
|
|
if not m_ds:
|
|
|
|
|
|
continue
|
|
|
|
|
|
drucksache = m_ds.group(1)
|
|
|
|
|
|
|
|
|
|
|
|
m_pdf = self._RE_BE_LINK.search(chunk)
|
|
|
|
|
|
pdf_url = ""
|
|
|
|
|
|
if m_pdf:
|
|
|
|
|
|
href = m_pdf.group(1)
|
|
|
|
|
|
if href.startswith("http://") or href.startswith("https://"):
|
|
|
|
|
|
pdf_url = href
|
|
|
|
|
|
elif href.startswith("/"):
|
|
|
|
|
|
pdf_url = f"{self.base_url}{href}"
|
|
|
|
|
|
else:
|
|
|
|
|
|
pdf_url = f"{self.base_url}{self.pdf_url_prefix}{href}"
|
|
|
|
|
|
|
|
|
|
|
|
m_dat = self._RE_BE_DATUM.search(chunk)
|
|
|
|
|
|
datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "")
|
|
|
|
|
|
|
|
|
|
|
|
m_doc = self._RE_BE_DOCTYPE.search(chunk)
|
|
|
|
|
|
doctype_full = m_doc.group(1).strip() if m_doc else "Drucksache"
|
|
|
|
|
|
|
|
|
|
|
|
# Berlin often packs the originator(s) into the same h6 line:
|
|
|
|
|
|
# "Antrag CDU, SPD" → fraktionen = ["CDU","SPD"], typ = "Antrag"
|
|
|
|
|
|
# Senat-Vorlagen carry no fraction, only "Vorlage zur …".
|
|
|
|
|
|
fraktionen = self._normalize_fraktion(doctype_full)
|
|
|
|
|
|
# Strip the fraction names back out of the typ string so the UI
|
|
|
|
|
|
# shows a clean "Antrag" / "Vorlage …" label.
|
|
|
|
|
|
typ = doctype_full
|
|
|
|
|
|
if fraktionen:
|
|
|
|
|
|
# Cut at the first occurrence of any party name
|
|
|
|
|
|
cuts = [typ.upper().find(f.upper()) for f in fraktionen]
|
|
|
|
|
|
cuts = [c for c in cuts if c >= 0]
|
|
|
|
|
|
if cuts:
|
|
|
|
|
|
typ = typ[: min(cuts)].rstrip(" ,")
|
|
|
|
|
|
|
|
|
|
|
|
doc = Drucksache(
|
|
|
|
|
|
drucksache=drucksache,
|
|
|
|
|
|
title=title,
|
|
|
|
|
|
fraktionen=fraktionen,
|
|
|
|
|
|
datum=datum_iso,
|
|
|
|
|
|
link=pdf_url,
|
|
|
|
|
|
bundesland=self.bundesland,
|
|
|
|
|
|
typ=typ,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if query_filter:
|
2026-04-08 07:50:44 +02:00
|
|
|
|
hay = f"{title} {doctype_full}".lower()
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
if not all(t in hay for t in query_filter.lower().split()):
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
results.append(doc)
|
|
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
"""Search recent documents of the current Wahlperiode.
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
|
|
|
|
|
|
``query`` is applied as a client-side title/Urheber filter; the
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
server-side query covers the configured ``date_window_days``
|
|
|
|
|
|
(default 24 months).
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
"""
|
|
|
|
|
|
from datetime import date, timedelta
|
|
|
|
|
|
|
|
|
|
|
|
end = date.today()
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
start = end - timedelta(days=self.date_window_days)
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
body = self._build_search_body(
|
|
|
|
|
|
wahlperiode=self.wahlperiode,
|
|
|
|
|
|
start_date=start.isoformat(),
|
|
|
|
|
|
end_date=end.isoformat(),
|
|
|
|
|
|
)
|
|
|
|
|
|
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
browse_html = f"{self.base_url}{self.portala_path}/browse.tt.html"
|
|
|
|
|
|
browse_json = f"{self.base_url}{self.portala_path}/browse.tt.json"
|
|
|
|
|
|
report_html = f"{self.base_url}{self.portala_path}/report.tt.html"
|
|
|
|
|
|
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
async with httpx.AsyncClient(
|
2026-04-08 13:58:34 +02:00
|
|
|
|
# Bumped from 30s for #13 quick-win: chunksize=500 against the
|
|
|
|
|
|
# LSA report.tt.html endpoint occasionally takes 30+ seconds.
|
|
|
|
|
|
timeout=60,
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
follow_redirects=True,
|
|
|
|
|
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
|
|
|
|
|
) as client:
|
|
|
|
|
|
try:
|
|
|
|
|
|
# Step 1: warm up cookies via the browse page
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
await client.get(browse_html)
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
|
|
|
|
|
|
# Step 2: submit the search action
|
|
|
|
|
|
resp = await client.post(
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
browse_json,
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
json=body,
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
headers={"Referer": browse_html},
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
)
|
|
|
|
|
|
if resp.status_code != 200:
|
2026-04-08 08:19:48 +02:00
|
|
|
|
logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
data = resp.json()
|
|
|
|
|
|
report_id = data.get("report_id")
|
|
|
|
|
|
if not report_id:
|
2026-04-08 08:19:48 +02:00
|
|
|
|
logger.error("%s: no report_id in response: %s", self.bundesland, data)
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
# Step 3: fetch the HTML hit list
|
2026-04-08 13:58:34 +02:00
|
|
|
|
# Take a generous chunk so the client-side title filter
|
|
|
|
|
|
# still has enough material to work with. Quick-win for #13
|
|
|
|
|
|
# until the eUI sf-Index for real server-side fulltext is
|
|
|
|
|
|
# reverse-engineered: bump the unfiltered chunk floor and
|
|
|
|
|
|
# the query-filtered chunk ceiling.
|
|
|
|
|
|
chunksize = max(limit * 10, 500) if query else max(limit * 2, 100)
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
report_resp = await client.post(
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
report_html,
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
json={"report_id": report_id, "start": 0, "chunksize": chunksize},
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
headers={"Referer": browse_html},
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
)
|
|
|
|
|
|
if report_resp.status_code != 200:
|
2026-04-08 08:19:48 +02:00
|
|
|
|
logger.error("%s report HTTP %s", self.bundesland, report_resp.status_code)
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
results = self._parse_hit_list_html(report_resp.text, query_filter=query)
|
|
|
|
|
|
return results[:limit]
|
|
|
|
|
|
|
2026-04-08 08:19:48 +02:00
|
|
|
|
except Exception:
|
|
|
|
|
|
logger.exception("%s search error", self.bundesland)
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
|
|
|
|
"""Look up a single document by ID via the search endpoint with a
|
|
|
|
|
|
document_number filter."""
|
|
|
|
|
|
# Pragmatic MVP: do a broad search and filter for the requested ID.
|
|
|
|
|
|
# A targeted single-document fetch would require a different
|
|
|
|
|
|
# action.search.json structure that we have not reverse-engineered yet.
|
|
|
|
|
|
results = await self.search(query="", limit=200)
|
|
|
|
|
|
for doc in results:
|
|
|
|
|
|
if doc.drucksache == drucksache:
|
|
|
|
|
|
return doc
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
|
|
|
|
"""Download the PDF for a Drucksache and extract its text."""
|
|
|
|
|
|
import fitz # PyMuPDF
|
|
|
|
|
|
|
|
|
|
|
|
doc = await self.get_document(drucksache)
|
|
|
|
|
|
if not doc or not doc.link:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
async with httpx.AsyncClient(
|
|
|
|
|
|
timeout=60,
|
|
|
|
|
|
follow_redirects=True,
|
|
|
|
|
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
|
|
|
|
|
) as client:
|
|
|
|
|
|
try:
|
|
|
|
|
|
resp = await client.get(doc.link)
|
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
|
return None
|
|
|
|
|
|
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
|
|
|
|
|
text = ""
|
|
|
|
|
|
for page in pdf:
|
|
|
|
|
|
text += page.get_text()
|
|
|
|
|
|
pdf.close()
|
|
|
|
|
|
return text
|
2026-04-08 08:19:48 +02:00
|
|
|
|
except Exception:
|
|
|
|
|
|
logger.exception("%s download error for %s", self.bundesland, drucksache)
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ParLDokAdapter(ParlamentAdapter):
|
|
|
|
|
|
"""Adapter for ParlDok 8.x parliament documentation systems (J3S GmbH).
|
|
|
|
|
|
|
|
|
|
|
|
ParlDok is a proprietary parliament documentation product by J3S GmbH
|
|
|
|
|
|
(https://www.j3s.de). Different from the portala/eUI framework used by
|
|
|
|
|
|
LSA/BE: ParlDok 8.x is a single-page app whose backend is a JSON API
|
|
|
|
|
|
rooted at ``{base_url}{prefix}/Fulltext/...``. The legacy ParLDok 5.x
|
|
|
|
|
|
HTML POST form (``parldok/formalkriterien``) used by dokukratie's MV
|
|
|
|
|
|
YAML scraper has been deprecated by the LandtagMV upgrade to 8.3.5.
|
|
|
|
|
|
|
|
|
|
|
|
Confirmed instances using this engine (April 2026):
|
|
|
|
|
|
|
|
|
|
|
|
- **MV** (Mecklenburg-Vorpommern) — ``dokumentation.landtag-mv.de/parldok``
|
|
|
|
|
|
- HH, SN, TH all advertise ParlDok in dokukratie but their actual
|
|
|
|
|
|
versions/themes have not been verified yet.
|
|
|
|
|
|
|
|
|
|
|
|
Search workflow:
|
|
|
|
|
|
|
|
|
|
|
|
1. ``GET {base_url}{prefix}/`` to obtain the session cookie. The
|
|
|
|
|
|
backend rejects POSTs without it.
|
|
|
|
|
|
2. ``POST {base_url}{prefix}/Fulltext/Search`` with form-encoded
|
|
|
|
|
|
``data=<json>`` payload. The JSON carries a ``tags`` array of
|
|
|
|
|
|
facet selections; each tag is ``{"type": <facet_type_int>,
|
|
|
|
|
|
"id": <facet_value>}``. Reverse-engineered facet type constants
|
|
|
|
|
|
from the bundle.js (``pd.facet_*``):
|
|
|
|
|
|
|
|
|
|
|
|
- ``facet_fraction = 2``
|
|
|
|
|
|
- ``facet_kind = 7`` (Drucksache, Plenarprotokoll, …)
|
|
|
|
|
|
- ``facet_type = 8`` (Antrag, Gesetzentwurf, Kleine Anfrage, …)
|
|
|
|
|
|
- ``facet_lp = 10`` (Wahlperiode)
|
|
|
|
|
|
|
|
|
|
|
|
Response is JSON ``{success, data: <stringified JSON>}`` where the
|
|
|
|
|
|
inner ``data`` carries ``{count, docs: [{id, title, date,
|
|
|
|
|
|
authorhtml, kind, type, lp, number, link, ...}], ...}``.
|
|
|
|
|
|
|
|
|
|
|
|
3. PDF download: ``GET {base_url}{prefix}/dokument/{numeric_id}``.
|
|
|
|
|
|
Returns ``application/pdf`` directly. The ``link`` field returned
|
|
|
|
|
|
by the search API already contains the path fragment
|
|
|
|
|
|
``/dokument/<id>#navpanes=0`` — strip the fragment and prepend
|
|
|
|
|
|
the configured ``prefix``.
|
|
|
|
|
|
|
|
|
|
|
|
Drucksachen-Nummer is reconstructed as ``f"{lp}/{number}"`` from the
|
|
|
|
|
|
search hit. Full-text search is *not* implemented in this MVP — the
|
|
|
|
|
|
backend supports it via ``facet_fulltext = 0`` tags but the public
|
|
|
|
|
|
LP-only filter already returns the relevant Antrag pool. ``query``
|
|
|
|
|
|
is applied as a client-side title/Urheber filter.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
# Reverse-engineered facet type constants from bundle.js (pd.facet_*).
|
2026-04-08 12:57:34 +02:00
|
|
|
|
FACET_FULLTEXT = 0
|
2026-04-08 08:19:48 +02:00
|
|
|
|
FACET_FRACTION = 2
|
|
|
|
|
|
FACET_KIND = 7
|
|
|
|
|
|
FACET_TYPE = 8
|
|
|
|
|
|
FACET_LP = 10
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
*,
|
|
|
|
|
|
bundesland: str,
|
|
|
|
|
|
name: str,
|
|
|
|
|
|
base_url: str,
|
|
|
|
|
|
wahlperiode: int,
|
|
|
|
|
|
prefix: str = "/parldok",
|
|
|
|
|
|
document_typ: str = "Antrag",
|
|
|
|
|
|
) -> None:
|
|
|
|
|
|
"""Configure a ParlDok 8.x adapter for one specific parliament.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
bundesland: state code, e.g. ``"MV"``.
|
|
|
|
|
|
name: human-readable label.
|
|
|
|
|
|
base_url: ``https://...`` host root, no trailing slash.
|
|
|
|
|
|
wahlperiode: current legislative period — fed into the
|
|
|
|
|
|
``facet_lp`` tag of the search payload.
|
|
|
|
|
|
prefix: app prefix where ParlDok lives. ``/parldok`` for MV.
|
|
|
|
|
|
document_typ: client-side filter on the ``type`` field of
|
|
|
|
|
|
each hit ("Antrag", "Gesetzentwurf", …). Set to empty
|
|
|
|
|
|
string to disable type filtering.
|
|
|
|
|
|
"""
|
|
|
|
|
|
self.bundesland = bundesland
|
|
|
|
|
|
self.name = name
|
|
|
|
|
|
self.base_url = base_url.rstrip("/")
|
|
|
|
|
|
self.prefix = "/" + prefix.strip("/")
|
|
|
|
|
|
self.wahlperiode = wahlperiode
|
|
|
|
|
|
self.document_typ = document_typ
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _datum_de_to_iso(datum_de: str) -> str:
|
|
|
|
|
|
"""DD.MM.YYYY → YYYY-MM-DD; '' for empty input."""
|
|
|
|
|
|
if not datum_de:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
try:
|
|
|
|
|
|
d, m, y = datum_de.split(".")
|
|
|
|
|
|
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
|
|
|
|
|
except ValueError:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _normalize_fraktion(authorhtml: str) -> list[str]:
|
|
|
|
|
|
"""Map ParlDok ``authorhtml`` to canonical fraction codes.
|
|
|
|
|
|
|
|
|
|
|
|
``authorhtml`` may be a comma-separated list of fractions
|
|
|
|
|
|
("CDU, SPD, F.D.P."), a single MdL with party in parens
|
|
|
|
|
|
("Thomas de Jesus Fernandes (AfD)") or empty (Landesregierung).
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not authorhtml:
|
|
|
|
|
|
return []
|
|
|
|
|
|
u = authorhtml.upper()
|
|
|
|
|
|
out: list[str] = []
|
|
|
|
|
|
if re.search(r"\bBÜNDNIS\s*90\b", u) or re.search(r"\bGR(?:Ü|UE)NE\b", u):
|
|
|
|
|
|
out.append("GRÜNE")
|
|
|
|
|
|
if re.search(r"\bCDU\b", u):
|
|
|
|
|
|
out.append("CDU")
|
|
|
|
|
|
if re.search(r"\bSPD\b", u):
|
|
|
|
|
|
out.append("SPD")
|
|
|
|
|
|
# F.D.P. (with dots, historical) and FDP both occur in MV
|
|
|
|
|
|
if re.search(r"\bF\.?\s*D\.?\s*P\.?\b", u):
|
|
|
|
|
|
out.append("FDP")
|
|
|
|
|
|
if re.search(r"\bAFD\b", u):
|
|
|
|
|
|
out.append("AfD")
|
|
|
|
|
|
if re.search(r"\bLINKE\b", u) or re.search(r"\bLL/PDS\b", u):
|
|
|
|
|
|
out.append("LINKE")
|
|
|
|
|
|
if re.search(r"\bBSW\b", u):
|
|
|
|
|
|
out.append("BSW")
|
Add pytest suite + fix two regex bugs uncovered by it (#46)
Erste Tests für die Codebase. 77 Tests, 0.08s Laufzeit, decken die
drei Bug-Klassen aus der April-2026-Adapter-Session ab plus haben
schon zwei weitere Bugs in Production-Code aufgedeckt.
## Setup
- requirements-dev.txt mit pytest + pytest-asyncio
- pytest.ini mit asyncio_mode=auto
- tests/conftest.py stubbt fitz/bs4/openai/pydantic_settings, damit
die Suite ohne den vollen prod-requirements-Satz läuft (pure unit
tests, kein PDF-Parsing, kein HTTP)
## Tests
- tests/test_parlamente.py (33 Tests)
* PortalaAdapter._parse_hit_list_cards: doctype/doctype_full
NameError-Regression aus 1cb030a, plus Title/Drucksache/Fraktion-
/Datum/PDF-Extraktion gegen ein BE-Card-Fixture
* PortalaAdapter._parse_hit_list_dump: gegen ein LSA-Perl-Dump-
Fixture inkl. Hex-Escape-Decoding (\x{fc} → ü)
* PortalaAdapter._parse_hit_list_html: Auto-Detection zwischen
Card- und Dump-Format
* PortalaAdapter._normalize_fraktion: kanonische Fraktion-Codes
inkl. F.D.P.-mit-Punkten, BÜNDNIS 90, DIE LINKE, BSW
* ParLDokAdapter._hit_to_drucksache: JSON-Hit → Drucksache
Mapping inkl. /navpanes-Stripping, MdL-mit-Partei-in-Klammern,
Landesregierung-Detection
* ParLDokAdapter._fulltext_id: bundle.js-mirroring (deferred,
aber dokumentiert)
* ADAPTERS-Registry-Sanity
- tests/test_embeddings.py (11 Tests)
* _chunk_source_label: Programm-Name + Seite (Halluzinations-
Bug-Regression aus 1b5fd96)
* format_quotes_for_prompt: jeder Chunk muss Programm-Name
enthalten, strict-citation-Hinweis muss im Output sein,
keine NRW-Halluzinationen für MV/BE-Chunk-Sets
- tests/test_wahlprogramme.py (14 Tests)
* Registry-Struktur (jahr int, seiten int, .pdf-Endung)
* File-Existenz: jede registrierte PDF muss in
static/referenzen/ liegen — würde Tippfehler in den 22
indexierten Programmen sofort fangen
* embeddings.PROGRAMME-Konsistenz-Cross-Check
- tests/test_bundeslaender.py (15 Tests)
* Sanity über 16-State-Registry
* #48-Klassifikations-Regression: TH=ParlDok, HB=StarWeb,
SN=Eigensystem
* Wahltermine plausibel (zwischen 2026 und 2035)
- tests/test_analyzer.py (4 Tests)
* Markdown-Codeblock-Stripping aus dem JSON-Retry-Loop
## Bug-Funde während der Test-Schreibphase
Zwei Production-Bugs in den _normalize_fraktion-Helfern wurden
durch die neuen Tests sofort aufgedeckt und im selben Commit gefixt:
1. PortalaAdapter._normalize_fraktion matched "F.D.P." (mit Punkten,
wie historische SH/HB-Drucksachen) nicht — Regex \bFDP\b ist zu
strikt. Fix: \bF\.?\s*D\.?\s*P\.?\b analog zu ParLDokAdapter.
2. ParLDokAdapter._normalize_fraktion (auch PortalaAdapter) matched
"Ministerium der Finanzen" nicht als Landesregierung, weil
\bMINISTER\b die Wortgrenze auch nach MINISTER verlangt — bei
MINISTERIUM steht aber IUM danach, keine Wortgrenze. Fix:
\bMINISTER ohne abschließendes \b.
Beide Bugs hätten Fraktion-Felder bei Drucksachen der Bremischen
Bürgerschaft (FDP-Listen) und bei Landesregierungs-Drucksachen
in MV/LSA fälschlich leer gelassen — exakt der "fraktionen=[]"-
Befund aus dem MV-Smoke-Test in #4.
Phase 0 aus Roadmap-Issue #49.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 23:26:06 +02:00
|
|
|
|
# \bMINISTER ohne abschließende Wortgrenze, damit MINISTERIUM
|
|
|
|
|
|
# auch trifft (z.B. "Ministerium der Finanzen" als Urheber von
|
|
|
|
|
|
# Landesregierungs-Drucksachen).
|
|
|
|
|
|
if re.search(r"LANDESREGIERUNG|\bMINISTER|STAATSKANZLEI|MINISTERPRÄSIDENT", u):
|
2026-04-08 08:19:48 +02:00
|
|
|
|
out.append("Landesregierung")
|
|
|
|
|
|
return out
|
|
|
|
|
|
|
2026-04-08 12:57:34 +02:00
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _fulltext_id(term: str) -> str:
|
|
|
|
|
|
"""Sanitize a search term to ParlDok's facet ID format.
|
|
|
|
|
|
|
|
|
|
|
|
Mirrors ``pd.getFulltextId`` from ``bundle.js``: replace every
|
|
|
|
|
|
non-alphanumeric character with ``-``. The server uses this to
|
|
|
|
|
|
deduplicate identical search facets.
|
|
|
|
|
|
"""
|
|
|
|
|
|
return re.sub(r"[^a-zA-Z0-9]", "-", term)
|
|
|
|
|
|
|
|
|
|
|
|
def _build_search_body(self, *, length: int = 100, query: str = "") -> dict:
|
2026-04-08 08:19:48 +02:00
|
|
|
|
"""Build the JSON payload for the initial ``Fulltext/Search`` call.
|
|
|
|
|
|
|
2026-04-08 19:01:00 +02:00
|
|
|
|
Filters by Wahlperiode only — type/kind/fulltext filtering all
|
|
|
|
|
|
happen client-side after the hit list is paginated. The
|
|
|
|
|
|
``query`` parameter is accepted for API compatibility but is
|
|
|
|
|
|
currently NOT forwarded to the server (#18: einheitliche
|
|
|
|
|
|
client-side Title-Suche, kein Server-Volltext, weil das
|
|
|
|
|
|
Verhalten zwischen Adaptern sonst asymmetrisch wird). The
|
|
|
|
|
|
``FACET_FULLTEXT`` constant and :meth:`_fulltext_id` helper
|
|
|
|
|
|
are kept around as documentation for the previous #12
|
|
|
|
|
|
server-side variant — when fulltext gets uniformly
|
|
|
|
|
|
re-introduced later, the dormant tag is just::
|
|
|
|
|
|
|
|
|
|
|
|
{"type": self.FACET_FULLTEXT,
|
|
|
|
|
|
"id": self._fulltext_id(query),
|
|
|
|
|
|
"fulltext": query, "label": query, "field": "Alle"}
|
2026-04-08 12:57:34 +02:00
|
|
|
|
|
|
|
|
|
|
Pagination beyond the first page goes through
|
|
|
|
|
|
``Fulltext/Resultpage`` — the ``Search`` endpoint itself
|
|
|
|
|
|
ignores any non-zero ``Start``.
|
2026-04-08 08:19:48 +02:00
|
|
|
|
"""
|
2026-04-08 19:01:00 +02:00
|
|
|
|
del query # explicitly unused — see docstring
|
2026-04-08 12:57:34 +02:00
|
|
|
|
tags: list[dict] = [{"type": self.FACET_LP, "id": self.wahlperiode}]
|
2026-04-08 08:19:48 +02:00
|
|
|
|
return {
|
|
|
|
|
|
"devicekey": "",
|
|
|
|
|
|
"max": length,
|
|
|
|
|
|
"withfilter": False,
|
|
|
|
|
|
# sort=2 → newest first (date desc); sort=1 is relevance.
|
|
|
|
|
|
"sort": 2,
|
|
|
|
|
|
"topk": length,
|
|
|
|
|
|
"llm": 0,
|
|
|
|
|
|
"newdocsearch": False,
|
|
|
|
|
|
"limit": {"Start": 0, "Length": length},
|
2026-04-08 12:57:34 +02:00
|
|
|
|
"tags": tags,
|
2026-04-08 08:19:48 +02:00
|
|
|
|
"updateFilters": [],
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def _hit_to_drucksache(self, hit: dict) -> Optional[Drucksache]:
|
|
|
|
|
|
"""Convert one ParlDok JSON hit to a Drucksache. None if unusable."""
|
|
|
|
|
|
lp = hit.get("lp")
|
|
|
|
|
|
number = hit.get("number")
|
|
|
|
|
|
if not lp or not number:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
link_field = hit.get("link") or hit.get("prelink") or ""
|
|
|
|
|
|
# Strip "#navpanes=0" fragment and prepend the prefix.
|
|
|
|
|
|
path = link_field.split("#", 1)[0]
|
|
|
|
|
|
pdf_url = f"{self.base_url}{self.prefix}{path}" if path else ""
|
|
|
|
|
|
|
|
|
|
|
|
return Drucksache(
|
|
|
|
|
|
drucksache=f"{lp}/{number}",
|
|
|
|
|
|
title=hit.get("title", ""),
|
|
|
|
|
|
fraktionen=self._normalize_fraktion(hit.get("authorhtml", "")),
|
|
|
|
|
|
datum=self._datum_de_to_iso(hit.get("date", "")),
|
|
|
|
|
|
link=pdf_url,
|
|
|
|
|
|
bundesland=self.bundesland,
|
|
|
|
|
|
typ=hit.get("type", "") or hit.get("kind", ""),
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
async def _post_json(
|
|
|
|
|
|
self, client: httpx.AsyncClient, endpoint: str, payload: dict,
|
|
|
|
|
|
) -> Optional[dict]:
|
|
|
|
|
|
"""POST a JSON-stringified payload to a ParlDok endpoint.
|
|
|
|
|
|
|
|
|
|
|
|
``endpoint`` is the path tail (e.g. ``"Fulltext/Search"`` or
|
|
|
|
|
|
``"Fulltext/Resultpage"``). Returns the inner JSON object
|
|
|
|
|
|
(already parsed from the stringified ``data`` field), or None
|
|
|
|
|
|
on error.
|
|
|
|
|
|
"""
|
|
|
|
|
|
homepage = f"{self.base_url}{self.prefix}/"
|
|
|
|
|
|
url = f"{self.base_url}{self.prefix}/{endpoint}"
|
|
|
|
|
|
try:
|
|
|
|
|
|
resp = await client.post(
|
|
|
|
|
|
url,
|
|
|
|
|
|
data={"data": json.dumps(payload, ensure_ascii=False)},
|
|
|
|
|
|
headers={
|
|
|
|
|
|
"X-Requested-With": "XMLHttpRequest",
|
|
|
|
|
|
"Referer": homepage,
|
|
|
|
|
|
},
|
|
|
|
|
|
)
|
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
|
logger.error(
|
|
|
|
|
|
"%s %s HTTP %s",
|
|
|
|
|
|
self.bundesland, endpoint, resp.status_code,
|
|
|
|
|
|
)
|
|
|
|
|
|
return None
|
|
|
|
|
|
outer = resp.json()
|
|
|
|
|
|
if not outer.get("success"):
|
|
|
|
|
|
logger.error(
|
|
|
|
|
|
"%s %s not successful: %s",
|
|
|
|
|
|
self.bundesland, endpoint, outer.get("message"),
|
|
|
|
|
|
)
|
|
|
|
|
|
return None
|
|
|
|
|
|
return json.loads(outer["data"])
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
logger.exception("%s ParlDok %s error", self.bundesland, endpoint)
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
async def _initial_search(
|
2026-04-08 19:01:00 +02:00
|
|
|
|
self, client: httpx.AsyncClient, *, length: int,
|
2026-04-08 08:19:48 +02:00
|
|
|
|
) -> tuple[Optional[int], list[dict]]:
|
|
|
|
|
|
"""Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
|
|
|
|
|
|
|
|
|
|
|
|
The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
|
|
|
|
|
|
calls. ParlDok ignores any non-zero ``Start`` on this endpoint —
|
|
|
|
|
|
the first 100 hits are the only ones reachable via ``Search``.
|
|
|
|
|
|
"""
|
2026-04-08 19:01:00 +02:00
|
|
|
|
body = self._build_search_body(length=length)
|
2026-04-08 08:19:48 +02:00
|
|
|
|
inner = await self._post_json(client, "Fulltext/Search", body)
|
|
|
|
|
|
if not inner:
|
|
|
|
|
|
return None, []
|
|
|
|
|
|
return inner.get("queryid"), (inner.get("docs") or [])
|
|
|
|
|
|
|
|
|
|
|
|
async def _result_page(
|
|
|
|
|
|
self, client: httpx.AsyncClient, *, queryid: int, start: int, length: int,
|
|
|
|
|
|
) -> list[dict]:
|
|
|
|
|
|
"""Fetch a further result page via ``Fulltext/Resultpage``."""
|
|
|
|
|
|
payload = {
|
|
|
|
|
|
"devicekey": "",
|
|
|
|
|
|
"queryid": queryid,
|
|
|
|
|
|
"limit": {"Start": start, "Length": length},
|
|
|
|
|
|
}
|
|
|
|
|
|
inner = await self._post_json(client, "Fulltext/Resultpage", payload)
|
|
|
|
|
|
if not inner:
|
|
|
|
|
|
return []
|
|
|
|
|
|
return inner.get("docs") or []
|
|
|
|
|
|
|
|
|
|
|
|
def _make_client(self) -> httpx.AsyncClient:
|
|
|
|
|
|
return httpx.AsyncClient(
|
|
|
|
|
|
timeout=30,
|
|
|
|
|
|
follow_redirects=True,
|
|
|
|
|
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-08 19:01:00 +02:00
|
|
|
|
async def _paginated_hits(self, client: httpx.AsyncClient):
|
|
|
|
|
|
"""Async iterator over Drucksachen-style hits across pages.
|
2026-04-08 08:19:48 +02:00
|
|
|
|
|
|
|
|
|
|
Yields raw hit dicts in newest-first order. The first batch comes
|
|
|
|
|
|
from ``Fulltext/Search``, subsequent batches from
|
|
|
|
|
|
``Fulltext/Resultpage`` using the queryid the server returned for
|
|
|
|
|
|
the initial call. Stops when a page comes back empty, undersized,
|
2026-04-08 19:01:00 +02:00
|
|
|
|
or after :attr:`MAX_PAGES` iterations.
|
2026-04-08 08:19:48 +02:00
|
|
|
|
"""
|
2026-04-08 19:01:00 +02:00
|
|
|
|
queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE)
|
2026-04-08 08:19:48 +02:00
|
|
|
|
for hit in hits:
|
|
|
|
|
|
yield hit
|
|
|
|
|
|
if not queryid or len(hits) < self.PAGE_SIZE:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
for page in range(1, self.MAX_PAGES):
|
|
|
|
|
|
page_hits = await self._result_page(
|
|
|
|
|
|
client,
|
|
|
|
|
|
queryid=queryid,
|
|
|
|
|
|
start=page * self.PAGE_SIZE,
|
|
|
|
|
|
length=self.PAGE_SIZE,
|
|
|
|
|
|
)
|
|
|
|
|
|
if not page_hits:
|
|
|
|
|
|
return
|
|
|
|
|
|
for hit in page_hits:
|
|
|
|
|
|
yield hit
|
|
|
|
|
|
if len(page_hits) < self.PAGE_SIZE:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
# ParlDok 8.x caps Length per request at 100 — paginate if needed.
|
|
|
|
|
|
PAGE_SIZE = 100
|
|
|
|
|
|
# Safety bound: scan at most 10 pages × 100 = 1000 most recent docs.
|
|
|
|
|
|
# Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more
|
|
|
|
|
|
# than enough for the typical UI request (limit 5..20). Filtered
|
|
|
|
|
|
# queries that find nothing in the last 1000 docs return empty
|
2026-04-08 19:01:00 +02:00
|
|
|
|
# rather than scan the entire WP — same trade-off as the BE/LSA
|
|
|
|
|
|
# PortalaAdapter quick-win window.
|
2026-04-08 08:19:48 +02:00
|
|
|
|
MAX_PAGES = 10
|
|
|
|
|
|
|
|
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
2026-04-08 19:01:00 +02:00
|
|
|
|
"""Search the configured Wahlperiode, sorted newest-first.
|
|
|
|
|
|
|
|
|
|
|
|
#18: einheitliches Verhalten — Server filtert nur nach WP, der
|
|
|
|
|
|
Client paginiert über die ganze WP und filtert lokal nach
|
|
|
|
|
|
Treffern in Titel oder Urheber. Volltext-Filter (#12) ist
|
|
|
|
|
|
zurückgebaut, weil das Verhalten zwischen Adaptern sonst
|
|
|
|
|
|
asymmetrisch wird. Sortierung kommt vom Server (newest-first
|
|
|
|
|
|
durch ``sort=2`` in :meth:`_build_search_body`).
|
|
|
|
|
|
|
|
|
|
|
|
Dedupe per ``lp/number`` weil ParlDok dieselbe Drucksache
|
|
|
|
|
|
mehrfach in verschiedenen Vorgängen/Beratungen liefert.
|
2026-04-08 08:19:48 +02:00
|
|
|
|
"""
|
|
|
|
|
|
results: list[Drucksache] = []
|
|
|
|
|
|
seen: set[str] = set()
|
2026-04-08 19:01:00 +02:00
|
|
|
|
query_terms = [t.lower() for t in query.split() if t] if query else []
|
2026-04-08 08:19:48 +02:00
|
|
|
|
|
|
|
|
|
|
async with self._make_client() as client:
|
|
|
|
|
|
await client.get(f"{self.base_url}{self.prefix}/")
|
2026-04-08 19:01:00 +02:00
|
|
|
|
async for hit in self._paginated_hits(client):
|
2026-04-08 08:19:48 +02:00
|
|
|
|
if hit.get("kind") != "Drucksache":
|
|
|
|
|
|
continue
|
|
|
|
|
|
if self.document_typ and hit.get("type") != self.document_typ:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
doc = self._hit_to_drucksache(hit)
|
|
|
|
|
|
if not doc:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if doc.drucksache in seen:
|
|
|
|
|
|
continue
|
|
|
|
|
|
seen.add(doc.drucksache)
|
|
|
|
|
|
|
2026-04-08 19:01:00 +02:00
|
|
|
|
if query_terms:
|
|
|
|
|
|
hay = f"{doc.title} {hit.get('authorhtml', '')}".lower()
|
|
|
|
|
|
if not all(t in hay for t in query_terms):
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
2026-04-08 08:19:48 +02:00
|
|
|
|
results.append(doc)
|
|
|
|
|
|
if len(results) >= limit:
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
|
|
|
|
"""Look up a single Antrag by ``lp/number`` ID.
|
|
|
|
|
|
|
|
|
|
|
|
Pragmatic MVP: page through the WP unfiltered until we find a
|
|
|
|
|
|
match. ParlDok offers a ``facet_number`` (14) facet that would
|
|
|
|
|
|
let us target the lookup directly, but the facet ID values are
|
|
|
|
|
|
instance-specific (would require a ``Fulltext/Filter`` discovery
|
|
|
|
|
|
call) and the WP-wide pagination is fast enough for the typical
|
|
|
|
|
|
2k–10k Drucksachen per period.
|
|
|
|
|
|
"""
|
|
|
|
|
|
wanted_lp, wanted_num = (drucksache.split("/", 1) + [""])[:2]
|
|
|
|
|
|
if not wanted_num:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
async with self._make_client() as client:
|
|
|
|
|
|
await client.get(f"{self.base_url}{self.prefix}/")
|
|
|
|
|
|
async for hit in self._paginated_hits(client):
|
|
|
|
|
|
if hit.get("kind") != "Drucksache":
|
|
|
|
|
|
continue
|
|
|
|
|
|
if str(hit.get("lp")) == wanted_lp and str(hit.get("number")) == wanted_num:
|
|
|
|
|
|
return self._hit_to_drucksache(hit)
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
|
|
|
|
"""Download the PDF for a Drucksache and extract its text."""
|
|
|
|
|
|
import fitz # PyMuPDF
|
|
|
|
|
|
|
|
|
|
|
|
doc = await self.get_document(drucksache)
|
|
|
|
|
|
if not doc or not doc.link:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
async with httpx.AsyncClient(
|
|
|
|
|
|
timeout=60,
|
|
|
|
|
|
follow_redirects=True,
|
|
|
|
|
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
|
|
|
|
|
) as client:
|
|
|
|
|
|
try:
|
|
|
|
|
|
resp = await client.get(doc.link)
|
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
|
logger.error(
|
|
|
|
|
|
"%s PDF HTTP %s for %s (%s)",
|
|
|
|
|
|
self.bundesland, resp.status_code, drucksache, doc.link,
|
|
|
|
|
|
)
|
|
|
|
|
|
return None
|
|
|
|
|
|
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
|
|
|
|
|
text = ""
|
|
|
|
|
|
for page in pdf:
|
|
|
|
|
|
text += page.get_text()
|
|
|
|
|
|
pdf.close()
|
|
|
|
|
|
return text
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
logger.exception("%s ParlDok download error for %s", self.bundesland, drucksache)
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
|
class BayernAdapter(ParlamentAdapter):
|
|
|
|
|
|
"""Adapter for Bayerischer Landtag."""
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
|
bundesland = "BY"
|
|
|
|
|
|
name = "Bayerischer Landtag"
|
|
|
|
|
|
base_url = "https://www.bayern.landtag.de"
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
|
|
|
|
|
# TODO: Implement Bayern search
|
|
|
|
|
|
return []
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
|
|
|
|
|
# TODO: Implement
|
|
|
|
|
|
return None
|
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)
Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.
Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
is outdated. The Sachsen-Anhalt portal was migrated to the same
eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
1. POST /portal/browse.tt.json with a JSON action body containing an
Elasticsearch-style query tree under search.json. Returns a
report_id plus hit count.
2. POST /portal/report.tt.html with {report_id, start, chunksize}
returns the HTML hit list. Each record carries a Perl Data::Dumper
block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
— only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
directly without any session cookie.
What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
document type and returns the most recent hits. The user's free-text
query is applied as a client-side title/Urheber filter (no fulltext
search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
- WEV06.main → title (Perl \x{xx} hex escapes decoded)
- WEV32.5 → relative PDF path
- WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
"PARDOK", anmerkung updated with the migration story.
Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
names for LSA full-text content are not yet known; tree mutations
with sf=alAB return 0 hits. Client-side filter is "good enough" for
the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
in production until issue #2's wahlprogramm ingest and frontend
activation land.
Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
(LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
("ICE-Halt für Salzwedel dauerhaft erhalten").
Refs #2.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
|
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)
PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:
1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
plus HAR-Verifikation gegen die Live-Instanz.
2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
search_id mit status=running, KEINE report_id. Erst eine zweite
SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
aus esearch-ui.main.js requestReportOK() Z. ~1268.
3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
<!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
als LSA Perl-Dump oder BE HTML-Cards. Felder:
- EWBV22: "Drucksache 17/10323"
- EWBD05: direkter PDF-URL
- WMV33: Schlagworte (joined by ;)
- WMV30: Urheber-Kurzform
- EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"
Smoke-Test (lokal):
BW q='': 8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
Schwimmunterricht, Lehrerbedarf etc.)
BW q='Klima': 8 hits, Klimaschutz/CO2/Energieberatung
get_document(17/10323): roundtrip funktioniert
bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.
Phase 1 (1/3) aus Roadmap-Issue #49.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 23:38:04 +02:00
|
|
|
|
class PARLISAdapter(ParlamentAdapter):
|
|
|
|
|
|
"""Adapter for Baden-Württemberg's PARLIS — eUI/portala-Variante mit
|
|
|
|
|
|
polling und JSON-in-HTML-Comment-Records.
|
|
|
|
|
|
|
|
|
|
|
|
PARLIS auf ``parlis.landtag-bw.de`` läuft technisch auf demselben
|
|
|
|
|
|
eUI-Backend wie LSA-PADOKA und BE-PARDOK, aber mit drei wichtigen
|
|
|
|
|
|
Unterschieden, die eine eigene Klasse statt einer PortalaAdapter-
|
|
|
|
|
|
Subklasse rechtfertigen:
|
|
|
|
|
|
|
|
|
|
|
|
1. **Body-Schema:** Statt der portala/LSA-typischen ``search.lines``
|
|
|
|
|
|
mit ``2/3/4/10/11/20.x/90.x``-Slots nutzt PARLIS ein viel kürzeres
|
|
|
|
|
|
``l1/l2/l3/l4`` Schema (siehe ``dokukratie/scrapers/portala.query.bw.json``).
|
|
|
|
|
|
``serverrecordname`` ist ``"vorgang"`` statt ``"sr_generic1"``,
|
|
|
|
|
|
``format`` ist ``"suchergebnis-vorgang-full"``, ``sort`` ist
|
|
|
|
|
|
``"SORT01/D SORT02/D SORT03"``. Es gibt kein ``parsed`` und kein
|
|
|
|
|
|
``json``-Tree — der Server akzeptiert das minimale Schema direkt.
|
|
|
|
|
|
|
|
|
|
|
|
2. **Async polling:** Im Gegensatz zu LSA/BE liefert die initiale
|
|
|
|
|
|
``Fulltext/Search``-Antwort nur eine ``search_id`` mit
|
|
|
|
|
|
``status: "running"``, KEINE ``report_id``. Erst eine zweite
|
|
|
|
|
|
``SearchAndDisplay``-Anfrage mit ``id: <search_id>`` (und ohne
|
|
|
|
|
|
``search``-Component) bekommt die fertige ``report_id`` zurück.
|
|
|
|
|
|
In meinen Live-Tests reichte ein einziger 2-Sekunden-Sleep
|
|
|
|
|
|
zwischen den Calls.
|
|
|
|
|
|
|
|
|
|
|
|
3. **Hit-Format:** Die ``report.tt.html``-Antwort liefert keine
|
|
|
|
|
|
Perl-Dump-Blöcke (LSA) und keine Bootstrap-Card-Divs (BE),
|
|
|
|
|
|
sondern **JSON-Records in HTML-Kommentaren**::
|
|
|
|
|
|
|
|
|
|
|
|
<!--{"WMV33":[{"main":"Schlagworte"}],
|
|
|
|
|
|
"EWBV22":[{"main":"Drucksache 17/10323"}],
|
|
|
|
|
|
"EWBD05":[{"main":"https://.../17_10323.pdf"}],
|
|
|
|
|
|
"EWBV23":[{"main":"Antrag Felix Herkens (GRÜNE) u. a. 16.03.2026"}],
|
|
|
|
|
|
...}-->
|
|
|
|
|
|
|
|
|
|
|
|
Der Parser zieht die Comments raw raus und mappt die WMV/EWBV-
|
|
|
|
|
|
Felder auf das ``Drucksache``-Dataclass.
|
|
|
|
|
|
|
|
|
|
|
|
Reverse-Engineering-Quelle: ``dokukratie/scrapers/portala.query.bw.json``
|
|
|
|
|
|
+ Live-HAR gegen ``parlis.landtag-bw.de`` (Issue #29).
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
# Reverse-engineered field map for the JSON records that come embedded
|
|
|
|
|
|
# in HTML comments inside report.tt.html responses.
|
|
|
|
|
|
#
|
|
|
|
|
|
# Records look like ``<!--{"WMV33":[...],...}-->`` and may contain
|
|
|
|
|
|
# nested ``<i>...</i>`` highlight tags inside the JSON values.
|
|
|
|
|
|
# Non-greedy match against the literal closing ``}-->`` because that
|
|
|
|
|
|
# delimiter does not appear inside the JSON payload itself.
|
|
|
|
|
|
_RE_RECORD = re.compile(r"<!--(\{.*?\})-->", re.DOTALL)
|
|
|
|
|
|
_RE_DRUCKSACHE = re.compile(r"Drucksache\s+(\d+/\d+)")
|
|
|
|
|
|
_RE_DATUM = re.compile(r"(\d{1,2}\.\d{1,2}\.\d{4})")
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
*,
|
|
|
|
|
|
bundesland: str,
|
|
|
|
|
|
name: str,
|
|
|
|
|
|
base_url: str,
|
|
|
|
|
|
wahlperiode: int,
|
|
|
|
|
|
prefix: str = "/parlis",
|
|
|
|
|
|
document_typ: str = "Antrag",
|
|
|
|
|
|
date_window_days: int = 730,
|
|
|
|
|
|
poll_attempts: int = 15,
|
|
|
|
|
|
poll_interval_seconds: float = 2.0,
|
|
|
|
|
|
) -> None:
|
|
|
|
|
|
"""Configure a PARLIS adapter for one specific parliament instance.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
bundesland: state code, e.g. ``"BW"``.
|
|
|
|
|
|
name: human-readable label.
|
|
|
|
|
|
base_url: ``https://parlis.landtag-bw.de`` (no trailing slash).
|
|
|
|
|
|
wahlperiode: legislative period — feeds into ``lines.l1``.
|
|
|
|
|
|
prefix: app prefix where PARLIS lives. ``/parlis`` for BW.
|
|
|
|
|
|
document_typ: feeds into ``lines.l4``. The server interprets
|
|
|
|
|
|
this as a German document type label like ``"Antrag"``.
|
|
|
|
|
|
date_window_days: look-back window for the search range,
|
|
|
|
|
|
quick-win against title-only filtering — same approach
|
|
|
|
|
|
as the PortalaAdapter for LSA/BE.
|
|
|
|
|
|
poll_attempts: how many times to poll for ``report_id`` before
|
|
|
|
|
|
giving up. ~15 × 2s = 30s upper bound.
|
|
|
|
|
|
poll_interval_seconds: sleep between poll attempts.
|
|
|
|
|
|
"""
|
|
|
|
|
|
self.bundesland = bundesland
|
|
|
|
|
|
self.name = name
|
|
|
|
|
|
self.base_url = base_url.rstrip("/")
|
|
|
|
|
|
self.prefix = "/" + prefix.strip("/")
|
|
|
|
|
|
self.wahlperiode = wahlperiode
|
|
|
|
|
|
self.document_typ = document_typ
|
|
|
|
|
|
self.date_window_days = date_window_days
|
|
|
|
|
|
self.poll_attempts = poll_attempts
|
|
|
|
|
|
self.poll_interval_seconds = poll_interval_seconds
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _datum_de_to_iso(datum_de: str) -> str:
|
|
|
|
|
|
"""DD.MM.YYYY → YYYY-MM-DD; '' for empty input."""
|
|
|
|
|
|
if not datum_de:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
try:
|
|
|
|
|
|
d, m, y = datum_de.split(".")
|
|
|
|
|
|
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
|
|
|
|
|
except ValueError:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _normalize_fraktion(text: str) -> list[str]:
|
|
|
|
|
|
"""Map a free-text Urheber line to canonical fraction codes.
|
|
|
|
|
|
|
|
|
|
|
|
PARLIS packs the originator into ``EWBV23`` like
|
|
|
|
|
|
``"Antrag Felix Herkens (GRÜNE), Saskia Frank (GRÜNE), ... 16.03.2026"``
|
|
|
|
|
|
— multiple MdLs with their party in parentheses, comma-separated.
|
|
|
|
|
|
Same logic as ``ParLDokAdapter._normalize_fraktion`` (#46 fixed
|
|
|
|
|
|
the MINISTER/MINISTERIUM regex there too).
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not text:
|
|
|
|
|
|
return []
|
|
|
|
|
|
u = text.upper()
|
|
|
|
|
|
out: list[str] = []
|
|
|
|
|
|
if re.search(r"\bBÜNDNIS\s*90\b", u) or re.search(r"\bGR(?:Ü|UE)NE\b", u):
|
|
|
|
|
|
out.append("GRÜNE")
|
|
|
|
|
|
if re.search(r"\bCDU\b", u):
|
|
|
|
|
|
out.append("CDU")
|
|
|
|
|
|
if re.search(r"\bSPD\b", u):
|
|
|
|
|
|
out.append("SPD")
|
|
|
|
|
|
if re.search(r"\bF\.?\s*D\.?\s*P\.?\b", u):
|
|
|
|
|
|
out.append("FDP")
|
|
|
|
|
|
if re.search(r"\bAFD\b", u):
|
|
|
|
|
|
out.append("AfD")
|
|
|
|
|
|
if re.search(r"\bLINKE\b", u):
|
|
|
|
|
|
out.append("LINKE")
|
|
|
|
|
|
if re.search(r"\bBSW\b", u):
|
|
|
|
|
|
out.append("BSW")
|
|
|
|
|
|
if re.search(r"LANDESREGIERUNG|\bMINISTER|STAATSKANZLEI|MINISTERPRÄSIDENT", u):
|
|
|
|
|
|
out.append("Landesregierung")
|
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
def _build_initial_body(self, start_date: str, end_date: str) -> dict:
|
|
|
|
|
|
"""Build the first ``SearchAndDisplay`` body with the search component.
|
|
|
|
|
|
|
|
|
|
|
|
The schema follows ``dokukratie/scrapers/portala.query.bw.json``
|
|
|
|
|
|
verbatim — only the placeholder values are substituted.
|
|
|
|
|
|
"""
|
|
|
|
|
|
return {
|
|
|
|
|
|
"action": "SearchAndDisplay",
|
|
|
|
|
|
"report": {
|
|
|
|
|
|
"rhl": "main",
|
|
|
|
|
|
"rhlmode": "add",
|
|
|
|
|
|
"format": "suchergebnis-vorgang-full",
|
|
|
|
|
|
"mime": "html",
|
|
|
|
|
|
"sort": "SORT01/D SORT02/D SORT03",
|
|
|
|
|
|
},
|
|
|
|
|
|
"search": {
|
|
|
|
|
|
"lines": {
|
|
|
|
|
|
"l1": str(self.wahlperiode),
|
|
|
|
|
|
"l2": start_date,
|
|
|
|
|
|
"l3": end_date,
|
|
|
|
|
|
"l4": self.document_typ,
|
|
|
|
|
|
},
|
|
|
|
|
|
"serverrecordname": "vorgang",
|
|
|
|
|
|
},
|
|
|
|
|
|
"sources": ["Star"],
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def _build_poll_body(self, search_id: str) -> dict:
|
|
|
|
|
|
"""Build the polling body — same action, but with the search_id
|
|
|
|
|
|
instead of a fresh search component."""
|
|
|
|
|
|
return {
|
|
|
|
|
|
"action": "SearchAndDisplay",
|
|
|
|
|
|
"report": {
|
|
|
|
|
|
"rhl": "main",
|
|
|
|
|
|
"rhlmode": "add",
|
|
|
|
|
|
"format": "suchergebnis-vorgang-full",
|
|
|
|
|
|
"mime": "html",
|
|
|
|
|
|
"sort": "SORT01/D SORT02/D SORT03",
|
|
|
|
|
|
},
|
|
|
|
|
|
"id": search_id,
|
|
|
|
|
|
"sources": ["Star"],
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def _hit_record_to_drucksache(self, record: dict) -> Optional[Drucksache]:
|
|
|
|
|
|
"""Map a single JSON-in-comment record to a ``Drucksache``.
|
|
|
|
|
|
|
|
|
|
|
|
PARLIS-record schema (reverse-engineered, all values are arrays
|
|
|
|
|
|
of ``{"main": ...}`` dicts):
|
|
|
|
|
|
|
|
|
|
|
|
- ``EWBV22``: "Drucksache 17/10323"
|
|
|
|
|
|
- ``EWBD05``: direct PDF URL
|
|
|
|
|
|
- ``EWBV23``: "Antrag <Urheber> <DD.MM.YYYY>" — single combined line
|
|
|
|
|
|
- ``WMV30``: short Urheber summary ("Felix Herkens (GRÜNE) u. a.")
|
|
|
|
|
|
- ``WMV33``: subject keywords (Schlagworte)
|
|
|
|
|
|
- ``EWBD01``: "Drucksache <X/Y> <DD.MM.YYYY>"
|
|
|
|
|
|
"""
|
|
|
|
|
|
def first(field: str) -> str:
|
|
|
|
|
|
block = record.get(field)
|
|
|
|
|
|
if isinstance(block, list) and block:
|
|
|
|
|
|
return (block[0].get("main") or "").strip()
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
ds_text = first("EWBV22") or first("EWBD01")
|
|
|
|
|
|
m_ds = self._RE_DRUCKSACHE.search(ds_text)
|
|
|
|
|
|
if not m_ds:
|
|
|
|
|
|
return None
|
|
|
|
|
|
drucksache = m_ds.group(1)
|
|
|
|
|
|
|
|
|
|
|
|
# The "title" we want is the Schlagworte/topic, not the
|
|
|
|
|
|
# Drucksachen-Header. PARLIS keeps the human-readable subject
|
|
|
|
|
|
# in WMV33 (Schlagworte joined by semicolons) — that's the
|
|
|
|
|
|
# closest equivalent to "title" the LSA/BE adapters expose.
|
|
|
|
|
|
# Fallback to the EWBV23 line if WMV33 is empty.
|
|
|
|
|
|
schlagworte = first("WMV33")
|
|
|
|
|
|
# Strip embedded <i>...</i> highlight tags
|
|
|
|
|
|
schlagworte_clean = re.sub(r"</?i>", "", schlagworte).strip()
|
|
|
|
|
|
title = schlagworte_clean or first("EWBV23") or f"Drucksache {drucksache}"
|
|
|
|
|
|
|
|
|
|
|
|
# Date + Urheber out of EWBV23 ("Antrag <Urheber> <DD.MM.YYYY>")
|
|
|
|
|
|
ewbv23 = first("EWBV23")
|
|
|
|
|
|
m_dat = self._RE_DATUM.search(ewbv23)
|
|
|
|
|
|
datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "")
|
|
|
|
|
|
urheber_short = first("WMV30")
|
|
|
|
|
|
fraktionen = self._normalize_fraktion(urheber_short or ewbv23)
|
|
|
|
|
|
|
|
|
|
|
|
pdf_url = first("EWBD05")
|
|
|
|
|
|
|
|
|
|
|
|
return Drucksache(
|
|
|
|
|
|
drucksache=drucksache,
|
|
|
|
|
|
title=title,
|
|
|
|
|
|
fraktionen=fraktionen,
|
|
|
|
|
|
datum=datum_iso,
|
|
|
|
|
|
link=pdf_url,
|
|
|
|
|
|
bundesland=self.bundesland,
|
|
|
|
|
|
typ=self.document_typ,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
async def _initial_search_and_poll(
|
|
|
|
|
|
self, client: httpx.AsyncClient, start_date: str, end_date: str,
|
|
|
|
|
|
) -> Optional[str]:
|
|
|
|
|
|
"""Run the initial search + poll until ``report_id`` arrives."""
|
|
|
|
|
|
import asyncio
|
|
|
|
|
|
|
|
|
|
|
|
browse_html = f"{self.base_url}{self.prefix}/browse.tt.html"
|
|
|
|
|
|
browse_json = f"{self.base_url}{self.prefix}/browse.tt.json"
|
|
|
|
|
|
|
|
|
|
|
|
# Step 1: warm cookies
|
|
|
|
|
|
await client.get(browse_html)
|
|
|
|
|
|
|
|
|
|
|
|
# Step 2: initial search
|
|
|
|
|
|
try:
|
|
|
|
|
|
resp = await client.post(
|
|
|
|
|
|
browse_json,
|
|
|
|
|
|
json=self._build_initial_body(start_date, end_date),
|
|
|
|
|
|
headers={"Referer": browse_html},
|
|
|
|
|
|
)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
logger.exception("%s initial search request error", self.bundesland)
|
|
|
|
|
|
return None
|
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
|
logger.error("%s initial search HTTP %s", self.bundesland, resp.status_code)
|
|
|
|
|
|
return None
|
|
|
|
|
|
data = resp.json()
|
|
|
|
|
|
if data.get("report_id"):
|
|
|
|
|
|
return data["report_id"]
|
|
|
|
|
|
search_id = data.get("search_id")
|
|
|
|
|
|
if not search_id:
|
|
|
|
|
|
logger.error("%s no search_id in initial response: %s", self.bundesland, data)
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# Step 3: poll until report_id appears or we run out of attempts
|
|
|
|
|
|
for _ in range(self.poll_attempts):
|
|
|
|
|
|
await asyncio.sleep(self.poll_interval_seconds)
|
|
|
|
|
|
try:
|
|
|
|
|
|
resp = await client.post(
|
|
|
|
|
|
browse_json,
|
|
|
|
|
|
json=self._build_poll_body(search_id),
|
|
|
|
|
|
headers={"Referer": browse_html},
|
|
|
|
|
|
)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
logger.exception("%s poll request error", self.bundesland)
|
|
|
|
|
|
return None
|
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
|
logger.error("%s poll HTTP %s", self.bundesland, resp.status_code)
|
|
|
|
|
|
return None
|
|
|
|
|
|
data = resp.json()
|
|
|
|
|
|
if data.get("report_id"):
|
|
|
|
|
|
return data["report_id"]
|
|
|
|
|
|
star = data.get("sources", {}).get("Star", {})
|
|
|
|
|
|
if star.get("status") == "stopped" and not data.get("report_id"):
|
|
|
|
|
|
# Search finished but no report — empty result
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
logger.warning("%s gave up polling after %d attempts", self.bundesland, self.poll_attempts)
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_report_html(self, html: str) -> list[Drucksache]:
|
|
|
|
|
|
"""Extract Drucksachen from a report.tt.html response.
|
|
|
|
|
|
|
|
|
|
|
|
Records are JSON objects embedded in HTML comments. We pull each
|
|
|
|
|
|
comment block via regex, parse it as JSON, and map the WMV/EWBV
|
|
|
|
|
|
fields to a Drucksache.
|
|
|
|
|
|
"""
|
|
|
|
|
|
results: list[Drucksache] = []
|
|
|
|
|
|
for m in self._RE_RECORD.finditer(html):
|
|
|
|
|
|
json_text = m.group(1)
|
|
|
|
|
|
try:
|
|
|
|
|
|
record = json.loads(json_text)
|
|
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
|
|
continue
|
|
|
|
|
|
doc = self._hit_record_to_drucksache(record)
|
|
|
|
|
|
if doc:
|
|
|
|
|
|
results.append(doc)
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)
PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:
1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
plus HAR-Verifikation gegen die Live-Instanz.
2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
search_id mit status=running, KEINE report_id. Erst eine zweite
SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
aus esearch-ui.main.js requestReportOK() Z. ~1268.
3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
<!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
als LSA Perl-Dump oder BE HTML-Cards. Felder:
- EWBV22: "Drucksache 17/10323"
- EWBD05: direkter PDF-URL
- WMV33: Schlagworte (joined by ;)
- WMV30: Urheber-Kurzform
- EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"
Smoke-Test (lokal):
BW q='': 8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
Schwimmunterricht, Lehrerbedarf etc.)
BW q='Klima': 8 hits, Klimaschutz/CO2/Energieberatung
get_document(17/10323): roundtrip funktioniert
bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.
Phase 1 (1/3) aus Roadmap-Issue #49.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 23:38:04 +02:00
|
|
|
|
"""Search recent BW Anträge with optional client-side title filter.
|
|
|
|
|
|
|
|
|
|
|
|
Server-side full-text is not used (#18 — einheitliches
|
|
|
|
|
|
Verhalten ohne Volltext bis alle Adapter es können). The
|
|
|
|
|
|
client filter looks at title (Schlagworte) + Urheber.
|
|
|
|
|
|
"""
|
|
|
|
|
|
from datetime import date, timedelta
|
|
|
|
|
|
|
|
|
|
|
|
end = date.today()
|
|
|
|
|
|
start = end - timedelta(days=self.date_window_days)
|
|
|
|
|
|
|
|
|
|
|
|
async with httpx.AsyncClient(
|
|
|
|
|
|
timeout=60,
|
|
|
|
|
|
follow_redirects=True,
|
|
|
|
|
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
|
|
|
|
|
) as client:
|
|
|
|
|
|
try:
|
|
|
|
|
|
report_id = await self._initial_search_and_poll(
|
|
|
|
|
|
client, start.isoformat(), end.isoformat(),
|
|
|
|
|
|
)
|
|
|
|
|
|
if not report_id:
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
# Pull a generous chunk so the client-side filter has
|
|
|
|
|
|
# enough material to work with.
|
|
|
|
|
|
chunksize = max(limit * 10, 200) if query else max(limit * 2, 50)
|
|
|
|
|
|
report_url = (
|
|
|
|
|
|
f"{self.base_url}{self.prefix}/report.tt.html"
|
|
|
|
|
|
f"?report_id={report_id}&start=0&chunksize={chunksize}"
|
|
|
|
|
|
)
|
|
|
|
|
|
resp = await client.get(
|
|
|
|
|
|
report_url,
|
|
|
|
|
|
headers={"Referer": f"{self.base_url}{self.prefix}/browse.tt.html"},
|
|
|
|
|
|
)
|
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
|
logger.error("%s report HTTP %s", self.bundesland, resp.status_code)
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
results = self._parse_report_html(resp.text)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
logger.exception("%s search error", self.bundesland)
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
# Client-side filter
|
|
|
|
|
|
if query:
|
|
|
|
|
|
terms = [t.lower() for t in query.split() if t]
|
|
|
|
|
|
results = [
|
|
|
|
|
|
d for d in results
|
|
|
|
|
|
if all(t in f"{d.title} {' '.join(d.fraktionen)}".lower() for t in terms)
|
|
|
|
|
|
]
|
|
|
|
|
|
return results[:limit]
|
|
|
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)
PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:
1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
plus HAR-Verifikation gegen die Live-Instanz.
2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
search_id mit status=running, KEINE report_id. Erst eine zweite
SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
aus esearch-ui.main.js requestReportOK() Z. ~1268.
3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
<!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
als LSA Perl-Dump oder BE HTML-Cards. Felder:
- EWBV22: "Drucksache 17/10323"
- EWBD05: direkter PDF-URL
- WMV33: Schlagworte (joined by ;)
- WMV30: Urheber-Kurzform
- EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"
Smoke-Test (lokal):
BW q='': 8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
Schwimmunterricht, Lehrerbedarf etc.)
BW q='Klima': 8 hits, Klimaschutz/CO2/Energieberatung
get_document(17/10323): roundtrip funktioniert
bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.
Phase 1 (1/3) aus Roadmap-Issue #49.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 23:38:04 +02:00
|
|
|
|
"""Look up a single Drucksache by ID via a broad browse."""
|
|
|
|
|
|
results = await self.search(query="", limit=200)
|
|
|
|
|
|
for doc in results:
|
|
|
|
|
|
if doc.drucksache == drucksache:
|
|
|
|
|
|
return doc
|
2026-03-28 22:30:24 +01:00
|
|
|
|
return None
|
Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)
PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:
1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
plus HAR-Verifikation gegen die Live-Instanz.
2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
search_id mit status=running, KEINE report_id. Erst eine zweite
SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
aus esearch-ui.main.js requestReportOK() Z. ~1268.
3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
<!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
als LSA Perl-Dump oder BE HTML-Cards. Felder:
- EWBV22: "Drucksache 17/10323"
- EWBD05: direkter PDF-URL
- WMV33: Schlagworte (joined by ;)
- WMV30: Urheber-Kurzform
- EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"
Smoke-Test (lokal):
BW q='': 8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
Schwimmunterricht, Lehrerbedarf etc.)
BW q='Klima': 8 hits, Klimaschutz/CO2/Energieberatung
get_document(17/10323): roundtrip funktioniert
bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.
Phase 1 (1/3) aus Roadmap-Issue #49.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 23:38:04 +02:00
|
|
|
|
|
2026-03-28 22:30:24 +01:00
|
|
|
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)
PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:
1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
plus HAR-Verifikation gegen die Live-Instanz.
2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
search_id mit status=running, KEINE report_id. Erst eine zweite
SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
aus esearch-ui.main.js requestReportOK() Z. ~1268.
3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
<!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
als LSA Perl-Dump oder BE HTML-Cards. Felder:
- EWBV22: "Drucksache 17/10323"
- EWBD05: direkter PDF-URL
- WMV33: Schlagworte (joined by ;)
- WMV30: Urheber-Kurzform
- EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"
Smoke-Test (lokal):
BW q='': 8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
Schwimmunterricht, Lehrerbedarf etc.)
BW q='Klima': 8 hits, Klimaschutz/CO2/Energieberatung
get_document(17/10323): roundtrip funktioniert
bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.
Phase 1 (1/3) aus Roadmap-Issue #49.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 23:38:04 +02:00
|
|
|
|
"""Download the PDF for a Drucksache and extract its text."""
|
|
|
|
|
|
import fitz # PyMuPDF
|
|
|
|
|
|
|
|
|
|
|
|
doc = await self.get_document(drucksache)
|
|
|
|
|
|
if not doc or not doc.link:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
async with httpx.AsyncClient(
|
|
|
|
|
|
timeout=60,
|
|
|
|
|
|
follow_redirects=True,
|
|
|
|
|
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
|
|
|
|
|
) as client:
|
|
|
|
|
|
try:
|
|
|
|
|
|
resp = await client.get(doc.link)
|
|
|
|
|
|
if resp.status_code != 200:
|
|
|
|
|
|
logger.error(
|
|
|
|
|
|
"%s PDF HTTP %s for %s (%s)",
|
|
|
|
|
|
self.bundesland, resp.status_code, drucksache, doc.link,
|
|
|
|
|
|
)
|
|
|
|
|
|
return None
|
|
|
|
|
|
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
|
|
|
|
|
text = ""
|
|
|
|
|
|
for page in pdf:
|
|
|
|
|
|
text += page.get_text()
|
|
|
|
|
|
pdf.close()
|
|
|
|
|
|
return text
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
logger.exception("%s PDF download error for %s", self.bundesland, drucksache)
|
|
|
|
|
|
return None
|
2026-03-28 22:30:24 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Registry of adapters
|
|
|
|
|
|
ADAPTERS = {
|
|
|
|
|
|
"NRW": NRWAdapter(),
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
"LSA": PortalaAdapter(
|
|
|
|
|
|
bundesland="LSA",
|
|
|
|
|
|
name="Landtag von Sachsen-Anhalt (PADOKA)",
|
|
|
|
|
|
base_url="https://padoka.landtag.sachsen-anhalt.de",
|
|
|
|
|
|
db_id="lsa.lissh",
|
|
|
|
|
|
wahlperiode=8,
|
|
|
|
|
|
portala_path="/portal",
|
|
|
|
|
|
document_type="Antrag",
|
|
|
|
|
|
pdf_url_prefix="/files/",
|
|
|
|
|
|
),
|
|
|
|
|
|
"BE": PortalaAdapter(
|
|
|
|
|
|
bundesland="BE",
|
|
|
|
|
|
name="Abgeordnetenhaus von Berlin (PARDOK)",
|
|
|
|
|
|
base_url="https://pardok.parlament-berlin.de",
|
|
|
|
|
|
db_id="lah.lissh",
|
|
|
|
|
|
wahlperiode=19,
|
|
|
|
|
|
portala_path="/portala",
|
|
|
|
|
|
# Berlin's ETYPF index uses different value strings — drop the
|
|
|
|
|
|
# document_type subtree, fall back to client-side title filter.
|
|
|
|
|
|
document_type=None,
|
2026-04-08 13:58:34 +02:00
|
|
|
|
# Quick-win for #13: pulled the date window from the original
|
|
|
|
|
|
# 180-day MVP up to 730 days so client-side title-filter searches
|
|
|
|
|
|
# ("Schule" etc.) reach back across more of the WP19 corpus until
|
|
|
|
|
|
# the eUI fulltext-sf is reverse-engineered. The chunksize bump
|
|
|
|
|
|
# in PortalaAdapter.search() means the per-request payload stays
|
|
|
|
|
|
# bounded.
|
|
|
|
|
|
date_window_days=730,
|
Activate Berlin (PARDOK) — search-only MVP (#3)
PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.
PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
moved into the constructor. New optional parameters:
- portala_path: "/portal" for LSA, "/portala" for Berlin
- document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
index uses different value strings; the document_type subtree
is dropped from the action.search.json tree)
- pdf_url_prefix: "/files/" by default; absolute URLs in the hit
list are passed through unchanged (Berlin embeds full
starweb/adis/citat/... links)
- date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
document_type is None, the entire ETYPF/DTYPF/DART subtree is
omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
(existing parser, untouched).
2. Berlin-style: production HTML cards with efxRecordRepeater
divs, h3 titles, h6 metadata lines containing the document
type, drucksachen-id and date, plus a direct <a href="…pdf">
to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
_normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
long-standing bug where comma-separated fraction lists like
"CDU, SPD" failed to match CDU. Also picks up BSW for the
Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.
bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
Lücke and the auto-detected hit-list format.
Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
regression from the refactor.
Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
embeddings DB. The 2023 PDFs are no longer linked from the live
party websites (which currently feature 2026 draft programmes), and
Wayback has no snapshots. The analyzer therefore falls back to
bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
the 2023 PDFs are sourced manually.
Refs #3.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
|
|
|
|
pdf_url_prefix="/files/",
|
|
|
|
|
|
),
|
2026-04-08 08:19:48 +02:00
|
|
|
|
"MV": ParLDokAdapter(
|
|
|
|
|
|
bundesland="MV",
|
|
|
|
|
|
name="Landtag Mecklenburg-Vorpommern (ParlDok)",
|
|
|
|
|
|
base_url="https://www.dokumentation.landtag-mv.de",
|
|
|
|
|
|
wahlperiode=8,
|
|
|
|
|
|
prefix="/parldok",
|
|
|
|
|
|
document_typ="Antrag",
|
|
|
|
|
|
),
|
Activate Hamburg via ParLDokAdapter reuse (#28, Phase 1)
Hamburg's parldok runs ParlDok 8.3.1 (J3S GmbH) — kompatibel mit
der MV-Variante (8.3.5). Selber /parldok/Fulltext/Search-Endpoint,
selbe Body-Schema, selbes Hit-Format. Dadurch ist der existierende
ParLDokAdapter aus #4 ohne Code-Änderungen wiederverwendbar.
Eingetragen wurde nur:
- ADAPTERS["HH"] = ParLDokAdapter(base_url=buergerschaft-hh.de,
wahlperiode=23, prefix=/parldok, document_typ="Antrag")
- bundeslaender.py::HH.aktiv = True
Smoke-Test (lokal):
HH q="": 8 hits in 1.5s, jüngste WP23-Anträge sortiert newest-first
HH q="Schule": 1 hit in 13.2s (HH ist klein, WP23 erst seit März 2025,
HH nutzt eher "Kita"/"Bildung"/"Lehrkräfte" im Titel)
HH q="Klima": 2 hits
Verifikation HH ist 8.x:
curl https://www.buergerschaft-hh.de/parldok/ | grep generator
→ "ParlDok 8.3.1, entwickelt von der J3S GmbH"
Dies ist der zweite Phase-1-Win — ein nahezu kostenloser Adapter-
Reuse weil das Backend identisch ist. Anders als BW (#29), das eine
eigene PARLISAdapter-Klasse brauchte, braucht HH gar keinen neuen
Code.
Phase 1 (2/3) aus Roadmap-Issue #49.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 23:41:23 +02:00
|
|
|
|
"HH": ParLDokAdapter(
|
|
|
|
|
|
bundesland="HH",
|
|
|
|
|
|
name="Hamburgische Bürgerschaft (ParlDok)",
|
|
|
|
|
|
base_url="https://www.buergerschaft-hh.de",
|
|
|
|
|
|
wahlperiode=23,
|
|
|
|
|
|
prefix="/parldok",
|
|
|
|
|
|
document_typ="Antrag",
|
|
|
|
|
|
),
|
2026-03-28 22:30:24 +01:00
|
|
|
|
"BY": BayernAdapter(),
|
Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)
PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:
1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
plus HAR-Verifikation gegen die Live-Instanz.
2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
search_id mit status=running, KEINE report_id. Erst eine zweite
SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
aus esearch-ui.main.js requestReportOK() Z. ~1268.
3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
<!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
als LSA Perl-Dump oder BE HTML-Cards. Felder:
- EWBV22: "Drucksache 17/10323"
- EWBD05: direkter PDF-URL
- WMV33: Schlagworte (joined by ;)
- WMV30: Urheber-Kurzform
- EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"
Smoke-Test (lokal):
BW q='': 8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
Schwimmunterricht, Lehrerbedarf etc.)
BW q='Klima': 8 hits, Klimaschutz/CO2/Energieberatung
get_document(17/10323): roundtrip funktioniert
bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.
Phase 1 (1/3) aus Roadmap-Issue #49.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 23:38:04 +02:00
|
|
|
|
"BW": PARLISAdapter(
|
|
|
|
|
|
bundesland="BW",
|
|
|
|
|
|
name="Landtag von Baden-Württemberg (PARLIS)",
|
|
|
|
|
|
base_url="https://parlis.landtag-bw.de",
|
|
|
|
|
|
wahlperiode=17,
|
|
|
|
|
|
prefix="/parlis",
|
|
|
|
|
|
document_typ="Antrag",
|
|
|
|
|
|
),
|
2026-03-28 22:30:24 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_adapter(bundesland: str) -> Optional[ParlamentAdapter]:
|
|
|
|
|
|
"""Get adapter for a bundesland."""
|
|
|
|
|
|
return ADAPTERS.get(bundesland)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def search_all(query: str, bundesland: str = "NRW", limit: int = 20) -> list[Drucksache]:
|
|
|
|
|
|
"""Search parliament documents in a specific state."""
|
|
|
|
|
|
adapter = get_adapter(bundesland)
|
|
|
|
|
|
if not adapter:
|
|
|
|
|
|
return []
|
|
|
|
|
|
return await adapter.search(query, limit)
|