gwoe-antragspruefer/app/parlamente.py

1340 lines
54 KiB
Python
Raw Normal View History

"""Parliament search adapters for different German states."""
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
import json
import logging
import httpx
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional
from bs4 import BeautifulSoup
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
logger = logging.getLogger(__name__)
@dataclass
class Drucksache:
"""A parliamentary document."""
drucksache: str # e.g. "18/8125"
title: str
fraktionen: list[str]
datum: str # ISO date
link: str # PDF URL
bundesland: str
typ: str = "Antrag" # Antrag, Anfrage, Beschlussempfehlung, etc.
class ParlamentAdapter(ABC):
"""Base adapter for searching parliament documents."""
bundesland: str
name: str
@abstractmethod
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
"""Search for documents matching query."""
pass
@abstractmethod
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
"""Get a specific document by ID."""
pass
@abstractmethod
async def download_text(self, drucksache: str) -> Optional[str]:
"""Download and extract text from a document."""
pass
class NRWAdapter(ParlamentAdapter):
"""Adapter for NRW Landtag (opal.landtag.nrw.de)."""
bundesland = "NRW"
name = "Landtag Nordrhein-Westfalen"
base_url = "https://opal.landtag.nrw.de"
search_url = "https://opal.landtag.nrw.de/home/dokumente/dokumentensuche/parlamentsdokumente/aktuelle-dokumente.html"
def _parse_query(self, query: str) -> tuple[str, list[str], bool]:
"""
Parse search query for AND logic and exact phrases.
Returns: (search_term_for_api, filter_terms, is_exact)
Examples:
- 'Klimaschutz Energie' -> ('Klimaschutz', ['klimaschutz', 'energie'], False)
- '"Grüner Stahl"' -> ('Grüner Stahl', ['grüner stahl'], True)
- 'Klimaschutz "erneuerbare Energie"' -> ('Klimaschutz', ['klimaschutz', 'erneuerbare energie'], False)
"""
query = query.strip()
# Check for exact phrase (entire query in quotes)
if query.startswith('"') and query.endswith('"') and query.count('"') == 2:
exact = query[1:-1].strip()
return (exact, [exact.lower()], True)
# Extract quoted phrases and regular terms
import shlex
try:
parts = shlex.split(query)
except ValueError:
# Fallback for unbalanced quotes
parts = query.split()
if not parts:
return (query, [query.lower()], False)
# Use first term for API search, all terms for filtering
filter_terms = [p.lower() for p in parts]
return (parts[0], filter_terms, False)
def _matches_all_terms(self, doc: 'Drucksache', terms: list[str], is_exact: bool) -> bool:
"""Check if document matches all search terms (AND logic)."""
searchable = f"{doc.title} {doc.drucksache} {' '.join(doc.fraktionen)} {doc.typ}".lower()
if is_exact:
# Exact phrase must appear
return terms[0] in searchable
else:
# All terms must appear (AND)
return all(term in searchable for term in terms)
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
"""Search NRW Landtag documents via OPAL portal."""
results = []
# Parse query for AND logic
api_query, filter_terms, is_exact = self._parse_query(query)
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
try:
# First, get the page to establish session
initial = await client.get(self.search_url)
if initial.status_code != 200:
print(f"NRW search initial request failed: {initial.status_code}")
return []
# Parse for webflow token from pagination links
soup = BeautifulSoup(initial.text, 'html.parser')
# Find a pagination link to extract the webflow token
pagination_link = soup.select_one('a[href*="webflowexecution"]')
webflow_token = ""
webflow_execution = ""
if pagination_link:
href = pagination_link.get('href', '')
# Extract webflowToken and webflowexecution from URL
token_match = re.search(r'webflowToken=([^&]*)', href)
exec_match = re.search(r'(webflowexecution[^=]+)=([^&]+)', href)
if token_match:
webflow_token = token_match.group(1)
if exec_match:
webflow_execution = f"{exec_match.group(1)}={exec_match.group(2)}"
# Now perform the search with POST
# Find the form action URL with webflow token
form = soup.select_one('form#docSearchByItem')
form_action = self.search_url
if form and form.get('action'):
action = form.get('action')
if action.startswith('/'):
form_action = f"{self.base_url}{action}"
elif action.startswith('http'):
form_action = action
else:
form_action = f"{self.search_url}?{action}"
# Build form data for "Einfache Suche" (searchByItem form)
form_data = {
'_eventId_sendform': '1',
'dokNum': api_query, # This is the text search field
'formId': 'searchByItem',
'dokTyp': '', # All types
'wp': '18', # Wahlperiode 18
}
# POST request with form data to the form action URL
search_resp = await client.post(
form_action,
data=form_data,
cookies=initial.cookies,
headers={'Content-Type': 'application/x-www-form-urlencoded'}
)
if search_resp.status_code != 200:
print(f"NRW search request failed: {search_resp.status_code}")
return []
# Parse results
soup = BeautifulSoup(search_resp.text, 'html.parser')
# Find all document result items (li elements containing articles)
items = soup.select('li:has(article)')
for item in items[:limit]:
try:
# Extract drucksache number from first link
num_link = item.select_one('a[href*="MMD"]')
if not num_link:
continue
href = num_link.get('href', '')
# Extract number: MMD18-12345.pdf -> 18/12345
match = re.search(r'MMD(\d+)-(\d+)\.pdf', href)
if not match:
continue
legislatur, nummer = match.groups()
drucksache = f"{legislatur}/{nummer}"
pdf_url = f"https://www.landtag.nrw.de{href}" if href.startswith('/') else href
# Extract title from the title link (class e-document-result-item__title)
title_elem = item.select_one('a.e-document-result-item__title')
if title_elem:
# Get text content, clean it up
title = title_elem.get_text(strip=True)
# Remove SVG icon text and clean
title = re.sub(r'\s*<svg.*', '', title)
title = re.sub(r'\s+', ' ', title).strip()
else:
# Fallback: try to find any longer text
title = f"Drucksache {drucksache}"
# Clean up common artifacts
title = re.sub(r'\s*\(\s*externer Link.*?\)', '', title).strip()
# Extract type (Antrag, Kleine Anfrage, etc.)
typ_elem = item.select_one('.e-document-result-item__category')
typ = typ_elem.get_text(strip=True) if typ_elem else "Drucksache"
# Extract date
time_elem = item.select_one('time')
datum = ""
if time_elem:
datum_text = time_elem.get_text(strip=True)
# Convert DD.MM.YYYY to YYYY-MM-DD
date_match = re.match(r'(\d{2})\.(\d{2})\.(\d{4})', datum_text)
if date_match:
d, m, y = date_match.groups()
datum = f"{y}-{m}-{d}"
# Extract Urheber (fraktionen) - look for paragraph containing "Urheber:"
urheber_text = ""
for p in item.select('p'):
if 'Urheber:' in p.get_text():
urheber_text = p.get_text()
break
fraktionen = []
if urheber_text:
# Extract party names (SPD, CDU, GRÜNE, FDP, AfD)
for party in ['SPD', 'CDU', 'GRÜNE', 'Grüne', 'FDP', 'AfD']:
if party in urheber_text:
fraktionen.append(party.upper() if party.lower() != 'grüne' else 'GRÜNE')
doc = Drucksache(
drucksache=drucksache,
title=title,
fraktionen=fraktionen,
datum=datum,
link=pdf_url,
bundesland="NRW",
typ=typ,
)
# Apply AND filter (all terms must match)
if self._matches_all_terms(doc, filter_terms, is_exact):
results.append(doc)
except Exception as e:
print(f"Error parsing item: {e}")
continue
except Exception as e:
print(f"NRW search error: {e}")
return results
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
"""Get document metadata by drucksache ID (e.g. '18/8125')."""
# Parse legislatur and number
match = re.match(r"(\d+)/(\d+)", drucksache)
if not match:
return None
legislatur, nummer = match.groups()
pdf_url = f"https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument/MMD{legislatur}-{nummer}.pdf"
# Try to fetch and extract basic info
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
try:
resp = await client.head(pdf_url)
if resp.status_code == 200:
return Drucksache(
drucksache=drucksache,
title=f"Drucksache {drucksache}",
fraktionen=[],
datum="",
link=pdf_url,
bundesland="NRW",
)
except:
pass
return None
async def download_text(self, drucksache: str) -> Optional[str]:
"""Download PDF and extract text."""
import fitz # PyMuPDF
doc = await self.get_document(drucksache)
if not doc:
return None
async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
try:
resp = await client.get(doc.link)
if resp.status_code != 200:
return None
# Extract text with PyMuPDF
pdf = fitz.open(stream=resp.content, filetype="pdf")
text = ""
for page in pdf:
text += page.get_text()
pdf.close()
return text
except Exception as e:
print(f"Error downloading {drucksache}: {e}")
return None
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
class PortalaAdapter(ParlamentAdapter):
"""Adapter for portala/eUI-based parliament documentation systems.
Used by parliaments running the proprietary "esearch" / portala framework
(originally developed for STAR/StarFinder backends, now wrapped in a
Single-Page App with Template Toolkit on the server side):
- **LSA** (Sachsen-Anhalt) PADOKA at ``padoka.landtag.sachsen-anhalt.de``
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
under ``/portal/`` (singular)
- **BE** (Berlin) PARDOK at ``pardok.parlament-berlin.de`` under
``/portala/`` (with the trailing 'a')
Both instances share the same JSON action schema, only the base URL,
the data source ID, the application path prefix and a few minor
quirks differ those are constructor parameters so that the same
class can serve both states (and any future portala-based parliament).
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
The search workflow is two-stage:
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
1. ``POST {base}{path}/browse.tt.json`` with a complex JSON ``action``
body that contains an Elasticsearch-style query tree under
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
``search.json``. The server returns a ``report_id`` plus hit count.
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
2. ``POST {base}{path}/report.tt.html`` with ``{report_id, start,
chunksize}`` to fetch the HTML hit list. Each hit carries a Perl
Data::Dumper block in a ``<pre>`` tag with the canonical metadata.
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
The query body schema was reverse-engineered from
https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
(GPL-3.0 only structure/selectors are reused, not Python code).
Full-text search is **not** implemented in the MVP: the adapter
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
returns documents of the current Wahlperiode in the given date
window, and the search query is applied as a client-side
title/Urheber filter. The server-side full-text path requires
state-specific ``sf`` index names that are not yet known.
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
"""
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
def __init__(
self,
*,
bundesland: str,
name: str,
base_url: str,
db_id: str,
wahlperiode: int,
portala_path: str = "/portal",
document_type: Optional[str] = "Antrag",
pdf_url_prefix: str = "/files/",
date_window_days: int = 730,
) -> None:
"""Configure a portala/eUI adapter for one specific parliament.
Args:
bundesland: state code (e.g. ``"LSA"``, ``"BE"``).
name: human-readable adapter label (used in logs/UI).
base_url: ``https://...`` of the portal host without trailing slash.
db_id: data source identifier the eUI server expects in
``action.sources``, e.g. ``"lsa.lissh"`` or ``"lah.lissh"``.
wahlperiode: current legislative period fed into the WP
term of the search tree.
portala_path: path prefix where the portala app lives. ``/portal``
for LSA, ``/portala`` for Berlin.
document_type: optional filter applied via ETYPF/DTYPF/DART
terms. ``"Antrag"`` works for LSA; for instances where
the index uses different document_type values (e.g. Berlin),
pass ``None`` to drop the document_type subtree entirely
the user can still filter client-side by title.
pdf_url_prefix: URL fragment between ``base_url`` and the
relative PDF path returned by the server.
date_window_days: how many days back ``search()`` looks by
default.
"""
self.bundesland = bundesland
self.name = name
self.base_url = base_url.rstrip("/")
self.db_id = db_id
self.wahlperiode = wahlperiode
self.portala_path = "/" + portala_path.strip("/")
self.document_type = document_type
self.pdf_url_prefix = "/" + pdf_url_prefix.strip("/") + "/"
self.date_window_days = date_window_days
# ── LSA-style hit list (Perl Data::Dumper inside <pre> blocks) ──
# Reverse-engineered "WEV*" record fields:
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
# WEV06.main = title
# WEV32.5 = relative PDF path
# WEV32.main = "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b> ..."
_RE_TITLE = re.compile(r"'WEV06'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
_RE_PDF = re.compile(r"'5'\s*=>\s*'([^']*\.pdf)'")
_RE_DRUCKSACHE = re.compile(r"Drucksache\s*<b>(\d+/\d+)</b>")
_RE_URHEBER_DATUM = re.compile(
r"'WEV32'\s*=>\s*\[\s*\{[^}]*'main'\s*=>\s*[\"']Antrag\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
)
_RE_PRE_BLOCK = re.compile(r'<pre>\$VAR1 = (.*?)</pre>', re.DOTALL)
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
# ── Berlin-style hit list (production HTML cards, no Perl dump) ──
# The whole div for one record:
_RE_BE_RECORD = re.compile(
r'<div[^>]*class="[^"]*efxRecordRepeater[^"]*"[^>]*data-efx-rec="[^"]*"[^>]*>(.*?)(?=<div[^>]*efxRecordRepeater|<div[^>]*id="efxResultsEnd"|</main>|$)',
re.DOTALL,
)
_RE_BE_TITLE = re.compile(r'<h3[^>]*class="h5[^"]*"[^>]*>\s*<span>([^<]+)</span>')
_RE_BE_LINK = re.compile(r'<a[^>]*href="([^"]+\.pdf)"[^>]*>')
# The metadata h6 looks like:
# <span class="h6">Antrag (Eilantrag) &nbsp;<a ...>Drucksache 19/3104</a> S. 1 bis 24 vom 31.03.2026</span>
_RE_BE_DRUCKSACHE = re.compile(r'Drucksache\s+(\d+/\d+)')
_RE_BE_DATUM = re.compile(r'vom\s+(\d{1,2}\.\d{1,2}\.\d{4})')
_RE_BE_DOCTYPE = re.compile(r'<span class="h6">\s*([^<&]+?)(?:&nbsp;|<)')
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
@staticmethod
def _decode_perl_hex(s: str) -> str:
"""Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s)
@staticmethod
def _normalize_fraktion(urheber: str) -> list[str]:
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
"""Map Urheber-String to canonical fraction codes.
Uses regex word boundaries instead of plain substring matching so
that comma-separated lists ("CDU, SPD") and the embedded "DIE
LINKE" are matched reliably.
"""
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
u = urheber.upper()
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
out: list[str] = []
def has(pattern: str) -> bool:
return re.search(pattern, u) is not None
if has(r"\bBÜNDNIS\s*90\b") or has(r"\bGR(?:Ü|UE)NE\b"):
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
out.append("GRÜNE")
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
if has(r"\bCDU\b"):
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
out.append("CDU")
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
if has(r"\bSPD\b"):
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
out.append("SPD")
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
if has(r"\bFDP\b"):
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
out.append("FDP")
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
if has(r"\bAFD\b"):
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
out.append("AfD")
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
if has(r"\bLINKE\b"):
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
out.append("LINKE")
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
if has(r"\bBSW\b"):
out.append("BSW")
if has(r"LANDESREGIERUNG|SENAT VON BERLIN|REGIERENDE[RN]?\s+BÜRGERMEISTER|MINISTER\b|STAATSKANZLEI"):
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
out.append("Landesregierung")
return out
def _build_search_body(
self,
wahlperiode: int,
start_date: str,
end_date: str,
) -> dict:
"""Build the action JSON body for browse.tt.json.
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
The schema is taken from dokukratie's portala.query.json template
and only differs in the data source and the variable substitutions.
When ``self.document_type`` is None, the ETYPF/DTYPF/DART subtree
is dropped useful for parliaments whose ETYPF index uses
different value strings than ``"Antrag"``.
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
"""
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
document_type = self.document_type
date_range_text = f"{start_date} THRU {end_date}"
date_term = lambda sf, num: { # noqa: E731 — local helper
"tn": "trange", "sf": sf, "op": "eq", "num": num,
"idx": 119, "l": 3,
"p1": start_date, "t1": start_date,
"p2": end_date, "t2": end_date,
"t": date_range_text,
}
# Build the search.lines (form-state mirror) and the json tree
lines: dict = {
"2": str(wahlperiode),
"10": start_date,
"11": end_date,
"20.1": "alWEBBI",
"20.2": "alWEBBI",
"20.3": "alWEBBI",
"90.1": "AND",
"90.2": "AND",
"90.3": "AND",
}
if document_type is not None:
lines["3"] = document_type
lines["4"] = "D"
# Top-level AND tree
top_terms: list = [
{"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3,
"sf": "WP", "op": "eq", "num": 5},
]
if document_type is not None:
top_terms.append({"tn": "or", "num": 3, "terms": [
{"tn": "or", "num": 4, "terms": [
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
"l": 4, "sf": "ETYPF", "op": "eq", "num": 10},
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
"l": 4, "sf": "ETYP2F", "op": "eq", "num": 11},
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
"l": 4, "sf": "DTYPF", "op": "eq", "num": 12},
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
"l": 4, "sf": "DTYP2F", "op": "eq", "num": 13},
{"tn": "term", "t": f'"{document_type}"', "idx": 50,
"l": 4, "sf": "1VTYPF", "op": "eq", "num": 14},
]},
{"tn": "or", "num": 15, "terms": [
{"tn": "term", "t": '"D"', "idx": 93, "l": 4,
"sf": "DART", "op": "eq", "num": 16},
{"tn": "term", "t": '"D"', "idx": 93, "l": 4,
"sf": "DARTS", "op": "eq", "num": 17},
]},
]})
top_terms.append({"tn": "or", "num": 18, "terms": [
{"tn": "or", "num": 19, "terms": [
date_term("DAT", 20),
date_term("DDAT", 21),
]},
date_term("SDAT", 22),
]})
top_terms.append({"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1,
"sf": "TYP", "op": "eq", "num": 23})
# Mirror the same shape into the parsed/sref display strings
if document_type is not None:
parsed = (
f"((/WP {wahlperiode}) AND "
f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
f"AND (/DART,DARTS (\"D\")) AND "
f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
)
else:
parsed = (
f"((/WP {wahlperiode}) AND "
f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
)
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
return {
"action": "SearchAndDisplay",
"sources": [self.db_id],
"report": {
"rhl": "main",
"rhlmode": "add",
"format": "generic1-full",
"mime": "html",
"sort": "WEVSO1/D WEVSO2 WEVSO3",
},
"search": {
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
"lines": lines,
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
"serverrecordname": "sr_generic1",
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
"parsed": parsed,
"sref": parsed,
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
"json": [{
"tn": "and",
"num": 1,
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
"terms": top_terms,
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
}],
},
"dataSet": "1",
}
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
@staticmethod
def _datum_de_to_iso(datum_de: str) -> str:
"""Convert DD.MM.YYYY → YYYY-MM-DD; return '' for empty input."""
if not datum_de:
return ""
d, m, y = datum_de.split(".")
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]:
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
"""Extract Drucksachen from a report.tt.html response.
Two formats are supported and auto-detected:
- **LSA-style:** the records are embedded as Perl Data::Dumper
dumps inside ``<pre>$VAR1 = </pre>`` blocks. WEV06 title,
WEV32 metadata + PDF path. Used by Sachsen-Anhalt's PADOKA
template.
- **Berlin-style:** standard production HTML cards with
``efxRecordRepeater`` divs. Title in an ``<h3 class="h5">``,
metadata + PDF link in an ``<span class="h6">``. Used by
Berlin's PARDOK template.
"""
if self._RE_PRE_BLOCK.search(html):
return self._parse_hit_list_dump(html, query_filter)
return self._parse_hit_list_cards(html, query_filter)
def _parse_hit_list_dump(self, html: str, query_filter: str) -> list[Drucksache]:
"""Parse LSA-style ``<pre>$VAR1 = …</pre>`` Perl-dump records."""
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
results: list[Drucksache] = []
for pre in self._RE_PRE_BLOCK.findall(html):
m_ds = self._RE_DRUCKSACHE.search(pre)
if not m_ds:
continue
drucksache = m_ds.group(1)
m_t = self._RE_TITLE.search(pre)
title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"
m_pdf = self._RE_PDF.search(pre)
pdf_rel = m_pdf.group(1) if m_pdf else ""
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
pdf_url = f"{self.base_url}{self.pdf_url_prefix}{pdf_rel}" if pdf_rel else ""
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
m_w32 = self._RE_URHEBER_DATUM.search(pre)
urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else ""
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
datum_iso = self._datum_de_to_iso(m_w32.group(2) if m_w32 else "")
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
fraktionen = self._normalize_fraktion(urheber) if urheber else []
doc = Drucksache(
drucksache=drucksache,
title=title,
fraktionen=fraktionen,
datum=datum_iso,
link=pdf_url,
bundesland=self.bundesland,
typ="Antrag",
)
if query_filter:
hay = f"{title} {urheber}".lower()
if not all(t in hay for t in query_filter.lower().split()):
continue
results.append(doc)
return results
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
def _parse_hit_list_cards(self, html: str, query_filter: str) -> list[Drucksache]:
"""Parse Berlin-style ``efxRecordRepeater`` HTML-card records.
Each card contains an ``<h3>`` title, a metadata ``<span class="h6">``
with the document type, the Drucksachen-Nummer, and the date,
plus a direct ``<a href="…pdf">`` link to the PDF on the same host.
"""
results: list[Drucksache] = []
# Split the HTML on every record-div opener — easier than balancing
# divs with regex.
chunks = html.split('class="record')
# First chunk is the prelude, skip it
for chunk in chunks[1:]:
# Each chunk now starts at the record class attribute
m_t = self._RE_BE_TITLE.search(chunk)
title = m_t.group(1).strip() if m_t else "Ohne Titel"
m_ds = self._RE_BE_DRUCKSACHE.search(chunk)
if not m_ds:
continue
drucksache = m_ds.group(1)
m_pdf = self._RE_BE_LINK.search(chunk)
pdf_url = ""
if m_pdf:
href = m_pdf.group(1)
if href.startswith("http://") or href.startswith("https://"):
pdf_url = href
elif href.startswith("/"):
pdf_url = f"{self.base_url}{href}"
else:
pdf_url = f"{self.base_url}{self.pdf_url_prefix}{href}"
m_dat = self._RE_BE_DATUM.search(chunk)
datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "")
m_doc = self._RE_BE_DOCTYPE.search(chunk)
doctype_full = m_doc.group(1).strip() if m_doc else "Drucksache"
# Berlin often packs the originator(s) into the same h6 line:
# "Antrag CDU, SPD" → fraktionen = ["CDU","SPD"], typ = "Antrag"
# Senat-Vorlagen carry no fraction, only "Vorlage zur …".
fraktionen = self._normalize_fraktion(doctype_full)
# Strip the fraction names back out of the typ string so the UI
# shows a clean "Antrag" / "Vorlage …" label.
typ = doctype_full
if fraktionen:
# Cut at the first occurrence of any party name
cuts = [typ.upper().find(f.upper()) for f in fraktionen]
cuts = [c for c in cuts if c >= 0]
if cuts:
typ = typ[: min(cuts)].rstrip(" ,")
doc = Drucksache(
drucksache=drucksache,
title=title,
fraktionen=fraktionen,
datum=datum_iso,
link=pdf_url,
bundesland=self.bundesland,
typ=typ,
)
if query_filter:
hay = f"{title} {doctype_full}".lower()
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
if not all(t in hay for t in query_filter.lower().split()):
continue
results.append(doc)
return results
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
"""Search recent documents of the current Wahlperiode.
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
``query`` is applied as a client-side title/Urheber filter; the
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
server-side query covers the configured ``date_window_days``
(default 24 months).
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
"""
from datetime import date, timedelta
end = date.today()
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
start = end - timedelta(days=self.date_window_days)
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
body = self._build_search_body(
wahlperiode=self.wahlperiode,
start_date=start.isoformat(),
end_date=end.isoformat(),
)
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
browse_html = f"{self.base_url}{self.portala_path}/browse.tt.html"
browse_json = f"{self.base_url}{self.portala_path}/browse.tt.json"
report_html = f"{self.base_url}{self.portala_path}/report.tt.html"
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
async with httpx.AsyncClient(
# Bumped from 30s for #13 quick-win: chunksize=500 against the
# LSA report.tt.html endpoint occasionally takes 30+ seconds.
timeout=60,
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
) as client:
try:
# Step 1: warm up cookies via the browse page
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
await client.get(browse_html)
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
# Step 2: submit the search action
resp = await client.post(
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
browse_json,
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
json=body,
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
headers={"Referer": browse_html},
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
)
if resp.status_code != 200:
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
return []
data = resp.json()
report_id = data.get("report_id")
if not report_id:
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
logger.error("%s: no report_id in response: %s", self.bundesland, data)
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
return []
# Step 3: fetch the HTML hit list
# Take a generous chunk so the client-side title filter
# still has enough material to work with. Quick-win for #13
# until the eUI sf-Index for real server-side fulltext is
# reverse-engineered: bump the unfiltered chunk floor and
# the query-filtered chunk ceiling.
chunksize = max(limit * 10, 500) if query else max(limit * 2, 100)
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
report_resp = await client.post(
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
report_html,
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
json={"report_id": report_id, "start": 0, "chunksize": chunksize},
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
headers={"Referer": browse_html},
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
)
if report_resp.status_code != 200:
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
logger.error("%s report HTTP %s", self.bundesland, report_resp.status_code)
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
return []
results = self._parse_hit_list_html(report_resp.text, query_filter=query)
return results[:limit]
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
except Exception:
logger.exception("%s search error", self.bundesland)
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
return []
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
"""Look up a single document by ID via the search endpoint with a
document_number filter."""
# Pragmatic MVP: do a broad search and filter for the requested ID.
# A targeted single-document fetch would require a different
# action.search.json structure that we have not reverse-engineered yet.
results = await self.search(query="", limit=200)
for doc in results:
if doc.drucksache == drucksache:
return doc
return None
async def download_text(self, drucksache: str) -> Optional[str]:
"""Download the PDF for a Drucksache and extract its text."""
import fitz # PyMuPDF
doc = await self.get_document(drucksache)
if not doc or not doc.link:
return None
async with httpx.AsyncClient(
timeout=60,
follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
) as client:
try:
resp = await client.get(doc.link)
if resp.status_code != 200:
return None
pdf = fitz.open(stream=resp.content, filetype="pdf")
text = ""
for page in pdf:
text += page.get_text()
pdf.close()
return text
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
except Exception:
logger.exception("%s download error for %s", self.bundesland, drucksache)
return None
class ParLDokAdapter(ParlamentAdapter):
"""Adapter for ParlDok 8.x parliament documentation systems (J3S GmbH).
ParlDok is a proprietary parliament documentation product by J3S GmbH
(https://www.j3s.de). Different from the portala/eUI framework used by
LSA/BE: ParlDok 8.x is a single-page app whose backend is a JSON API
rooted at ``{base_url}{prefix}/Fulltext/...``. The legacy ParLDok 5.x
HTML POST form (``parldok/formalkriterien``) used by dokukratie's MV
YAML scraper has been deprecated by the LandtagMV upgrade to 8.3.5.
Confirmed instances using this engine (April 2026):
- **MV** (Mecklenburg-Vorpommern) ``dokumentation.landtag-mv.de/parldok``
- HH, SN, TH all advertise ParlDok in dokukratie but their actual
versions/themes have not been verified yet.
Search workflow:
1. ``GET {base_url}{prefix}/`` to obtain the session cookie. The
backend rejects POSTs without it.
2. ``POST {base_url}{prefix}/Fulltext/Search`` with form-encoded
``data=<json>`` payload. The JSON carries a ``tags`` array of
facet selections; each tag is ``{"type": <facet_type_int>,
"id": <facet_value>}``. Reverse-engineered facet type constants
from the bundle.js (``pd.facet_*``):
- ``facet_fraction = 2``
- ``facet_kind = 7`` (Drucksache, Plenarprotokoll, )
- ``facet_type = 8`` (Antrag, Gesetzentwurf, Kleine Anfrage, )
- ``facet_lp = 10`` (Wahlperiode)
Response is JSON ``{success, data: <stringified JSON>}`` where the
inner ``data`` carries ``{count, docs: [{id, title, date,
authorhtml, kind, type, lp, number, link, ...}], ...}``.
3. PDF download: ``GET {base_url}{prefix}/dokument/{numeric_id}``.
Returns ``application/pdf`` directly. The ``link`` field returned
by the search API already contains the path fragment
``/dokument/<id>#navpanes=0`` — strip the fragment and prepend
the configured ``prefix``.
Drucksachen-Nummer is reconstructed as ``f"{lp}/{number}"`` from the
search hit. Full-text search is *not* implemented in this MVP the
backend supports it via ``facet_fulltext = 0`` tags but the public
LP-only filter already returns the relevant Antrag pool. ``query``
is applied as a client-side title/Urheber filter.
"""
# Reverse-engineered facet type constants from bundle.js (pd.facet_*).
FACET_FULLTEXT = 0
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
FACET_FRACTION = 2
FACET_KIND = 7
FACET_TYPE = 8
FACET_LP = 10
def __init__(
self,
*,
bundesland: str,
name: str,
base_url: str,
wahlperiode: int,
prefix: str = "/parldok",
document_typ: str = "Antrag",
) -> None:
"""Configure a ParlDok 8.x adapter for one specific parliament.
Args:
bundesland: state code, e.g. ``"MV"``.
name: human-readable label.
base_url: ``https://...`` host root, no trailing slash.
wahlperiode: current legislative period fed into the
``facet_lp`` tag of the search payload.
prefix: app prefix where ParlDok lives. ``/parldok`` for MV.
document_typ: client-side filter on the ``type`` field of
each hit ("Antrag", "Gesetzentwurf", ). Set to empty
string to disable type filtering.
"""
self.bundesland = bundesland
self.name = name
self.base_url = base_url.rstrip("/")
self.prefix = "/" + prefix.strip("/")
self.wahlperiode = wahlperiode
self.document_typ = document_typ
@staticmethod
def _datum_de_to_iso(datum_de: str) -> str:
"""DD.MM.YYYY → YYYY-MM-DD; '' for empty input."""
if not datum_de:
return ""
try:
d, m, y = datum_de.split(".")
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
except ValueError:
return ""
@staticmethod
def _normalize_fraktion(authorhtml: str) -> list[str]:
"""Map ParlDok ``authorhtml`` to canonical fraction codes.
``authorhtml`` may be a comma-separated list of fractions
("CDU, SPD, F.D.P."), a single MdL with party in parens
("Thomas de Jesus Fernandes (AfD)") or empty (Landesregierung).
"""
if not authorhtml:
return []
u = authorhtml.upper()
out: list[str] = []
if re.search(r"\bBÜNDNIS\s*90\b", u) or re.search(r"\bGR(?:Ü|UE)NE\b", u):
out.append("GRÜNE")
if re.search(r"\bCDU\b", u):
out.append("CDU")
if re.search(r"\bSPD\b", u):
out.append("SPD")
# F.D.P. (with dots, historical) and FDP both occur in MV
if re.search(r"\bF\.?\s*D\.?\s*P\.?\b", u):
out.append("FDP")
if re.search(r"\bAFD\b", u):
out.append("AfD")
if re.search(r"\bLINKE\b", u) or re.search(r"\bLL/PDS\b", u):
out.append("LINKE")
if re.search(r"\bBSW\b", u):
out.append("BSW")
if re.search(r"LANDESREGIERUNG|MINISTER\b|STAATSKANZLEI|MINISTERPRÄSIDENT", u):
out.append("Landesregierung")
return out
@staticmethod
def _fulltext_id(term: str) -> str:
"""Sanitize a search term to ParlDok's facet ID format.
Mirrors ``pd.getFulltextId`` from ``bundle.js``: replace every
non-alphanumeric character with ``-``. The server uses this to
deduplicate identical search facets.
"""
return re.sub(r"[^a-zA-Z0-9]", "-", term)
def _build_search_body(self, *, length: int = 100, query: str = "") -> dict:
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
"""Build the JSON payload for the initial ``Fulltext/Search`` call.
Filters by Wahlperiode + optional server-side full-text search.
Type/kind filtering still happens client-side because the
facet_type/facet_kind value IDs are instance-specific and would
require an extra ``Fulltext/Filter`` round trip to discover.
Pagination beyond the first page goes through
``Fulltext/Resultpage`` the ``Search`` endpoint itself
ignores any non-zero ``Start``.
The full-text tag schema is reverse-engineered from
``pd.addInput`` in ``bundle.js`` and matches the SPA payload
verbatim::
{"type": 0, "id": "<sanitized>", "fulltext": "<raw>",
"label": "<raw>", "field": "Alle"}
``field="Alle"`` means "search all indexed fields"
(``pd.currentFTSearchMode`` default). The server tokenizes
the term and applies AND-semantics across whitespace.
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
"""
tags: list[dict] = [{"type": self.FACET_LP, "id": self.wahlperiode}]
if query:
tags.append({
"type": self.FACET_FULLTEXT,
"id": self._fulltext_id(query),
"fulltext": query,
"label": query,
"field": "Alle",
})
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
return {
"devicekey": "",
"max": length,
"withfilter": False,
# sort=2 → newest first (date desc); sort=1 is relevance.
"sort": 2,
"topk": length,
"llm": 0,
"newdocsearch": False,
"limit": {"Start": 0, "Length": length},
"tags": tags,
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
"updateFilters": [],
}
def _hit_to_drucksache(self, hit: dict) -> Optional[Drucksache]:
"""Convert one ParlDok JSON hit to a Drucksache. None if unusable."""
lp = hit.get("lp")
number = hit.get("number")
if not lp or not number:
return None
link_field = hit.get("link") or hit.get("prelink") or ""
# Strip "#navpanes=0" fragment and prepend the prefix.
path = link_field.split("#", 1)[0]
pdf_url = f"{self.base_url}{self.prefix}{path}" if path else ""
return Drucksache(
drucksache=f"{lp}/{number}",
title=hit.get("title", ""),
fraktionen=self._normalize_fraktion(hit.get("authorhtml", "")),
datum=self._datum_de_to_iso(hit.get("date", "")),
link=pdf_url,
bundesland=self.bundesland,
typ=hit.get("type", "") or hit.get("kind", ""),
)
async def _post_json(
self, client: httpx.AsyncClient, endpoint: str, payload: dict,
) -> Optional[dict]:
"""POST a JSON-stringified payload to a ParlDok endpoint.
``endpoint`` is the path tail (e.g. ``"Fulltext/Search"`` or
``"Fulltext/Resultpage"``). Returns the inner JSON object
(already parsed from the stringified ``data`` field), or None
on error.
"""
homepage = f"{self.base_url}{self.prefix}/"
url = f"{self.base_url}{self.prefix}/{endpoint}"
try:
resp = await client.post(
url,
data={"data": json.dumps(payload, ensure_ascii=False)},
headers={
"X-Requested-With": "XMLHttpRequest",
"Referer": homepage,
},
)
if resp.status_code != 200:
logger.error(
"%s %s HTTP %s",
self.bundesland, endpoint, resp.status_code,
)
return None
outer = resp.json()
if not outer.get("success"):
logger.error(
"%s %s not successful: %s",
self.bundesland, endpoint, outer.get("message"),
)
return None
return json.loads(outer["data"])
except Exception:
logger.exception("%s ParlDok %s error", self.bundesland, endpoint)
return None
async def _initial_search(
self, client: httpx.AsyncClient, *, length: int, query: str = "",
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
) -> tuple[Optional[int], list[dict]]:
"""Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
calls. ParlDok ignores any non-zero ``Start`` on this endpoint
the first 100 hits are the only ones reachable via ``Search``.
``query`` is sent server-side as a ``facet_fulltext`` tag see
``_build_search_body``.
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
"""
body = self._build_search_body(length=length, query=query)
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
inner = await self._post_json(client, "Fulltext/Search", body)
if not inner:
return None, []
return inner.get("queryid"), (inner.get("docs") or [])
async def _result_page(
self, client: httpx.AsyncClient, *, queryid: int, start: int, length: int,
) -> list[dict]:
"""Fetch a further result page via ``Fulltext/Resultpage``."""
payload = {
"devicekey": "",
"queryid": queryid,
"limit": {"Start": start, "Length": length},
}
inner = await self._post_json(client, "Fulltext/Resultpage", payload)
if not inner:
return []
return inner.get("docs") or []
def _make_client(self) -> httpx.AsyncClient:
return httpx.AsyncClient(
timeout=30,
follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
)
async def _paginated_hits(
self, client: httpx.AsyncClient, *, query: str = "",
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
):
"""Async iterator over Drucksachen-style hits across all pages.
Yields raw hit dicts in newest-first order. The first batch comes
from ``Fulltext/Search``, subsequent batches from
``Fulltext/Resultpage`` using the queryid the server returned for
the initial call. Stops when a page comes back empty, undersized,
or after ``MAX_PAGES`` iterations.
``query`` is forwarded as a server-side full-text filter to
``_initial_search``; the resulting ``queryid`` is bound to that
filter, so subsequent ``Resultpage`` calls automatically inherit
it without needing to repeat the tag.
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
"""
queryid, hits = await self._initial_search(
client, length=self.PAGE_SIZE, query=query,
)
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
for hit in hits:
yield hit
if not queryid or len(hits) < self.PAGE_SIZE:
return
for page in range(1, self.MAX_PAGES):
page_hits = await self._result_page(
client,
queryid=queryid,
start=page * self.PAGE_SIZE,
length=self.PAGE_SIZE,
)
if not page_hits:
return
for hit in page_hits:
yield hit
if len(page_hits) < self.PAGE_SIZE:
return
# ParlDok 8.x caps Length per request at 100 — paginate if needed.
PAGE_SIZE = 100
# Safety bound: scan at most 10 pages × 100 = 1000 most recent docs.
# Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more
# than enough for the typical UI request (limit 5..20). Filtered
# queries that find nothing in the last 1000 docs return empty
# rather than scan the entire WP.
MAX_PAGES = 10
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
"""Search recent documents of the configured Wahlperiode.
Server-side full-text search via the ``facet_fulltext`` tag (#12)
when ``query`` is non-empty; otherwise pure browse mode. The
server returns the WP sorted newest-first across all document
kinds, the client keeps only ``Antrag``-typed Drucksachen and
dedupes by lp/number (ParlDok returns the same Drucksache
multiple times when it appears in several Vorgänge/Beratungen).
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
Pagination: ParlDok caps each response at 100 rows; further
pages come from ``Fulltext/Resultpage`` bound to the
server-assigned ``queryid``.
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
"""
results: list[Drucksache] = []
seen: set[str] = set()
async with self._make_client() as client:
await client.get(f"{self.base_url}{self.prefix}/")
async for hit in self._paginated_hits(client, query=query):
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
if hit.get("kind") != "Drucksache":
continue
if self.document_typ and hit.get("type") != self.document_typ:
continue
doc = self._hit_to_drucksache(hit)
if not doc:
continue
if doc.drucksache in seen:
continue
seen.add(doc.drucksache)
results.append(doc)
if len(results) >= limit:
return results
return results
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
"""Look up a single Antrag by ``lp/number`` ID.
Pragmatic MVP: page through the WP unfiltered until we find a
match. ParlDok offers a ``facet_number`` (14) facet that would
let us target the lookup directly, but the facet ID values are
instance-specific (would require a ``Fulltext/Filter`` discovery
call) and the WP-wide pagination is fast enough for the typical
2k10k Drucksachen per period.
"""
wanted_lp, wanted_num = (drucksache.split("/", 1) + [""])[:2]
if not wanted_num:
return None
async with self._make_client() as client:
await client.get(f"{self.base_url}{self.prefix}/")
async for hit in self._paginated_hits(client):
if hit.get("kind") != "Drucksache":
continue
if str(hit.get("lp")) == wanted_lp and str(hit.get("number")) == wanted_num:
return self._hit_to_drucksache(hit)
return None
async def download_text(self, drucksache: str) -> Optional[str]:
"""Download the PDF for a Drucksache and extract its text."""
import fitz # PyMuPDF
doc = await self.get_document(drucksache)
if not doc or not doc.link:
return None
async with httpx.AsyncClient(
timeout=60,
follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
) as client:
try:
resp = await client.get(doc.link)
if resp.status_code != 200:
logger.error(
"%s PDF HTTP %s for %s (%s)",
self.bundesland, resp.status_code, drucksache, doc.link,
)
return None
pdf = fitz.open(stream=resp.content, filetype="pdf")
text = ""
for page in pdf:
text += page.get_text()
pdf.close()
return text
except Exception:
logger.exception("%s ParlDok download error for %s", self.bundesland, drucksache)
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
return None
class BayernAdapter(ParlamentAdapter):
"""Adapter for Bayerischer Landtag."""
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
bundesland = "BY"
name = "Bayerischer Landtag"
base_url = "https://www.bayern.landtag.de"
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
# TODO: Implement Bayern search
return []
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
# TODO: Implement
return None
Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a <pre> tag with the canonical metadata. - The query schema (sources, search.lines, search.json tree, report block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0) — only structure/selectors are reused, no Python code is ported. - DB id is "lsa.lissh"; the server validates this and rejects unknown interfaces with an explicit errormsg. - PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served directly without any session cookie. What the adapter does - search() builds a date-window query (last ~24 months) for "Antrag" document type and returns the most recent hits. The user's free-text query is applied as a client-side title/Urheber filter (no fulltext search server-side yet — see "Limitations" below). - Hits are parsed from the Perl record dumps in the report HTML: - WEV06.main → title (Perl \x{xx} hex escapes decoded) - WEV32.5 → relative PDF path - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>" - Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE, FDP, AfD, LINKE, Landesregierung). - get_document() looks up a single Drucksache by re-running the search. - download_text() fetches the PDF and extracts text via PyMuPDF. - bundeslaender.py: LSA's doku_system corrected from "StarWeb" to "PARDOK", anmerkung updated with the migration story. Limitations (deliberate, MVP) - No server-side full-text search. The portala framework's sf index names for LSA full-text content are not yet known; tree mutations with sf=alAB return 0 hits. Client-side filter is "good enough" for the next ~24 months of Anträge (≈few hundred per WP). - LSA is still aktiv=False in bundeslaender.py — the adapter is dormant in production until issue #2's wahlprogramm ingest and frontend activation land. Verified live against padoka.landtag.sachsen-anhalt.de: - search(query="", limit=5) returned 5 current Anträge from März 2026 (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs. - download_text("8/6790") returned 5051 chars of real Antragstext ("ICE-Halt für Salzwedel dauerhaft erhalten"). Refs #2. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 21:50:23 +02:00
async def download_text(self, drucksache: str) -> Optional[str]:
return None
class BWAdapter(ParlamentAdapter):
"""Adapter for Baden-Württemberg Landtag."""
bundesland = "BW"
name = "Landtag Baden-Württemberg"
base_url = "https://www.landtag-bw.de"
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
# TODO: Implement BW search
return []
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
return None
async def download_text(self, drucksache: str) -> Optional[str]:
return None
# Registry of adapters
ADAPTERS = {
"NRW": NRWAdapter(),
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
"LSA": PortalaAdapter(
bundesland="LSA",
name="Landtag von Sachsen-Anhalt (PADOKA)",
base_url="https://padoka.landtag.sachsen-anhalt.de",
db_id="lsa.lissh",
wahlperiode=8,
portala_path="/portal",
document_type="Antrag",
pdf_url_prefix="/files/",
),
"BE": PortalaAdapter(
bundesland="BE",
name="Abgeordnetenhaus von Berlin (PARDOK)",
base_url="https://pardok.parlament-berlin.de",
db_id="lah.lissh",
wahlperiode=19,
portala_path="/portala",
# Berlin's ETYPF index uses different value strings — drop the
# document_type subtree, fall back to client-side title filter.
document_type=None,
# Quick-win for #13: pulled the date window from the original
# 180-day MVP up to 730 days so client-side title-filter searches
# ("Schule" etc.) reach back across more of the WP19 corpus until
# the eUI fulltext-sf is reverse-engineered. The chunksize bump
# in PortalaAdapter.search() means the per-request payload stays
# bounded.
date_window_days=730,
Activate Berlin (PARDOK) — search-only MVP (#3) PortalaAdapter is now parameterizable and serves both LSA and Berlin from a single class. Berlin is activated as the third live bundesland (after NRW + LSA), with the deliberate caveat that the LTW 2023 Wahlprogramme are not yet indexed. PortalaAdapter refactor - Class attributes (bundesland, name, base_url, db_id, wahlperiode) moved into the constructor. New optional parameters: - portala_path: "/portal" for LSA, "/portala" for Berlin - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF index uses different value strings; the document_type subtree is dropped from the action.search.json tree) - pdf_url_prefix: "/files/" by default; absolute URLs in the hit list are passed through unchanged (Berlin embeds full starweb/adis/citat/... links) - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more documents per WP, narrower window keeps payloads bounded) - _build_search_body builds the JSON tree dynamically: when document_type is None, the entire ETYPF/DTYPF/DART subtree is omitted, mirrored in the parsed/sref display strings as well. - _parse_hit_list_html now auto-detects between two formats: 1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records (existing parser, untouched). 2. Berlin-style: production HTML cards with efxRecordRepeater divs, h3 titles, h6 metadata lines containing the document type, drucksachen-id and date, plus a direct <a href="…pdf"> to the PDF on the same host. - Berlin extracts originator parties from the h6 line ("Antrag CDU, SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary _normalize_fraktion regex. - _normalize_fraktion rewritten with regex word boundaries, fixing a long-standing bug where comma-separated fraction lists like "CDU, SPD" failed to match CDU. Also picks up BSW for the Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung. bundeslaender.py - BE flipped to aktiv=True. anmerkung documents the Wahlprogramm- Lücke and the auto-detected hit-list format. Live verified against pardok.parlament-berlin.de: - WP 19 with 180-day date window returns 2962 hits, page 1 contains 5 records all with title, drucksache, date, PDF URL. - 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as Vorlage zur Kenntnisnahme. - LSA still returns the same 5 current Anträge of März 2026 — no regression from the refactor. Known limitation (will be tracked as a follow-up issue) - Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the embeddings DB. The 2023 PDFs are no longer linked from the live party websites (which currently feature 2026 draft programmes), and Wayback has no snapshots. The analyzer therefore falls back to bundesländer-übergreifende Grundsatzprogramme for BE Anträge until the 2023 PDFs are sourced manually. Refs #3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:33:16 +02:00
pdf_url_prefix="/files/",
),
Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4) Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation systems by J3S GmbH. MV becomes the fourth supported state alongside NRW, LSA and BE. Notable details: - ParlDok 8.x is a single-page app whose backend is a JSON API rooted at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok 5.x HTML POST form (parldok/formalkriterien) used by dokukratie's mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and is no longer reachable via the old form fields — hence a new adapter rather than reusing the dokukratie scraper. - Two-stage pagination: Fulltext/Search returns the first 100 hits + a queryid; further pages come from Fulltext/Resultpage with {queryid, limit:{Start,Length}}. The Search endpoint silently ignores any non-zero Start, so single-stage offset pagination is not an option. - Server-side filter via facet_lp (type=10) on the configured WP; type=Antrag is filtered client-side because the facet_type value IDs are instance-specific and would require an extra Fulltext/Filter discovery call. ParlDok also returns the same Drucksache multiple times when it appears in several Vorgänge/Beratungen, so search() dedupes by lp/number. - Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up in #4) — analyses run with the federal Grundsatzprogramm fallback, same as Berlin until #10 lands. Drive-by cleanup of PortalaAdapter print() statements: switched to the module-level logger so adapter parser bugs no longer disappear into stdout. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 08:19:48 +02:00
"MV": ParLDokAdapter(
bundesland="MV",
name="Landtag Mecklenburg-Vorpommern (ParlDok)",
base_url="https://www.dokumentation.landtag-mv.de",
wahlperiode=8,
prefix="/parldok",
document_typ="Antrag",
),
"BY": BayernAdapter(),
"BW": BWAdapter(),
}
def get_adapter(bundesland: str) -> Optional[ParlamentAdapter]:
"""Get adapter for a bundesland."""
return ADAPTERS.get(bundesland)
async def search_all(query: str, bundesland: str = "NRW", limit: int = 20) -> list[Drucksache]:
"""Search parliament documents in a specific state."""
adapter = get_adapter(bundesland)
if not adapter:
return []
return await adapter.search(query, limit)