diff --git a/app/bundeslaender.py b/app/bundeslaender.py index fab9df9..cd3c83e 100644 --- a/app/bundeslaender.py +++ b/app/bundeslaender.py @@ -298,14 +298,16 @@ BUNDESLAENDER: dict[str, Bundesland] = { naechste_wahl="2026-09-06", regierungsfraktionen=["CDU", "SPD", "FDP"], landtagsfraktionen=["CDU", "AfD", "LINKE", "SPD", "GRÜNE", "FDP"], - doku_system="StarWeb", + doku_system="PARDOK", doku_base_url="https://padoka.landtag.sachsen-anhalt.de", drucksache_format="8/1234", dokukratie_scraper="st", anmerkung=( "ISO-Code wäre ST; LSA ist im politischen Sprachgebrauch dominant. " "Sven Schulze (CDU) seit 28.01.2026 MP nach Rücktritt Haseloff. " - "PADOKA = Parlamentsdokumentationssystem auf StarWeb-6.0.01-Basis." + "PADOKA wurde von StarWeb auf das portala/eUI-Framework migriert " + "(gleiche Engine wie Berlin/PARDOK). dokukratie's st.yml ist veraltet. " + "Suche läuft via POST /portal/browse.tt.json + report.tt.html." ), ), "SH": Bundesland( diff --git a/app/parlamente.py b/app/parlamente.py index a701db4..99f0c69 100644 --- a/app/parlamente.py +++ b/app/parlamente.py @@ -305,21 +305,343 @@ class NRWAdapter(ParlamentAdapter): return None +class PortalaAdapter(ParlamentAdapter): + """Adapter for portala/eUI-based parliament documentation systems. + + Used by parliaments running the proprietary "esearch" / portala framework + (originally developed for STAR/StarFinder backends, now wrapped in a + Single-Page App with Template Toolkit on the server side): + + - **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de`` + - **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` (future) + + The search workflow is two-stage: + + 1. ``POST /portal/browse.tt.json`` with a complex JSON ``action`` body + that contains an Elasticsearch-style query tree under + ``search.json``. The server returns a ``report_id`` plus hit count. + 2. ``POST /portal/report.tt.html`` with ``{report_id, start, chunksize}`` + to fetch the HTML hit list. Each hit carries a Perl Data::Dumper + block in a ``
`` tag with the canonical metadata. + + The query body schema was reverse-engineered from + https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json + (GPL-3.0 — only structure/selectors are reused, not Python code). + + Full-text search is **not** implemented in the MVP: the adapter + returns the most recent ``Anträge`` of the current Wahlperiode in the + given date window, and the search query is applied as a client-side + title/Urheber filter. The portala server-side full-text path requires + LSA-specific ``sf`` index names that are not yet known. + """ + + bundesland = "LSA" + name = "Landtag von Sachsen-Anhalt (PADOKA)" + base_url = "https://padoka.landtag.sachsen-anhalt.de" + db_id = "lsa.lissh" + wahlperiode = 8 + + # Reverse-engineered "WEV*" Perl record fields used in the hit-list dumps: + # WEV06.main = title + # WEV32.5 = relative PDF path + # WEV32.main = "AntragDrucksache X/YYYY ..." + _RE_TITLE = re.compile(r"'WEV06'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']") + _RE_PDF = re.compile(r"'5'\s*=>\s*'([^']*\.pdf)'") + _RE_DRUCKSACHE = re.compile(r"Drucksache\s*(\d+/\d+)") + _RE_URHEBER_DATUM = re.compile( + r"'WEV32'\s*=>\s*\[\s*\{[^}]*'main'\s*=>\s*[\"']Antrag\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache", + ) + _RE_PRE_BLOCK = re.compile(r' \$VAR1 = (.*?)', re.DOTALL) + + @staticmethod + def _decode_perl_hex(s: str) -> str: + """Decode \\x{abcd} escape sequences from Perl Data::Dumper output.""" + return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s) + + @staticmethod + def _normalize_fraktion(urheber: str) -> list[str]: + """Map Urheber-String to canonical fraction codes.""" + u = urheber.upper() + out = [] + if "BÜNDNIS 90" in u or "GRÜNE" in u or "GRUENE" in u: + out.append("GRÜNE") + if u.startswith("CDU") or " CDU " in u or u.endswith(" CDU"): + out.append("CDU") + if "SPD" in u: + out.append("SPD") + if "FDP" in u: + out.append("FDP") + if "AFD" in u: + out.append("AfD") + if "LINKE" in u or "DIE LINKE" in u: + out.append("LINKE") + if "LANDESREGIERUNG" in u or "MINISTER" in u or "STAATSKANZLEI" in u: + out.append("Landesregierung") + return out + + def _build_search_body( + self, + wahlperiode: int, + start_date: str, + end_date: str, + document_type: str = "Antrag", + ) -> dict: + """Build the action JSON body for browse.tt.json. + + The schema is taken 1:1 from dokukratie's portala.query.json template + and only differs in the data source (lsa.lissh) and the variable + substitutions. + """ + return { + "action": "SearchAndDisplay", + "sources": [self.db_id], + "report": { + "rhl": "main", + "rhlmode": "add", + "format": "generic1-full", + "mime": "html", + "sort": "WEVSO1/D WEVSO2 WEVSO3", + }, + "search": { + "lines": { + "2": str(wahlperiode), + "3": document_type, + "4": "D", + "10": start_date, + "11": end_date, + "20.1": "alWEBBI", + "20.2": "alWEBBI", + "20.3": "alWEBBI", + "90.1": "AND", + "90.2": "AND", + "90.3": "AND", + }, + "serverrecordname": "sr_generic1", + "parsed": ( + f"((/WP {wahlperiode}) AND " + f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) " + f"AND (/DART,DARTS (\"D\")) AND " + f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE" + ), + "sref": ( + f"((/WP {wahlperiode}) AND " + f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) " + f"AND (/DART,DARTS (\"D\")) AND " + f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE" + ), + "json": [{ + "tn": "and", + "num": 1, + "terms": [ + {"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3, + "sf": "WP", "op": "eq", "num": 5}, + {"tn": "or", "num": 3, "terms": [ + {"tn": "or", "num": 4, "terms": [ + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "ETYPF", "op": "eq", "num": 10}, + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "ETYP2F", "op": "eq", "num": 11}, + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "DTYPF", "op": "eq", "num": 12}, + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "DTYP2F", "op": "eq", "num": 13}, + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "1VTYPF", "op": "eq", "num": 14}, + ]}, + {"tn": "or", "num": 15, "terms": [ + {"tn": "term", "t": '"D"', "idx": 93, "l": 4, + "sf": "DART", "op": "eq", "num": 16}, + {"tn": "term", "t": '"D"', "idx": 93, "l": 4, + "sf": "DARTS", "op": "eq", "num": 17}, + ]}, + ]}, + {"tn": "or", "num": 18, "terms": [ + {"tn": "or", "num": 19, "terms": [ + {"tn": "trange", "sf": "DAT", "op": "eq", "num": 20, + "idx": 119, "l": 3, "p1": start_date, "t1": start_date, + "p2": end_date, "t2": end_date, + "t": f"{start_date} THRU {end_date}"}, + {"tn": "trange", "sf": "DDAT", "op": "eq", "num": 21, + "idx": 119, "l": 3, "p1": start_date, "t1": start_date, + "p2": end_date, "t2": end_date, + "t": f"{start_date} THRU {end_date}"}, + ]}, + {"tn": "trange", "sf": "SDAT", "op": "eq", "num": 22, + "idx": 119, "l": 3, "p1": start_date, "t1": start_date, + "p2": end_date, "t2": end_date, + "t": f"{start_date} THRU {end_date}"}, + ]}, + {"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1, + "sf": "TYP", "op": "eq", "num": 23}, + ], + }], + }, + "dataSet": "1", + } + + def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]: + """Extract Drucksachen from a report.tt.html response.""" + results: list[Drucksache] = [] + for pre in self._RE_PRE_BLOCK.findall(html): + m_ds = self._RE_DRUCKSACHE.search(pre) + if not m_ds: + continue + drucksache = m_ds.group(1) + + m_t = self._RE_TITLE.search(pre) + title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}" + + m_pdf = self._RE_PDF.search(pre) + pdf_rel = m_pdf.group(1) if m_pdf else "" + pdf_url = f"{self.base_url}/files/{pdf_rel}" if pdf_rel else "" + + m_w32 = self._RE_URHEBER_DATUM.search(pre) + urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else "" + datum_de = m_w32.group(2) if m_w32 else "" + # DD.MM.YYYY -> ISO YYYY-MM-DD + datum_iso = "" + if datum_de: + d, m, y = datum_de.split(".") + datum_iso = f"{y}-{m.zfill(2)}-{d.zfill(2)}" + + fraktionen = self._normalize_fraktion(urheber) if urheber else [] + + doc = Drucksache( + drucksache=drucksache, + title=title, + fraktionen=fraktionen, + datum=datum_iso, + link=pdf_url, + bundesland=self.bundesland, + typ="Antrag", + ) + + # Client-side title filter (no fulltext search server-side) + if query_filter: + hay = f"{title} {urheber}".lower() + if not all(t in hay for t in query_filter.lower().split()): + continue + + results.append(doc) + + return results + + async def search(self, query: str, limit: int = 20) -> list[Drucksache]: + """Search recent Anträge of the current Wahlperiode. + + ``query`` is applied as a client-side title/Urheber filter; the + server-side query covers the last ~24 months by default. + """ + from datetime import date, timedelta + + end = date.today() + start = end - timedelta(days=730) + body = self._build_search_body( + wahlperiode=self.wahlperiode, + start_date=start.isoformat(), + end_date=end.isoformat(), + document_type="Antrag", + ) + + async with httpx.AsyncClient( + timeout=30, + follow_redirects=True, + headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, + ) as client: + try: + # Step 1: warm up cookies via the browse page + await client.get(f"{self.base_url}/portal/browse.tt.html") + + # Step 2: submit the search action + resp = await client.post( + f"{self.base_url}/portal/browse.tt.json", + json=body, + headers={"Referer": f"{self.base_url}/portal/browse.tt.html"}, + ) + if resp.status_code != 200: + print(f"PADOKA search HTTP {resp.status_code}") + return [] + + data = resp.json() + report_id = data.get("report_id") + if not report_id: + print(f"PADOKA: no report_id in response: {data}") + return [] + + # Step 3: fetch the HTML hit list + # Take a generous chunk so client-side filter still has enough + chunksize = 100 if query else limit + report_resp = await client.post( + f"{self.base_url}/portal/report.tt.html", + json={"report_id": report_id, "start": 0, "chunksize": chunksize}, + headers={"Referer": f"{self.base_url}/portal/browse.tt.html"}, + ) + if report_resp.status_code != 200: + print(f"PADOKA report HTTP {report_resp.status_code}") + return [] + + results = self._parse_hit_list_html(report_resp.text, query_filter=query) + return results[:limit] + + except Exception as e: + print(f"PADOKA search error: {e}") + return [] + + async def get_document(self, drucksache: str) -> Optional[Drucksache]: + """Look up a single document by ID via the search endpoint with a + document_number filter.""" + # Pragmatic MVP: do a broad search and filter for the requested ID. + # A targeted single-document fetch would require a different + # action.search.json structure that we have not reverse-engineered yet. + results = await self.search(query="", limit=200) + for doc in results: + if doc.drucksache == drucksache: + return doc + return None + + async def download_text(self, drucksache: str) -> Optional[str]: + """Download the PDF for a Drucksache and extract its text.""" + import fitz # PyMuPDF + + doc = await self.get_document(drucksache) + if not doc or not doc.link: + return None + + async with httpx.AsyncClient( + timeout=60, + follow_redirects=True, + headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, + ) as client: + try: + resp = await client.get(doc.link) + if resp.status_code != 200: + return None + pdf = fitz.open(stream=resp.content, filetype="pdf") + text = "" + for page in pdf: + text += page.get_text() + pdf.close() + return text + except Exception as e: + print(f"PADOKA download error for {drucksache}: {e}") + return None + + class BayernAdapter(ParlamentAdapter): """Adapter for Bayerischer Landtag.""" - + bundesland = "BY" name = "Bayerischer Landtag" base_url = "https://www.bayern.landtag.de" - + async def search(self, query: str, limit: int = 20) -> list[Drucksache]: # TODO: Implement Bayern search return [] - + async def get_document(self, drucksache: str) -> Optional[Drucksache]: # TODO: Implement return None - + async def download_text(self, drucksache: str) -> Optional[str]: return None @@ -345,6 +667,7 @@ class BWAdapter(ParlamentAdapter): # Registry of adapters ADAPTERS = { "NRW": NRWAdapter(), + "LSA": PortalaAdapter(), "BY": BayernAdapter(), "BW": BWAdapter(), }