From c7242f8413cd909e210dfebd1845950486b9a4a8 Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Tue, 7 Apr 2026 21:50:23 +0200 Subject: [PATCH] Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a clean-room PortalaAdapter that talks to the eUI/portala framework behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's PARDOK; the same adapter will serve issue #3 once activated for BE. Reverse-engineering notes - The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml is outdated. The Sachsen-Anhalt portal was migrated to the same eUI/portala SPA framework Berlin uses. The legacy starweb URL returns 503; the new entry point is /portal/browse.tt.html. - Search workflow is two-stage: 1. POST /portal/browse.tt.json with a JSON action body containing an Elasticsearch-style query tree under search.json. Returns a report_id plus hit count. 2. POST /portal/report.tt.html with {report_id, start, chunksize} returns the HTML hit list. Each record carries a Perl Data::Dumper block in a
 tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag   Drucksache X/YYYY"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 
---
 app/bundeslaender.py |   6 +-
 app/parlamente.py    | 331 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 331 insertions(+), 6 deletions(-)

diff --git a/app/bundeslaender.py b/app/bundeslaender.py
index fab9df9..cd3c83e 100644
--- a/app/bundeslaender.py
+++ b/app/bundeslaender.py
@@ -298,14 +298,16 @@ BUNDESLAENDER: dict[str, Bundesland] = {
         naechste_wahl="2026-09-06",
         regierungsfraktionen=["CDU", "SPD", "FDP"],
         landtagsfraktionen=["CDU", "AfD", "LINKE", "SPD", "GRÜNE", "FDP"],
-        doku_system="StarWeb",
+        doku_system="PARDOK",
         doku_base_url="https://padoka.landtag.sachsen-anhalt.de",
         drucksache_format="8/1234",
         dokukratie_scraper="st",
         anmerkung=(
             "ISO-Code wäre ST; LSA ist im politischen Sprachgebrauch dominant. "
             "Sven Schulze (CDU) seit 28.01.2026 MP nach Rücktritt Haseloff. "
-            "PADOKA = Parlamentsdokumentationssystem auf StarWeb-6.0.01-Basis."
+            "PADOKA wurde von StarWeb auf das portala/eUI-Framework migriert "
+            "(gleiche Engine wie Berlin/PARDOK). dokukratie's st.yml ist veraltet. "
+            "Suche läuft via POST /portal/browse.tt.json + report.tt.html."
         ),
     ),
     "SH": Bundesland(
diff --git a/app/parlamente.py b/app/parlamente.py
index a701db4..99f0c69 100644
--- a/app/parlamente.py
+++ b/app/parlamente.py
@@ -305,21 +305,343 @@ class NRWAdapter(ParlamentAdapter):
                 return None
 
 
+class PortalaAdapter(ParlamentAdapter):
+    """Adapter for portala/eUI-based parliament documentation systems.
+
+    Used by parliaments running the proprietary "esearch" / portala framework
+    (originally developed for STAR/StarFinder backends, now wrapped in a
+    Single-Page App with Template Toolkit on the server side):
+
+    - **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de``
+    - **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` (future)
+
+    The search workflow is two-stage:
+
+    1. ``POST /portal/browse.tt.json`` with a complex JSON ``action`` body
+       that contains an Elasticsearch-style query tree under
+       ``search.json``. The server returns a ``report_id`` plus hit count.
+    2. ``POST /portal/report.tt.html`` with ``{report_id, start, chunksize}``
+       to fetch the HTML hit list. Each hit carries a Perl Data::Dumper
+       block in a ``
`` tag with the canonical metadata.
+
+    The query body schema was reverse-engineered from
+    https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
+    (GPL-3.0 — only structure/selectors are reused, not Python code).
+
+    Full-text search is **not** implemented in the MVP: the adapter
+    returns the most recent ``Anträge`` of the current Wahlperiode in the
+    given date window, and the search query is applied as a client-side
+    title/Urheber filter. The portala server-side full-text path requires
+    LSA-specific ``sf`` index names that are not yet known.
+    """
+
+    bundesland = "LSA"
+    name = "Landtag von Sachsen-Anhalt (PADOKA)"
+    base_url = "https://padoka.landtag.sachsen-anhalt.de"
+    db_id = "lsa.lissh"
+    wahlperiode = 8
+
+    # Reverse-engineered "WEV*" Perl record fields used in the hit-list dumps:
+    # WEV06.main = title
+    # WEV32.5    = relative PDF path
+    # WEV32.main = "Antrag   Drucksache X/YYYY ..."
+    _RE_TITLE = re.compile(r"'WEV06'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
+    _RE_PDF = re.compile(r"'5'\s*=>\s*'([^']*\.pdf)'")
+    _RE_DRUCKSACHE = re.compile(r"Drucksache\s*(\d+/\d+)")
+    _RE_URHEBER_DATUM = re.compile(
+        r"'WEV32'\s*=>\s*\[\s*\{[^}]*'main'\s*=>\s*[\"']Antrag\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
+    )
+    _RE_PRE_BLOCK = re.compile(r'
\$VAR1 = (.*?)
', re.DOTALL) + + @staticmethod + def _decode_perl_hex(s: str) -> str: + """Decode \\x{abcd} escape sequences from Perl Data::Dumper output.""" + return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s) + + @staticmethod + def _normalize_fraktion(urheber: str) -> list[str]: + """Map Urheber-String to canonical fraction codes.""" + u = urheber.upper() + out = [] + if "BÜNDNIS 90" in u or "GRÜNE" in u or "GRUENE" in u: + out.append("GRÜNE") + if u.startswith("CDU") or " CDU " in u or u.endswith(" CDU"): + out.append("CDU") + if "SPD" in u: + out.append("SPD") + if "FDP" in u: + out.append("FDP") + if "AFD" in u: + out.append("AfD") + if "LINKE" in u or "DIE LINKE" in u: + out.append("LINKE") + if "LANDESREGIERUNG" in u or "MINISTER" in u or "STAATSKANZLEI" in u: + out.append("Landesregierung") + return out + + def _build_search_body( + self, + wahlperiode: int, + start_date: str, + end_date: str, + document_type: str = "Antrag", + ) -> dict: + """Build the action JSON body for browse.tt.json. + + The schema is taken 1:1 from dokukratie's portala.query.json template + and only differs in the data source (lsa.lissh) and the variable + substitutions. + """ + return { + "action": "SearchAndDisplay", + "sources": [self.db_id], + "report": { + "rhl": "main", + "rhlmode": "add", + "format": "generic1-full", + "mime": "html", + "sort": "WEVSO1/D WEVSO2 WEVSO3", + }, + "search": { + "lines": { + "2": str(wahlperiode), + "3": document_type, + "4": "D", + "10": start_date, + "11": end_date, + "20.1": "alWEBBI", + "20.2": "alWEBBI", + "20.3": "alWEBBI", + "90.1": "AND", + "90.2": "AND", + "90.3": "AND", + }, + "serverrecordname": "sr_generic1", + "parsed": ( + f"((/WP {wahlperiode}) AND " + f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) " + f"AND (/DART,DARTS (\"D\")) AND " + f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE" + ), + "sref": ( + f"((/WP {wahlperiode}) AND " + f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) " + f"AND (/DART,DARTS (\"D\")) AND " + f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE" + ), + "json": [{ + "tn": "and", + "num": 1, + "terms": [ + {"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3, + "sf": "WP", "op": "eq", "num": 5}, + {"tn": "or", "num": 3, "terms": [ + {"tn": "or", "num": 4, "terms": [ + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "ETYPF", "op": "eq", "num": 10}, + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "ETYP2F", "op": "eq", "num": 11}, + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "DTYPF", "op": "eq", "num": 12}, + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "DTYP2F", "op": "eq", "num": 13}, + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "1VTYPF", "op": "eq", "num": 14}, + ]}, + {"tn": "or", "num": 15, "terms": [ + {"tn": "term", "t": '"D"', "idx": 93, "l": 4, + "sf": "DART", "op": "eq", "num": 16}, + {"tn": "term", "t": '"D"', "idx": 93, "l": 4, + "sf": "DARTS", "op": "eq", "num": 17}, + ]}, + ]}, + {"tn": "or", "num": 18, "terms": [ + {"tn": "or", "num": 19, "terms": [ + {"tn": "trange", "sf": "DAT", "op": "eq", "num": 20, + "idx": 119, "l": 3, "p1": start_date, "t1": start_date, + "p2": end_date, "t2": end_date, + "t": f"{start_date} THRU {end_date}"}, + {"tn": "trange", "sf": "DDAT", "op": "eq", "num": 21, + "idx": 119, "l": 3, "p1": start_date, "t1": start_date, + "p2": end_date, "t2": end_date, + "t": f"{start_date} THRU {end_date}"}, + ]}, + {"tn": "trange", "sf": "SDAT", "op": "eq", "num": 22, + "idx": 119, "l": 3, "p1": start_date, "t1": start_date, + "p2": end_date, "t2": end_date, + "t": f"{start_date} THRU {end_date}"}, + ]}, + {"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1, + "sf": "TYP", "op": "eq", "num": 23}, + ], + }], + }, + "dataSet": "1", + } + + def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]: + """Extract Drucksachen from a report.tt.html response.""" + results: list[Drucksache] = [] + for pre in self._RE_PRE_BLOCK.findall(html): + m_ds = self._RE_DRUCKSACHE.search(pre) + if not m_ds: + continue + drucksache = m_ds.group(1) + + m_t = self._RE_TITLE.search(pre) + title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}" + + m_pdf = self._RE_PDF.search(pre) + pdf_rel = m_pdf.group(1) if m_pdf else "" + pdf_url = f"{self.base_url}/files/{pdf_rel}" if pdf_rel else "" + + m_w32 = self._RE_URHEBER_DATUM.search(pre) + urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else "" + datum_de = m_w32.group(2) if m_w32 else "" + # DD.MM.YYYY -> ISO YYYY-MM-DD + datum_iso = "" + if datum_de: + d, m, y = datum_de.split(".") + datum_iso = f"{y}-{m.zfill(2)}-{d.zfill(2)}" + + fraktionen = self._normalize_fraktion(urheber) if urheber else [] + + doc = Drucksache( + drucksache=drucksache, + title=title, + fraktionen=fraktionen, + datum=datum_iso, + link=pdf_url, + bundesland=self.bundesland, + typ="Antrag", + ) + + # Client-side title filter (no fulltext search server-side) + if query_filter: + hay = f"{title} {urheber}".lower() + if not all(t in hay for t in query_filter.lower().split()): + continue + + results.append(doc) + + return results + + async def search(self, query: str, limit: int = 20) -> list[Drucksache]: + """Search recent Anträge of the current Wahlperiode. + + ``query`` is applied as a client-side title/Urheber filter; the + server-side query covers the last ~24 months by default. + """ + from datetime import date, timedelta + + end = date.today() + start = end - timedelta(days=730) + body = self._build_search_body( + wahlperiode=self.wahlperiode, + start_date=start.isoformat(), + end_date=end.isoformat(), + document_type="Antrag", + ) + + async with httpx.AsyncClient( + timeout=30, + follow_redirects=True, + headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, + ) as client: + try: + # Step 1: warm up cookies via the browse page + await client.get(f"{self.base_url}/portal/browse.tt.html") + + # Step 2: submit the search action + resp = await client.post( + f"{self.base_url}/portal/browse.tt.json", + json=body, + headers={"Referer": f"{self.base_url}/portal/browse.tt.html"}, + ) + if resp.status_code != 200: + print(f"PADOKA search HTTP {resp.status_code}") + return [] + + data = resp.json() + report_id = data.get("report_id") + if not report_id: + print(f"PADOKA: no report_id in response: {data}") + return [] + + # Step 3: fetch the HTML hit list + # Take a generous chunk so client-side filter still has enough + chunksize = 100 if query else limit + report_resp = await client.post( + f"{self.base_url}/portal/report.tt.html", + json={"report_id": report_id, "start": 0, "chunksize": chunksize}, + headers={"Referer": f"{self.base_url}/portal/browse.tt.html"}, + ) + if report_resp.status_code != 200: + print(f"PADOKA report HTTP {report_resp.status_code}") + return [] + + results = self._parse_hit_list_html(report_resp.text, query_filter=query) + return results[:limit] + + except Exception as e: + print(f"PADOKA search error: {e}") + return [] + + async def get_document(self, drucksache: str) -> Optional[Drucksache]: + """Look up a single document by ID via the search endpoint with a + document_number filter.""" + # Pragmatic MVP: do a broad search and filter for the requested ID. + # A targeted single-document fetch would require a different + # action.search.json structure that we have not reverse-engineered yet. + results = await self.search(query="", limit=200) + for doc in results: + if doc.drucksache == drucksache: + return doc + return None + + async def download_text(self, drucksache: str) -> Optional[str]: + """Download the PDF for a Drucksache and extract its text.""" + import fitz # PyMuPDF + + doc = await self.get_document(drucksache) + if not doc or not doc.link: + return None + + async with httpx.AsyncClient( + timeout=60, + follow_redirects=True, + headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, + ) as client: + try: + resp = await client.get(doc.link) + if resp.status_code != 200: + return None + pdf = fitz.open(stream=resp.content, filetype="pdf") + text = "" + for page in pdf: + text += page.get_text() + pdf.close() + return text + except Exception as e: + print(f"PADOKA download error for {drucksache}: {e}") + return None + + class BayernAdapter(ParlamentAdapter): """Adapter for Bayerischer Landtag.""" - + bundesland = "BY" name = "Bayerischer Landtag" base_url = "https://www.bayern.landtag.de" - + async def search(self, query: str, limit: int = 20) -> list[Drucksache]: # TODO: Implement Bayern search return [] - + async def get_document(self, drucksache: str) -> Optional[Drucksache]: # TODO: Implement return None - + async def download_text(self, drucksache: str) -> Optional[str]: return None @@ -345,6 +667,7 @@ class BWAdapter(ParlamentAdapter): # Registry of adapters ADAPTERS = { "NRW": NRWAdapter(), + "LSA": PortalaAdapter(), "BY": BayernAdapter(), "BW": BWAdapter(), }