diff --git a/app/bundeslaender.py b/app/bundeslaender.py index 235ad20..374a7cc 100644 --- a/app/bundeslaender.py +++ b/app/bundeslaender.py @@ -114,9 +114,18 @@ BUNDESLAENDER: dict[str, Bundesland] = { doku_base_url="https://pardok.parlament-berlin.de", drucksache_format="19/1234", dokukratie_scraper="be", + aktiv=True, anmerkung=( - "PARDOK basiert auf StarWeb-Software (portala-Frontend). Berlin bietet " - "zusätzlich Open-Data-XML unter parlament-berlin.de/dokumente/open-data." + "PARDOK = portala/eUI-Framework (gleiche Engine wie LSA-PADOKA, " + "unter /portala/ statt /portal/). Hit list arrives as production " + "HTML cards instead of LSA-style Perl Data::Dumper blocks — " + "PortalaAdapter auto-detects both formats. document_type=None " + "for BE because Berlin's ETYPF index uses different value strings " + "than LSA. Wahlprogramme zur LTW 2023 sind noch nicht indexiert " + "(Folge-Issue) — Analyse läuft daher mit Grundsatzprogramm-" + "Zitaten als Fallback. Open-Data-XML unter " + "parlament-berlin.de/dokumente/open-data ist eine alternative " + "Datenquelle, derzeit nicht verwendet." ), ), "BB": Bundesland( diff --git a/app/parlamente.py b/app/parlamente.py index 99f0c69..9cccda0 100644 --- a/app/parlamente.py +++ b/app/parlamente.py @@ -313,35 +313,82 @@ class PortalaAdapter(ParlamentAdapter): Single-Page App with Template Toolkit on the server side): - **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de`` - - **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` (future) + under ``/portal/`` (singular) + - **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` under + ``/portala/`` (with the trailing 'a') + + Both instances share the same JSON action schema, only the base URL, + the data source ID, the application path prefix and a few minor + quirks differ — those are constructor parameters so that the same + class can serve both states (and any future portala-based parliament). The search workflow is two-stage: - 1. ``POST /portal/browse.tt.json`` with a complex JSON ``action`` body - that contains an Elasticsearch-style query tree under + 1. ``POST {base}{path}/browse.tt.json`` with a complex JSON ``action`` + body that contains an Elasticsearch-style query tree under ``search.json``. The server returns a ``report_id`` plus hit count. - 2. ``POST /portal/report.tt.html`` with ``{report_id, start, chunksize}`` - to fetch the HTML hit list. Each hit carries a Perl Data::Dumper - block in a ``
`` tag with the canonical metadata.
+    2. ``POST {base}{path}/report.tt.html`` with ``{report_id, start,
+       chunksize}`` to fetch the HTML hit list. Each hit carries a Perl
+       Data::Dumper block in a ``
`` tag with the canonical metadata.
 
     The query body schema was reverse-engineered from
     https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
     (GPL-3.0 — only structure/selectors are reused, not Python code).
 
     Full-text search is **not** implemented in the MVP: the adapter
-    returns the most recent ``Anträge`` of the current Wahlperiode in the
-    given date window, and the search query is applied as a client-side
-    title/Urheber filter. The portala server-side full-text path requires
-    LSA-specific ``sf`` index names that are not yet known.
+    returns documents of the current Wahlperiode in the given date
+    window, and the search query is applied as a client-side
+    title/Urheber filter. The server-side full-text path requires
+    state-specific ``sf`` index names that are not yet known.
     """
 
-    bundesland = "LSA"
-    name = "Landtag von Sachsen-Anhalt (PADOKA)"
-    base_url = "https://padoka.landtag.sachsen-anhalt.de"
-    db_id = "lsa.lissh"
-    wahlperiode = 8
+    def __init__(
+        self,
+        *,
+        bundesland: str,
+        name: str,
+        base_url: str,
+        db_id: str,
+        wahlperiode: int,
+        portala_path: str = "/portal",
+        document_type: Optional[str] = "Antrag",
+        pdf_url_prefix: str = "/files/",
+        date_window_days: int = 730,
+    ) -> None:
+        """Configure a portala/eUI adapter for one specific parliament.
 
-    # Reverse-engineered "WEV*" Perl record fields used in the hit-list dumps:
+        Args:
+            bundesland: state code (e.g. ``"LSA"``, ``"BE"``).
+            name: human-readable adapter label (used in logs/UI).
+            base_url: ``https://...`` of the portal host without trailing slash.
+            db_id: data source identifier the eUI server expects in
+                ``action.sources``, e.g. ``"lsa.lissh"`` or ``"lah.lissh"``.
+            wahlperiode: current legislative period — fed into the WP
+                term of the search tree.
+            portala_path: path prefix where the portala app lives. ``/portal``
+                for LSA, ``/portala`` for Berlin.
+            document_type: optional filter applied via ETYPF/DTYPF/DART
+                terms. ``"Antrag"`` works for LSA; for instances where
+                the index uses different document_type values (e.g. Berlin),
+                pass ``None`` to drop the document_type subtree entirely
+                — the user can still filter client-side by title.
+            pdf_url_prefix: URL fragment between ``base_url`` and the
+                relative PDF path returned by the server.
+            date_window_days: how many days back ``search()`` looks by
+                default.
+        """
+        self.bundesland = bundesland
+        self.name = name
+        self.base_url = base_url.rstrip("/")
+        self.db_id = db_id
+        self.wahlperiode = wahlperiode
+        self.portala_path = "/" + portala_path.strip("/")
+        self.document_type = document_type
+        self.pdf_url_prefix = "/" + pdf_url_prefix.strip("/") + "/"
+        self.date_window_days = date_window_days
+
+    # ── LSA-style hit list (Perl Data::Dumper inside 
 blocks) ──
+    # Reverse-engineered "WEV*" record fields:
     # WEV06.main = title
     # WEV32.5    = relative PDF path
     # WEV32.main = "Antrag   Drucksache X/YYYY ..."
@@ -353,6 +400,20 @@ class PortalaAdapter(ParlamentAdapter):
     )
     _RE_PRE_BLOCK = re.compile(r'
\$VAR1 = (.*?)
', re.DOTALL) + # ── Berlin-style hit list (production HTML cards, no Perl dump) ── + # The whole div for one record: + _RE_BE_RECORD = re.compile( + r']*class="[^"]*efxRecordRepeater[^"]*"[^>]*data-efx-rec="[^"]*"[^>]*>(.*?)(?=]*efxRecordRepeater|]*id="efxResultsEnd"||$)', + re.DOTALL, + ) + _RE_BE_TITLE = re.compile(r']*class="h5[^"]*"[^>]*>\s*([^<]+)') + _RE_BE_LINK = re.compile(r']*href="([^"]+\.pdf)"[^>]*>') + # The metadata h6 looks like: + # Antrag (Eilantrag)  Drucksache 19/3104 S. 1 bis 24 vom 31.03.2026 + _RE_BE_DRUCKSACHE = re.compile(r'Drucksache\s+(\d+/\d+)') + _RE_BE_DATUM = re.compile(r'vom\s+(\d{1,2}\.\d{1,2}\.\d{4})') + _RE_BE_DOCTYPE = re.compile(r'\s*([^<&]+?)(?: |<)') + @staticmethod def _decode_perl_hex(s: str) -> str: """Decode \\x{abcd} escape sequences from Perl Data::Dumper output.""" @@ -360,22 +421,33 @@ class PortalaAdapter(ParlamentAdapter): @staticmethod def _normalize_fraktion(urheber: str) -> list[str]: - """Map Urheber-String to canonical fraction codes.""" + """Map Urheber-String to canonical fraction codes. + + Uses regex word boundaries instead of plain substring matching so + that comma-separated lists ("CDU, SPD") and the embedded "DIE + LINKE" are matched reliably. + """ u = urheber.upper() - out = [] - if "BÜNDNIS 90" in u or "GRÜNE" in u or "GRUENE" in u: + out: list[str] = [] + + def has(pattern: str) -> bool: + return re.search(pattern, u) is not None + + if has(r"\bBÜNDNIS\s*90\b") or has(r"\bGR(?:Ü|UE)NE\b"): out.append("GRÜNE") - if u.startswith("CDU") or " CDU " in u or u.endswith(" CDU"): + if has(r"\bCDU\b"): out.append("CDU") - if "SPD" in u: + if has(r"\bSPD\b"): out.append("SPD") - if "FDP" in u: + if has(r"\bFDP\b"): out.append("FDP") - if "AFD" in u: + if has(r"\bAFD\b"): out.append("AfD") - if "LINKE" in u or "DIE LINKE" in u: + if has(r"\bLINKE\b"): out.append("LINKE") - if "LANDESREGIERUNG" in u or "MINISTER" in u or "STAATSKANZLEI" in u: + if has(r"\bBSW\b"): + out.append("BSW") + if has(r"LANDESREGIERUNG|SENAT VON BERLIN|REGIERENDE[RN]?\s+BÜRGERMEISTER|MINISTER\b|STAATSKANZLEI"): out.append("Landesregierung") return out @@ -384,14 +456,93 @@ class PortalaAdapter(ParlamentAdapter): wahlperiode: int, start_date: str, end_date: str, - document_type: str = "Antrag", ) -> dict: """Build the action JSON body for browse.tt.json. - The schema is taken 1:1 from dokukratie's portala.query.json template - and only differs in the data source (lsa.lissh) and the variable - substitutions. + The schema is taken from dokukratie's portala.query.json template + and only differs in the data source and the variable substitutions. + When ``self.document_type`` is None, the ETYPF/DTYPF/DART subtree + is dropped — useful for parliaments whose ETYPF index uses + different value strings than ``"Antrag"``. """ + document_type = self.document_type + date_range_text = f"{start_date} THRU {end_date}" + date_term = lambda sf, num: { # noqa: E731 — local helper + "tn": "trange", "sf": sf, "op": "eq", "num": num, + "idx": 119, "l": 3, + "p1": start_date, "t1": start_date, + "p2": end_date, "t2": end_date, + "t": date_range_text, + } + + # Build the search.lines (form-state mirror) and the json tree + lines: dict = { + "2": str(wahlperiode), + "10": start_date, + "11": end_date, + "20.1": "alWEBBI", + "20.2": "alWEBBI", + "20.3": "alWEBBI", + "90.1": "AND", + "90.2": "AND", + "90.3": "AND", + } + if document_type is not None: + lines["3"] = document_type + lines["4"] = "D" + + # Top-level AND tree + top_terms: list = [ + {"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3, + "sf": "WP", "op": "eq", "num": 5}, + ] + + if document_type is not None: + top_terms.append({"tn": "or", "num": 3, "terms": [ + {"tn": "or", "num": 4, "terms": [ + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "ETYPF", "op": "eq", "num": 10}, + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "ETYP2F", "op": "eq", "num": 11}, + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "DTYPF", "op": "eq", "num": 12}, + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "DTYP2F", "op": "eq", "num": 13}, + {"tn": "term", "t": f'"{document_type}"', "idx": 50, + "l": 4, "sf": "1VTYPF", "op": "eq", "num": 14}, + ]}, + {"tn": "or", "num": 15, "terms": [ + {"tn": "term", "t": '"D"', "idx": 93, "l": 4, + "sf": "DART", "op": "eq", "num": 16}, + {"tn": "term", "t": '"D"', "idx": 93, "l": 4, + "sf": "DARTS", "op": "eq", "num": 17}, + ]}, + ]}) + + top_terms.append({"tn": "or", "num": 18, "terms": [ + {"tn": "or", "num": 19, "terms": [ + date_term("DAT", 20), + date_term("DDAT", 21), + ]}, + date_term("SDAT", 22), + ]}) + top_terms.append({"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1, + "sf": "TYP", "op": "eq", "num": 23}) + + # Mirror the same shape into the parsed/sref display strings + if document_type is not None: + parsed = ( + f"((/WP {wahlperiode}) AND " + f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) " + f"AND (/DART,DARTS (\"D\")) AND " + f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE" + ) + else: + parsed = ( + f"((/WP {wahlperiode}) AND " + f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE" + ) + return { "action": "SearchAndDisplay", "sources": [self.db_id], @@ -403,84 +554,47 @@ class PortalaAdapter(ParlamentAdapter): "sort": "WEVSO1/D WEVSO2 WEVSO3", }, "search": { - "lines": { - "2": str(wahlperiode), - "3": document_type, - "4": "D", - "10": start_date, - "11": end_date, - "20.1": "alWEBBI", - "20.2": "alWEBBI", - "20.3": "alWEBBI", - "90.1": "AND", - "90.2": "AND", - "90.3": "AND", - }, + "lines": lines, "serverrecordname": "sr_generic1", - "parsed": ( - f"((/WP {wahlperiode}) AND " - f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) " - f"AND (/DART,DARTS (\"D\")) AND " - f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE" - ), - "sref": ( - f"((/WP {wahlperiode}) AND " - f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) " - f"AND (/DART,DARTS (\"D\")) AND " - f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE" - ), + "parsed": parsed, + "sref": parsed, "json": [{ "tn": "and", "num": 1, - "terms": [ - {"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3, - "sf": "WP", "op": "eq", "num": 5}, - {"tn": "or", "num": 3, "terms": [ - {"tn": "or", "num": 4, "terms": [ - {"tn": "term", "t": f'"{document_type}"', "idx": 50, - "l": 4, "sf": "ETYPF", "op": "eq", "num": 10}, - {"tn": "term", "t": f'"{document_type}"', "idx": 50, - "l": 4, "sf": "ETYP2F", "op": "eq", "num": 11}, - {"tn": "term", "t": f'"{document_type}"', "idx": 50, - "l": 4, "sf": "DTYPF", "op": "eq", "num": 12}, - {"tn": "term", "t": f'"{document_type}"', "idx": 50, - "l": 4, "sf": "DTYP2F", "op": "eq", "num": 13}, - {"tn": "term", "t": f'"{document_type}"', "idx": 50, - "l": 4, "sf": "1VTYPF", "op": "eq", "num": 14}, - ]}, - {"tn": "or", "num": 15, "terms": [ - {"tn": "term", "t": '"D"', "idx": 93, "l": 4, - "sf": "DART", "op": "eq", "num": 16}, - {"tn": "term", "t": '"D"', "idx": 93, "l": 4, - "sf": "DARTS", "op": "eq", "num": 17}, - ]}, - ]}, - {"tn": "or", "num": 18, "terms": [ - {"tn": "or", "num": 19, "terms": [ - {"tn": "trange", "sf": "DAT", "op": "eq", "num": 20, - "idx": 119, "l": 3, "p1": start_date, "t1": start_date, - "p2": end_date, "t2": end_date, - "t": f"{start_date} THRU {end_date}"}, - {"tn": "trange", "sf": "DDAT", "op": "eq", "num": 21, - "idx": 119, "l": 3, "p1": start_date, "t1": start_date, - "p2": end_date, "t2": end_date, - "t": f"{start_date} THRU {end_date}"}, - ]}, - {"tn": "trange", "sf": "SDAT", "op": "eq", "num": 22, - "idx": 119, "l": 3, "p1": start_date, "t1": start_date, - "p2": end_date, "t2": end_date, - "t": f"{start_date} THRU {end_date}"}, - ]}, - {"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1, - "sf": "TYP", "op": "eq", "num": 23}, - ], + "terms": top_terms, }], }, "dataSet": "1", } + @staticmethod + def _datum_de_to_iso(datum_de: str) -> str: + """Convert DD.MM.YYYY → YYYY-MM-DD; return '' for empty input.""" + if not datum_de: + return "" + d, m, y = datum_de.split(".") + return f"{y}-{m.zfill(2)}-{d.zfill(2)}" + def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]: - """Extract Drucksachen from a report.tt.html response.""" + """Extract Drucksachen from a report.tt.html response. + + Two formats are supported and auto-detected: + + - **LSA-style:** the records are embedded as Perl Data::Dumper + dumps inside ``
$VAR1 = …
`` blocks. WEV06 → title, + WEV32 → metadata + PDF path. Used by Sachsen-Anhalt's PADOKA + template. + - **Berlin-style:** standard production HTML cards with + ``efxRecordRepeater`` divs. Title in an ``

``, + metadata + PDF link in an ````. Used by + Berlin's PARDOK template. + """ + if self._RE_PRE_BLOCK.search(html): + return self._parse_hit_list_dump(html, query_filter) + return self._parse_hit_list_cards(html, query_filter) + + def _parse_hit_list_dump(self, html: str, query_filter: str) -> list[Drucksache]: + """Parse LSA-style ``
$VAR1 = …
`` Perl-dump records.""" results: list[Drucksache] = [] for pre in self._RE_PRE_BLOCK.findall(html): m_ds = self._RE_DRUCKSACHE.search(pre) @@ -493,17 +607,11 @@ class PortalaAdapter(ParlamentAdapter): m_pdf = self._RE_PDF.search(pre) pdf_rel = m_pdf.group(1) if m_pdf else "" - pdf_url = f"{self.base_url}/files/{pdf_rel}" if pdf_rel else "" + pdf_url = f"{self.base_url}{self.pdf_url_prefix}{pdf_rel}" if pdf_rel else "" m_w32 = self._RE_URHEBER_DATUM.search(pre) urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else "" - datum_de = m_w32.group(2) if m_w32 else "" - # DD.MM.YYYY -> ISO YYYY-MM-DD - datum_iso = "" - if datum_de: - d, m, y = datum_de.split(".") - datum_iso = f"{y}-{m.zfill(2)}-{d.zfill(2)}" - + datum_iso = self._datum_de_to_iso(m_w32.group(2) if m_w32 else "") fraktionen = self._normalize_fraktion(urheber) if urheber else [] doc = Drucksache( @@ -516,7 +624,6 @@ class PortalaAdapter(ParlamentAdapter): typ="Antrag", ) - # Client-side title filter (no fulltext search server-side) if query_filter: hay = f"{title} {urheber}".lower() if not all(t in hay for t in query_filter.lower().split()): @@ -526,23 +633,100 @@ class PortalaAdapter(ParlamentAdapter): return results + def _parse_hit_list_cards(self, html: str, query_filter: str) -> list[Drucksache]: + """Parse Berlin-style ``efxRecordRepeater`` HTML-card records. + + Each card contains an ``

`` title, a metadata ```` + with the document type, the Drucksachen-Nummer, and the date, + plus a direct ```` link to the PDF on the same host. + """ + results: list[Drucksache] = [] + + # Split the HTML on every record-div opener — easier than balancing + # divs with regex. + chunks = html.split('class="record') + # First chunk is the prelude, skip it + for chunk in chunks[1:]: + # Each chunk now starts at the record class attribute + m_t = self._RE_BE_TITLE.search(chunk) + title = m_t.group(1).strip() if m_t else "Ohne Titel" + + m_ds = self._RE_BE_DRUCKSACHE.search(chunk) + if not m_ds: + continue + drucksache = m_ds.group(1) + + m_pdf = self._RE_BE_LINK.search(chunk) + pdf_url = "" + if m_pdf: + href = m_pdf.group(1) + if href.startswith("http://") or href.startswith("https://"): + pdf_url = href + elif href.startswith("/"): + pdf_url = f"{self.base_url}{href}" + else: + pdf_url = f"{self.base_url}{self.pdf_url_prefix}{href}" + + m_dat = self._RE_BE_DATUM.search(chunk) + datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "") + + m_doc = self._RE_BE_DOCTYPE.search(chunk) + doctype_full = m_doc.group(1).strip() if m_doc else "Drucksache" + + # Berlin often packs the originator(s) into the same h6 line: + # "Antrag CDU, SPD" → fraktionen = ["CDU","SPD"], typ = "Antrag" + # Senat-Vorlagen carry no fraction, only "Vorlage zur …". + fraktionen = self._normalize_fraktion(doctype_full) + # Strip the fraction names back out of the typ string so the UI + # shows a clean "Antrag" / "Vorlage …" label. + typ = doctype_full + if fraktionen: + # Cut at the first occurrence of any party name + cuts = [typ.upper().find(f.upper()) for f in fraktionen] + cuts = [c for c in cuts if c >= 0] + if cuts: + typ = typ[: min(cuts)].rstrip(" ,") + + doc = Drucksache( + drucksache=drucksache, + title=title, + fraktionen=fraktionen, + datum=datum_iso, + link=pdf_url, + bundesland=self.bundesland, + typ=typ, + ) + + if query_filter: + hay = f"{title} {doctype}".lower() + if not all(t in hay for t in query_filter.lower().split()): + continue + + results.append(doc) + + return results + async def search(self, query: str, limit: int = 20) -> list[Drucksache]: - """Search recent Anträge of the current Wahlperiode. + """Search recent documents of the current Wahlperiode. ``query`` is applied as a client-side title/Urheber filter; the - server-side query covers the last ~24 months by default. + server-side query covers the configured ``date_window_days`` + (default 24 months). """ from datetime import date, timedelta end = date.today() - start = end - timedelta(days=730) + start = end - timedelta(days=self.date_window_days) body = self._build_search_body( wahlperiode=self.wahlperiode, start_date=start.isoformat(), end_date=end.isoformat(), - document_type="Antrag", ) + browse_html = f"{self.base_url}{self.portala_path}/browse.tt.html" + browse_json = f"{self.base_url}{self.portala_path}/browse.tt.json" + report_html = f"{self.base_url}{self.portala_path}/report.tt.html" + async with httpx.AsyncClient( timeout=30, follow_redirects=True, @@ -550,41 +734,41 @@ class PortalaAdapter(ParlamentAdapter): ) as client: try: # Step 1: warm up cookies via the browse page - await client.get(f"{self.base_url}/portal/browse.tt.html") + await client.get(browse_html) # Step 2: submit the search action resp = await client.post( - f"{self.base_url}/portal/browse.tt.json", + browse_json, json=body, - headers={"Referer": f"{self.base_url}/portal/browse.tt.html"}, + headers={"Referer": browse_html}, ) if resp.status_code != 200: - print(f"PADOKA search HTTP {resp.status_code}") + print(f"{self.bundesland} search HTTP {resp.status_code}") return [] data = resp.json() report_id = data.get("report_id") if not report_id: - print(f"PADOKA: no report_id in response: {data}") + print(f"{self.bundesland}: no report_id in response: {data}") return [] # Step 3: fetch the HTML hit list # Take a generous chunk so client-side filter still has enough chunksize = 100 if query else limit report_resp = await client.post( - f"{self.base_url}/portal/report.tt.html", + report_html, json={"report_id": report_id, "start": 0, "chunksize": chunksize}, - headers={"Referer": f"{self.base_url}/portal/browse.tt.html"}, + headers={"Referer": browse_html}, ) if report_resp.status_code != 200: - print(f"PADOKA report HTTP {report_resp.status_code}") + print(f"{self.bundesland} report HTTP {report_resp.status_code}") return [] results = self._parse_hit_list_html(report_resp.text, query_filter=query) return results[:limit] except Exception as e: - print(f"PADOKA search error: {e}") + print(f"{self.bundesland} search error: {e}") return [] async def get_document(self, drucksache: str) -> Optional[Drucksache]: @@ -623,7 +807,7 @@ class PortalaAdapter(ParlamentAdapter): pdf.close() return text except Exception as e: - print(f"PADOKA download error for {drucksache}: {e}") + print(f"{self.bundesland} download error for {drucksache}: {e}") return None @@ -667,7 +851,31 @@ class BWAdapter(ParlamentAdapter): # Registry of adapters ADAPTERS = { "NRW": NRWAdapter(), - "LSA": PortalaAdapter(), + "LSA": PortalaAdapter( + bundesland="LSA", + name="Landtag von Sachsen-Anhalt (PADOKA)", + base_url="https://padoka.landtag.sachsen-anhalt.de", + db_id="lsa.lissh", + wahlperiode=8, + portala_path="/portal", + document_type="Antrag", + pdf_url_prefix="/files/", + ), + "BE": PortalaAdapter( + bundesland="BE", + name="Abgeordnetenhaus von Berlin (PARDOK)", + base_url="https://pardok.parlament-berlin.de", + db_id="lah.lissh", + wahlperiode=19, + portala_path="/portala", + # Berlin's ETYPF index uses different value strings — drop the + # document_type subtree, fall back to client-side title filter. + document_type=None, + # Tighter date window: BE has ~10x more documents than LSA, so a + # narrower window keeps the per-request payload bounded. + date_window_days=180, + pdf_url_prefix="/files/", + ), "BY": BayernAdapter(), "BW": BWAdapter(), }