diff --git a/app/bundeslaender.py b/app/bundeslaender.py index 235ad20..374a7cc 100644 --- a/app/bundeslaender.py +++ b/app/bundeslaender.py @@ -114,9 +114,18 @@ BUNDESLAENDER: dict[str, Bundesland] = { doku_base_url="https://pardok.parlament-berlin.de", drucksache_format="19/1234", dokukratie_scraper="be", + aktiv=True, anmerkung=( - "PARDOK basiert auf StarWeb-Software (portala-Frontend). Berlin bietet " - "zusätzlich Open-Data-XML unter parlament-berlin.de/dokumente/open-data." + "PARDOK = portala/eUI-Framework (gleiche Engine wie LSA-PADOKA, " + "unter /portala/ statt /portal/). Hit list arrives as production " + "HTML cards instead of LSA-style Perl Data::Dumper blocks — " + "PortalaAdapter auto-detects both formats. document_type=None " + "for BE because Berlin's ETYPF index uses different value strings " + "than LSA. Wahlprogramme zur LTW 2023 sind noch nicht indexiert " + "(Folge-Issue) — Analyse läuft daher mit Grundsatzprogramm-" + "Zitaten als Fallback. Open-Data-XML unter " + "parlament-berlin.de/dokumente/open-data ist eine alternative " + "Datenquelle, derzeit nicht verwendet." ), ), "BB": Bundesland( diff --git a/app/parlamente.py b/app/parlamente.py index 99f0c69..9cccda0 100644 --- a/app/parlamente.py +++ b/app/parlamente.py @@ -313,35 +313,82 @@ class PortalaAdapter(ParlamentAdapter): Single-Page App with Template Toolkit on the server side): - **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de`` - - **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` (future) + under ``/portal/`` (singular) + - **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` under + ``/portala/`` (with the trailing 'a') + + Both instances share the same JSON action schema, only the base URL, + the data source ID, the application path prefix and a few minor + quirks differ — those are constructor parameters so that the same + class can serve both states (and any future portala-based parliament). The search workflow is two-stage: - 1. ``POST /portal/browse.tt.json`` with a complex JSON ``action`` body - that contains an Elasticsearch-style query tree under + 1. ``POST {base}{path}/browse.tt.json`` with a complex JSON ``action`` + body that contains an Elasticsearch-style query tree under ``search.json``. The server returns a ``report_id`` plus hit count. - 2. ``POST /portal/report.tt.html`` with ``{report_id, start, chunksize}`` - to fetch the HTML hit list. Each hit carries a Perl Data::Dumper - block in a ``
`` tag with the canonical metadata.
+ 2. ``POST {base}{path}/report.tt.html`` with ``{report_id, start,
+ chunksize}`` to fetch the HTML hit list. Each hit carries a Perl
+ Data::Dumper block in a ```` tag with the canonical metadata.
The query body schema was reverse-engineered from
https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
(GPL-3.0 — only structure/selectors are reused, not Python code).
Full-text search is **not** implemented in the MVP: the adapter
- returns the most recent ``Anträge`` of the current Wahlperiode in the
- given date window, and the search query is applied as a client-side
- title/Urheber filter. The portala server-side full-text path requires
- LSA-specific ``sf`` index names that are not yet known.
+ returns documents of the current Wahlperiode in the given date
+ window, and the search query is applied as a client-side
+ title/Urheber filter. The server-side full-text path requires
+ state-specific ``sf`` index names that are not yet known.
"""
- bundesland = "LSA"
- name = "Landtag von Sachsen-Anhalt (PADOKA)"
- base_url = "https://padoka.landtag.sachsen-anhalt.de"
- db_id = "lsa.lissh"
- wahlperiode = 8
+ def __init__(
+ self,
+ *,
+ bundesland: str,
+ name: str,
+ base_url: str,
+ db_id: str,
+ wahlperiode: int,
+ portala_path: str = "/portal",
+ document_type: Optional[str] = "Antrag",
+ pdf_url_prefix: str = "/files/",
+ date_window_days: int = 730,
+ ) -> None:
+ """Configure a portala/eUI adapter for one specific parliament.
- # Reverse-engineered "WEV*" Perl record fields used in the hit-list dumps:
+ Args:
+ bundesland: state code (e.g. ``"LSA"``, ``"BE"``).
+ name: human-readable adapter label (used in logs/UI).
+ base_url: ``https://...`` of the portal host without trailing slash.
+ db_id: data source identifier the eUI server expects in
+ ``action.sources``, e.g. ``"lsa.lissh"`` or ``"lah.lissh"``.
+ wahlperiode: current legislative period — fed into the WP
+ term of the search tree.
+ portala_path: path prefix where the portala app lives. ``/portal``
+ for LSA, ``/portala`` for Berlin.
+ document_type: optional filter applied via ETYPF/DTYPF/DART
+ terms. ``"Antrag"`` works for LSA; for instances where
+ the index uses different document_type values (e.g. Berlin),
+ pass ``None`` to drop the document_type subtree entirely
+ — the user can still filter client-side by title.
+ pdf_url_prefix: URL fragment between ``base_url`` and the
+ relative PDF path returned by the server.
+ date_window_days: how many days back ``search()`` looks by
+ default.
+ """
+ self.bundesland = bundesland
+ self.name = name
+ self.base_url = base_url.rstrip("/")
+ self.db_id = db_id
+ self.wahlperiode = wahlperiode
+ self.portala_path = "/" + portala_path.strip("/")
+ self.document_type = document_type
+ self.pdf_url_prefix = "/" + pdf_url_prefix.strip("/") + "/"
+ self.date_window_days = date_window_days
+
+ # ── LSA-style hit list (Perl Data::Dumper inside blocks) ──
+ # Reverse-engineered "WEV*" record fields:
# WEV06.main = title
# WEV32.5 = relative PDF path
# WEV32.main = "Antrag Drucksache X/YYYY ..."
@@ -353,6 +400,20 @@ class PortalaAdapter(ParlamentAdapter):
)
_RE_PRE_BLOCK = re.compile(r'\$VAR1 = (.*?)
', re.DOTALL)
+ # ── Berlin-style hit list (production HTML cards, no Perl dump) ──
+ # The whole div for one record:
+ _RE_BE_RECORD = re.compile(
+ r']*class="[^"]*efxRecordRepeater[^"]*"[^>]*data-efx-rec="[^"]*"[^>]*>(.*?)(?=]*efxRecordRepeater|]*id="efxResultsEnd"||$)',
+ re.DOTALL,
+ )
+ _RE_BE_TITLE = re.compile(r']*class="h5[^"]*"[^>]*>\s*([^<]+)')
+ _RE_BE_LINK = re.compile(r']*href="([^"]+\.pdf)"[^>]*>')
+ # The metadata h6 looks like:
+ # Antrag (Eilantrag) Drucksache 19/3104 S. 1 bis 24 vom 31.03.2026
+ _RE_BE_DRUCKSACHE = re.compile(r'Drucksache\s+(\d+/\d+)')
+ _RE_BE_DATUM = re.compile(r'vom\s+(\d{1,2}\.\d{1,2}\.\d{4})')
+ _RE_BE_DOCTYPE = re.compile(r'\s*([^<&]+?)(?: |<)')
+
@staticmethod
def _decode_perl_hex(s: str) -> str:
"""Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
@@ -360,22 +421,33 @@ class PortalaAdapter(ParlamentAdapter):
@staticmethod
def _normalize_fraktion(urheber: str) -> list[str]:
- """Map Urheber-String to canonical fraction codes."""
+ """Map Urheber-String to canonical fraction codes.
+
+ Uses regex word boundaries instead of plain substring matching so
+ that comma-separated lists ("CDU, SPD") and the embedded "DIE
+ LINKE" are matched reliably.
+ """
u = urheber.upper()
- out = []
- if "BÜNDNIS 90" in u or "GRÜNE" in u or "GRUENE" in u:
+ out: list[str] = []
+
+ def has(pattern: str) -> bool:
+ return re.search(pattern, u) is not None
+
+ if has(r"\bBÜNDNIS\s*90\b") or has(r"\bGR(?:Ü|UE)NE\b"):
out.append("GRÜNE")
- if u.startswith("CDU") or " CDU " in u or u.endswith(" CDU"):
+ if has(r"\bCDU\b"):
out.append("CDU")
- if "SPD" in u:
+ if has(r"\bSPD\b"):
out.append("SPD")
- if "FDP" in u:
+ if has(r"\bFDP\b"):
out.append("FDP")
- if "AFD" in u:
+ if has(r"\bAFD\b"):
out.append("AfD")
- if "LINKE" in u or "DIE LINKE" in u:
+ if has(r"\bLINKE\b"):
out.append("LINKE")
- if "LANDESREGIERUNG" in u or "MINISTER" in u or "STAATSKANZLEI" in u:
+ if has(r"\bBSW\b"):
+ out.append("BSW")
+ if has(r"LANDESREGIERUNG|SENAT VON BERLIN|REGIERENDE[RN]?\s+BÜRGERMEISTER|MINISTER\b|STAATSKANZLEI"):
out.append("Landesregierung")
return out
@@ -384,14 +456,93 @@ class PortalaAdapter(ParlamentAdapter):
wahlperiode: int,
start_date: str,
end_date: str,
- document_type: str = "Antrag",
) -> dict:
"""Build the action JSON body for browse.tt.json.
- The schema is taken 1:1 from dokukratie's portala.query.json template
- and only differs in the data source (lsa.lissh) and the variable
- substitutions.
+ The schema is taken from dokukratie's portala.query.json template
+ and only differs in the data source and the variable substitutions.
+ When ``self.document_type`` is None, the ETYPF/DTYPF/DART subtree
+ is dropped — useful for parliaments whose ETYPF index uses
+ different value strings than ``"Antrag"``.
"""
+ document_type = self.document_type
+ date_range_text = f"{start_date} THRU {end_date}"
+ date_term = lambda sf, num: { # noqa: E731 — local helper
+ "tn": "trange", "sf": sf, "op": "eq", "num": num,
+ "idx": 119, "l": 3,
+ "p1": start_date, "t1": start_date,
+ "p2": end_date, "t2": end_date,
+ "t": date_range_text,
+ }
+
+ # Build the search.lines (form-state mirror) and the json tree
+ lines: dict = {
+ "2": str(wahlperiode),
+ "10": start_date,
+ "11": end_date,
+ "20.1": "alWEBBI",
+ "20.2": "alWEBBI",
+ "20.3": "alWEBBI",
+ "90.1": "AND",
+ "90.2": "AND",
+ "90.3": "AND",
+ }
+ if document_type is not None:
+ lines["3"] = document_type
+ lines["4"] = "D"
+
+ # Top-level AND tree
+ top_terms: list = [
+ {"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3,
+ "sf": "WP", "op": "eq", "num": 5},
+ ]
+
+ if document_type is not None:
+ top_terms.append({"tn": "or", "num": 3, "terms": [
+ {"tn": "or", "num": 4, "terms": [
+ {"tn": "term", "t": f'"{document_type}"', "idx": 50,
+ "l": 4, "sf": "ETYPF", "op": "eq", "num": 10},
+ {"tn": "term", "t": f'"{document_type}"', "idx": 50,
+ "l": 4, "sf": "ETYP2F", "op": "eq", "num": 11},
+ {"tn": "term", "t": f'"{document_type}"', "idx": 50,
+ "l": 4, "sf": "DTYPF", "op": "eq", "num": 12},
+ {"tn": "term", "t": f'"{document_type}"', "idx": 50,
+ "l": 4, "sf": "DTYP2F", "op": "eq", "num": 13},
+ {"tn": "term", "t": f'"{document_type}"', "idx": 50,
+ "l": 4, "sf": "1VTYPF", "op": "eq", "num": 14},
+ ]},
+ {"tn": "or", "num": 15, "terms": [
+ {"tn": "term", "t": '"D"', "idx": 93, "l": 4,
+ "sf": "DART", "op": "eq", "num": 16},
+ {"tn": "term", "t": '"D"', "idx": 93, "l": 4,
+ "sf": "DARTS", "op": "eq", "num": 17},
+ ]},
+ ]})
+
+ top_terms.append({"tn": "or", "num": 18, "terms": [
+ {"tn": "or", "num": 19, "terms": [
+ date_term("DAT", 20),
+ date_term("DDAT", 21),
+ ]},
+ date_term("SDAT", 22),
+ ]})
+ top_terms.append({"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1,
+ "sf": "TYP", "op": "eq", "num": 23})
+
+ # Mirror the same shape into the parsed/sref display strings
+ if document_type is not None:
+ parsed = (
+ f"((/WP {wahlperiode}) AND "
+ f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
+ f"AND (/DART,DARTS (\"D\")) AND "
+ f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
+ )
+ else:
+ parsed = (
+ f"((/WP {wahlperiode}) AND "
+ f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
+ )
+
return {
"action": "SearchAndDisplay",
"sources": [self.db_id],
@@ -403,84 +554,47 @@ class PortalaAdapter(ParlamentAdapter):
"sort": "WEVSO1/D WEVSO2 WEVSO3",
},
"search": {
- "lines": {
- "2": str(wahlperiode),
- "3": document_type,
- "4": "D",
- "10": start_date,
- "11": end_date,
- "20.1": "alWEBBI",
- "20.2": "alWEBBI",
- "20.3": "alWEBBI",
- "90.1": "AND",
- "90.2": "AND",
- "90.3": "AND",
- },
+ "lines": lines,
"serverrecordname": "sr_generic1",
- "parsed": (
- f"((/WP {wahlperiode}) AND "
- f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
- f"AND (/DART,DARTS (\"D\")) AND "
- f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE"
- ),
- "sref": (
- f"((/WP {wahlperiode}) AND "
- f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
- f"AND (/DART,DARTS (\"D\")) AND "
- f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE"
- ),
+ "parsed": parsed,
+ "sref": parsed,
"json": [{
"tn": "and",
"num": 1,
- "terms": [
- {"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3,
- "sf": "WP", "op": "eq", "num": 5},
- {"tn": "or", "num": 3, "terms": [
- {"tn": "or", "num": 4, "terms": [
- {"tn": "term", "t": f'"{document_type}"', "idx": 50,
- "l": 4, "sf": "ETYPF", "op": "eq", "num": 10},
- {"tn": "term", "t": f'"{document_type}"', "idx": 50,
- "l": 4, "sf": "ETYP2F", "op": "eq", "num": 11},
- {"tn": "term", "t": f'"{document_type}"', "idx": 50,
- "l": 4, "sf": "DTYPF", "op": "eq", "num": 12},
- {"tn": "term", "t": f'"{document_type}"', "idx": 50,
- "l": 4, "sf": "DTYP2F", "op": "eq", "num": 13},
- {"tn": "term", "t": f'"{document_type}"', "idx": 50,
- "l": 4, "sf": "1VTYPF", "op": "eq", "num": 14},
- ]},
- {"tn": "or", "num": 15, "terms": [
- {"tn": "term", "t": '"D"', "idx": 93, "l": 4,
- "sf": "DART", "op": "eq", "num": 16},
- {"tn": "term", "t": '"D"', "idx": 93, "l": 4,
- "sf": "DARTS", "op": "eq", "num": 17},
- ]},
- ]},
- {"tn": "or", "num": 18, "terms": [
- {"tn": "or", "num": 19, "terms": [
- {"tn": "trange", "sf": "DAT", "op": "eq", "num": 20,
- "idx": 119, "l": 3, "p1": start_date, "t1": start_date,
- "p2": end_date, "t2": end_date,
- "t": f"{start_date} THRU {end_date}"},
- {"tn": "trange", "sf": "DDAT", "op": "eq", "num": 21,
- "idx": 119, "l": 3, "p1": start_date, "t1": start_date,
- "p2": end_date, "t2": end_date,
- "t": f"{start_date} THRU {end_date}"},
- ]},
- {"tn": "trange", "sf": "SDAT", "op": "eq", "num": 22,
- "idx": 119, "l": 3, "p1": start_date, "t1": start_date,
- "p2": end_date, "t2": end_date,
- "t": f"{start_date} THRU {end_date}"},
- ]},
- {"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1,
- "sf": "TYP", "op": "eq", "num": 23},
- ],
+ "terms": top_terms,
}],
},
"dataSet": "1",
}
+ @staticmethod
+ def _datum_de_to_iso(datum_de: str) -> str:
+ """Convert DD.MM.YYYY → YYYY-MM-DD; return '' for empty input."""
+ if not datum_de:
+ return ""
+ d, m, y = datum_de.split(".")
+ return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
+
def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]:
- """Extract Drucksachen from a report.tt.html response."""
+ """Extract Drucksachen from a report.tt.html response.
+
+ Two formats are supported and auto-detected:
+
+ - **LSA-style:** the records are embedded as Perl Data::Dumper
+ dumps inside ``$VAR1 = …
`` blocks. WEV06 → title,
+ WEV32 → metadata + PDF path. Used by Sachsen-Anhalt's PADOKA
+ template.
+ - **Berlin-style:** standard production HTML cards with
+ ``efxRecordRepeater`` divs. Title in an ````,
+ metadata + PDF link in an ````. Used by
+ Berlin's PARDOK template.
+ """
+ if self._RE_PRE_BLOCK.search(html):
+ return self._parse_hit_list_dump(html, query_filter)
+ return self._parse_hit_list_cards(html, query_filter)
+
+ def _parse_hit_list_dump(self, html: str, query_filter: str) -> list[Drucksache]:
+ """Parse LSA-style ``$VAR1 = …
`` Perl-dump records."""
results: list[Drucksache] = []
for pre in self._RE_PRE_BLOCK.findall(html):
m_ds = self._RE_DRUCKSACHE.search(pre)
@@ -493,17 +607,11 @@ class PortalaAdapter(ParlamentAdapter):
m_pdf = self._RE_PDF.search(pre)
pdf_rel = m_pdf.group(1) if m_pdf else ""
- pdf_url = f"{self.base_url}/files/{pdf_rel}" if pdf_rel else ""
+ pdf_url = f"{self.base_url}{self.pdf_url_prefix}{pdf_rel}" if pdf_rel else ""
m_w32 = self._RE_URHEBER_DATUM.search(pre)
urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else ""
- datum_de = m_w32.group(2) if m_w32 else ""
- # DD.MM.YYYY -> ISO YYYY-MM-DD
- datum_iso = ""
- if datum_de:
- d, m, y = datum_de.split(".")
- datum_iso = f"{y}-{m.zfill(2)}-{d.zfill(2)}"
-
+ datum_iso = self._datum_de_to_iso(m_w32.group(2) if m_w32 else "")
fraktionen = self._normalize_fraktion(urheber) if urheber else []
doc = Drucksache(
@@ -516,7 +624,6 @@ class PortalaAdapter(ParlamentAdapter):
typ="Antrag",
)
- # Client-side title filter (no fulltext search server-side)
if query_filter:
hay = f"{title} {urheber}".lower()
if not all(t in hay for t in query_filter.lower().split()):
@@ -526,23 +633,100 @@ class PortalaAdapter(ParlamentAdapter):
return results
+ def _parse_hit_list_cards(self, html: str, query_filter: str) -> list[Drucksache]:
+ """Parse Berlin-style ``efxRecordRepeater`` HTML-card records.
+
+ Each card contains an ```` title, a metadata ````
+ with the document type, the Drucksachen-Nummer, and the date,
+ plus a direct ```` link to the PDF on the same host.
+ """
+ results: list[Drucksache] = []
+
+ # Split the HTML on every record-div opener — easier than balancing
+ # divs with regex.
+ chunks = html.split('class="record')
+ # First chunk is the prelude, skip it
+ for chunk in chunks[1:]:
+ # Each chunk now starts at the record class attribute
+ m_t = self._RE_BE_TITLE.search(chunk)
+ title = m_t.group(1).strip() if m_t else "Ohne Titel"
+
+ m_ds = self._RE_BE_DRUCKSACHE.search(chunk)
+ if not m_ds:
+ continue
+ drucksache = m_ds.group(1)
+
+ m_pdf = self._RE_BE_LINK.search(chunk)
+ pdf_url = ""
+ if m_pdf:
+ href = m_pdf.group(1)
+ if href.startswith("http://") or href.startswith("https://"):
+ pdf_url = href
+ elif href.startswith("/"):
+ pdf_url = f"{self.base_url}{href}"
+ else:
+ pdf_url = f"{self.base_url}{self.pdf_url_prefix}{href}"
+
+ m_dat = self._RE_BE_DATUM.search(chunk)
+ datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "")
+
+ m_doc = self._RE_BE_DOCTYPE.search(chunk)
+ doctype_full = m_doc.group(1).strip() if m_doc else "Drucksache"
+
+ # Berlin often packs the originator(s) into the same h6 line:
+ # "Antrag CDU, SPD" → fraktionen = ["CDU","SPD"], typ = "Antrag"
+ # Senat-Vorlagen carry no fraction, only "Vorlage zur …".
+ fraktionen = self._normalize_fraktion(doctype_full)
+ # Strip the fraction names back out of the typ string so the UI
+ # shows a clean "Antrag" / "Vorlage …" label.
+ typ = doctype_full
+ if fraktionen:
+ # Cut at the first occurrence of any party name
+ cuts = [typ.upper().find(f.upper()) for f in fraktionen]
+ cuts = [c for c in cuts if c >= 0]
+ if cuts:
+ typ = typ[: min(cuts)].rstrip(" ,")
+
+ doc = Drucksache(
+ drucksache=drucksache,
+ title=title,
+ fraktionen=fraktionen,
+ datum=datum_iso,
+ link=pdf_url,
+ bundesland=self.bundesland,
+ typ=typ,
+ )
+
+ if query_filter:
+ hay = f"{title} {doctype}".lower()
+ if not all(t in hay for t in query_filter.lower().split()):
+ continue
+
+ results.append(doc)
+
+ return results
+
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
- """Search recent Anträge of the current Wahlperiode.
+ """Search recent documents of the current Wahlperiode.
``query`` is applied as a client-side title/Urheber filter; the
- server-side query covers the last ~24 months by default.
+ server-side query covers the configured ``date_window_days``
+ (default 24 months).
"""
from datetime import date, timedelta
end = date.today()
- start = end - timedelta(days=730)
+ start = end - timedelta(days=self.date_window_days)
body = self._build_search_body(
wahlperiode=self.wahlperiode,
start_date=start.isoformat(),
end_date=end.isoformat(),
- document_type="Antrag",
)
+ browse_html = f"{self.base_url}{self.portala_path}/browse.tt.html"
+ browse_json = f"{self.base_url}{self.portala_path}/browse.tt.json"
+ report_html = f"{self.base_url}{self.portala_path}/report.tt.html"
+
async with httpx.AsyncClient(
timeout=30,
follow_redirects=True,
@@ -550,41 +734,41 @@ class PortalaAdapter(ParlamentAdapter):
) as client:
try:
# Step 1: warm up cookies via the browse page
- await client.get(f"{self.base_url}/portal/browse.tt.html")
+ await client.get(browse_html)
# Step 2: submit the search action
resp = await client.post(
- f"{self.base_url}/portal/browse.tt.json",
+ browse_json,
json=body,
- headers={"Referer": f"{self.base_url}/portal/browse.tt.html"},
+ headers={"Referer": browse_html},
)
if resp.status_code != 200:
- print(f"PADOKA search HTTP {resp.status_code}")
+ print(f"{self.bundesland} search HTTP {resp.status_code}")
return []
data = resp.json()
report_id = data.get("report_id")
if not report_id:
- print(f"PADOKA: no report_id in response: {data}")
+ print(f"{self.bundesland}: no report_id in response: {data}")
return []
# Step 3: fetch the HTML hit list
# Take a generous chunk so client-side filter still has enough
chunksize = 100 if query else limit
report_resp = await client.post(
- f"{self.base_url}/portal/report.tt.html",
+ report_html,
json={"report_id": report_id, "start": 0, "chunksize": chunksize},
- headers={"Referer": f"{self.base_url}/portal/browse.tt.html"},
+ headers={"Referer": browse_html},
)
if report_resp.status_code != 200:
- print(f"PADOKA report HTTP {report_resp.status_code}")
+ print(f"{self.bundesland} report HTTP {report_resp.status_code}")
return []
results = self._parse_hit_list_html(report_resp.text, query_filter=query)
return results[:limit]
except Exception as e:
- print(f"PADOKA search error: {e}")
+ print(f"{self.bundesland} search error: {e}")
return []
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
@@ -623,7 +807,7 @@ class PortalaAdapter(ParlamentAdapter):
pdf.close()
return text
except Exception as e:
- print(f"PADOKA download error for {drucksache}: {e}")
+ print(f"{self.bundesland} download error for {drucksache}: {e}")
return None
@@ -667,7 +851,31 @@ class BWAdapter(ParlamentAdapter):
# Registry of adapters
ADAPTERS = {
"NRW": NRWAdapter(),
- "LSA": PortalaAdapter(),
+ "LSA": PortalaAdapter(
+ bundesland="LSA",
+ name="Landtag von Sachsen-Anhalt (PADOKA)",
+ base_url="https://padoka.landtag.sachsen-anhalt.de",
+ db_id="lsa.lissh",
+ wahlperiode=8,
+ portala_path="/portal",
+ document_type="Antrag",
+ pdf_url_prefix="/files/",
+ ),
+ "BE": PortalaAdapter(
+ bundesland="BE",
+ name="Abgeordnetenhaus von Berlin (PARDOK)",
+ base_url="https://pardok.parlament-berlin.de",
+ db_id="lah.lissh",
+ wahlperiode=19,
+ portala_path="/portala",
+ # Berlin's ETYPF index uses different value strings — drop the
+ # document_type subtree, fall back to client-side title filter.
+ document_type=None,
+ # Tighter date window: BE has ~10x more documents than LSA, so a
+ # narrower window keeps the per-request payload bounded.
+ date_window_days=180,
+ pdf_url_prefix="/files/",
+ ),
"BY": BayernAdapter(),
"BW": BWAdapter(),
}