"""Parliament search adapters for different German states.""" import httpx import re from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Optional from bs4 import BeautifulSoup @dataclass class Drucksache: """A parliamentary document.""" drucksache: str # e.g. "18/8125" title: str fraktionen: list[str] datum: str # ISO date link: str # PDF URL bundesland: str typ: str = "Antrag" # Antrag, Anfrage, Beschlussempfehlung, etc. class ParlamentAdapter(ABC): """Base adapter for searching parliament documents.""" bundesland: str name: str @abstractmethod async def search(self, query: str, limit: int = 20) -> list[Drucksache]: """Search for documents matching query.""" pass @abstractmethod async def get_document(self, drucksache: str) -> Optional[Drucksache]: """Get a specific document by ID.""" pass @abstractmethod async def download_text(self, drucksache: str) -> Optional[str]: """Download and extract text from a document.""" pass class NRWAdapter(ParlamentAdapter): """Adapter for NRW Landtag (opal.landtag.nrw.de).""" bundesland = "NRW" name = "Landtag Nordrhein-Westfalen" base_url = "https://opal.landtag.nrw.de" search_url = "https://opal.landtag.nrw.de/home/dokumente/dokumentensuche/parlamentsdokumente/aktuelle-dokumente.html" def _parse_query(self, query: str) -> tuple[str, list[str], bool]: """ Parse search query for AND logic and exact phrases. Returns: (search_term_for_api, filter_terms, is_exact) Examples: - 'Klimaschutz Energie' -> ('Klimaschutz', ['klimaschutz', 'energie'], False) - '"Grüner Stahl"' -> ('Grüner Stahl', ['grüner stahl'], True) - 'Klimaschutz "erneuerbare Energie"' -> ('Klimaschutz', ['klimaschutz', 'erneuerbare energie'], False) """ query = query.strip() # Check for exact phrase (entire query in quotes) if query.startswith('"') and query.endswith('"') and query.count('"') == 2: exact = query[1:-1].strip() return (exact, [exact.lower()], True) # Extract quoted phrases and regular terms import shlex try: parts = shlex.split(query) except ValueError: # Fallback for unbalanced quotes parts = query.split() if not parts: return (query, [query.lower()], False) # Use first term for API search, all terms for filtering filter_terms = [p.lower() for p in parts] return (parts[0], filter_terms, False) def _matches_all_terms(self, doc: 'Drucksache', terms: list[str], is_exact: bool) -> bool: """Check if document matches all search terms (AND logic).""" searchable = f"{doc.title} {doc.drucksache} {' '.join(doc.fraktionen)} {doc.typ}".lower() if is_exact: # Exact phrase must appear return terms[0] in searchable else: # All terms must appear (AND) return all(term in searchable for term in terms) async def search(self, query: str, limit: int = 20) -> list[Drucksache]: """Search NRW Landtag documents via OPAL portal.""" results = [] # Parse query for AND logic api_query, filter_terms, is_exact = self._parse_query(query) async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client: try: # First, get the page to establish session initial = await client.get(self.search_url) if initial.status_code != 200: print(f"NRW search initial request failed: {initial.status_code}") return [] # Parse for webflow token from pagination links soup = BeautifulSoup(initial.text, 'html.parser') # Find a pagination link to extract the webflow token pagination_link = soup.select_one('a[href*="webflowexecution"]') webflow_token = "" webflow_execution = "" if pagination_link: href = pagination_link.get('href', '') # Extract webflowToken and webflowexecution from URL token_match = re.search(r'webflowToken=([^&]*)', href) exec_match = re.search(r'(webflowexecution[^=]+)=([^&]+)', href) if token_match: webflow_token = token_match.group(1) if exec_match: webflow_execution = f"{exec_match.group(1)}={exec_match.group(2)}" # Now perform the search with POST # Find the form action URL with webflow token form = soup.select_one('form#docSearchByItem') form_action = self.search_url if form and form.get('action'): action = form.get('action') if action.startswith('/'): form_action = f"{self.base_url}{action}" elif action.startswith('http'): form_action = action else: form_action = f"{self.search_url}?{action}" # Build form data for "Einfache Suche" (searchByItem form) form_data = { '_eventId_sendform': '1', 'dokNum': api_query, # This is the text search field 'formId': 'searchByItem', 'dokTyp': '', # All types 'wp': '18', # Wahlperiode 18 } # POST request with form data to the form action URL search_resp = await client.post( form_action, data=form_data, cookies=initial.cookies, headers={'Content-Type': 'application/x-www-form-urlencoded'} ) if search_resp.status_code != 200: print(f"NRW search request failed: {search_resp.status_code}") return [] # Parse results soup = BeautifulSoup(search_resp.text, 'html.parser') # Find all document result items (li elements containing articles) items = soup.select('li:has(article)') for item in items[:limit]: try: # Extract drucksache number from first link num_link = item.select_one('a[href*="MMD"]') if not num_link: continue href = num_link.get('href', '') # Extract number: MMD18-12345.pdf -> 18/12345 match = re.search(r'MMD(\d+)-(\d+)\.pdf', href) if not match: continue legislatur, nummer = match.groups() drucksache = f"{legislatur}/{nummer}" pdf_url = f"https://www.landtag.nrw.de{href}" if href.startswith('/') else href # Extract title from the title link (class e-document-result-item__title) title_elem = item.select_one('a.e-document-result-item__title') if title_elem: # Get text content, clean it up title = title_elem.get_text(strip=True) # Remove SVG icon text and clean title = re.sub(r'\s* Optional[Drucksache]: """Get document metadata by drucksache ID (e.g. '18/8125').""" # Parse legislatur and number match = re.match(r"(\d+)/(\d+)", drucksache) if not match: return None legislatur, nummer = match.groups() pdf_url = f"https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument/MMD{legislatur}-{nummer}.pdf" # Try to fetch and extract basic info async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client: try: resp = await client.head(pdf_url) if resp.status_code == 200: return Drucksache( drucksache=drucksache, title=f"Drucksache {drucksache}", fraktionen=[], datum="", link=pdf_url, bundesland="NRW", ) except: pass return None async def download_text(self, drucksache: str) -> Optional[str]: """Download PDF and extract text.""" import fitz # PyMuPDF doc = await self.get_document(drucksache) if not doc: return None async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client: try: resp = await client.get(doc.link) if resp.status_code != 200: return None # Extract text with PyMuPDF pdf = fitz.open(stream=resp.content, filetype="pdf") text = "" for page in pdf: text += page.get_text() pdf.close() return text except Exception as e: print(f"Error downloading {drucksache}: {e}") return None class PortalaAdapter(ParlamentAdapter): """Adapter for portala/eUI-based parliament documentation systems. Used by parliaments running the proprietary "esearch" / portala framework (originally developed for STAR/StarFinder backends, now wrapped in a Single-Page App with Template Toolkit on the server side): - **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de`` - **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` (future) The search workflow is two-stage: 1. ``POST /portal/browse.tt.json`` with a complex JSON ``action`` body that contains an Elasticsearch-style query tree under ``search.json``. The server returns a ``report_id`` plus hit count. 2. ``POST /portal/report.tt.html`` with ``{report_id, start, chunksize}`` to fetch the HTML hit list. Each hit carries a Perl Data::Dumper block in a ``
`` tag with the canonical metadata.

    The query body schema was reverse-engineered from
    https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
    (GPL-3.0 — only structure/selectors are reused, not Python code).

    Full-text search is **not** implemented in the MVP: the adapter
    returns the most recent ``Anträge`` of the current Wahlperiode in the
    given date window, and the search query is applied as a client-side
    title/Urheber filter. The portala server-side full-text path requires
    LSA-specific ``sf`` index names that are not yet known.
    """

    bundesland = "LSA"
    name = "Landtag von Sachsen-Anhalt (PADOKA)"
    base_url = "https://padoka.landtag.sachsen-anhalt.de"
    db_id = "lsa.lissh"
    wahlperiode = 8

    # Reverse-engineered "WEV*" Perl record fields used in the hit-list dumps:
    # WEV06.main = title
    # WEV32.5    = relative PDF path
    # WEV32.main = "Antrag   Drucksache X/YYYY ..."
    _RE_TITLE = re.compile(r"'WEV06'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
    _RE_PDF = re.compile(r"'5'\s*=>\s*'([^']*\.pdf)'")
    _RE_DRUCKSACHE = re.compile(r"Drucksache\s*(\d+/\d+)")
    _RE_URHEBER_DATUM = re.compile(
        r"'WEV32'\s*=>\s*\[\s*\{[^}]*'main'\s*=>\s*[\"']Antrag\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
    )
    _RE_PRE_BLOCK = re.compile(r'
\$VAR1 = (.*?)
', re.DOTALL) @staticmethod def _decode_perl_hex(s: str) -> str: """Decode \\x{abcd} escape sequences from Perl Data::Dumper output.""" return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s) @staticmethod def _normalize_fraktion(urheber: str) -> list[str]: """Map Urheber-String to canonical fraction codes.""" u = urheber.upper() out = [] if "BÜNDNIS 90" in u or "GRÜNE" in u or "GRUENE" in u: out.append("GRÜNE") if u.startswith("CDU") or " CDU " in u or u.endswith(" CDU"): out.append("CDU") if "SPD" in u: out.append("SPD") if "FDP" in u: out.append("FDP") if "AFD" in u: out.append("AfD") if "LINKE" in u or "DIE LINKE" in u: out.append("LINKE") if "LANDESREGIERUNG" in u or "MINISTER" in u or "STAATSKANZLEI" in u: out.append("Landesregierung") return out def _build_search_body( self, wahlperiode: int, start_date: str, end_date: str, document_type: str = "Antrag", ) -> dict: """Build the action JSON body for browse.tt.json. The schema is taken 1:1 from dokukratie's portala.query.json template and only differs in the data source (lsa.lissh) and the variable substitutions. """ return { "action": "SearchAndDisplay", "sources": [self.db_id], "report": { "rhl": "main", "rhlmode": "add", "format": "generic1-full", "mime": "html", "sort": "WEVSO1/D WEVSO2 WEVSO3", }, "search": { "lines": { "2": str(wahlperiode), "3": document_type, "4": "D", "10": start_date, "11": end_date, "20.1": "alWEBBI", "20.2": "alWEBBI", "20.3": "alWEBBI", "90.1": "AND", "90.2": "AND", "90.3": "AND", }, "serverrecordname": "sr_generic1", "parsed": ( f"((/WP {wahlperiode}) AND " f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) " f"AND (/DART,DARTS (\"D\")) AND " f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE" ), "sref": ( f"((/WP {wahlperiode}) AND " f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) " f"AND (/DART,DARTS (\"D\")) AND " f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE" ), "json": [{ "tn": "and", "num": 1, "terms": [ {"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3, "sf": "WP", "op": "eq", "num": 5}, {"tn": "or", "num": 3, "terms": [ {"tn": "or", "num": 4, "terms": [ {"tn": "term", "t": f'"{document_type}"', "idx": 50, "l": 4, "sf": "ETYPF", "op": "eq", "num": 10}, {"tn": "term", "t": f'"{document_type}"', "idx": 50, "l": 4, "sf": "ETYP2F", "op": "eq", "num": 11}, {"tn": "term", "t": f'"{document_type}"', "idx": 50, "l": 4, "sf": "DTYPF", "op": "eq", "num": 12}, {"tn": "term", "t": f'"{document_type}"', "idx": 50, "l": 4, "sf": "DTYP2F", "op": "eq", "num": 13}, {"tn": "term", "t": f'"{document_type}"', "idx": 50, "l": 4, "sf": "1VTYPF", "op": "eq", "num": 14}, ]}, {"tn": "or", "num": 15, "terms": [ {"tn": "term", "t": '"D"', "idx": 93, "l": 4, "sf": "DART", "op": "eq", "num": 16}, {"tn": "term", "t": '"D"', "idx": 93, "l": 4, "sf": "DARTS", "op": "eq", "num": 17}, ]}, ]}, {"tn": "or", "num": 18, "terms": [ {"tn": "or", "num": 19, "terms": [ {"tn": "trange", "sf": "DAT", "op": "eq", "num": 20, "idx": 119, "l": 3, "p1": start_date, "t1": start_date, "p2": end_date, "t2": end_date, "t": f"{start_date} THRU {end_date}"}, {"tn": "trange", "sf": "DDAT", "op": "eq", "num": 21, "idx": 119, "l": 3, "p1": start_date, "t1": start_date, "p2": end_date, "t2": end_date, "t": f"{start_date} THRU {end_date}"}, ]}, {"tn": "trange", "sf": "SDAT", "op": "eq", "num": 22, "idx": 119, "l": 3, "p1": start_date, "t1": start_date, "p2": end_date, "t2": end_date, "t": f"{start_date} THRU {end_date}"}, ]}, {"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1, "sf": "TYP", "op": "eq", "num": 23}, ], }], }, "dataSet": "1", } def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]: """Extract Drucksachen from a report.tt.html response.""" results: list[Drucksache] = [] for pre in self._RE_PRE_BLOCK.findall(html): m_ds = self._RE_DRUCKSACHE.search(pre) if not m_ds: continue drucksache = m_ds.group(1) m_t = self._RE_TITLE.search(pre) title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}" m_pdf = self._RE_PDF.search(pre) pdf_rel = m_pdf.group(1) if m_pdf else "" pdf_url = f"{self.base_url}/files/{pdf_rel}" if pdf_rel else "" m_w32 = self._RE_URHEBER_DATUM.search(pre) urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else "" datum_de = m_w32.group(2) if m_w32 else "" # DD.MM.YYYY -> ISO YYYY-MM-DD datum_iso = "" if datum_de: d, m, y = datum_de.split(".") datum_iso = f"{y}-{m.zfill(2)}-{d.zfill(2)}" fraktionen = self._normalize_fraktion(urheber) if urheber else [] doc = Drucksache( drucksache=drucksache, title=title, fraktionen=fraktionen, datum=datum_iso, link=pdf_url, bundesland=self.bundesland, typ="Antrag", ) # Client-side title filter (no fulltext search server-side) if query_filter: hay = f"{title} {urheber}".lower() if not all(t in hay for t in query_filter.lower().split()): continue results.append(doc) return results async def search(self, query: str, limit: int = 20) -> list[Drucksache]: """Search recent Anträge of the current Wahlperiode. ``query`` is applied as a client-side title/Urheber filter; the server-side query covers the last ~24 months by default. """ from datetime import date, timedelta end = date.today() start = end - timedelta(days=730) body = self._build_search_body( wahlperiode=self.wahlperiode, start_date=start.isoformat(), end_date=end.isoformat(), document_type="Antrag", ) async with httpx.AsyncClient( timeout=30, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, ) as client: try: # Step 1: warm up cookies via the browse page await client.get(f"{self.base_url}/portal/browse.tt.html") # Step 2: submit the search action resp = await client.post( f"{self.base_url}/portal/browse.tt.json", json=body, headers={"Referer": f"{self.base_url}/portal/browse.tt.html"}, ) if resp.status_code != 200: print(f"PADOKA search HTTP {resp.status_code}") return [] data = resp.json() report_id = data.get("report_id") if not report_id: print(f"PADOKA: no report_id in response: {data}") return [] # Step 3: fetch the HTML hit list # Take a generous chunk so client-side filter still has enough chunksize = 100 if query else limit report_resp = await client.post( f"{self.base_url}/portal/report.tt.html", json={"report_id": report_id, "start": 0, "chunksize": chunksize}, headers={"Referer": f"{self.base_url}/portal/browse.tt.html"}, ) if report_resp.status_code != 200: print(f"PADOKA report HTTP {report_resp.status_code}") return [] results = self._parse_hit_list_html(report_resp.text, query_filter=query) return results[:limit] except Exception as e: print(f"PADOKA search error: {e}") return [] async def get_document(self, drucksache: str) -> Optional[Drucksache]: """Look up a single document by ID via the search endpoint with a document_number filter.""" # Pragmatic MVP: do a broad search and filter for the requested ID. # A targeted single-document fetch would require a different # action.search.json structure that we have not reverse-engineered yet. results = await self.search(query="", limit=200) for doc in results: if doc.drucksache == drucksache: return doc return None async def download_text(self, drucksache: str) -> Optional[str]: """Download the PDF for a Drucksache and extract its text.""" import fitz # PyMuPDF doc = await self.get_document(drucksache) if not doc or not doc.link: return None async with httpx.AsyncClient( timeout=60, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, ) as client: try: resp = await client.get(doc.link) if resp.status_code != 200: return None pdf = fitz.open(stream=resp.content, filetype="pdf") text = "" for page in pdf: text += page.get_text() pdf.close() return text except Exception as e: print(f"PADOKA download error for {drucksache}: {e}") return None class BayernAdapter(ParlamentAdapter): """Adapter for Bayerischer Landtag.""" bundesland = "BY" name = "Bayerischer Landtag" base_url = "https://www.bayern.landtag.de" async def search(self, query: str, limit: int = 20) -> list[Drucksache]: # TODO: Implement Bayern search return [] async def get_document(self, drucksache: str) -> Optional[Drucksache]: # TODO: Implement return None async def download_text(self, drucksache: str) -> Optional[str]: return None class BWAdapter(ParlamentAdapter): """Adapter for Baden-Württemberg Landtag.""" bundesland = "BW" name = "Landtag Baden-Württemberg" base_url = "https://www.landtag-bw.de" async def search(self, query: str, limit: int = 20) -> list[Drucksache]: # TODO: Implement BW search return [] async def get_document(self, drucksache: str) -> Optional[Drucksache]: return None async def download_text(self, drucksache: str) -> Optional[str]: return None # Registry of adapters ADAPTERS = { "NRW": NRWAdapter(), "LSA": PortalaAdapter(), "BY": BayernAdapter(), "BW": BWAdapter(), } def get_adapter(bundesland: str) -> Optional[ParlamentAdapter]: """Get adapter for a bundesland.""" return ADAPTERS.get(bundesland) async def search_all(query: str, bundesland: str = "NRW", limit: int = 20) -> list[Drucksache]: """Search parliament documents in a specific state.""" adapter = get_adapter(bundesland) if not adapter: return [] return await adapter.search(query, limit)