gwoe-antragspruefer/app/parlamente.py

"""Parliament search adapters for different German states."""

import httpx
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional
from bs4 import BeautifulSoup


@dataclass
class Drucksache:
    """A parliamentary document."""
    drucksache: str  # e.g. "18/8125"
    title: str
    fraktionen: list[str]
    datum: str  # ISO date
    link: str  # PDF URL
    bundesland: str
    typ: str = "Antrag"  # Antrag, Anfrage, Beschlussempfehlung, etc.


class ParlamentAdapter(ABC):
    """Base adapter for searching parliament documents."""
    
    bundesland: str
    name: str
    
    @abstractmethod
    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Search for documents matching query."""
        pass
    
    @abstractmethod
    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Get a specific document by ID."""
        pass
    
    @abstractmethod
    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download and extract text from a document."""
        pass


class NRWAdapter(ParlamentAdapter):
    """Adapter for NRW Landtag (opal.landtag.nrw.de)."""
    
    bundesland = "NRW"
    name = "Landtag Nordrhein-Westfalen"
    base_url = "https://opal.landtag.nrw.de"
    search_url = "https://opal.landtag.nrw.de/home/dokumente/dokumentensuche/parlamentsdokumente/aktuelle-dokumente.html"
    
    def _parse_query(self, query: str) -> tuple[str, list[str], bool]:
        """
        Parse search query for AND logic and exact phrases.
        Returns: (search_term_for_api, filter_terms, is_exact)
        
        Examples:
        - 'Klimaschutz Energie' -> ('Klimaschutz', ['klimaschutz', 'energie'], False)
        - '"Grüner Stahl"' -> ('Grüner Stahl', ['grüner stahl'], True)
        - 'Klimaschutz "erneuerbare Energie"' -> ('Klimaschutz', ['klimaschutz', 'erneuerbare energie'], False)
        """
        query = query.strip()
        
        # Check for exact phrase (entire query in quotes)
        if query.startswith('"') and query.endswith('"') and query.count('"') == 2:
            exact = query[1:-1].strip()
            return (exact, [exact.lower()], True)
        
        # Extract quoted phrases and regular terms
        import shlex
        try:
            parts = shlex.split(query)
        except ValueError:
            # Fallback for unbalanced quotes
            parts = query.split()
        
        if not parts:
            return (query, [query.lower()], False)
        
        # Use first term for API search, all terms for filtering
        filter_terms = [p.lower() for p in parts]
        return (parts[0], filter_terms, False)
    
    def _matches_all_terms(self, doc: 'Drucksache', terms: list[str], is_exact: bool) -> bool:
        """Check if document matches all search terms (AND logic)."""
        searchable = f"{doc.title} {doc.drucksache} {' '.join(doc.fraktionen)} {doc.typ}".lower()
        
        if is_exact:
            # Exact phrase must appear
            return terms[0] in searchable
        else:
            # All terms must appear (AND)
            return all(term in searchable for term in terms)
    
    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Search NRW Landtag documents via OPAL portal."""
        results = []
        
        # Parse query for AND logic
        api_query, filter_terms, is_exact = self._parse_query(query)
        
        async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
            try:
                # First, get the page to establish session
                initial = await client.get(self.search_url)
                if initial.status_code != 200:
                    print(f"NRW search initial request failed: {initial.status_code}")
                    return []
                
                # Parse for webflow token from pagination links
                soup = BeautifulSoup(initial.text, 'html.parser')
                
                # Find a pagination link to extract the webflow token
                pagination_link = soup.select_one('a[href*="webflowexecution"]')
                webflow_token = ""
                webflow_execution = ""
                
                if pagination_link:
                    href = pagination_link.get('href', '')
                    # Extract webflowToken and webflowexecution from URL
                    token_match = re.search(r'webflowToken=([^&]*)', href)
                    exec_match = re.search(r'(webflowexecution[^=]+)=([^&]+)', href)
                    if token_match:
                        webflow_token = token_match.group(1)
                    if exec_match:
                        webflow_execution = f"{exec_match.group(1)}={exec_match.group(2)}"
                
                # Now perform the search with POST
                # Find the form action URL with webflow token
                form = soup.select_one('form#docSearchByItem')
                form_action = self.search_url
                if form and form.get('action'):
                    action = form.get('action')
                    if action.startswith('/'):
                        form_action = f"{self.base_url}{action}"
                    elif action.startswith('http'):
                        form_action = action
                    else:
                        form_action = f"{self.search_url}?{action}"
                
                # Build form data for "Einfache Suche" (searchByItem form)
                form_data = {
                    '_eventId_sendform': '1',
                    'dokNum': api_query,  # This is the text search field
                    'formId': 'searchByItem',
                    'dokTyp': '',  # All types
                    'wp': '18',  # Wahlperiode 18
                }
                
                # POST request with form data to the form action URL
                search_resp = await client.post(
                    form_action,
                    data=form_data,
                    cookies=initial.cookies,
                    headers={'Content-Type': 'application/x-www-form-urlencoded'}
                )
                
                if search_resp.status_code != 200:
                    print(f"NRW search request failed: {search_resp.status_code}")
                    return []
                
                # Parse results
                soup = BeautifulSoup(search_resp.text, 'html.parser')
                
                # Find all document result items (li elements containing articles)
                items = soup.select('li:has(article)')
                
                for item in items[:limit]:
                    try:
                        # Extract drucksache number from first link
                        num_link = item.select_one('a[href*="MMD"]')
                        if not num_link:
                            continue
                        
                        href = num_link.get('href', '')
                        # Extract number: MMD18-12345.pdf -> 18/12345
                        match = re.search(r'MMD(\d+)-(\d+)\.pdf', href)
                        if not match:
                            continue
                        
                        legislatur, nummer = match.groups()
                        drucksache = f"{legislatur}/{nummer}"
                        pdf_url = f"https://www.landtag.nrw.de{href}" if href.startswith('/') else href
                        
                        # Extract title from the title link (class e-document-result-item__title)
                        title_elem = item.select_one('a.e-document-result-item__title')
                        if title_elem:
                            # Get text content, clean it up
                            title = title_elem.get_text(strip=True)
                            # Remove SVG icon text and clean
                            title = re.sub(r'\s*<svg.*', '', title)
                            title = re.sub(r'\s+', ' ', title).strip()
                        else:
                            # Fallback: try to find any longer text
                            title = f"Drucksache {drucksache}"
                        
                        # Clean up common artifacts
                        title = re.sub(r'\s*\(\s*externer Link.*?\)', '', title).strip()
                        
                        # Extract type (Antrag, Kleine Anfrage, etc.)
                        typ_elem = item.select_one('.e-document-result-item__category')
                        typ = typ_elem.get_text(strip=True) if typ_elem else "Drucksache"
                        
                        # Extract date
                        time_elem = item.select_one('time')
                        datum = ""
                        if time_elem:
                            datum_text = time_elem.get_text(strip=True)
                            # Convert DD.MM.YYYY to YYYY-MM-DD
                            date_match = re.match(r'(\d{2})\.(\d{2})\.(\d{4})', datum_text)
                            if date_match:
                                d, m, y = date_match.groups()
                                datum = f"{y}-{m}-{d}"
                        
                        # Extract Urheber (fraktionen) - look for paragraph containing "Urheber:"
                        urheber_text = ""
                        for p in item.select('p'):
                            if 'Urheber:' in p.get_text():
                                urheber_text = p.get_text()
                                break
                        
                        fraktionen = []
                        if urheber_text:
                            # Extract party names (SPD, CDU, GRÜNE, FDP, AfD)
                            for party in ['SPD', 'CDU', 'GRÜNE', 'Grüne', 'FDP', 'AfD']:
                                if party in urheber_text:
                                    fraktionen.append(party.upper() if party.lower() != 'grüne' else 'GRÜNE')
                        
                        doc = Drucksache(
                            drucksache=drucksache,
                            title=title,
                            fraktionen=fraktionen,
                            datum=datum,
                            link=pdf_url,
                            bundesland="NRW",
                            typ=typ,
                        )
                        
                        # Apply AND filter (all terms must match)
                        if self._matches_all_terms(doc, filter_terms, is_exact):
                            results.append(doc)
                            
                    except Exception as e:
                        print(f"Error parsing item: {e}")
                        continue
                
            except Exception as e:
                print(f"NRW search error: {e}")
        
        return results
    
    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Get document metadata by drucksache ID (e.g. '18/8125')."""
        # Parse legislatur and number
        match = re.match(r"(\d+)/(\d+)", drucksache)
        if not match:
            return None
        
        legislatur, nummer = match.groups()
        pdf_url = f"https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument/MMD{legislatur}-{nummer}.pdf"
        
        # Try to fetch and extract basic info
        async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
            try:
                resp = await client.head(pdf_url)
                if resp.status_code == 200:
                    return Drucksache(
                        drucksache=drucksache,
                        title=f"Drucksache {drucksache}",
                        fraktionen=[],
                        datum="",
                        link=pdf_url,
                        bundesland="NRW",
                    )
            except:
                pass
        
        return None
    
    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download PDF and extract text."""
        import fitz  # PyMuPDF
        
        doc = await self.get_document(drucksache)
        if not doc:
            return None
        
        async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    return None
                
                # Extract text with PyMuPDF
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                
                return text
            except Exception as e:
                print(f"Error downloading {drucksache}: {e}")
                return None


class PortalaAdapter(ParlamentAdapter):
    """Adapter for portala/eUI-based parliament documentation systems.

    Used by parliaments running the proprietary "esearch" / portala framework
    (originally developed for STAR/StarFinder backends, now wrapped in a
    Single-Page App with Template Toolkit on the server side):

    - **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de``
      under ``/portal/`` (singular)
    - **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` under
      ``/portala/`` (with the trailing 'a')

    Both instances share the same JSON action schema, only the base URL,
    the data source ID, the application path prefix and a few minor
    quirks differ — those are constructor parameters so that the same
    class can serve both states (and any future portala-based parliament).

    The search workflow is two-stage:

    1. ``POST {base}{path}/browse.tt.json`` with a complex JSON ``action``
       body that contains an Elasticsearch-style query tree under
       ``search.json``. The server returns a ``report_id`` plus hit count.
    2. ``POST {base}{path}/report.tt.html`` with ``{report_id, start,
       chunksize}`` to fetch the HTML hit list. Each hit carries a Perl
       Data::Dumper block in a ``<pre>`` tag with the canonical metadata.

    The query body schema was reverse-engineered from
    https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
    (GPL-3.0 — only structure/selectors are reused, not Python code).

    Full-text search is **not** implemented in the MVP: the adapter
    returns documents of the current Wahlperiode in the given date
    window, and the search query is applied as a client-side
    title/Urheber filter. The server-side full-text path requires
    state-specific ``sf`` index names that are not yet known.
    """

    def __init__(
        self,
        *,
        bundesland: str,
        name: str,
        base_url: str,
        db_id: str,
        wahlperiode: int,
        portala_path: str = "/portal",
        document_type: Optional[str] = "Antrag",
        pdf_url_prefix: str = "/files/",
        date_window_days: int = 730,
    ) -> None:
        """Configure a portala/eUI adapter for one specific parliament.

        Args:
            bundesland: state code (e.g. ``"LSA"``, ``"BE"``).
            name: human-readable adapter label (used in logs/UI).
            base_url: ``https://...`` of the portal host without trailing slash.
            db_id: data source identifier the eUI server expects in
                ``action.sources``, e.g. ``"lsa.lissh"`` or ``"lah.lissh"``.
            wahlperiode: current legislative period — fed into the WP
                term of the search tree.
            portala_path: path prefix where the portala app lives. ``/portal``
                for LSA, ``/portala`` for Berlin.
            document_type: optional filter applied via ETYPF/DTYPF/DART
                terms. ``"Antrag"`` works for LSA; for instances where
                the index uses different document_type values (e.g. Berlin),
                pass ``None`` to drop the document_type subtree entirely
                — the user can still filter client-side by title.
            pdf_url_prefix: URL fragment between ``base_url`` and the
                relative PDF path returned by the server.
            date_window_days: how many days back ``search()`` looks by
                default.
        """
        self.bundesland = bundesland
        self.name = name
        self.base_url = base_url.rstrip("/")
        self.db_id = db_id
        self.wahlperiode = wahlperiode
        self.portala_path = "/" + portala_path.strip("/")
        self.document_type = document_type
        self.pdf_url_prefix = "/" + pdf_url_prefix.strip("/") + "/"
        self.date_window_days = date_window_days

    # ── LSA-style hit list (Perl Data::Dumper inside <pre> blocks) ──
    # Reverse-engineered "WEV*" record fields:
    # WEV06.main = title
    # WEV32.5    = relative PDF path
    # WEV32.main = "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b> ..."
    _RE_TITLE = re.compile(r"'WEV06'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
    _RE_PDF = re.compile(r"'5'\s*=>\s*'([^']*\.pdf)'")
    _RE_DRUCKSACHE = re.compile(r"Drucksache\s*<b>(\d+/\d+)</b>")
    _RE_URHEBER_DATUM = re.compile(
        r"'WEV32'\s*=>\s*\[\s*\{[^}]*'main'\s*=>\s*[\"']Antrag\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
    )
    _RE_PRE_BLOCK = re.compile(r'<pre>\$VAR1 = (.*?)</pre>', re.DOTALL)

    # ── Berlin-style hit list (production HTML cards, no Perl dump) ──
    # The whole div for one record:
    _RE_BE_RECORD = re.compile(
        r'<div[^>]*class="[^"]*efxRecordRepeater[^"]*"[^>]*data-efx-rec="[^"]*"[^>]*>(.*?)(?=<div[^>]*efxRecordRepeater|<div[^>]*id="efxResultsEnd"|</main>|$)',
        re.DOTALL,
    )
    _RE_BE_TITLE = re.compile(r'<h3[^>]*class="h5[^"]*"[^>]*>\s*<span>([^<]+)</span>')
    _RE_BE_LINK = re.compile(r'<a[^>]*href="([^"]+\.pdf)"[^>]*>')
    # The metadata h6 looks like:
    #   <span class="h6">Antrag (Eilantrag)  &nbsp;<a ...>Drucksache 19/3104</a>  S. 1 bis 24 vom 31.03.2026</span>
    _RE_BE_DRUCKSACHE = re.compile(r'Drucksache\s+(\d+/\d+)')
    _RE_BE_DATUM = re.compile(r'vom\s+(\d{1,2}\.\d{1,2}\.\d{4})')
    _RE_BE_DOCTYPE = re.compile(r'<span class="h6">\s*([^<&]+?)(?:&nbsp;|<)')

    @staticmethod
    def _decode_perl_hex(s: str) -> str:
        """Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
        return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s)

    @staticmethod
    def _normalize_fraktion(urheber: str) -> list[str]:
        """Map Urheber-String to canonical fraction codes.

        Uses regex word boundaries instead of plain substring matching so
        that comma-separated lists ("CDU, SPD") and the embedded "DIE
        LINKE" are matched reliably.
        """
        u = urheber.upper()
        out: list[str] = []

        def has(pattern: str) -> bool:
            return re.search(pattern, u) is not None

        if has(r"\bBÜNDNIS\s*90\b") or has(r"\bGR(?:Ü|UE)NE\b"):
            out.append("GRÜNE")
        if has(r"\bCDU\b"):
            out.append("CDU")
        if has(r"\bSPD\b"):
            out.append("SPD")
        if has(r"\bFDP\b"):
            out.append("FDP")
        if has(r"\bAFD\b"):
            out.append("AfD")
        if has(r"\bLINKE\b"):
            out.append("LINKE")
        if has(r"\bBSW\b"):
            out.append("BSW")
        if has(r"LANDESREGIERUNG|SENAT VON BERLIN|REGIERENDE[RN]?\s+BÜRGERMEISTER|MINISTER\b|STAATSKANZLEI"):
            out.append("Landesregierung")
        return out

    def _build_search_body(
        self,
        wahlperiode: int,
        start_date: str,
        end_date: str,
    ) -> dict:
        """Build the action JSON body for browse.tt.json.

        The schema is taken from dokukratie's portala.query.json template
        and only differs in the data source and the variable substitutions.
        When ``self.document_type`` is None, the ETYPF/DTYPF/DART subtree
        is dropped — useful for parliaments whose ETYPF index uses
        different value strings than ``"Antrag"``.
        """
        document_type = self.document_type
        date_range_text = f"{start_date} THRU {end_date}"
        date_term = lambda sf, num: {  # noqa: E731 — local helper
            "tn": "trange", "sf": sf, "op": "eq", "num": num,
            "idx": 119, "l": 3,
            "p1": start_date, "t1": start_date,
            "p2": end_date, "t2": end_date,
            "t": date_range_text,
        }

        # Build the search.lines (form-state mirror) and the json tree
        lines: dict = {
            "2": str(wahlperiode),
            "10": start_date,
            "11": end_date,
            "20.1": "alWEBBI",
            "20.2": "alWEBBI",
            "20.3": "alWEBBI",
            "90.1": "AND",
            "90.2": "AND",
            "90.3": "AND",
        }
        if document_type is not None:
            lines["3"] = document_type
            lines["4"] = "D"

        # Top-level AND tree
        top_terms: list = [
            {"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3,
             "sf": "WP", "op": "eq", "num": 5},
        ]

        if document_type is not None:
            top_terms.append({"tn": "or", "num": 3, "terms": [
                {"tn": "or", "num": 4, "terms": [
                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                     "l": 4, "sf": "ETYPF", "op": "eq", "num": 10},
                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                     "l": 4, "sf": "ETYP2F", "op": "eq", "num": 11},
                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                     "l": 4, "sf": "DTYPF", "op": "eq", "num": 12},
                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                     "l": 4, "sf": "DTYP2F", "op": "eq", "num": 13},
                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                     "l": 4, "sf": "1VTYPF", "op": "eq", "num": 14},
                ]},
                {"tn": "or", "num": 15, "terms": [
                    {"tn": "term", "t": '"D"', "idx": 93, "l": 4,
                     "sf": "DART", "op": "eq", "num": 16},
                    {"tn": "term", "t": '"D"', "idx": 93, "l": 4,
                     "sf": "DARTS", "op": "eq", "num": 17},
                ]},
            ]})

        top_terms.append({"tn": "or", "num": 18, "terms": [
            {"tn": "or", "num": 19, "terms": [
                date_term("DAT", 20),
                date_term("DDAT", 21),
            ]},
            date_term("SDAT", 22),
        ]})
        top_terms.append({"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1,
                          "sf": "TYP", "op": "eq", "num": 23})

        # Mirror the same shape into the parsed/sref display strings
        if document_type is not None:
            parsed = (
                f"((/WP {wahlperiode}) AND "
                f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
                f"AND (/DART,DARTS (\"D\")) AND "
                f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
            )
        else:
            parsed = (
                f"((/WP {wahlperiode}) AND "
                f"(DAT,DDAT,SDAT= {date_range_text})) AND TYP=DOKDBE"
            )

        return {
            "action": "SearchAndDisplay",
            "sources": [self.db_id],
            "report": {
                "rhl": "main",
                "rhlmode": "add",
                "format": "generic1-full",
                "mime": "html",
                "sort": "WEVSO1/D WEVSO2 WEVSO3",
            },
            "search": {
                "lines": lines,
                "serverrecordname": "sr_generic1",
                "parsed": parsed,
                "sref": parsed,
                "json": [{
                    "tn": "and",
                    "num": 1,
                    "terms": top_terms,
                }],
            },
            "dataSet": "1",
        }

    @staticmethod
    def _datum_de_to_iso(datum_de: str) -> str:
        """Convert DD.MM.YYYY → YYYY-MM-DD; return '' for empty input."""
        if not datum_de:
            return ""
        d, m, y = datum_de.split(".")
        return f"{y}-{m.zfill(2)}-{d.zfill(2)}"

    def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]:
        """Extract Drucksachen from a report.tt.html response.

        Two formats are supported and auto-detected:

        - **LSA-style:** the records are embedded as Perl Data::Dumper
          dumps inside ``<pre>$VAR1 = …</pre>`` blocks. WEV06 → title,
          WEV32 → metadata + PDF path. Used by Sachsen-Anhalt's PADOKA
          template.
        - **Berlin-style:** standard production HTML cards with
          ``efxRecordRepeater`` divs. Title in an ``<h3 class="h5">``,
          metadata + PDF link in an ``<span class="h6">``. Used by
          Berlin's PARDOK template.
        """
        if self._RE_PRE_BLOCK.search(html):
            return self._parse_hit_list_dump(html, query_filter)
        return self._parse_hit_list_cards(html, query_filter)

    def _parse_hit_list_dump(self, html: str, query_filter: str) -> list[Drucksache]:
        """Parse LSA-style ``<pre>$VAR1 = …</pre>`` Perl-dump records."""
        results: list[Drucksache] = []
        for pre in self._RE_PRE_BLOCK.findall(html):
            m_ds = self._RE_DRUCKSACHE.search(pre)
            if not m_ds:
                continue
            drucksache = m_ds.group(1)

            m_t = self._RE_TITLE.search(pre)
            title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"

            m_pdf = self._RE_PDF.search(pre)
            pdf_rel = m_pdf.group(1) if m_pdf else ""
            pdf_url = f"{self.base_url}{self.pdf_url_prefix}{pdf_rel}" if pdf_rel else ""

            m_w32 = self._RE_URHEBER_DATUM.search(pre)
            urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else ""
            datum_iso = self._datum_de_to_iso(m_w32.group(2) if m_w32 else "")
            fraktionen = self._normalize_fraktion(urheber) if urheber else []

            doc = Drucksache(
                drucksache=drucksache,
                title=title,
                fraktionen=fraktionen,
                datum=datum_iso,
                link=pdf_url,
                bundesland=self.bundesland,
                typ="Antrag",
            )

            if query_filter:
                hay = f"{title} {urheber}".lower()
                if not all(t in hay for t in query_filter.lower().split()):
                    continue

            results.append(doc)

        return results

    def _parse_hit_list_cards(self, html: str, query_filter: str) -> list[Drucksache]:
        """Parse Berlin-style ``efxRecordRepeater`` HTML-card records.

        Each card contains an ``<h3>`` title, a metadata ``<span class="h6">``
        with the document type, the Drucksachen-Nummer, and the date,
        plus a direct ``<a href="…pdf">`` link to the PDF on the same host.
        """
        results: list[Drucksache] = []

        # Split the HTML on every record-div opener — easier than balancing
        # divs with regex.
        chunks = html.split('class="record')
        # First chunk is the prelude, skip it
        for chunk in chunks[1:]:
            # Each chunk now starts at the record class attribute
            m_t = self._RE_BE_TITLE.search(chunk)
            title = m_t.group(1).strip() if m_t else "Ohne Titel"

            m_ds = self._RE_BE_DRUCKSACHE.search(chunk)
            if not m_ds:
                continue
            drucksache = m_ds.group(1)

            m_pdf = self._RE_BE_LINK.search(chunk)
            pdf_url = ""
            if m_pdf:
                href = m_pdf.group(1)
                if href.startswith("http://") or href.startswith("https://"):
                    pdf_url = href
                elif href.startswith("/"):
                    pdf_url = f"{self.base_url}{href}"
                else:
                    pdf_url = f"{self.base_url}{self.pdf_url_prefix}{href}"

            m_dat = self._RE_BE_DATUM.search(chunk)
            datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "")

            m_doc = self._RE_BE_DOCTYPE.search(chunk)
            doctype_full = m_doc.group(1).strip() if m_doc else "Drucksache"

            # Berlin often packs the originator(s) into the same h6 line:
            #   "Antrag  CDU, SPD" → fraktionen = ["CDU","SPD"], typ = "Antrag"
            # Senat-Vorlagen carry no fraction, only "Vorlage zur …".
            fraktionen = self._normalize_fraktion(doctype_full)
            # Strip the fraction names back out of the typ string so the UI
            # shows a clean "Antrag" / "Vorlage …" label.
            typ = doctype_full
            if fraktionen:
                # Cut at the first occurrence of any party name
                cuts = [typ.upper().find(f.upper()) for f in fraktionen]
                cuts = [c for c in cuts if c >= 0]
                if cuts:
                    typ = typ[: min(cuts)].rstrip(" ,")

            doc = Drucksache(
                drucksache=drucksache,
                title=title,
                fraktionen=fraktionen,
                datum=datum_iso,
                link=pdf_url,
                bundesland=self.bundesland,
                typ=typ,
            )

            if query_filter:
                hay = f"{title} {doctype_full}".lower()
                if not all(t in hay for t in query_filter.lower().split()):
                    continue

            results.append(doc)

        return results

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Search recent documents of the current Wahlperiode.

        ``query`` is applied as a client-side title/Urheber filter; the
        server-side query covers the configured ``date_window_days``
        (default 24 months).
        """
        from datetime import date, timedelta

        end = date.today()
        start = end - timedelta(days=self.date_window_days)
        body = self._build_search_body(
            wahlperiode=self.wahlperiode,
            start_date=start.isoformat(),
            end_date=end.isoformat(),
        )

        browse_html = f"{self.base_url}{self.portala_path}/browse.tt.html"
        browse_json = f"{self.base_url}{self.portala_path}/browse.tt.json"
        report_html = f"{self.base_url}{self.portala_path}/report.tt.html"

        async with httpx.AsyncClient(
            timeout=30,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                # Step 1: warm up cookies via the browse page
                await client.get(browse_html)

                # Step 2: submit the search action
                resp = await client.post(
                    browse_json,
                    json=body,
                    headers={"Referer": browse_html},
                )
                if resp.status_code != 200:
                    print(f"{self.bundesland} search HTTP {resp.status_code}")
                    return []

                data = resp.json()
                report_id = data.get("report_id")
                if not report_id:
                    print(f"{self.bundesland}: no report_id in response: {data}")
                    return []

                # Step 3: fetch the HTML hit list
                # Take a generous chunk so client-side filter still has enough
                chunksize = 100 if query else limit
                report_resp = await client.post(
                    report_html,
                    json={"report_id": report_id, "start": 0, "chunksize": chunksize},
                    headers={"Referer": browse_html},
                )
                if report_resp.status_code != 200:
                    print(f"{self.bundesland} report HTTP {report_resp.status_code}")
                    return []

                results = self._parse_hit_list_html(report_resp.text, query_filter=query)
                return results[:limit]

            except Exception as e:
                print(f"{self.bundesland} search error: {e}")
                return []

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Look up a single document by ID via the search endpoint with a
        document_number filter."""
        # Pragmatic MVP: do a broad search and filter for the requested ID.
        # A targeted single-document fetch would require a different
        # action.search.json structure that we have not reverse-engineered yet.
        results = await self.search(query="", limit=200)
        for doc in results:
            if doc.drucksache == drucksache:
                return doc
        return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download the PDF for a Drucksache and extract its text."""
        import fitz  # PyMuPDF

        doc = await self.get_document(drucksache)
        if not doc or not doc.link:
            return None

        async with httpx.AsyncClient(
            timeout=60,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception as e:
                print(f"{self.bundesland} download error for {drucksache}: {e}")
                return None


class BayernAdapter(ParlamentAdapter):
    """Adapter for Bayerischer Landtag."""

    bundesland = "BY"
    name = "Bayerischer Landtag"
    base_url = "https://www.bayern.landtag.de"

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        # TODO: Implement Bayern search
        return []

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        # TODO: Implement
        return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        return None


class BWAdapter(ParlamentAdapter):
    """Adapter for Baden-Württemberg Landtag."""
    
    bundesland = "BW"
    name = "Landtag Baden-Württemberg"
    base_url = "https://www.landtag-bw.de"
    
    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        # TODO: Implement BW search
        return []
    
    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        return None
    
    async def download_text(self, drucksache: str) -> Optional[str]:
        return None


# Registry of adapters
ADAPTERS = {
    "NRW": NRWAdapter(),
    "LSA": PortalaAdapter(
        bundesland="LSA",
        name="Landtag von Sachsen-Anhalt (PADOKA)",
        base_url="https://padoka.landtag.sachsen-anhalt.de",
        db_id="lsa.lissh",
        wahlperiode=8,
        portala_path="/portal",
        document_type="Antrag",
        pdf_url_prefix="/files/",
    ),
    "BE": PortalaAdapter(
        bundesland="BE",
        name="Abgeordnetenhaus von Berlin (PARDOK)",
        base_url="https://pardok.parlament-berlin.de",
        db_id="lah.lissh",
        wahlperiode=19,
        portala_path="/portala",
        # Berlin's ETYPF index uses different value strings — drop the
        # document_type subtree, fall back to client-side title filter.
        document_type=None,
        # Tighter date window: BE has ~10x more documents than LSA, so a
        # narrower window keeps the per-request payload bounded.
        date_window_days=180,
        pdf_url_prefix="/files/",
    ),
    "BY": BayernAdapter(),
    "BW": BWAdapter(),
}


def get_adapter(bundesland: str) -> Optional[ParlamentAdapter]:
    """Get adapter for a bundesland."""
    return ADAPTERS.get(bundesland)


async def search_all(query: str, bundesland: str = "NRW", limit: int = 20) -> list[Drucksache]:
    """Search parliament documents in a specific state."""
    adapter = get_adapter(bundesland)
    if not adapter:
        return []
    return await adapter.search(query, limit)