gwoe-antragspruefer/app/parlamente.py

"""Parliament search adapters for different German states."""

import httpx
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional
from bs4 import BeautifulSoup


@dataclass
class Drucksache:
    """A parliamentary document."""
    drucksache: str  # e.g. "18/8125"
    title: str
    fraktionen: list[str]
    datum: str  # ISO date
    link: str  # PDF URL
    bundesland: str
    typ: str = "Antrag"  # Antrag, Anfrage, Beschlussempfehlung, etc.


class ParlamentAdapter(ABC):
    """Base adapter for searching parliament documents."""
    
    bundesland: str
    name: str
    
    @abstractmethod
    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Search for documents matching query."""
        pass
    
    @abstractmethod
    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Get a specific document by ID."""
        pass
    
    @abstractmethod
    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download and extract text from a document."""
        pass


class NRWAdapter(ParlamentAdapter):
    """Adapter for NRW Landtag (opal.landtag.nrw.de)."""
    
    bundesland = "NRW"
    name = "Landtag Nordrhein-Westfalen"
    base_url = "https://opal.landtag.nrw.de"
    search_url = "https://opal.landtag.nrw.de/home/dokumente/dokumentensuche/parlamentsdokumente/aktuelle-dokumente.html"
    
    def _parse_query(self, query: str) -> tuple[str, list[str], bool]:
        """
        Parse search query for AND logic and exact phrases.
        Returns: (search_term_for_api, filter_terms, is_exact)
        
        Examples:
        - 'Klimaschutz Energie' -> ('Klimaschutz', ['klimaschutz', 'energie'], False)
        - '"Grüner Stahl"' -> ('Grüner Stahl', ['grüner stahl'], True)
        - 'Klimaschutz "erneuerbare Energie"' -> ('Klimaschutz', ['klimaschutz', 'erneuerbare energie'], False)
        """
        query = query.strip()
        
        # Check for exact phrase (entire query in quotes)
        if query.startswith('"') and query.endswith('"') and query.count('"') == 2:
            exact = query[1:-1].strip()
            return (exact, [exact.lower()], True)
        
        # Extract quoted phrases and regular terms
        import shlex
        try:
            parts = shlex.split(query)
        except ValueError:
            # Fallback for unbalanced quotes
            parts = query.split()
        
        if not parts:
            return (query, [query.lower()], False)
        
        # Use first term for API search, all terms for filtering
        filter_terms = [p.lower() for p in parts]
        return (parts[0], filter_terms, False)
    
    def _matches_all_terms(self, doc: 'Drucksache', terms: list[str], is_exact: bool) -> bool:
        """Check if document matches all search terms (AND logic)."""
        searchable = f"{doc.title} {doc.drucksache} {' '.join(doc.fraktionen)} {doc.typ}".lower()
        
        if is_exact:
            # Exact phrase must appear
            return terms[0] in searchable
        else:
            # All terms must appear (AND)
            return all(term in searchable for term in terms)
    
    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Search NRW Landtag documents via OPAL portal."""
        results = []
        
        # Parse query for AND logic
        api_query, filter_terms, is_exact = self._parse_query(query)
        
        async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
            try:
                # First, get the page to establish session
                initial = await client.get(self.search_url)
                if initial.status_code != 200:
                    print(f"NRW search initial request failed: {initial.status_code}")
                    return []
                
                # Parse for webflow token from pagination links
                soup = BeautifulSoup(initial.text, 'html.parser')
                
                # Find a pagination link to extract the webflow token
                pagination_link = soup.select_one('a[href*="webflowexecution"]')
                webflow_token = ""
                webflow_execution = ""
                
                if pagination_link:
                    href = pagination_link.get('href', '')
                    # Extract webflowToken and webflowexecution from URL
                    token_match = re.search(r'webflowToken=([^&]*)', href)
                    exec_match = re.search(r'(webflowexecution[^=]+)=([^&]+)', href)
                    if token_match:
                        webflow_token = token_match.group(1)
                    if exec_match:
                        webflow_execution = f"{exec_match.group(1)}={exec_match.group(2)}"
                
                # Now perform the search with POST
                # Find the form action URL with webflow token
                form = soup.select_one('form#docSearchByItem')
                form_action = self.search_url
                if form and form.get('action'):
                    action = form.get('action')
                    if action.startswith('/'):
                        form_action = f"{self.base_url}{action}"
                    elif action.startswith('http'):
                        form_action = action
                    else:
                        form_action = f"{self.search_url}?{action}"
                
                # Build form data for "Einfache Suche" (searchByItem form)
                form_data = {
                    '_eventId_sendform': '1',
                    'dokNum': api_query,  # This is the text search field
                    'formId': 'searchByItem',
                    'dokTyp': '',  # All types
                    'wp': '18',  # Wahlperiode 18
                }
                
                # POST request with form data to the form action URL
                search_resp = await client.post(
                    form_action,
                    data=form_data,
                    cookies=initial.cookies,
                    headers={'Content-Type': 'application/x-www-form-urlencoded'}
                )
                
                if search_resp.status_code != 200:
                    print(f"NRW search request failed: {search_resp.status_code}")
                    return []
                
                # Parse results
                soup = BeautifulSoup(search_resp.text, 'html.parser')
                
                # Find all document result items (li elements containing articles)
                items = soup.select('li:has(article)')
                
                for item in items[:limit]:
                    try:
                        # Extract drucksache number from first link
                        num_link = item.select_one('a[href*="MMD"]')
                        if not num_link:
                            continue
                        
                        href = num_link.get('href', '')
                        # Extract number: MMD18-12345.pdf -> 18/12345
                        match = re.search(r'MMD(\d+)-(\d+)\.pdf', href)
                        if not match:
                            continue
                        
                        legislatur, nummer = match.groups()
                        drucksache = f"{legislatur}/{nummer}"
                        pdf_url = f"https://www.landtag.nrw.de{href}" if href.startswith('/') else href
                        
                        # Extract title from the title link (class e-document-result-item__title)
                        title_elem = item.select_one('a.e-document-result-item__title')
                        if title_elem:
                            # Get text content, clean it up
                            title = title_elem.get_text(strip=True)
                            # Remove SVG icon text and clean
                            title = re.sub(r'\s*<svg.*', '', title)
                            title = re.sub(r'\s+', ' ', title).strip()
                        else:
                            # Fallback: try to find any longer text
                            title = f"Drucksache {drucksache}"
                        
                        # Clean up common artifacts
                        title = re.sub(r'\s*\(\s*externer Link.*?\)', '', title).strip()
                        
                        # Extract type (Antrag, Kleine Anfrage, etc.)
                        typ_elem = item.select_one('.e-document-result-item__category')
                        typ = typ_elem.get_text(strip=True) if typ_elem else "Drucksache"
                        
                        # Extract date
                        time_elem = item.select_one('time')
                        datum = ""
                        if time_elem:
                            datum_text = time_elem.get_text(strip=True)
                            # Convert DD.MM.YYYY to YYYY-MM-DD
                            date_match = re.match(r'(\d{2})\.(\d{2})\.(\d{4})', datum_text)
                            if date_match:
                                d, m, y = date_match.groups()
                                datum = f"{y}-{m}-{d}"
                        
                        # Extract Urheber (fraktionen) - look for paragraph containing "Urheber:"
                        urheber_text = ""
                        for p in item.select('p'):
                            if 'Urheber:' in p.get_text():
                                urheber_text = p.get_text()
                                break
                        
                        fraktionen = []
                        if urheber_text:
                            # Extract party names (SPD, CDU, GRÜNE, FDP, AfD)
                            for party in ['SPD', 'CDU', 'GRÜNE', 'Grüne', 'FDP', 'AfD']:
                                if party in urheber_text:
                                    fraktionen.append(party.upper() if party.lower() != 'grüne' else 'GRÜNE')
                        
                        doc = Drucksache(
                            drucksache=drucksache,
                            title=title,
                            fraktionen=fraktionen,
                            datum=datum,
                            link=pdf_url,
                            bundesland="NRW",
                            typ=typ,
                        )
                        
                        # Apply AND filter (all terms must match)
                        if self._matches_all_terms(doc, filter_terms, is_exact):
                            results.append(doc)
                            
                    except Exception as e:
                        print(f"Error parsing item: {e}")
                        continue
                
            except Exception as e:
                print(f"NRW search error: {e}")
        
        return results
    
    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Get document metadata by drucksache ID (e.g. '18/8125')."""
        # Parse legislatur and number
        match = re.match(r"(\d+)/(\d+)", drucksache)
        if not match:
            return None
        
        legislatur, nummer = match.groups()
        pdf_url = f"https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument/MMD{legislatur}-{nummer}.pdf"
        
        # Try to fetch and extract basic info
        async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
            try:
                resp = await client.head(pdf_url)
                if resp.status_code == 200:
                    return Drucksache(
                        drucksache=drucksache,
                        title=f"Drucksache {drucksache}",
                        fraktionen=[],
                        datum="",
                        link=pdf_url,
                        bundesland="NRW",
                    )
            except:
                pass
        
        return None
    
    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download PDF and extract text."""
        import fitz  # PyMuPDF
        
        doc = await self.get_document(drucksache)
        if not doc:
            return None
        
        async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    return None
                
                # Extract text with PyMuPDF
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                
                return text
            except Exception as e:
                print(f"Error downloading {drucksache}: {e}")
                return None


class PortalaAdapter(ParlamentAdapter):
    """Adapter for portala/eUI-based parliament documentation systems.

    Used by parliaments running the proprietary "esearch" / portala framework
    (originally developed for STAR/StarFinder backends, now wrapped in a
    Single-Page App with Template Toolkit on the server side):

    - **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de``
    - **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` (future)

    The search workflow is two-stage:

    1. ``POST /portal/browse.tt.json`` with a complex JSON ``action`` body
       that contains an Elasticsearch-style query tree under
       ``search.json``. The server returns a ``report_id`` plus hit count.
    2. ``POST /portal/report.tt.html`` with ``{report_id, start, chunksize}``
       to fetch the HTML hit list. Each hit carries a Perl Data::Dumper
       block in a ``<pre>`` tag with the canonical metadata.

    The query body schema was reverse-engineered from
    https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
    (GPL-3.0 — only structure/selectors are reused, not Python code).

    Full-text search is **not** implemented in the MVP: the adapter
    returns the most recent ``Anträge`` of the current Wahlperiode in the
    given date window, and the search query is applied as a client-side
    title/Urheber filter. The portala server-side full-text path requires
    LSA-specific ``sf`` index names that are not yet known.
    """

    bundesland = "LSA"
    name = "Landtag von Sachsen-Anhalt (PADOKA)"
    base_url = "https://padoka.landtag.sachsen-anhalt.de"
    db_id = "lsa.lissh"
    wahlperiode = 8

    # Reverse-engineered "WEV*" Perl record fields used in the hit-list dumps:
    # WEV06.main = title
    # WEV32.5    = relative PDF path
    # WEV32.main = "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b> ..."
    _RE_TITLE = re.compile(r"'WEV06'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
    _RE_PDF = re.compile(r"'5'\s*=>\s*'([^']*\.pdf)'")
    _RE_DRUCKSACHE = re.compile(r"Drucksache\s*<b>(\d+/\d+)</b>")
    _RE_URHEBER_DATUM = re.compile(
        r"'WEV32'\s*=>\s*\[\s*\{[^}]*'main'\s*=>\s*[\"']Antrag\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
    )
    _RE_PRE_BLOCK = re.compile(r'<pre>\$VAR1 = (.*?)</pre>', re.DOTALL)

    @staticmethod
    def _decode_perl_hex(s: str) -> str:
        """Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
        return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s)

    @staticmethod
    def _normalize_fraktion(urheber: str) -> list[str]:
        """Map Urheber-String to canonical fraction codes."""
        u = urheber.upper()
        out = []
        if "BÜNDNIS 90" in u or "GRÜNE" in u or "GRUENE" in u:
            out.append("GRÜNE")
        if u.startswith("CDU") or " CDU " in u or u.endswith(" CDU"):
            out.append("CDU")
        if "SPD" in u:
            out.append("SPD")
        if "FDP" in u:
            out.append("FDP")
        if "AFD" in u:
            out.append("AfD")
        if "LINKE" in u or "DIE LINKE" in u:
            out.append("LINKE")
        if "LANDESREGIERUNG" in u or "MINISTER" in u or "STAATSKANZLEI" in u:
            out.append("Landesregierung")
        return out

    def _build_search_body(
        self,
        wahlperiode: int,
        start_date: str,
        end_date: str,
        document_type: str = "Antrag",
    ) -> dict:
        """Build the action JSON body for browse.tt.json.

        The schema is taken 1:1 from dokukratie's portala.query.json template
        and only differs in the data source (lsa.lissh) and the variable
        substitutions.
        """
        return {
            "action": "SearchAndDisplay",
            "sources": [self.db_id],
            "report": {
                "rhl": "main",
                "rhlmode": "add",
                "format": "generic1-full",
                "mime": "html",
                "sort": "WEVSO1/D WEVSO2 WEVSO3",
            },
            "search": {
                "lines": {
                    "2": str(wahlperiode),
                    "3": document_type,
                    "4": "D",
                    "10": start_date,
                    "11": end_date,
                    "20.1": "alWEBBI",
                    "20.2": "alWEBBI",
                    "20.3": "alWEBBI",
                    "90.1": "AND",
                    "90.2": "AND",
                    "90.3": "AND",
                },
                "serverrecordname": "sr_generic1",
                "parsed": (
                    f"((/WP {wahlperiode}) AND "
                    f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
                    f"AND (/DART,DARTS (\"D\")) AND "
                    f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE"
                ),
                "sref": (
                    f"((/WP {wahlperiode}) AND "
                    f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
                    f"AND (/DART,DARTS (\"D\")) AND "
                    f"(DAT,DDAT,SDAT= {start_date} THRU {end_date})) AND TYP=DOKDBE"
                ),
                "json": [{
                    "tn": "and",
                    "num": 1,
                    "terms": [
                        {"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3,
                         "sf": "WP", "op": "eq", "num": 5},
                        {"tn": "or", "num": 3, "terms": [
                            {"tn": "or", "num": 4, "terms": [
                                {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                                 "l": 4, "sf": "ETYPF", "op": "eq", "num": 10},
                                {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                                 "l": 4, "sf": "ETYP2F", "op": "eq", "num": 11},
                                {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                                 "l": 4, "sf": "DTYPF", "op": "eq", "num": 12},
                                {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                                 "l": 4, "sf": "DTYP2F", "op": "eq", "num": 13},
                                {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                                 "l": 4, "sf": "1VTYPF", "op": "eq", "num": 14},
                            ]},
                            {"tn": "or", "num": 15, "terms": [
                                {"tn": "term", "t": '"D"', "idx": 93, "l": 4,
                                 "sf": "DART", "op": "eq", "num": 16},
                                {"tn": "term", "t": '"D"', "idx": 93, "l": 4,
                                 "sf": "DARTS", "op": "eq", "num": 17},
                            ]},
                        ]},
                        {"tn": "or", "num": 18, "terms": [
                            {"tn": "or", "num": 19, "terms": [
                                {"tn": "trange", "sf": "DAT", "op": "eq", "num": 20,
                                 "idx": 119, "l": 3, "p1": start_date, "t1": start_date,
                                 "p2": end_date, "t2": end_date,
                                 "t": f"{start_date} THRU {end_date}"},
                                {"tn": "trange", "sf": "DDAT", "op": "eq", "num": 21,
                                 "idx": 119, "l": 3, "p1": start_date, "t1": start_date,
                                 "p2": end_date, "t2": end_date,
                                 "t": f"{start_date} THRU {end_date}"},
                            ]},
                            {"tn": "trange", "sf": "SDAT", "op": "eq", "num": 22,
                             "idx": 119, "l": 3, "p1": start_date, "t1": start_date,
                             "p2": end_date, "t2": end_date,
                             "t": f"{start_date} THRU {end_date}"},
                        ]},
                        {"tn": "term", "t": "DOKDBE", "idx": 156, "l": 1,
                         "sf": "TYP", "op": "eq", "num": 23},
                    ],
                }],
            },
            "dataSet": "1",
        }

    def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]:
        """Extract Drucksachen from a report.tt.html response."""
        results: list[Drucksache] = []
        for pre in self._RE_PRE_BLOCK.findall(html):
            m_ds = self._RE_DRUCKSACHE.search(pre)
            if not m_ds:
                continue
            drucksache = m_ds.group(1)

            m_t = self._RE_TITLE.search(pre)
            title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"

            m_pdf = self._RE_PDF.search(pre)
            pdf_rel = m_pdf.group(1) if m_pdf else ""
            pdf_url = f"{self.base_url}/files/{pdf_rel}" if pdf_rel else ""

            m_w32 = self._RE_URHEBER_DATUM.search(pre)
            urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else ""
            datum_de = m_w32.group(2) if m_w32 else ""
            # DD.MM.YYYY -> ISO YYYY-MM-DD
            datum_iso = ""
            if datum_de:
                d, m, y = datum_de.split(".")
                datum_iso = f"{y}-{m.zfill(2)}-{d.zfill(2)}"

            fraktionen = self._normalize_fraktion(urheber) if urheber else []

            doc = Drucksache(
                drucksache=drucksache,
                title=title,
                fraktionen=fraktionen,
                datum=datum_iso,
                link=pdf_url,
                bundesland=self.bundesland,
                typ="Antrag",
            )

            # Client-side title filter (no fulltext search server-side)
            if query_filter:
                hay = f"{title} {urheber}".lower()
                if not all(t in hay for t in query_filter.lower().split()):
                    continue

            results.append(doc)

        return results

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Search recent Anträge of the current Wahlperiode.

        ``query`` is applied as a client-side title/Urheber filter; the
        server-side query covers the last ~24 months by default.
        """
        from datetime import date, timedelta

        end = date.today()
        start = end - timedelta(days=730)
        body = self._build_search_body(
            wahlperiode=self.wahlperiode,
            start_date=start.isoformat(),
            end_date=end.isoformat(),
            document_type="Antrag",
        )

        async with httpx.AsyncClient(
            timeout=30,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                # Step 1: warm up cookies via the browse page
                await client.get(f"{self.base_url}/portal/browse.tt.html")

                # Step 2: submit the search action
                resp = await client.post(
                    f"{self.base_url}/portal/browse.tt.json",
                    json=body,
                    headers={"Referer": f"{self.base_url}/portal/browse.tt.html"},
                )
                if resp.status_code != 200:
                    print(f"PADOKA search HTTP {resp.status_code}")
                    return []

                data = resp.json()
                report_id = data.get("report_id")
                if not report_id:
                    print(f"PADOKA: no report_id in response: {data}")
                    return []

                # Step 3: fetch the HTML hit list
                # Take a generous chunk so client-side filter still has enough
                chunksize = 100 if query else limit
                report_resp = await client.post(
                    f"{self.base_url}/portal/report.tt.html",
                    json={"report_id": report_id, "start": 0, "chunksize": chunksize},
                    headers={"Referer": f"{self.base_url}/portal/browse.tt.html"},
                )
                if report_resp.status_code != 200:
                    print(f"PADOKA report HTTP {report_resp.status_code}")
                    return []

                results = self._parse_hit_list_html(report_resp.text, query_filter=query)
                return results[:limit]

            except Exception as e:
                print(f"PADOKA search error: {e}")
                return []

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Look up a single document by ID via the search endpoint with a
        document_number filter."""
        # Pragmatic MVP: do a broad search and filter for the requested ID.
        # A targeted single-document fetch would require a different
        # action.search.json structure that we have not reverse-engineered yet.
        results = await self.search(query="", limit=200)
        for doc in results:
            if doc.drucksache == drucksache:
                return doc
        return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download the PDF for a Drucksache and extract its text."""
        import fitz  # PyMuPDF

        doc = await self.get_document(drucksache)
        if not doc or not doc.link:
            return None

        async with httpx.AsyncClient(
            timeout=60,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception as e:
                print(f"PADOKA download error for {drucksache}: {e}")
                return None


class BayernAdapter(ParlamentAdapter):
    """Adapter for Bayerischer Landtag."""

    bundesland = "BY"
    name = "Bayerischer Landtag"
    base_url = "https://www.bayern.landtag.de"

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        # TODO: Implement Bayern search
        return []

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        # TODO: Implement
        return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        return None


class BWAdapter(ParlamentAdapter):
    """Adapter for Baden-Württemberg Landtag."""
    
    bundesland = "BW"
    name = "Landtag Baden-Württemberg"
    base_url = "https://www.landtag-bw.de"
    
    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        # TODO: Implement BW search
        return []
    
    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        return None
    
    async def download_text(self, drucksache: str) -> Optional[str]:
        return None


# Registry of adapters
ADAPTERS = {
    "NRW": NRWAdapter(),
    "LSA": PortalaAdapter(),
    "BY": BayernAdapter(),
    "BW": BWAdapter(),
}


def get_adapter(bundesland: str) -> Optional[ParlamentAdapter]:
    """Get adapter for a bundesland."""
    return ADAPTERS.get(bundesland)


async def search_all(query: str, bundesland: str = "NRW", limit: int = 20) -> list[Drucksache]:
    """Search parliament documents in a specific state."""
    adapter = get_adapter(bundesland)
    if not adapter:
        return []
    return await adapter.search(query, limit)