gwoe-antragspruefer/app/parlamente.py

"""Parliament search adapters for different German states."""

import json
import logging
import httpx
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)


@dataclass
class Drucksache:
    """A parliamentary document."""
    drucksache: str  # e.g. "18/8125"
    title: str
    fraktionen: list[str]
    datum: str  # ISO date
    link: str  # PDF URL
    bundesland: str
    typ: str = "Antrag"  # Antrag, Anfrage, Beschlussempfehlung, etc.


class ParlamentAdapter(ABC):
    """Base adapter for searching parliament documents."""
    
    bundesland: str
    name: str
    
    @abstractmethod
    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Search for documents matching query."""
        pass
    
    @abstractmethod
    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Get a specific document by ID."""
        pass
    
    @abstractmethod
    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download and extract text from a document."""
        pass


class NRWAdapter(ParlamentAdapter):
    """Adapter for NRW Landtag (opal.landtag.nrw.de)."""
    
    bundesland = "NRW"
    name = "Landtag Nordrhein-Westfalen"
    base_url = "https://opal.landtag.nrw.de"
    search_url = "https://opal.landtag.nrw.de/home/dokumente/dokumentensuche/parlamentsdokumente/aktuelle-dokumente.html"
    
    def _parse_query(self, query: str) -> tuple[str, list[str], bool]:
        """
        Parse search query for AND logic and exact phrases.
        Returns: (search_term_for_api, filter_terms, is_exact)
        
        Examples:
        - 'Klimaschutz Energie' -> ('Klimaschutz', ['klimaschutz', 'energie'], False)
        - '"Grüner Stahl"' -> ('Grüner Stahl', ['grüner stahl'], True)
        - 'Klimaschutz "erneuerbare Energie"' -> ('Klimaschutz', ['klimaschutz', 'erneuerbare energie'], False)
        """
        query = query.strip()
        
        # Check for exact phrase (entire query in quotes)
        if query.startswith('"') and query.endswith('"') and query.count('"') == 2:
            exact = query[1:-1].strip()
            return (exact, [exact.lower()], True)
        
        # Extract quoted phrases and regular terms
        import shlex
        try:
            parts = shlex.split(query)
        except ValueError:
            # Fallback for unbalanced quotes
            parts = query.split()
        
        if not parts:
            return (query, [query.lower()], False)
        
        # Use first term for API search, all terms for filtering
        filter_terms = [p.lower() for p in parts]
        return (parts[0], filter_terms, False)
    
    def _matches_all_terms(self, doc: 'Drucksache', terms: list[str], is_exact: bool) -> bool:
        """Check if document matches all search terms (AND logic)."""
        searchable = f"{doc.title} {doc.drucksache} {' '.join(doc.fraktionen)} {doc.typ}".lower()
        
        if is_exact:
            # Exact phrase must appear
            return terms[0] in searchable
        else:
            # All terms must appear (AND)
            return all(term in searchable for term in terms)
    
    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Search NRW Landtag documents via OPAL portal."""
        results = []
        
        # Parse query for AND logic
        api_query, filter_terms, is_exact = self._parse_query(query)
        
        async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
            try:
                # First, get the page to establish session
                initial = await client.get(self.search_url)
                if initial.status_code != 200:
                    print(f"NRW search initial request failed: {initial.status_code}")
                    return []
                
                # Parse for webflow token from pagination links
                soup = BeautifulSoup(initial.text, 'html.parser')
                
                # Find a pagination link to extract the webflow token
                pagination_link = soup.select_one('a[href*="webflowexecution"]')
                webflow_token = ""
                webflow_execution = ""
                
                if pagination_link:
                    href = pagination_link.get('href', '')
                    # Extract webflowToken and webflowexecution from URL
                    token_match = re.search(r'webflowToken=([^&]*)', href)
                    exec_match = re.search(r'(webflowexecution[^=]+)=([^&]+)', href)
                    if token_match:
                        webflow_token = token_match.group(1)
                    if exec_match:
                        webflow_execution = f"{exec_match.group(1)}={exec_match.group(2)}"
                
                # Now perform the search with POST
                # Find the form action URL with webflow token
                form = soup.select_one('form#docSearchByItem')
                form_action = self.search_url
                if form and form.get('action'):
                    action = form.get('action')
                    if action.startswith('/'):
                        form_action = f"{self.base_url}{action}"
                    elif action.startswith('http'):
                        form_action = action
                    else:
                        form_action = f"{self.search_url}?{action}"
                
                # Build form data for "Einfache Suche" (searchByItem form)
                form_data = {
                    '_eventId_sendform': '1',
                    'dokNum': api_query,  # This is the text search field
                    'formId': 'searchByItem',
                    'dokTyp': '',  # All types
                    'wp': '18',  # Wahlperiode 18
                }
                
                # POST request with form data to the form action URL
                search_resp = await client.post(
                    form_action,
                    data=form_data,
                    cookies=initial.cookies,
                    headers={'Content-Type': 'application/x-www-form-urlencoded'}
                )
                
                if search_resp.status_code != 200:
                    print(f"NRW search request failed: {search_resp.status_code}")
                    return []
                
                # Parse results
                soup = BeautifulSoup(search_resp.text, 'html.parser')
                
                # Find all document result items (li elements containing articles)
                items = soup.select('li:has(article)')
                
                for item in items[:limit]:
                    try:
                        # Extract drucksache number from first link
                        num_link = item.select_one('a[href*="MMD"]')
                        if not num_link:
                            continue
                        
                        href = num_link.get('href', '')
                        # Extract number: MMD18-12345.pdf -> 18/12345
                        match = re.search(r'MMD(\d+)-(\d+)\.pdf', href)
                        if not match:
                            continue
                        
                        legislatur, nummer = match.groups()
                        drucksache = f"{legislatur}/{nummer}"
                        pdf_url = f"https://www.landtag.nrw.de{href}" if href.startswith('/') else href
                        
                        # Extract title from the title link (class e-document-result-item__title)
                        title_elem = item.select_one('a.e-document-result-item__title')
                        if title_elem:
                            # Get text content, clean it up
                            title = title_elem.get_text(strip=True)
                            # Remove SVG icon text and clean
                            title = re.sub(r'\s*<svg.*', '', title)
                            title = re.sub(r'\s+', ' ', title).strip()
                        else:
                            # Fallback: try to find any longer text
                            title = f"Drucksache {drucksache}"
                        
                        # Clean up common artifacts
                        title = re.sub(r'\s*\(\s*externer Link.*?\)', '', title).strip()
                        
                        # Extract type (Antrag, Kleine Anfrage, etc.)
                        typ_elem = item.select_one('.e-document-result-item__category')
                        typ = typ_elem.get_text(strip=True) if typ_elem else "Drucksache"
                        
                        # Extract date
                        time_elem = item.select_one('time')
                        datum = ""
                        if time_elem:
                            datum_text = time_elem.get_text(strip=True)
                            # Convert DD.MM.YYYY to YYYY-MM-DD
                            date_match = re.match(r'(\d{2})\.(\d{2})\.(\d{4})', datum_text)
                            if date_match:
                                d, m, y = date_match.groups()
                                datum = f"{y}-{m}-{d}"
                        
                        # Extract Urheber (fraktionen) - look for paragraph containing "Urheber:"
                        urheber_text = ""
                        for p in item.select('p'):
                            if 'Urheber:' in p.get_text():
                                urheber_text = p.get_text()
                                break
                        
                        fraktionen = []
                        if urheber_text:
                            # Extract party names (SPD, CDU, GRÜNE, FDP, AfD)
                            for party in ['SPD', 'CDU', 'GRÜNE', 'Grüne', 'FDP', 'AfD']:
                                if party in urheber_text:
                                    fraktionen.append(party.upper() if party.lower() != 'grüne' else 'GRÜNE')
                        
                        doc = Drucksache(
                            drucksache=drucksache,
                            title=title,
                            fraktionen=fraktionen,
                            datum=datum,
                            link=pdf_url,
                            bundesland="NRW",
                            typ=typ,
                        )
                        
                        # Apply AND filter (all terms must match)
                        if self._matches_all_terms(doc, filter_terms, is_exact):
                            results.append(doc)
                            
                    except Exception as e:
                        print(f"Error parsing item: {e}")
                        continue
                
            except Exception as e:
                print(f"NRW search error: {e}")
        
        return results
    
    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Get document metadata by drucksache ID (e.g. '18/8125')."""
        # Parse legislatur and number
        match = re.match(r"(\d+)/(\d+)", drucksache)
        if not match:
            return None
        
        legislatur, nummer = match.groups()
        pdf_url = f"https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument/MMD{legislatur}-{nummer}.pdf"
        
        # Try to fetch and extract basic info
        async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
            try:
                resp = await client.head(pdf_url)
                if resp.status_code == 200:
                    return Drucksache(
                        drucksache=drucksache,
                        title=f"Drucksache {drucksache}",
                        fraktionen=[],
                        datum="",
                        link=pdf_url,
                        bundesland="NRW",
                    )
            except:
                pass
        
        return None
    
    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download PDF and extract text."""
        import fitz  # PyMuPDF
        
        doc = await self.get_document(drucksache)
        if not doc:
            return None
        
        async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    return None
                
                # Extract text with PyMuPDF
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                
                return text
            except Exception as e:
                print(f"Error downloading {drucksache}: {e}")
                return None


class PortalaAdapter(ParlamentAdapter):
    """Adapter for portala/eUI-based parliament documentation systems.

    Used by parliaments running the proprietary "esearch" / portala framework
    (originally developed for STAR/StarFinder backends, now wrapped in a
    Single-Page App with Template Toolkit on the server side):

    - **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de``
      under ``/portal/`` (singular)
    - **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` under
      ``/portala/`` (with the trailing 'a')

    Both instances share the same JSON action schema, only the base URL,
    the data source ID, the application path prefix and a few minor
    quirks differ — those are constructor parameters so that the same
    class can serve both states (and any future portala-based parliament).

    The search workflow is two-stage:

    1. ``POST {base}{path}/browse.tt.json`` with a complex JSON ``action``
       body that contains an Elasticsearch-style query tree under
       ``search.json``. The server returns a ``report_id`` plus hit count.
    2. ``POST {base}{path}/report.tt.html`` with ``{report_id, start,
       chunksize}`` to fetch the HTML hit list. Each hit carries a Perl
       Data::Dumper block in a ``<pre>`` tag with the canonical metadata.

    The query body schema was reverse-engineered from
    https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
    (GPL-3.0 — only structure/selectors are reused, not Python code).

    Full-text search is **not** implemented in the MVP: the adapter
    returns documents of the current Wahlperiode in the given date
    window, and the search query is applied as a client-side
    title/Urheber filter. The server-side full-text path requires
    state-specific ``sf`` index names that are not yet known.
    """

    def __init__(
        self,
        *,
        bundesland: str,
        name: str,
        base_url: str,
        db_id: str,
        wahlperiode: int,
        portala_path: str = "/portal",
        document_type: Optional[str] = "Antrag",
        pdf_url_prefix: str = "/files/",
        date_window_days: int = 730,
        typ_filter: Optional[str] = "DOKDBE",
        omit_date_filter: bool = False,
    ) -> None:
        """Configure a portala/eUI adapter for one specific parliament.

        Args:
            bundesland: state code (e.g. ``"LSA"``, ``"BE"``).
            name: human-readable adapter label (used in logs/UI).
            base_url: ``https://...`` of the portal host without trailing slash.
            db_id: data source identifier the eUI server expects in
                ``action.sources``, e.g. ``"lsa.lissh"`` or ``"lah.lissh"``.
            wahlperiode: current legislative period — fed into the WP
                term of the search tree.
            portala_path: path prefix where the portala app lives. ``/portal``
                for LSA, ``/portala`` for Berlin.
            document_type: optional filter applied via ETYPF/DTYPF/DART
                terms. ``"Antrag"`` works for LSA; for instances where
                the index uses different document_type values (e.g. Berlin),
                pass ``None`` to drop the document_type subtree entirely
                — the user can still filter client-side by title.
            pdf_url_prefix: URL fragment between ``base_url`` and the
                relative PDF path returned by the server.
            date_window_days: how many days back ``search()`` looks by
                default.
            typ_filter: ``TYP=<value>`` term in the parsed string and
                JSON tree. ``DOKDBE`` works for LSA/BE/BB/BW (the
                lissh-style instances). For Hessen (``hlt.lis``) and
                similar instances the value is different or absent —
                pass ``None`` to drop the term entirely.
        """
        self.bundesland = bundesland
        self.name = name
        self.base_url = base_url.rstrip("/")
        self.db_id = db_id
        self.wahlperiode = wahlperiode
        self.portala_path = "/" + portala_path.strip("/")
        self.document_type = document_type
        self.pdf_url_prefix = "/" + pdf_url_prefix.strip("/") + "/"
        self.date_window_days = date_window_days
        self.typ_filter = typ_filter
        self.omit_date_filter = omit_date_filter

    # ── LSA-style hit list (Perl Data::Dumper inside <pre> blocks) ──
    # Reverse-engineered "WEV*" record fields:
    # WEV06.main = title
    # WEV32.5    = relative PDF path
    # WEV32.main = "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b> ..."
    _RE_TITLE = re.compile(r"'WEV06'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
    _RE_PDF = re.compile(r"'5'\s*=>\s*'([^']*\.pdf)'")
    _RE_DRUCKSACHE = re.compile(r"Drucksache\s*<b>(\d+/\d+)</b>")
    _RE_URHEBER_DATUM = re.compile(
        r"'WEV32'\s*=>\s*\[\s*\{[^}]*'main'\s*=>\s*[\"']Antrag\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
    )
    _RE_PRE_BLOCK = re.compile(r'<pre>\$VAR1 = (.*?)</pre>', re.DOTALL)

    # ── Berlin-style hit list (production HTML cards, no Perl dump) ──
    # The whole div for one record:
    _RE_BE_RECORD = re.compile(
        r'<div[^>]*class="[^"]*efxRecordRepeater[^"]*"[^>]*data-efx-rec="[^"]*"[^>]*>(.*?)(?=<div[^>]*efxRecordRepeater|<div[^>]*id="efxResultsEnd"|</main>|$)',
        re.DOTALL,
    )
    _RE_BE_TITLE = re.compile(r'<h3[^>]*class="h5[^"]*"[^>]*>\s*<span>([^<]+)</span>')
    _RE_BE_LINK = re.compile(r'<a[^>]*href="([^"]+\.pdf)"[^>]*>')
    # The metadata h6 looks like:
    #   <span class="h6">Antrag (Eilantrag)  &nbsp;<a ...>Drucksache 19/3104</a>  S. 1 bis 24 vom 31.03.2026</span>
    _RE_BE_DRUCKSACHE = re.compile(r'Drucksache\s+(\d+/\d+)')
    # BE has "Drucksache 19/3104 S. 1 bis 24 vom 31.03.2026" — date is
    # marked by ``vom``. BB has the BE card format too but writes the
    # date BEFORE the Drucksachen-Nummer with no marker:
    # "Antrag Reinhard Simon (BSW) 17.10.2024 Drucksache 8/2 (1 S.)".
    # Try ``vom``-prefix first; fall back to the first plain date.
    _RE_BE_DATUM_VOM = re.compile(r'vom\s+(\d{1,2}\.\d{1,2}\.\d{4})')
    _RE_BE_DATUM_PLAIN = re.compile(r'(\d{1,2}\.\d{1,2}\.\d{4})')
    _RE_BE_DOCTYPE = re.compile(r'<span class="h6">\s*([^<&]+?)(?:&nbsp;|<)')

    @staticmethod
    def _decode_perl_hex(s: str) -> str:
        """Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
        return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s)

    def _normalize_fraktion(self, urheber: str) -> list[str]:
        """Thin shim — die ganze Regex-Logik lebt jetzt zentral in
        ``app.parteien.extract_fraktionen`` (siehe #55). ``self.bundesland``
        wird mitgegeben, damit FW-Familien-Aliase korrekt disambiguiert
        werden.
        """
        from .parteien import extract_fraktionen
        return extract_fraktionen(urheber, bundesland=self.bundesland)

    def _build_search_body(
        self,
        wahlperiode: int,
        start_date: str,
        end_date: str,
    ) -> dict:
        """Build the action JSON body for browse.tt.json.

        The schema is taken from dokukratie's portala.query.json template
        and only differs in the data source and the variable substitutions.
        When ``self.document_type`` is None, the ETYPF/DTYPF/DART subtree
        is dropped — useful for parliaments whose ETYPF index uses
        different value strings than ``"Antrag"``.
        """
        document_type = self.document_type
        date_range_text = f"{start_date} THRU {end_date}"
        date_term = lambda sf, num: {  # noqa: E731 — local helper
            "tn": "trange", "sf": sf, "op": "eq", "num": num,
            "idx": 119, "l": 3,
            "p1": start_date, "t1": start_date,
            "p2": end_date, "t2": end_date,
            "t": date_range_text,
        }

        # Build the search.lines (form-state mirror) and the json tree
        lines: dict = {
            "2": str(wahlperiode),
            "10": start_date,
            "11": end_date,
            "20.1": "alWEBBI",
            "20.2": "alWEBBI",
            "20.3": "alWEBBI",
            "90.1": "AND",
            "90.2": "AND",
            "90.3": "AND",
        }
        if document_type is not None:
            lines["3"] = document_type
            lines["4"] = "D"

        # Top-level AND tree
        top_terms: list = [
            {"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3,
             "sf": "WP", "op": "eq", "num": 5},
        ]

        if document_type is not None:
            top_terms.append({"tn": "or", "num": 3, "terms": [
                {"tn": "or", "num": 4, "terms": [
                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                     "l": 4, "sf": "ETYPF", "op": "eq", "num": 10},
                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                     "l": 4, "sf": "ETYP2F", "op": "eq", "num": 11},
                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                     "l": 4, "sf": "DTYPF", "op": "eq", "num": 12},
                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                     "l": 4, "sf": "DTYP2F", "op": "eq", "num": 13},
                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
                     "l": 4, "sf": "1VTYPF", "op": "eq", "num": 14},
                ]},
                {"tn": "or", "num": 15, "terms": [
                    {"tn": "term", "t": '"D"', "idx": 93, "l": 4,
                     "sf": "DART", "op": "eq", "num": 16},
                    {"tn": "term", "t": '"D"', "idx": 93, "l": 4,
                     "sf": "DARTS", "op": "eq", "num": 17},
                ]},
            ]})

        if not self.omit_date_filter:
            top_terms.append({"tn": "or", "num": 18, "terms": [
                {"tn": "or", "num": 19, "terms": [
                    date_term("DAT", 20),
                    date_term("DDAT", 21),
                ]},
                date_term("SDAT", 22),
            ]})
        if self.typ_filter is not None:
            top_terms.append({"tn": "term", "t": self.typ_filter, "idx": 156, "l": 1,
                              "sf": "TYP", "op": "eq", "num": 23})

        # Mirror the same shape into the parsed/sref display strings
        typ_clause = f" AND TYP={self.typ_filter}" if self.typ_filter is not None else ""
        date_clause = (
            f" AND (DAT,DDAT,SDAT= {date_range_text})"
            if not self.omit_date_filter else ""
        )
        if document_type is not None:
            parsed = (
                f"((/WP {wahlperiode}) AND "
                f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
                f"AND (/DART,DARTS (\"D\")){date_clause}){typ_clause}"
            )
        else:
            parsed = f"((/WP {wahlperiode}){date_clause}){typ_clause}"

        return {
            "action": "SearchAndDisplay",
            "sources": [self.db_id],
            "report": {
                "rhl": "main",
                "rhlmode": "add",
                "format": "generic1-full",
                "mime": "html",
                "sort": "WEVSO1/D WEVSO2 WEVSO3",
            },
            "search": {
                "lines": lines,
                "serverrecordname": "sr_generic1",
                "parsed": parsed,
                "sref": parsed,
                "json": [{
                    "tn": "and",
                    "num": 1,
                    "terms": top_terms,
                }],
            },
            "dataSet": "1",
        }

    @staticmethod
    def _datum_de_to_iso(datum_de: str) -> str:
        """Convert DD.MM.YYYY → YYYY-MM-DD; return '' for empty input."""
        if not datum_de:
            return ""
        d, m, y = datum_de.split(".")
        return f"{y}-{m.zfill(2)}-{d.zfill(2)}"

    def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]:
        """Extract Drucksachen from a report.tt.html response.

        Two formats are supported and auto-detected:

        - **LSA-style:** the records are embedded as Perl Data::Dumper
          dumps inside ``<pre>$VAR1 = …</pre>`` blocks. WEV06 → title,
          WEV32 → metadata + PDF path. Used by Sachsen-Anhalt's PADOKA
          template.
        - **Berlin-style:** standard production HTML cards with
          ``efxRecordRepeater`` divs. Title in an ``<h3 class="h5">``,
          metadata + PDF link in an ``<span class="h6">``. Used by
          Berlin's PARDOK template.
        """
        if self._RE_PRE_BLOCK.search(html):
            return self._parse_hit_list_dump(html, query_filter)
        return self._parse_hit_list_cards(html, query_filter)

    def _parse_hit_list_dump(self, html: str, query_filter: str) -> list[Drucksache]:
        """Parse LSA-style ``<pre>$VAR1 = …</pre>`` Perl-dump records."""
        results: list[Drucksache] = []
        for pre in self._RE_PRE_BLOCK.findall(html):
            m_ds = self._RE_DRUCKSACHE.search(pre)
            if not m_ds:
                continue
            drucksache = m_ds.group(1)

            m_t = self._RE_TITLE.search(pre)
            title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"

            m_pdf = self._RE_PDF.search(pre)
            pdf_rel = m_pdf.group(1) if m_pdf else ""
            pdf_url = f"{self.base_url}{self.pdf_url_prefix}{pdf_rel}" if pdf_rel else ""

            m_w32 = self._RE_URHEBER_DATUM.search(pre)
            urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else ""
            datum_iso = self._datum_de_to_iso(m_w32.group(2) if m_w32 else "")
            fraktionen = self._normalize_fraktion(urheber) if urheber else []

            doc = Drucksache(
                drucksache=drucksache,
                title=title,
                fraktionen=fraktionen,
                datum=datum_iso,
                link=pdf_url,
                bundesland=self.bundesland,
                typ="Antrag",
            )

            if query_filter:
                hay = f"{title} {urheber}".lower()
                if not all(t in hay for t in query_filter.lower().split()):
                    continue

            results.append(doc)

        return results

    def _parse_hit_list_cards(self, html: str, query_filter: str) -> list[Drucksache]:
        """Parse Berlin-style ``efxRecordRepeater`` HTML-card records.

        Each card contains an ``<h3>`` title, a metadata ``<span class="h6">``
        with the document type, the Drucksachen-Nummer, and the date,
        plus a direct ``<a href="…pdf">`` link to the PDF on the same host.
        """
        results: list[Drucksache] = []

        # Split the HTML on every record-div opener — easier than balancing
        # divs with regex.
        chunks = html.split('class="record')
        # First chunk is the prelude, skip it
        for chunk in chunks[1:]:
            # Each chunk now starts at the record class attribute
            m_t = self._RE_BE_TITLE.search(chunk)
            title = m_t.group(1).strip() if m_t else "Ohne Titel"

            m_ds = self._RE_BE_DRUCKSACHE.search(chunk)
            if not m_ds:
                continue
            drucksache = m_ds.group(1)

            m_pdf = self._RE_BE_LINK.search(chunk)
            pdf_url = ""
            if m_pdf:
                href = m_pdf.group(1)
                if href.startswith("http://") or href.startswith("https://"):
                    pdf_url = href
                elif href.startswith("/"):
                    pdf_url = f"{self.base_url}{href}"
                else:
                    pdf_url = f"{self.base_url}{self.pdf_url_prefix}{href}"

            m_dat = self._RE_BE_DATUM_VOM.search(chunk) or self._RE_BE_DATUM_PLAIN.search(chunk)
            datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "")

            m_doc = self._RE_BE_DOCTYPE.search(chunk)
            doctype_full = m_doc.group(1).strip() if m_doc else "Drucksache"

            # Berlin often packs the originator(s) into the same h6 line:
            #   "Antrag  CDU, SPD" → fraktionen = ["CDU","SPD"], typ = "Antrag"
            # Senat-Vorlagen carry no fraction, only "Vorlage zur …".
            fraktionen = self._normalize_fraktion(doctype_full)
            # Strip the fraction names back out of the typ string so the UI
            # shows a clean "Antrag" / "Vorlage …" label.
            typ = doctype_full
            if fraktionen:
                # Cut at the first occurrence of any party name
                cuts = [typ.upper().find(f.upper()) for f in fraktionen]
                cuts = [c for c in cuts if c >= 0]
                if cuts:
                    typ = typ[: min(cuts)].rstrip(" ,")

            doc = Drucksache(
                drucksache=drucksache,
                title=title,
                fraktionen=fraktionen,
                datum=datum_iso,
                link=pdf_url,
                bundesland=self.bundesland,
                typ=typ,
            )

            if query_filter:
                hay = f"{title} {doctype_full}".lower()
                if not all(t in hay for t in query_filter.lower().split()):
                    continue

            results.append(doc)

        return results

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Search recent documents of the current Wahlperiode.

        ``query`` is applied as a client-side title/Urheber filter; the
        server-side query covers the configured ``date_window_days``
        (default 24 months).
        """
        from datetime import date, timedelta

        end = date.today()
        start = end - timedelta(days=self.date_window_days)
        body = self._build_search_body(
            wahlperiode=self.wahlperiode,
            start_date=start.isoformat(),
            end_date=end.isoformat(),
        )

        browse_html = f"{self.base_url}{self.portala_path}/browse.tt.html"
        browse_json = f"{self.base_url}{self.portala_path}/browse.tt.json"
        report_html = f"{self.base_url}{self.portala_path}/report.tt.html"

        async with httpx.AsyncClient(
            # Bumped from 30s for #13 quick-win: chunksize=500 against the
            # LSA report.tt.html endpoint occasionally takes 30+ seconds.
            timeout=60,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                # Step 1: warm up cookies via the browse page
                await client.get(browse_html)

                # Step 2: submit the search action
                resp = await client.post(
                    browse_json,
                    json=body,
                    headers={"Referer": browse_html},
                )
                if resp.status_code != 200:
                    logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
                    return []

                data = resp.json()
                report_id = data.get("report_id")
                if not report_id:
                    logger.error("%s: no report_id in response: %s", self.bundesland, data)
                    return []

                # Step 3: fetch the HTML hit list
                # Take a generous chunk so der client-side type-filter
                # genug Material zum Filtern hat. Berlin-PARDOK ist
                # dominiert von "Schriftliche Anfrage"-Hits und ohne
                # server-side ETYPF-Filter (BE: document_type=None) liefern
                # 100 Roh-Hits oft nur 1-2 Anträge. Floor bewusst hoch.
                # Quick-win für #13 + #61 Bug 5.
                chunksize = max(limit * 10, 1500)
                report_resp = await client.post(
                    report_html,
                    json={"report_id": report_id, "start": 0, "chunksize": chunksize},
                    headers={"Referer": browse_html},
                )
                if report_resp.status_code != 200:
                    logger.error("%s report HTTP %s", self.bundesland, report_resp.status_code)
                    return []

                results = self._parse_hit_list_html(report_resp.text, query_filter=query)
                # Server-side ETYPF/DTYPF filter is best-effort across portala
                # instances — BB/RP let "Kleine Anfrage" und "Beschluss-
                # empfehlung" durch, BE hat sogar `document_type=None`
                # (eigene ETYPF-Werte), wodurch "Schriftliche Anfrage" das
                # 200-Result-Window aushungern und Anträge wie 19/2650 nie
                # zurückkommen. Wir filtern client-side IMMER auf
                # "antrag"-Substring im typ — unabhängig davon, ob der
                # Server-Filter gesetzt war (siehe #61 Bug 2, 3, 5).
                results = [
                    d for d in results
                    if "antrag" in (d.typ or "").lower()
                ]
                return results[:limit]

            except Exception:
                logger.exception("%s search error", self.bundesland)
                return []

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Look up a single document by ID via the search endpoint with a
        document_number filter."""
        # Pragmatic MVP: do a broad search and filter for the requested ID.
        # A targeted single-document fetch would require a different
        # action.search.json structure that we have not reverse-engineered yet.
        results = await self.search(query="", limit=200)
        for doc in results:
            if doc.drucksache == drucksache:
                return doc
        return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download the PDF for a Drucksache and extract its text."""
        import fitz  # PyMuPDF

        doc = await self.get_document(drucksache)
        if not doc or not doc.link:
            return None

        async with httpx.AsyncClient(
            timeout=60,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception:
                logger.exception("%s download error for %s", self.bundesland, drucksache)
                return None


class ParLDokAdapter(ParlamentAdapter):
    """Adapter for ParlDok 8.x parliament documentation systems (J3S GmbH).

    ParlDok is a proprietary parliament documentation product by J3S GmbH
    (https://www.j3s.de). Different from the portala/eUI framework used by
    LSA/BE: ParlDok 8.x is a single-page app whose backend is a JSON API
    rooted at ``{base_url}{prefix}/Fulltext/...``. The legacy ParLDok 5.x
    HTML POST form (``parldok/formalkriterien``) used by dokukratie's MV
    YAML scraper has been deprecated by the LandtagMV upgrade to 8.3.5.

    Confirmed instances using this engine (April 2026):

    - **MV** (Mecklenburg-Vorpommern) — ``dokumentation.landtag-mv.de/parldok``
    - HH, SN, TH all advertise ParlDok in dokukratie but their actual
      versions/themes have not been verified yet.

    Search workflow:

    1. ``GET {base_url}{prefix}/`` to obtain the session cookie. The
       backend rejects POSTs without it.
    2. ``POST {base_url}{prefix}/Fulltext/Search`` with form-encoded
       ``data=<json>`` payload. The JSON carries a ``tags`` array of
       facet selections; each tag is ``{"type": <facet_type_int>,
       "id": <facet_value>}``. Reverse-engineered facet type constants
       from the bundle.js (``pd.facet_*``):

       - ``facet_fraction = 2``
       - ``facet_kind = 7`` (Drucksache, Plenarprotokoll, …)
       - ``facet_type = 8`` (Antrag, Gesetzentwurf, Kleine Anfrage, …)
       - ``facet_lp = 10`` (Wahlperiode)

       Response is JSON ``{success, data: <stringified JSON>}`` where the
       inner ``data`` carries ``{count, docs: [{id, title, date,
       authorhtml, kind, type, lp, number, link, ...}], ...}``.

    3. PDF download: ``GET {base_url}{prefix}/dokument/{numeric_id}``.
       Returns ``application/pdf`` directly. The ``link`` field returned
       by the search API already contains the path fragment
       ``/dokument/<id>#navpanes=0`` — strip the fragment and prepend
       the configured ``prefix``.

    Drucksachen-Nummer is reconstructed as ``f"{lp}/{number}"`` from the
    search hit. Full-text search is *not* implemented in this MVP — the
    backend supports it via ``facet_fulltext = 0`` tags but the public
    LP-only filter already returns the relevant Antrag pool. ``query``
    is applied as a client-side title/Urheber filter.
    """

    # Reverse-engineered facet type constants from bundle.js (pd.facet_*).
    FACET_FULLTEXT = 0
    FACET_FRACTION = 2
    FACET_KIND = 7
    FACET_TYPE = 8
    FACET_LP = 10

    def __init__(
        self,
        *,
        bundesland: str,
        name: str,
        base_url: str,
        wahlperiode: int,
        prefix: str = "/parldok",
        document_typ: str = "Antrag",
        document_typ_substring: bool = False,
        kinds: Optional[list[str]] = None,
    ) -> None:
        """Configure a ParlDok 8.x adapter for one specific parliament.

        Args:
            bundesland: state code, e.g. ``"MV"``.
            name: human-readable label.
            base_url: ``https://...`` host root, no trailing slash.
            wahlperiode: current legislative period — fed into the
                ``facet_lp`` tag of the search payload.
            prefix: app prefix where ParlDok lives. ``/parldok`` for MV.
            document_typ: client-side filter on the ``type`` field of
                each hit ("Antrag", "Gesetzentwurf", …). Set to empty
                string to disable type filtering.
            document_typ_substring: if True, ``document_typ`` is matched
                as a substring against the hit's ``type`` field instead
                of an exact match. Needed for instances where the
                Drucksachen-Anträge live under composite type strings
                like ``"Antrag gemäß § 79 GO"`` (Thüringen) — strict
                ``"Antrag"`` would never match.
            kinds: optional list of acceptable ``kind`` values. Defaults
                to ``["Drucksache"]`` if None — but TH packs its Anträge
                under ``kind="Vorlage"`` so the parameter has to be
                widened there.
        """
        self.bundesland = bundesland
        self.name = name
        self.base_url = base_url.rstrip("/")
        self.prefix = "/" + prefix.strip("/")
        self.wahlperiode = wahlperiode
        self.document_typ = document_typ
        self.document_typ_substring = document_typ_substring
        self.kinds = kinds if kinds is not None else ["Drucksache"]

    def _hit_matches_filters(self, hit: dict) -> bool:
        """Apply the kind/typ filters to a raw hit dict.

        Centralised so the search loop can short-circuit cleanly. ``hit``
        comes from ``Fulltext/Search`` or ``Fulltext/Resultpage`` JSON
        responses; both share the same record schema.
        """
        if self.kinds and hit.get("kind") not in self.kinds:
            return False
        hit_type = (hit.get("type") or "").strip()
        if self.document_typ:
            if self.document_typ_substring:
                if self.document_typ not in hit_type:
                    return False
            else:
                if hit_type != self.document_typ:
                    return False
        return True

    @staticmethod
    def _datum_de_to_iso(datum_de: str) -> str:
        """DD.MM.YYYY → YYYY-MM-DD; '' for empty input."""
        if not datum_de:
            return ""
        try:
            d, m, y = datum_de.split(".")
            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
        except ValueError:
            return ""

    def _normalize_fraktion(self, authorhtml: str) -> list[str]:
        """Thin shim — siehe ``app.parteien.extract_fraktionen``. #55."""
        from .parteien import extract_fraktionen
        return extract_fraktionen(authorhtml, bundesland=self.bundesland)

    @staticmethod
    def _fulltext_id(term: str) -> str:
        """Sanitize a search term to ParlDok's facet ID format.

        Mirrors ``pd.getFulltextId`` from ``bundle.js``: replace every
        non-alphanumeric character with ``-``. The server uses this to
        deduplicate identical search facets.
        """
        return re.sub(r"[^a-zA-Z0-9]", "-", term)

    def _build_search_body(self, *, length: int = 100, query: str = "") -> dict:
        """Build the JSON payload for the initial ``Fulltext/Search`` call.

        Filters by Wahlperiode only — type/kind/fulltext filtering all
        happen client-side after the hit list is paginated. The
        ``query`` parameter is accepted for API compatibility but is
        currently NOT forwarded to the server (#18: einheitliche
        client-side Title-Suche, kein Server-Volltext, weil das
        Verhalten zwischen Adaptern sonst asymmetrisch wird). The
        ``FACET_FULLTEXT`` constant and :meth:`_fulltext_id` helper
        are kept around as documentation for the previous #12
        server-side variant — when fulltext gets uniformly
        re-introduced later, the dormant tag is just::

            {"type": self.FACET_FULLTEXT,
             "id": self._fulltext_id(query),
             "fulltext": query, "label": query, "field": "Alle"}

        Pagination beyond the first page goes through
        ``Fulltext/Resultpage`` — the ``Search`` endpoint itself
        ignores any non-zero ``Start``.
        """
        del query  # explicitly unused — see docstring
        tags: list[dict] = [{"type": self.FACET_LP, "id": self.wahlperiode}]
        return {
            "devicekey": "",
            "max": length,
            "withfilter": False,
            # sort=2 → newest first (date desc); sort=1 is relevance.
            "sort": 2,
            "topk": length,
            "llm": 0,
            "newdocsearch": False,
            "limit": {"Start": 0, "Length": length},
            "tags": tags,
            "updateFilters": [],
        }

    def _hit_to_drucksache(self, hit: dict) -> Optional[Drucksache]:
        """Convert one ParlDok JSON hit to a Drucksache. None if unusable.

        ParlDok markiert frische Vorlagen mit leerem ``link``/``prelink``
        wenn das PDF noch nicht freigegeben ist (z.B. TH 8/1594, datum
        2026-03-31, ``allowed: false``). Solche Hits sind für unsere
        Pipeline wertlos — `download_text` würde an `not doc.link`
        scheitern und das Frontend würde einen unklickbaren Eintrag
        anzeigen. Sauberer Skip an dieser Stelle. Issue #61, Bug 1.
        """
        lp = hit.get("lp")
        number = hit.get("number")
        if not lp or not number:
            return None

        link_field = hit.get("link") or hit.get("prelink") or ""
        if not link_field:
            return None

        # Strip "#navpanes=0" fragment and prepend the prefix.
        path = link_field.split("#", 1)[0]
        pdf_url = f"{self.base_url}{self.prefix}{path}"

        return Drucksache(
            drucksache=f"{lp}/{number}",
            title=hit.get("title", ""),
            fraktionen=self._normalize_fraktion(hit.get("authorhtml", "")),
            datum=self._datum_de_to_iso(hit.get("date", "")),
            link=pdf_url,
            bundesland=self.bundesland,
            typ=hit.get("type", "") or hit.get("kind", ""),
        )

    async def _post_json(
        self, client: httpx.AsyncClient, endpoint: str, payload: dict,
    ) -> Optional[dict]:
        """POST a JSON-stringified payload to a ParlDok endpoint.

        ``endpoint`` is the path tail (e.g. ``"Fulltext/Search"`` or
        ``"Fulltext/Resultpage"``). Returns the inner JSON object
        (already parsed from the stringified ``data`` field), or None
        on error.
        """
        homepage = f"{self.base_url}{self.prefix}/"
        url = f"{self.base_url}{self.prefix}/{endpoint}"
        try:
            resp = await client.post(
                url,
                data={"data": json.dumps(payload, ensure_ascii=False)},
                headers={
                    "X-Requested-With": "XMLHttpRequest",
                    "Referer": homepage,
                },
            )
            if resp.status_code != 200:
                logger.error(
                    "%s %s HTTP %s",
                    self.bundesland, endpoint, resp.status_code,
                )
                return None
            outer = resp.json()
            if not outer.get("success"):
                logger.error(
                    "%s %s not successful: %s",
                    self.bundesland, endpoint, outer.get("message"),
                )
                return None
            return json.loads(outer["data"])
        except Exception:
            logger.exception("%s ParlDok %s error", self.bundesland, endpoint)
            return None

    async def _initial_search(
        self, client: httpx.AsyncClient, *, length: int,
    ) -> tuple[Optional[int], list[dict]]:
        """Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.

        The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
        calls. ParlDok ignores any non-zero ``Start`` on this endpoint —
        the first 100 hits are the only ones reachable via ``Search``.
        """
        body = self._build_search_body(length=length)
        inner = await self._post_json(client, "Fulltext/Search", body)
        if not inner:
            return None, []
        return inner.get("queryid"), (inner.get("docs") or [])

    async def _result_page(
        self, client: httpx.AsyncClient, *, queryid: int, start: int, length: int,
    ) -> list[dict]:
        """Fetch a further result page via ``Fulltext/Resultpage``."""
        payload = {
            "devicekey": "",
            "queryid": queryid,
            "limit": {"Start": start, "Length": length},
        }
        inner = await self._post_json(client, "Fulltext/Resultpage", payload)
        if not inner:
            return []
        return inner.get("docs") or []

    def _make_client(self) -> httpx.AsyncClient:
        return httpx.AsyncClient(
            timeout=30,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        )

    async def _paginated_hits(self, client: httpx.AsyncClient):
        """Async iterator over Drucksachen-style hits across pages.

        Yields raw hit dicts in newest-first order. The first batch comes
        from ``Fulltext/Search``, subsequent batches from
        ``Fulltext/Resultpage`` using the queryid the server returned for
        the initial call. Stops when a page comes back empty, undersized,
        or after :attr:`MAX_PAGES` iterations.
        """
        queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE)
        for hit in hits:
            yield hit
        if not queryid or len(hits) < self.PAGE_SIZE:
            return

        for page in range(1, self.MAX_PAGES):
            page_hits = await self._result_page(
                client,
                queryid=queryid,
                start=page * self.PAGE_SIZE,
                length=self.PAGE_SIZE,
            )
            if not page_hits:
                return
            for hit in page_hits:
                yield hit
            if len(page_hits) < self.PAGE_SIZE:
                return

    # ParlDok 8.x caps Length per request at 100 — paginate if needed.
    PAGE_SIZE = 100
    # Safety bound: scan at most 10 pages × 100 = 1000 most recent docs.
    # Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more
    # than enough for the typical UI request (limit 5..20). Filtered
    # queries that find nothing in the last 1000 docs return empty
    # rather than scan the entire WP — same trade-off as the BE/LSA
    # PortalaAdapter quick-win window.
    MAX_PAGES = 10

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Search the configured Wahlperiode, sorted newest-first.

        #18: einheitliches Verhalten — Server filtert nur nach WP, der
        Client paginiert über die ganze WP und filtert lokal nach
        Treffern in Titel oder Urheber. Volltext-Filter (#12) ist
        zurückgebaut, weil das Verhalten zwischen Adaptern sonst
        asymmetrisch wird. Sortierung kommt vom Server (newest-first
        durch ``sort=2`` in :meth:`_build_search_body`).

        Dedupe per ``lp/number`` weil ParlDok dieselbe Drucksache
        mehrfach in verschiedenen Vorgängen/Beratungen liefert.
        """
        results: list[Drucksache] = []
        seen: set[str] = set()
        query_terms = [t.lower() for t in query.split() if t] if query else []

        async with self._make_client() as client:
            await client.get(f"{self.base_url}{self.prefix}/")
            async for hit in self._paginated_hits(client):
                if not self._hit_matches_filters(hit):
                    continue

                doc = self._hit_to_drucksache(hit)
                if not doc:
                    continue
                if doc.drucksache in seen:
                    continue
                seen.add(doc.drucksache)

                if query_terms:
                    hay = f"{doc.title} {hit.get('authorhtml', '')}".lower()
                    if not all(t in hay for t in query_terms):
                        continue

                results.append(doc)
                if len(results) >= limit:
                    return results

        return results

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Look up a single Antrag by ``lp/number`` ID.

        Pragmatic MVP: page through the WP unfiltered until we find a
        match. ParlDok offers a ``facet_number`` (14) facet that would
        let us target the lookup directly, but the facet ID values are
        instance-specific (would require a ``Fulltext/Filter`` discovery
        call) and the WP-wide pagination is fast enough for the typical
        2k–10k Drucksachen per period.
        """
        wanted_lp, wanted_num = (drucksache.split("/", 1) + [""])[:2]
        if not wanted_num:
            return None

        async with self._make_client() as client:
            await client.get(f"{self.base_url}{self.prefix}/")
            async for hit in self._paginated_hits(client):
                # Don't apply doc-type filters here — get_document is
                # used to look up arbitrary Drucksachen, including ones
                # whose kind/typ doesn't match the search-time filter.
                if str(hit.get("lp")) == wanted_lp and str(hit.get("number")) == wanted_num:
                    return self._hit_to_drucksache(hit)
        return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download the PDF for a Drucksache and extract its text."""
        import fitz  # PyMuPDF

        doc = await self.get_document(drucksache)
        if not doc or not doc.link:
            return None

        async with httpx.AsyncClient(
            timeout=60,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    logger.error(
                        "%s PDF HTTP %s for %s (%s)",
                        self.bundesland, resp.status_code, drucksache, doc.link,
                    )
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception:
                logger.exception("%s ParlDok download error for %s", self.bundesland, drucksache)
                return None


class StarFinderCGIAdapter(ParlamentAdapter):
    """Adapter for old-school CGI Starfinder instances.

    Currently used by Schleswig-Holstein on
    ``lissh.lvn.parlanet.de/cgi-bin/starfinder/0`` — the **oldest** of the
    parliament backends we touch. Predates StarWeb's HTML form-submit
    machinery: instead of submitting a stateful AdvancedSearch form
    (which BB/HE/NI/RP/HB do), Starfinder accepts the entire query as
    URL parameters and returns plain HTML with a flat ``<tr>`` table of
    records.

    Reverse-engineering quelle: ``dokukratie/sh.yml`` plus a probe
    against the live endpoint. Format details:

    - URL template: ``{base}/cgi-bin/starfinder/0?path={db_path}&id=FASTLINK
      &pass=&search={starfinder_query}&format=WEBKURZFL``
    - Query syntax: ``WP=20+AND+dtyp=antrag`` (URL-encoded). The
      ``dtyp`` codes are lowercase short labels (``antrag``, ``kleine``).
    - Encoding: ``iso-8859-1`` (Latin-1) — NOT UTF-8. The HTTP response
      doesn't always declare it via Content-Type, so we explicitly
      decode with ``latin1`` to avoid mojibake on the German umlauts.
    - Hit-format: each record is one ``<tr class="tabcol|tabcol2|tabcol3">``
      with the title in ``<b>``, then ``Antrag <Urheber> <DD.MM.YYYY>
      Drucksache <a href="...pdf">XX/YYYY</a>``.
    """

    _RE_RECORD = re.compile(
        r'<tr class="tabcol[23]?">.*?</tr>',
        re.DOTALL,
    )
    _RE_TITLE = re.compile(r"<b>(.*?)</b>", re.DOTALL)
    _RE_DRUCKSACHE_LINK = re.compile(
        r'<a href="([^"]+\.pdf)"[^>]*>(\d+/\d+)</a>'
    )
    # The line between <b>title</b> and the <a>-link looks like:
    #   "Antrag Christian Dirschauer (SSW) 07.04.2026 Drucksache "
    # We pull the originator(s) and the date out of it.
    _RE_URHEBER_DATUM = re.compile(
        r"</b>\s*<br>\s*[A-Za-zÄÖÜäöüß]+\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
        re.DOTALL,
    )

    def __init__(
        self,
        *,
        bundesland: str,
        name: str,
        base_url: str,
        wahlperiode: int,
        db_path: str = "lisshfl.txt",
        document_typ_code: str = "antrag",
    ) -> None:
        self.bundesland = bundesland
        self.name = name
        self.base_url = base_url.rstrip("/")
        self.wahlperiode = wahlperiode
        self.db_path = db_path
        self.document_typ_code = document_typ_code

    @staticmethod
    def _datum_de_to_iso(datum_de: str) -> str:
        if not datum_de:
            return ""
        try:
            d, m, y = datum_de.split(".")
            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
        except ValueError:
            return ""

    def _normalize_fraktion(self, text: str) -> list[str]:
        """Thin shim — siehe ``app.parteien.extract_fraktionen``. #55.

        SH-spezifisch: SSW gehört zur SH-Tabelle und wird durch
        ``bundesland=SH`` korrekt mit-extrahiert.
        """
        from .parteien import extract_fraktionen
        return extract_fraktionen(text, bundesland=self.bundesland)

    def _build_url(self) -> str:
        """Build the Starfinder URL for the structural WP+dtyp browse.

        Free-text filtering is done client-side on the parsed records
        (consistent with #18 — alle Adapter machen einheitlich Title-
        Filter ohne Server-Volltext, weil das Verhalten zwischen
        Adaptern sonst asymmetrisch wird).
        """
        search_param = f"WP={self.wahlperiode}+AND+dtyp={self.document_typ_code}"
        return (
            f"{self.base_url}/cgi-bin/starfinder/0"
            f"?path={self.db_path}&id=FASTLINK&pass=&search={search_param}"
            f"&format=WEBKURZFL"
        )

    def _parse_records(self, html: str) -> list[Drucksache]:
        results: list[Drucksache] = []
        for record_html in self._RE_RECORD.findall(html):
            m_link = self._RE_DRUCKSACHE_LINK.search(record_html)
            if not m_link:
                continue
            pdf_url, drucksache = m_link.group(1), m_link.group(2)

            m_title = self._RE_TITLE.search(record_html)
            title = re.sub(r"\s+", " ", m_title.group(1)).strip() if m_title else f"Drucksache {drucksache}"

            urheber = ""
            datum_iso = ""
            m_meta = self._RE_URHEBER_DATUM.search(record_html)
            if m_meta:
                urheber = m_meta.group(1).strip()
                datum_iso = self._datum_de_to_iso(m_meta.group(2))

            results.append(Drucksache(
                drucksache=drucksache,
                title=title,
                fraktionen=self._normalize_fraktion(urheber),
                datum=datum_iso,
                link=pdf_url,
                bundesland=self.bundesland,
                typ="Antrag",
            ))
        return results

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        url = self._build_url()
        async with httpx.AsyncClient(
            timeout=60,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(url)
                if resp.status_code != 200:
                    logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
                    return []
                # Force latin1 because the Starfinder server doesn't always
                # advertise the encoding correctly.
                html = resp.content.decode("latin-1", errors="replace")
                results = self._parse_records(html)
            except Exception:
                logger.exception("%s search error", self.bundesland)
                return []

        # Client-side title + Urheber filter (siehe #18)
        if query:
            terms = [t.lower() for t in query.split() if t]
            results = [
                d for d in results
                if all(t in f"{d.title} {' '.join(d.fraktionen)}".lower() for t in terms)
            ]
        return results[:limit]

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Look up a single Drucksache by ID.

        SH responses are pre-sorted newest-first; we re-fetch up to 200
        records and scan for the exact match. The Starfinder server
        doesn't expose a number-only filter that we know of.
        """
        results = await self.search(query="", limit=200)
        for doc in results:
            if doc.drucksache == drucksache:
                return doc
        return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        import fitz  # PyMuPDF

        doc = await self.get_document(drucksache)
        if not doc or not doc.link:
            return None
        async with httpx.AsyncClient(
            timeout=60,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception:
                logger.exception("%s PDF download error for %s", self.bundesland, drucksache)
                return None


class BayernAdapter(ParlamentAdapter):
    """Adapter for Bayerischer Landtag (#23) — TYPO3-Solr HTML scraping.

    Backend ist eine TYPO3-Site mit ext-solr-Suche unter
    ``/parlament/dokumente/drucksachen``. Server-side rendering, keine
    SPA, keine API. Reverse-engineering ist trivial — die Drucksachen-
    Liste hat ein stabiles HTML-Pattern und der Server akzeptiert die
    Filter direkt als URL-Query-Parameter.

    Search-URL:

        GET /parlament/dokumente/drucksachen?dokumentenart=Drucksache
            &wahlperiodeid[]=19
            &q=<volltext>
            &sort=date
            &anzahl_treffer=100
            &page=<n>

    Response-Pattern (HTML):

        <div class="row result">
            <div class="col-12">
                <h4>
                    <a href="https://www.bayern.landtag.de/www/ElanTextAblage_WP19/Drucksachen/Basisdrucksachen/0000009000/0000009107.pdf">
                        Drucksache Nr. 19/11407 vom 08.04.2026
                    </a>
                </h4>
                <p> Antrag AfD </p>
                <h5><strong>Kostenloses Parken für E-Fahrzeuge…</strong></h5>
            </div>
        </div>

    Felder pro Eintrag:
      * ``Drucksache Nr. 19/<NUM> vom DD.MM.YYYY`` → drucksache + datum
      * ``<a href="…Basisdrucksachen/…NUM.pdf">`` → PDF-Link (Anträge)
        oder ``…Schriftliche Anfragen/…pdf`` für Anfragen — Anträge
        werden client-seitig über ``<p>Antrag …`` gefiltert
      * ``<p>Antrag <FRAKTION>[, <FRAKTION2>]</p>`` → typ + Fraktionen
      * ``<h5><strong>TITLE</strong></h5>`` → title

    Drucksachen-Lookup nutzt denselben Endpoint mit ``q=<drucksache>``;
    die Solr-Suche matcht die Nummer im Volltext und liefert sie als
    einzigen oder ersten Treffer.

    Pagination: 100 pro Page (Maximum), max 17.598 Drucksachen in WP19
    Stand 2026-04-10. Wir holen client-side max ``limit*5`` Anträge nach
    Filterung.
    """

    bundesland = "BY"
    name = "Bayerischer Landtag"
    base_url = "https://www.bayern.landtag.de"

    _RE_RESULT_BLOCK = re.compile(
        r'<div class="row result">(.*?)</div>\s*</div>', re.DOTALL,
    )
    _RE_DRUCKSACHE_HEADER = re.compile(
        r'Drucksache\s+Nr\.\s*(\d+/\d+)\s*vom\s*(\d{2}\.\d{2}\.\d{4})',
        re.IGNORECASE,
    )
    _RE_PDF_HREF = re.compile(r'href="([^"]+\.pdf)"')
    _RE_TYP_FRAKTION = re.compile(r'<p>\s*([^<]+?)\s*</p>')
    _RE_TITLE = re.compile(r'<h5>\s*<strong>([^<]+)</strong>\s*</h5>')

    def __init__(self, *, wahlperiode: int = 19):
        self.wahlperiode = wahlperiode

    @staticmethod
    def _datum_de_to_iso(datum_de: str) -> str:
        if not datum_de:
            return ""
        try:
            d, m, y = datum_de.split(".")
            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
        except ValueError:
            return ""

    def _parse_results(self, html: str) -> list[Drucksache]:
        """Extrahiere alle Drucksachen-Einträge aus einer Result-Page.

        Filtert client-seitig auf ``<p>Antrag …</p>`` — die Page enthält
        Anträge, Schriftliche Anfragen, Mündliche Anfragen, Berichte und
        Gesetzentwürfe gemischt.
        """
        from .parteien import extract_fraktionen

        results: list[Drucksache] = []
        for block in self._RE_RESULT_BLOCK.findall(html):
            m_header = self._RE_DRUCKSACHE_HEADER.search(block)
            if not m_header:
                continue
            drucksache = m_header.group(1)
            datum_iso = self._datum_de_to_iso(m_header.group(2))

            m_typ = self._RE_TYP_FRAKTION.search(block)
            typ_frak = m_typ.group(1).strip() if m_typ else ""
            # Format ist "<TYP> <FRAKTIONEN>" — Typ ist das erste Token,
            # Rest ist Fraktion(en) komma-separiert.
            parts = typ_frak.split(None, 1)
            typ = parts[0] if parts else ""
            fraktionen_text = parts[1] if len(parts) > 1 else ""

            # Bayern listet auch Schriftliche Anfragen, Berichte etc. in
            # derselben Liste — wir wollen nur Anträge.
            if typ.lower() != "antrag":
                continue

            fraktionen = extract_fraktionen(
                fraktionen_text, bundesland="BY",
            )

            m_title = self._RE_TITLE.search(block)
            title = m_title.group(1).strip() if m_title else f"Drucksache {drucksache}"
            # Kollabieren von Mehrfach-Whitespace innerhalb des Titels
            title = re.sub(r"\s+", " ", title)

            m_pdf = self._RE_PDF_HREF.search(block)
            pdf_url = m_pdf.group(1) if m_pdf else ""

            results.append(Drucksache(
                drucksache=drucksache,
                title=title,
                fraktionen=fraktionen,
                datum=datum_iso,
                link=pdf_url,
                bundesland="BY",
                typ=typ,
            ))
        return results

    def _build_search_params(self, query: str, page: int = 1) -> dict:
        # Bayern nutzt PHP-Style-Array-Suffix ``wahlperiodeid[]`` —
        # httpx codiert Listen als wiederholte Keys, wir bauen den
        # Param-Namen mit ``[]`` direkt in den dict-Key ein.
        return {
            "dokumentenart": "Drucksache",
            "wahlperiodeid[]": str(self.wahlperiode),
            "q": query or "",
            "sort": "date",
            "anzahl_treffer": "100",
            "page": str(page),
        }

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Volltext-Suche über die aktuelle Wahlperiode, gefiltert auf Anträge.

        Sortiert newest-first (``sort=date``). Holt 1-3 Pages, je 100
        Hits (Antrags-Anteil ist ~10-15% des Drucksachen-Mix), client-
        seitig nach ``Antrag``-Typ gefiltert.
        """
        url = f"{self.base_url}/parlament/dokumente/drucksachen"
        results: list[Drucksache] = []
        seen: set[str] = set()

        async with httpx.AsyncClient(
            timeout=30,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            for page in range(1, 4):  # max 300 raw hits → ~30-50 Anträge
                try:
                    resp = await client.get(url, params=self._build_search_params(query, page=page))
                except Exception:
                    logger.exception("BY search request error page=%d", page)
                    break
                if resp.status_code != 200:
                    logger.error("BY search HTTP %s page=%d", resp.status_code, page)
                    break

                page_results = self._parse_results(resp.text)
                if not page_results:
                    break

                for d in page_results:
                    if d.drucksache in seen:
                        continue
                    seen.add(d.drucksache)
                    results.append(d)
                    if len(results) >= limit:
                        return results

        return results

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Direktes Lookup via ``q=<drucksache>``. Solr-Volltext matcht
        die Drucksachen-Nummer und liefert sie als einzigen Hit zurück."""
        url = f"{self.base_url}/parlament/dokumente/drucksachen"
        params = self._build_search_params(drucksache, page=1)

        async with httpx.AsyncClient(
            timeout=30, follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(url, params=params)
            except Exception:
                logger.exception("BY get_document request error for %s", drucksache)
                return None

        if resp.status_code != 200:
            return None

        for d in self._parse_results(resp.text):
            if d.drucksache == drucksache:
                return d
        return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download das Antrags-PDF und extrahiere Volltext."""
        import fitz

        doc = await self.get_document(drucksache)
        if doc is None or not doc.link:
            return None

        async with httpx.AsyncClient(
            timeout=60, follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    logger.error("BY PDF HTTP %s for %s", resp.status_code, drucksache)
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception:
                logger.exception("BY download error for %s", drucksache)
                return None


class PARLISAdapter(ParlamentAdapter):
    """Adapter for Baden-Württemberg's PARLIS — eUI/portala-Variante mit
    polling und JSON-in-HTML-Comment-Records.

    PARLIS auf ``parlis.landtag-bw.de`` läuft technisch auf demselben
    eUI-Backend wie LSA-PADOKA und BE-PARDOK, aber mit drei wichtigen
    Unterschieden, die eine eigene Klasse statt einer PortalaAdapter-
    Subklasse rechtfertigen:

    1. **Body-Schema:** Statt der portala/LSA-typischen ``search.lines``
       mit ``2/3/4/10/11/20.x/90.x``-Slots nutzt PARLIS ein viel kürzeres
       ``l1/l2/l3/l4`` Schema (siehe ``dokukratie/scrapers/portala.query.bw.json``).
       ``serverrecordname`` ist ``"vorgang"`` statt ``"sr_generic1"``,
       ``format`` ist ``"suchergebnis-vorgang-full"``, ``sort`` ist
       ``"SORT01/D SORT02/D SORT03"``. Es gibt kein ``parsed`` und kein
       ``json``-Tree — der Server akzeptiert das minimale Schema direkt.

    2. **Async polling:** Im Gegensatz zu LSA/BE liefert die initiale
       ``Fulltext/Search``-Antwort nur eine ``search_id`` mit
       ``status: "running"``, KEINE ``report_id``. Erst eine zweite
       ``SearchAndDisplay``-Anfrage mit ``id: <search_id>`` (und ohne
       ``search``-Component) bekommt die fertige ``report_id`` zurück.
       In meinen Live-Tests reichte ein einziger 2-Sekunden-Sleep
       zwischen den Calls.

    3. **Hit-Format:** Die ``report.tt.html``-Antwort liefert keine
       Perl-Dump-Blöcke (LSA) und keine Bootstrap-Card-Divs (BE),
       sondern **JSON-Records in HTML-Kommentaren**::

           <!--{"WMV33":[{"main":"Schlagworte"}],
                "EWBV22":[{"main":"Drucksache 17/10323"}],
                "EWBD05":[{"main":"https://.../17_10323.pdf"}],
                "EWBV23":[{"main":"Antrag Felix Herkens (GRÜNE) u. a. 16.03.2026"}],
                ...}-->

       Der Parser zieht die Comments raw raus und mappt die WMV/EWBV-
       Felder auf das ``Drucksache``-Dataclass.

    Reverse-Engineering-Quelle: ``dokukratie/scrapers/portala.query.bw.json``
    + Live-HAR gegen ``parlis.landtag-bw.de`` (Issue #29).
    """

    # Reverse-engineered field map for the JSON records that come embedded
    # in HTML comments inside report.tt.html responses.
    #
    # Records look like ``<!--{"WMV33":[...],...}-->`` and may contain
    # nested ``<i>...</i>`` highlight tags inside the JSON values.
    # Non-greedy match against the literal closing ``}-->`` because that
    # delimiter does not appear inside the JSON payload itself.
    _RE_RECORD = re.compile(r"<!--(\{.*?\})-->", re.DOTALL)
    _RE_DRUCKSACHE = re.compile(r"Drucksache\s+(\d+/\d+)")
    _RE_DATUM = re.compile(r"(\d{1,2}\.\d{1,2}\.\d{4})")

    def __init__(
        self,
        *,
        bundesland: str,
        name: str,
        base_url: str,
        wahlperiode: int,
        prefix: str = "/parlis",
        document_typ: str = "Antrag",
        date_window_days: int = 730,
        poll_attempts: int = 15,
        poll_interval_seconds: float = 2.0,
    ) -> None:
        """Configure a PARLIS adapter for one specific parliament instance.

        Args:
            bundesland: state code, e.g. ``"BW"``.
            name: human-readable label.
            base_url: ``https://parlis.landtag-bw.de`` (no trailing slash).
            wahlperiode: legislative period — feeds into ``lines.l1``.
            prefix: app prefix where PARLIS lives. ``/parlis`` for BW.
            document_typ: feeds into ``lines.l4``. The server interprets
                this as a German document type label like ``"Antrag"``.
            date_window_days: look-back window for the search range,
                quick-win against title-only filtering — same approach
                as the PortalaAdapter for LSA/BE.
            poll_attempts: how many times to poll for ``report_id`` before
                giving up. ~15 × 2s = 30s upper bound.
            poll_interval_seconds: sleep between poll attempts.
        """
        self.bundesland = bundesland
        self.name = name
        self.base_url = base_url.rstrip("/")
        self.prefix = "/" + prefix.strip("/")
        self.wahlperiode = wahlperiode
        self.document_typ = document_typ
        self.date_window_days = date_window_days
        self.poll_attempts = poll_attempts
        self.poll_interval_seconds = poll_interval_seconds

    @staticmethod
    def _datum_de_to_iso(datum_de: str) -> str:
        """DD.MM.YYYY → YYYY-MM-DD; '' for empty input."""
        if not datum_de:
            return ""
        try:
            d, m, y = datum_de.split(".")
            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
        except ValueError:
            return ""

    def _normalize_fraktion(self, text: str) -> list[str]:
        """Thin shim — siehe ``app.parteien.extract_fraktionen``. #55.

        PARLIS packt den Originator in ``EWBV23`` wie
        ``"Antrag Felix Herkens (GRÜNE), Saskia Frank (GRÜNE)..."``.
        """
        from .parteien import extract_fraktionen
        return extract_fraktionen(text, bundesland=self.bundesland)

    def _build_initial_body(self, start_date: str, end_date: str) -> dict:
        """Build the first ``SearchAndDisplay`` body with the search component.

        The schema follows ``dokukratie/scrapers/portala.query.bw.json``
        verbatim — only the placeholder values are substituted.
        """
        return {
            "action": "SearchAndDisplay",
            "report": {
                "rhl": "main",
                "rhlmode": "add",
                "format": "suchergebnis-vorgang-full",
                "mime": "html",
                "sort": "SORT01/D SORT02/D SORT03",
            },
            "search": {
                "lines": {
                    "l1": str(self.wahlperiode),
                    "l2": start_date,
                    "l3": end_date,
                    "l4": self.document_typ,
                },
                "serverrecordname": "vorgang",
            },
            "sources": ["Star"],
        }

    def _build_poll_body(self, search_id: str) -> dict:
        """Build the polling body — same action, but with the search_id
        instead of a fresh search component."""
        return {
            "action": "SearchAndDisplay",
            "report": {
                "rhl": "main",
                "rhlmode": "add",
                "format": "suchergebnis-vorgang-full",
                "mime": "html",
                "sort": "SORT01/D SORT02/D SORT03",
            },
            "id": search_id,
            "sources": ["Star"],
        }

    def _hit_record_to_drucksache(self, record: dict) -> Optional[Drucksache]:
        """Map a single JSON-in-comment record to a ``Drucksache``.

        PARLIS-record schema (reverse-engineered, all values are arrays
        of ``{"main": ...}`` dicts):

        - ``EWBV22``: "Drucksache 17/10323"
        - ``EWBD05``: direct PDF URL
        - ``EWBV23``: "Antrag <Urheber> <DD.MM.YYYY>" — single combined line
        - ``WMV30``: short Urheber summary ("Felix Herkens (GRÜNE) u. a.")
        - ``WMV33``: subject keywords (Schlagworte)
        - ``EWBD01``: "Drucksache <X/Y> <DD.MM.YYYY>"
        """
        def first(field: str) -> str:
            block = record.get(field)
            if isinstance(block, list) and block:
                return (block[0].get("main") or "").strip()
            return ""

        ds_text = first("EWBV22") or first("EWBD01")
        m_ds = self._RE_DRUCKSACHE.search(ds_text)
        if not m_ds:
            return None
        drucksache = m_ds.group(1)

        # The "title" we want is the Schlagworte/topic, not the
        # Drucksachen-Header. PARLIS keeps the human-readable subject
        # in WMV33 (Schlagworte joined by semicolons) — that's the
        # closest equivalent to "title" the LSA/BE adapters expose.
        # Fallback to the EWBV23 line if WMV33 is empty.
        schlagworte = first("WMV33")
        # Strip embedded <i>...</i> highlight tags
        schlagworte_clean = re.sub(r"</?i>", "", schlagworte).strip()
        title = schlagworte_clean or first("EWBV23") or f"Drucksache {drucksache}"

        # Date + Urheber out of EWBV23 ("Antrag <Urheber> <DD.MM.YYYY>")
        ewbv23 = first("EWBV23")
        m_dat = self._RE_DATUM.search(ewbv23)
        datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "")
        urheber_short = first("WMV30")
        fraktionen = self._normalize_fraktion(urheber_short or ewbv23)

        pdf_url = first("EWBD05")

        return Drucksache(
            drucksache=drucksache,
            title=title,
            fraktionen=fraktionen,
            datum=datum_iso,
            link=pdf_url,
            bundesland=self.bundesland,
            typ=self.document_typ,
        )

    async def _initial_search_and_poll(
        self, client: httpx.AsyncClient, start_date: str, end_date: str,
    ) -> Optional[str]:
        """Run the initial search + poll until ``report_id`` arrives."""
        import asyncio

        browse_html = f"{self.base_url}{self.prefix}/browse.tt.html"
        browse_json = f"{self.base_url}{self.prefix}/browse.tt.json"

        # Step 1: warm cookies
        await client.get(browse_html)

        # Step 2: initial search
        try:
            resp = await client.post(
                browse_json,
                json=self._build_initial_body(start_date, end_date),
                headers={"Referer": browse_html},
            )
        except Exception:
            logger.exception("%s initial search request error", self.bundesland)
            return None
        if resp.status_code != 200:
            logger.error("%s initial search HTTP %s", self.bundesland, resp.status_code)
            return None
        data = resp.json()
        if data.get("report_id"):
            return data["report_id"]
        search_id = data.get("search_id")
        if not search_id:
            logger.error("%s no search_id in initial response: %s", self.bundesland, data)
            return None

        # Step 3: poll until report_id appears or we run out of attempts
        for _ in range(self.poll_attempts):
            await asyncio.sleep(self.poll_interval_seconds)
            try:
                resp = await client.post(
                    browse_json,
                    json=self._build_poll_body(search_id),
                    headers={"Referer": browse_html},
                )
            except Exception:
                logger.exception("%s poll request error", self.bundesland)
                return None
            if resp.status_code != 200:
                logger.error("%s poll HTTP %s", self.bundesland, resp.status_code)
                return None
            data = resp.json()
            if data.get("report_id"):
                return data["report_id"]
            star = data.get("sources", {}).get("Star", {})
            if star.get("status") == "stopped" and not data.get("report_id"):
                # Search finished but no report — empty result
                return None

        logger.warning("%s gave up polling after %d attempts", self.bundesland, self.poll_attempts)
        return None

    def _parse_report_html(self, html: str) -> list[Drucksache]:
        """Extract Drucksachen from a report.tt.html response.

        Records are JSON objects embedded in HTML comments. We pull each
        comment block via regex, parse it as JSON, and map the WMV/EWBV
        fields to a Drucksache.
        """
        results: list[Drucksache] = []
        for m in self._RE_RECORD.finditer(html):
            json_text = m.group(1)
            try:
                record = json.loads(json_text)
            except json.JSONDecodeError:
                continue
            doc = self._hit_record_to_drucksache(record)
            if doc:
                results.append(doc)
        return results

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Search recent BW Anträge with optional client-side title filter.

        Server-side full-text is not used (#18 — einheitliches
        Verhalten ohne Volltext bis alle Adapter es können). The
        client filter looks at title (Schlagworte) + Urheber.
        """
        from datetime import date, timedelta

        end = date.today()
        start = end - timedelta(days=self.date_window_days)

        async with httpx.AsyncClient(
            timeout=60,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                report_id = await self._initial_search_and_poll(
                    client, start.isoformat(), end.isoformat(),
                )
                if not report_id:
                    return []

                # Pull a generous chunk so the client-side filter has
                # enough material to work with.
                chunksize = max(limit * 10, 200) if query else max(limit * 2, 50)
                report_url = (
                    f"{self.base_url}{self.prefix}/report.tt.html"
                    f"?report_id={report_id}&start=0&chunksize={chunksize}"
                )
                resp = await client.get(
                    report_url,
                    headers={"Referer": f"{self.base_url}{self.prefix}/browse.tt.html"},
                )
                if resp.status_code != 200:
                    logger.error("%s report HTTP %s", self.bundesland, resp.status_code)
                    return []

                results = self._parse_report_html(resp.text)
            except Exception:
                logger.exception("%s search error", self.bundesland)
                return []

        # Client-side filter
        if query:
            terms = [t.lower() for t in query.split() if t]
            results = [
                d for d in results
                if all(t in f"{d.title} {' '.join(d.fraktionen)}".lower() for t in terms)
            ]
        return results[:limit]

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Look up a single Drucksache by ID via a broad browse."""
        results = await self.search(query="", limit=200)
        for doc in results:
            if doc.drucksache == drucksache:
                return doc
        return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download the PDF for a Drucksache and extract its text."""
        import fitz  # PyMuPDF

        doc = await self.get_document(drucksache)
        if not doc or not doc.link:
            return None

        async with httpx.AsyncClient(
            timeout=60,
            follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    logger.error(
                        "%s PDF HTTP %s for %s (%s)",
                        self.bundesland, resp.status_code, drucksache, doc.link,
                    )
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception:
                logger.exception("%s PDF download error for %s", self.bundesland, drucksache)
                return None


class SNEdasXmlAdapter(ParlamentAdapter):
    """Sachsen-Adapter via XML-Export aus EDAS (#26/#38).

    EDAS (edas.landtag.sachsen.de) blockiert sowohl per ``robots.txt:
    Disallow: /`` als auch über ASP.NET-Webforms-Postbacks autonomes
    Crawling. Der Sächsische Landtag bietet aber einen offiziellen
    XML-Export-Knopf in der Suchmaske, der bis zu 2500 Treffer als
    strukturiertes XML herunterlädt — das umgeht beide Probleme:

    - **Manueller Export-Workflow**: Der User exportiert wöchentlich die
      Dokumentenliste mit Filter "Dokumententyp = Antr" und legt die
      Datei unter ``data/sn-edas-export.xml`` ab. Die Pipeline liest sie
      lokal und ist damit komplett unabhängig vom EDAS-Server.
    - **PDF-URL-Extraktion**: Das XML liefert ID, Wahlperiode,
      Dokumentennummer, Fundstelle (mit Fraktion + Datum) und Titel —
      aber keine PDF-URL. Wir holen die PDF-URL **erst beim
      ``download_text()``** aus dem ``viewer_navigation.aspx``-Frame
      des Landtags (ein einzelner GET, kein Postback). Dadurch
      generieren wir nur dann Server-Last, wenn ein Antrag tatsächlich
      analysiert wird.

    XML-Schema:

    ```
    <treffer>
      <ID><![CDATA[297875]]></ID>
      <Wahlperiode><![CDATA[8]]></Wahlperiode>
      <Dokumentenart><![CDATA[Drs]]></Dokumentenart>
      <Dokumentennummer><![CDATA[2]]></Dokumentennummer>
      <Fundstelle><![CDATA[Antr CDU, BSW, SPD 01.10.2024 Drs 8/2]]></Fundstelle>
      <Titel><![CDATA[Geschäftsordnung des Sächsischen Landtags]]></Titel>
    </treffer>
    ```

    Encoding ist ISO-8859-1 (Sachsen ist alt-school).
    """

    bundesland = "SN"
    name = "Sächsischer Landtag (EDAS-XML-Export)"
    base_url = "https://edas.landtag.sachsen.de"
    viewer_path = "/viewer/viewer_navigation.aspx"

    # Default-Pfad zum Export-File. Wird im Container vom mounted data/-
    # Volume bedient — der User legt die XML-Datei dort ab.
    DEFAULT_EXPORT_PATH = "data/sn-edas-export.xml"

    _RE_TREFFER = re.compile(r"<treffer>([\s\S]*?)</treffer>")
    _RE_FIELD = re.compile(r"<(\w+)><!\[CDATA\[(.*?)\]\]></\1>", re.DOTALL)
    _RE_FUNDSTELLE = re.compile(
        r"^(?P<typ>\S+)\s+(?P<urheber>.+?)\s+(?P<datum>\d{1,2}\.\d{1,2}\.\d{4})\s+Drs\s+\d+/\d+$"
    )
    _RE_VIEWER_PDF = re.compile(
        r"https://ws\.landtag\.sachsen\.de/images/[\w_]+\.pdf"
    )

    def __init__(self, *, export_path: Optional[str] = None):
        from pathlib import Path as _P
        # Pfad relativ zum webapp-Root, falls nicht absolut
        if export_path is None:
            self.export_path = _P(__file__).resolve().parent.parent / self.DEFAULT_EXPORT_PATH
        else:
            self.export_path = _P(export_path)

    def _normalize_fraktion(self, text: str) -> list[str]:
        from .parteien import extract_fraktionen
        return extract_fraktionen(text, bundesland=self.bundesland)

    @staticmethod
    def _datum_de_to_iso(datum_de: str) -> str:
        try:
            d, m, y = datum_de.split(".")
            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
        except ValueError:
            return ""

    def _read_export(self) -> str:
        """Lade die XML-Datei. Returns leeren String wenn nicht vorhanden
        — der Adapter degradiert dann gracefully zu 0 Hits."""
        if not self.export_path.exists():
            logger.warning("SN: export file not found at %s", self.export_path)
            return ""
        return self.export_path.read_text(encoding="iso-8859-1")

    def _parse_treffer(self, xml: str) -> list[Drucksache]:
        results: list[Drucksache] = []
        for chunk in self._RE_TREFFER.findall(xml):
            fields = dict(self._RE_FIELD.findall(chunk))
            wp = fields.get("Wahlperiode", "").strip()
            nr = fields.get("Dokumentennummer", "").strip()
            if not (wp and nr):
                continue

            drucksache = f"{wp}/{nr}"
            titel = fields.get("Titel", "").strip()
            fundstelle = fields.get("Fundstelle", "").strip()

            # Aus Fundstelle "Antr CDU, BSW, SPD 01.10.2024 Drs 8/2" die
            # Felder extrahieren
            datum_iso = ""
            urheber = ""
            typ = "Antrag"
            m = self._RE_FUNDSTELLE.match(fundstelle)
            if m:
                urheber = m.group("urheber")
                datum_iso = self._datum_de_to_iso(m.group("datum"))
            fraktionen = self._normalize_fraktion(urheber)

            # Stub-Link: viewer.aspx mit den drei Parametern. Die echte
            # PDF-URL wird beim download_text() per zweitem Call aufgelöst.
            link = (
                f"{self.base_url}/parlamentsdokumentation/parlamentsarchiv/"
                f"viewer.aspx?dok_nr={nr}&dok_art=Drs&leg_per={wp}"
            )

            results.append(Drucksache(
                drucksache=drucksache,
                title=titel,
                fraktionen=fraktionen,
                datum=datum_iso,
                link=link,
                bundesland=self.bundesland,
                typ=typ,
            ))
        return results

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Liefert Anträge aus dem statischen XML-Export, optional
        client-side title-filtered nach Query. Das XML ist bereits
        newest-first sortiert (verifiziert: erste Treffer 8/2 vom
        01.10.2024, letzte 5/9268 vom 04.06.2012)."""
        xml = self._read_export()
        if not xml:
            return []
        results = self._parse_treffer(xml)
        if query:
            qterms = [t.lower() for t in query.split()]
            results = [
                d for d in results
                if all(t in d.title.lower() or t in " ".join(d.fraktionen).lower()
                       for t in qterms)
            ]
        return results[:limit]

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Lookup im statischen Export, kein Server-Call."""
        xml = self._read_export()
        if not xml:
            return None
        for doc in self._parse_treffer(xml):
            if doc.drucksache == drucksache:
                return doc
        return None

    async def _resolve_pdf_url(
        self, client: httpx.AsyncClient, drucksache: str,
    ) -> Optional[str]:
        """Resolve die echte PDF-URL über das viewer_navigation.aspx-
        Frame. Single GET-Call, kein Postback."""
        wp, _, nr = drucksache.partition("/")
        if not (wp and nr):
            return None
        url = (
            f"{self.base_url}/viewer/viewer_navigation.aspx"
            f"?dok_nr={nr}&dok_art=Drs&leg_per={wp}"
        )
        try:
            resp = await client.get(url)
            if resp.status_code != 200:
                return None
            m = self._RE_VIEWER_PDF.search(resp.text)
            return m.group(0) if m else None
        except Exception:
            logger.exception("SN viewer probe error for %s", drucksache)
            return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        import fitz

        async with httpx.AsyncClient(
            timeout=60, follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            pdf_url = await self._resolve_pdf_url(client, drucksache)
            if not pdf_url:
                logger.error("SN: no PDF URL found for %s", drucksache)
                return None
            try:
                resp = await client.get(pdf_url)
                if resp.status_code != 200:
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception:
                logger.exception("SN PDF download error for %s", drucksache)
                return None


class PARiSHBAdapter(ParlamentAdapter):
    """Bremen-Adapter für PARiS (paris.bremische-buergerschaft.de).

    PARiS ist die alte Java-Servlet-Variante von StarWeb (anders als
    HE/starweb.hessen.de, das auf dem moderneren eUI läuft). Die Suche
    geht über genau einen POST-Call gegen ``/starweb/paris/servlet.starweb``
    mit form-urlencoded Body. Response ist ein vollständiges HTML-
    Ergebnis-Page mit ``<tbody name="RecordRepeater">``-Hits.

    Hit-Format pro ``<tr name="Repeat_TYP">``:

    - ``<abbr title="Bremische Stadtbürgerschaft">S</abbr>`` oder
      ``<abbr title="Bremischer Landtag">L</abbr>`` als Indikator
    - ``<h2><a>TITEL</a></h2>``
    - Stichworte (Thesaurus-Links, ignoriert)
    - ``Drs <b>21/730 S</b>`` (Drucksachen-Nr mit S/L-Suffix)
    - ``Änderungsantrag vom 23.02.2026`` (Typ + Datum)
    - ``SPD, BÜNDNIS 90/DIE GRÜNEN, Die Linke`` (Fraktionen)
    - ``<a href="https://www.bremische-buergerschaft.de/dokumente/...pdf">``

    Bremen hat zwei parallele Parlamente: Bürgerschaft (Landtag) für
    landespolitische Anträge und Stadtbürgerschaft für Bremens
    kommunale Sachen. Wir lassen beide durch (``PARL=S OR L``) — der
    Stadtbürgerschafts-Anteil ist für die GWÖ-Bilanzierung sogar
    interessanter, weil viele Entscheidungen auf kommunaler Ebene
    laufen.
    """

    bundesland = "HB"
    name = "Bremische Bürgerschaft (PARiS)"
    base_url = "https://paris.bremische-buergerschaft.de"
    servlet_path = "/starweb/paris/servlet.starweb"
    wahlperiode = 21

    # Pro-Hit-Regex über das `<tr name="Repeat_TYP">`-Pattern
    _RE_TR = re.compile(
        r'<tr\s+name="Repeat_TYP"[^>]*>([\s\S]*?)</tr\s*>',
        re.IGNORECASE,
    )
    _RE_TITLE = re.compile(r'<h2[^>]*>\s*<a[^>]*>(.*?)</a>', re.DOTALL)
    _RE_DRUCKSACHE = re.compile(r'Drs\s*<b>\s*(\d+/\d+)\s*([SL]?)\s*</b>')
    _RE_TYP_DATUM = re.compile(r'</b>\s*,\s*([^,<\n]+?)\s+vom\s+(\d{1,2}\.\d{1,2}\.\d{4})')
    _RE_FRAKTIONEN_AFTER_DATUM = re.compile(r'vom\s+\d{1,2}\.\d{1,2}\.\d{4}\s*<br\s*/?\s*>\s*([^<]+)')
    _RE_PDF_LINK = re.compile(
        r'<a\s+href="(https?://[^"]*\.pdf[^"]*)"[^>]*target="new"',
        re.IGNORECASE,
    )

    def _normalize_fraktion(self, text: str) -> list[str]:
        from .parteien import extract_fraktionen
        return extract_fraktionen(text, bundesland=self.bundesland)

    @staticmethod
    def _datum_de_to_iso(datum_de: str) -> str:
        try:
            d, m, y = datum_de.split(".")
            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
        except ValueError:
            return ""

    @staticmethod
    def _strip_html(s: str) -> str:
        """Entferne HTML-Tags und entities aus einem Snippet."""
        s = re.sub(r"<[^>]+>", "", s)
        s = s.replace("&ndash;", "–").replace("&nbsp;", " ")
        s = re.sub(r"&[a-zA-Z]+;", " ", s)
        return re.sub(r"\s+", " ", s).strip()

    def _parse_record_html(self, chunk: str) -> Optional[Drucksache]:
        m_ds = self._RE_DRUCKSACHE.search(chunk)
        if not m_ds:
            return None
        nr_only = m_ds.group(1)         # "21/730"
        suffix = m_ds.group(2) or ""    # "S" oder "L"
        # Drucksachen-ID: ohne Whitespace, mit Suffix dahinter wenn vorhanden
        drucksache = f"{nr_only}{suffix}" if suffix else nr_only

        m_t = self._RE_TITLE.search(chunk)
        title = self._strip_html(m_t.group(1)) if m_t else f"Drucksache {drucksache}"

        m_pdf = self._RE_PDF_LINK.search(chunk)
        pdf_url = m_pdf.group(1) if m_pdf else ""

        m_td = self._RE_TYP_DATUM.search(chunk)
        if m_td:
            typ = self._strip_html(m_td.group(1))
            datum = self._datum_de_to_iso(m_td.group(2))
        else:
            typ = "Drucksache"
            datum = ""

        m_fr = self._RE_FRAKTIONEN_AFTER_DATUM.search(chunk)
        urheber = self._strip_html(m_fr.group(1)) if m_fr else ""
        fraktionen = self._normalize_fraktion(urheber)

        return Drucksache(
            drucksache=drucksache,
            title=title,
            fraktionen=fraktionen,
            datum=datum,
            link=pdf_url,
            bundesland=self.bundesland,
            typ=typ,
        )

    def _build_form_body(self, query: str) -> dict:
        """Form-Body für PARiS Suche.

        - ``path=paris/LISSHFL.web``: die LISSH-Vorgangsdatenbank
        - ``format=LISSH_BrowseVorgang_Report``: Browse-Format mit
          allen Hits in einer Page (kein Pagination)
        - ``01_LISSHFL_Themen``: Thesaurus-Volltext-Suche. Der Server
          akzeptiert kein ``*``-Wildcard und timeout-t bei leerem Wert,
          deshalb verwenden wir bei leerer Query ein hochfrequentes
          Stoppwort als Catch-all.
        - ``02_LISSHFL_PARL=S OR L``: Stadtbürgerschaft + Landtag
        - ``03_LISSHFL_WP``: aktuelle Wahlperiode (kein Range — ein
          Multi-WP-Range hat im Test 60s+ gebraucht)
        """
        return {
            "path": "paris/LISSHFL.web",
            "format": "LISSH_BrowseVorgang_Report",
            "01_LISSHFL_Themen": query or "der",  # häufiges Stoppwort
            "02_LISSHFL_PARL": "S OR L",
            "03_LISSHFL_WP": str(self.wahlperiode),
        }

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Single-POST-Search gegen den PARiS-Servlet."""
        body = self._build_form_body(query)
        async with httpx.AsyncClient(
            timeout=60, follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.post(
                    f"{self.base_url}{self.servlet_path}",
                    data=body,
                    headers={"Content-Type": "application/x-www-form-urlencoded"},
                )
                if resp.status_code != 200:
                    logger.error("HB PARiS HTTP %s", resp.status_code)
                    return []

                results: list[Drucksache] = []
                for chunk in self._RE_TR.findall(resp.text):
                    doc = self._parse_record_html(chunk)
                    if not doc:
                        continue
                    if "antrag" not in (doc.typ or "").lower():
                        continue
                    results.append(doc)
                    if len(results) >= limit:
                        break
                return results
            except Exception:
                logger.exception("HB PARiS search error")
                return []

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Linearer Lookup über die search()-Resultate."""
        # Bei Drucksachen-IDs mit Suffix (21/730S) zerlegen wir die,
        # damit die Volltext-Suche den nackten Drucksachen-Anteil findet
        m = re.match(r"(\d+/\d+)([SL]?)$", drucksache)
        if not m:
            return None
        results = await self.search("*", limit=200)
        for d in results:
            if d.drucksache == drucksache:
                return d
        return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        import fitz
        doc = await self.get_document(drucksache)
        if not doc or not doc.link:
            return None
        async with httpx.AsyncClient(
            timeout=60, follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception:
                logger.exception("HB PARiS PDF download error for %s", drucksache)
                return None


class StarWebHEAdapter(ParlamentAdapter):
    """Hessen-spezifischer eUI-Adapter (#24/#30).

    starweb.hessen.de läuft auf einem eUI-Backend mit synchronem 2-Step-
    Flow (anders als BW PARLIS, das asynchron pollt):

    1. POST ``/portal/browse.tt.json`` mit ``action=SearchAndDisplay`` →
       Response enthält ``report_id`` direkt
    2. GET ``/portal/report.tt.html?report_id=...`` → HTML mit den Hits

    Hit-Format: Cards mit ``efxRecordRepeater``-divs, Daten in HTML-
    Kommentar-Perl-Dumps (``<!--<pre class="dump">$VAR1 = ...</pre>-->``).
    Field-Mapping:

    - ``WEV01`` → Title
    - ``WEV02`` → Datum
    - ``WEV03`` → Typ
    - ``WEV07`` → PDF-URL
    - ``WEV08`` → Drucksachen-Nummer
    - ``WEV12`` → Urheber/Fraktion

    Source: ``hlt.lis`` (Hessischer Landtag), Wahlperiode 21.
    """

    _RE_HE_COMMENT_DUMP = re.compile(
        r'<!--\s*<pre[^>]*class="dump"[^>]*>\s*\$VAR1 = (.*?)</pre>\s*-->',
        re.DOTALL,
    )
    _RE_HE_WEV01 = re.compile(r"'WEV01'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
    _RE_HE_WEV02 = re.compile(r"'WEV02'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"'](\d{1,2}\.\d{1,2}\.\d{4})[\"']")
    _RE_HE_WEV03 = re.compile(r"'WEV03'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
    _RE_HE_WEV07 = re.compile(r"'WEV07'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
    _RE_HE_WEV08 = re.compile(r"'WEV08'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"'](\d+/\d+)[\"']")
    _RE_HE_WEV12 = re.compile(r"'WEV12'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")

    bundesland = "HE"
    name = "Hessischer Landtag (StarWeb)"
    base_url = "https://starweb.hessen.de"
    portal_path = "/portal"
    wahlperiode = 21

    def _normalize_fraktion(self, text: str) -> list[str]:
        from .parteien import extract_fraktionen
        return extract_fraktionen(text, bundesland=self.bundesland)

    @staticmethod
    def _datum_de_to_iso(datum_de: str) -> str:
        if not datum_de:
            return ""
        try:
            d, m, y = datum_de.split(".")
            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
        except ValueError:
            return ""

    @staticmethod
    def _decode_perl_hex(text: str) -> str:
        """Wandle ``\\x{e9}`` → ``é`` etc. um. Robuste Hex-Substitution."""
        return re.sub(
            r"\\x\{([0-9a-fA-F]+)\}",
            lambda m: chr(int(m.group(1), 16)),
            text,
        )

    def _build_initial_body(self, query: str = "") -> dict:
        """HE-Server-Body. Aktuelle WP, optional Volltext-Filter.

        Der Server verlangt ZWINGEND einen ``search.json``-Term-Tree mit
        einer ``not(query, NOWEB=X)``-Wurzel. ``parsed``/``sref`` allein
        reichen nicht — der Server ignoriert sie und liefert nur
        ``facets`` zurück.
        """
        wp_str = str(self.wahlperiode)
        wp_term = {
            "tn": "term", "t": wp_str, "sf": "WP",
            "op": "eq", "idx": 45, "l": 3, "num": 1,
        }
        # Bauen den Top-NOT-Tree: NOT(query_subtree, NOWEB=X)
        if query:
            vtdrs_term = {
                "tn": "term",
                "t": f"\"(/VT ('\\\"{query}\\\"'))\"",
                "sf": "VTDRS", "op": "eq", "idx": 9, "l": 3, "num": 3,
            }
            inner = {"tn": "and", "terms": [vtdrs_term, wp_term], "num": 4}
            parsed = (
                f"((/VTDRS \"(/VT ('\\\"{query}\\\"'))\") "
                f"AND (/WP {wp_str})) AND NOT NOWEB=X"
            )
        else:
            inner = wp_term
            parsed = f"(/WP {wp_str}) AND NOT NOWEB=X"

        json_tree = [{
            "tn": "not",
            "terms": [
                inner,
                {"tn": "term", "t": "X", "sf": "NOWEB",
                 "op": "eq", "idx": 100, "l": 3, "num": 2},
            ],
        }]

        return {
            "action": "SearchAndDisplay",
            "sources": ["hlt.lis"],
            "report": {
                "rhl": "main",
                "rhlmode": "add",
                "format": "generic2-short",
                "mime": "html",
                "sort": "WPSORT/D DRSORT/D",
            },
            "search": {
                "lines": {"1": query, "2": wp_str},
                "serverrecordname": "generic2Search",
                "parsed": parsed,
                "sref": parsed,
                "json": json_tree,
            },
        }

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Synchroner 2-Step gegen starweb.hessen.de."""
        from .parteien import extract_fraktionen

        body = self._build_initial_body(query)
        browse_url = f"{self.base_url}{self.portal_path}/browse.tt.json"
        report_url = f"{self.base_url}{self.portal_path}/report.tt.html"

        async with httpx.AsyncClient(
            timeout=60, follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.post(browse_url, json=body)
                if resp.status_code != 200:
                    logger.error("HE browse HTTP %s", resp.status_code)
                    return []
                data = resp.json()
                report_id = data.get("report_id")
                if not report_id:
                    logger.error("HE: no report_id in browse response keys=%s", sorted(data.keys()))
                    return []

                # Step 2: report.tt.html mit chunksize — ohne den Parameter
                # liefert der Server nur den allerersten Hit (8 KB HTML).
                # Wir nehmen 1500 als Floor, analog #61 PortalaAdapter, weil
                # nach dem client-side Antrag-Filter die Hit-Dichte gering
                # ist (HE hat ~1:30 Antrag/Anfrage).
                chunksize = max(limit * 30, 1500)
                rep = await client.get(
                    report_url,
                    params={
                        "report_id": report_id,
                        "start": 0,
                        "chunksize": chunksize,
                    },
                )
                if rep.status_code != 200:
                    logger.error("HE report HTTP %s", rep.status_code)
                    return []
                results = self._parse_report_html(rep.text)
                # Client-side Antrag-Filter (analog #61 Bug 2/3 für portala)
                results = [d for d in results if "antrag" in (d.typ or "").lower()]
                # Optional Query-Filter client-side
                if query:
                    qterms = query.lower().split()
                    results = [
                        d for d in results
                        if all(t in (d.title.lower() + " " + " ".join(d.fraktionen).lower()) for t in qterms)
                    ]
                return results[:limit]
            except Exception:
                logger.exception("HE search error")
                return []

    def _parse_report_html(self, html: str) -> list[Drucksache]:
        """Zieht Daten aus den ``<!--<pre class="dump">$VAR1 = ...-->``-
        Kommentaren. WEV01–WEV12 → Drucksache-Felder."""
        from .parteien import extract_fraktionen

        results: list[Drucksache] = []
        for dump in self._RE_HE_COMMENT_DUMP.findall(html):
            m_ds = self._RE_HE_WEV08.search(dump)
            if not m_ds:
                continue
            drucksache = m_ds.group(1)

            m_t = self._RE_HE_WEV01.search(dump)
            title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"

            m_pdf = self._RE_HE_WEV07.search(dump)
            pdf_url = m_pdf.group(1) if m_pdf else ""
            if pdf_url.startswith("http://"):
                pdf_url = "https://" + pdf_url[len("http://"):]

            m_dat = self._RE_HE_WEV02.search(dump)
            datum_iso = self._datum_de_to_iso(m_dat.group(1)) if m_dat else ""

            m_typ = self._RE_HE_WEV03.search(dump)
            typ = self._decode_perl_hex(m_typ.group(1)) if m_typ else "Drucksache"

            m_urheber = self._RE_HE_WEV12.search(dump)
            urheber = self._decode_perl_hex(m_urheber.group(1)) if m_urheber else ""
            fraktionen = extract_fraktionen(urheber, bundesland=self.bundesland)

            results.append(Drucksache(
                drucksache=drucksache, title=title, fraktionen=fraktionen,
                datum=datum_iso, link=pdf_url, bundesland=self.bundesland,
                typ=typ,
            ))

        return results

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Linearer Lookup über search() — wie die anderen Adapter, kein
        Direkt-ID-Filter."""
        results = await self.search("", limit=200)
        for d in results:
            if d.drucksache == drucksache:
                return d
        return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        import fitz
        doc = await self.get_document(drucksache)
        if not doc or not doc.link:
            return None
        async with httpx.AsyncClient(
            timeout=60, follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception:
                logger.exception("HE PDF download error for %s", drucksache)
                return None


class BundestagAdapter(ParlamentAdapter):
    """Adapter für den Deutschen Bundestag via DIP-API.

    Quelle: ``search.dip.bundestag.de/api/v1`` — die offizielle REST-API
    des Dokumentations- und Informationssystems (DIP). Schema dokumentiert
    unter https://dip.bundestag.de/über-dip/hilfe/api (SPA, Inhalt im
    Bundle ``main.*.chunk.js``). Auth via URL-Parameter ``apikey=...``
    PLUS einem ``Origin: https://dip.bundestag.de``-Header — der Server
    macht Origin-Locking auf seine eigene Single-Page-App.

    Der API-Key liegt offen in ``dip-config.js`` und wird vom DIP-Frontend
    bei jedem Request als URL-Parameter mitgeschickt. Solange wir den
    Origin-Header setzen, akzeptiert die API das auch von server-to-
    server-Calls.

    Doc-Mapping (``/api/v1/drucksache``):

    - ``dokumentnummer`` → ``drucksache`` (z.B. ``"21/5136"``)
    - ``titel`` → ``title``
    - ``urheber[*].bezeichnung``/``titel`` → ``fraktionen`` (durch
      ``parteien.extract_fraktionen`` normalisiert, deckt
      ``"Fraktion der AfD"`` → ``"AfD"`` ab)
    - ``datum`` → ``datum`` (bereits ISO YYYY-MM-DD)
    - ``fundstelle.pdf_url`` → ``link``
    - ``drucksachetyp`` → ``typ`` (Filter auf ``"Antrag"``)

    Pagination via ``cursor``-Parameter — der Server gibt nach jedem
    Result einen neuen Cursor zurück, den wir als nächsten Request
    mitschicken. 100 Hits pro Page, pro Wahlperiode ~600 Anträge.
    """

    bundesland = "BUND"
    name = "Deutscher Bundestag (DIP)"
    base_url = "https://search.dip.bundestag.de/api/v1"

    # Aus dip-config.js gescraped (öffentlich, klartext, von der DIP-SPA
    # bei jedem Request mitgesendet). Origin-Locking macht den Key
    # nicht-trivial weiterzugeben, aber für server-to-server-Calls mit
    # gesetztem Origin-Header voll funktional.
    DEFAULT_APIKEY = "SbGXhWA.3cpnNdb8rkht7iWpvSgTP8XIG88LoCrGd4"
    ORIGIN = "https://dip.bundestag.de"

    def __init__(
        self,
        *,
        apikey: Optional[str] = None,
        wahlperiode: int = 21,
        document_typ: str = "Antrag",
    ):
        self.apikey = apikey or self.DEFAULT_APIKEY
        self.wahlperiode = wahlperiode
        self.document_typ = document_typ

    def _make_client(self) -> httpx.AsyncClient:
        return httpx.AsyncClient(
            timeout=30,
            follow_redirects=True,
            headers={
                "Origin": self.ORIGIN,
                "Referer": f"{self.ORIGIN}/",
                "User-Agent": "Mozilla/5.0 GWOE-Antragspruefer",
                "Accept": "application/json",
            },
        )

    def _doc_to_drucksache(self, doc: dict) -> Optional[Drucksache]:
        """Map ein DIP-/drucksache-JSON auf unser ``Drucksache``-dataclass.
        ``None`` wenn essentielle Felder fehlen."""
        from .parteien import extract_fraktionen

        nummer = doc.get("dokumentnummer")
        if not nummer:
            return None

        # PDF-URL aus fundstelle ziehen — ist die zuverlässige Adresse
        fundstelle = doc.get("fundstelle") or {}
        pdf_url = fundstelle.get("pdf_url") or ""
        if not pdf_url:
            return None

        # Fraktionen aus urheber-Liste extrahieren. DIP listet sie als
        # "Fraktion der AfD" o.ä. — extract_fraktionen kennt das Pattern
        # bereits aus den Landtags-Adaptern.
        urheber_strs: list[str] = []
        for u in (doc.get("urheber") or []):
            if isinstance(u, dict):
                urheber_strs.append(u.get("titel") or u.get("bezeichnung") or "")
        urheber_combined = ", ".join(filter(None, urheber_strs))
        fraktionen = extract_fraktionen(urheber_combined, bundesland=self.bundesland)

        return Drucksache(
            drucksache=nummer,
            title=doc.get("titel", ""),
            fraktionen=fraktionen,
            datum=doc.get("datum", ""),
            link=pdf_url,
            bundesland=self.bundesland,
            typ=doc.get("drucksachetyp", "Antrag"),
        )

    async def _fetch_page(
        self, client: httpx.AsyncClient, *, cursor: Optional[str] = None,
    ) -> tuple[list[dict], Optional[str]]:
        """Lade eine Page vom /drucksache-Endpoint. Returns (docs, next_cursor)."""
        params = {
            "apikey": self.apikey,
            "f.drucksachetyp": self.document_typ,
            "f.wahlperiode": str(self.wahlperiode),
        }
        if cursor:
            params["cursor"] = cursor
        try:
            resp = await client.get(f"{self.base_url}/drucksache", params=params)
            if resp.status_code != 200:
                logger.error("BUND DIP HTTP %s: %s", resp.status_code, resp.text[:200])
                return [], None
            data = resp.json()
            return data.get("documents", []), data.get("cursor")
        except Exception:
            logger.exception("BUND DIP request error")
            return [], None

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Liste die neuesten Anträge der konfigurierten Wahlperiode.

        Server liefert Antrags-gefiltert + nach Aktualität sortiert; wir
        paginieren über cursor bis ``limit`` (oder das Ende der Periode)
        erreicht ist. Query wird client-side als Title-Substring-Filter
        angewandt — die DIP-API hat einen ``f.titel``-Filter, aber für
        Konsistenz mit den Landtags-Adaptern (alle nutzen client-side
        Filter wegen Schema-Drift) machen wir es hier auch so.
        """
        results: list[Drucksache] = []
        seen: set[str] = set()
        query_terms = [t.lower() for t in query.split() if t] if query else []

        async with self._make_client() as client:
            cursor: Optional[str] = None
            for _ in range(20):  # max 20 pages = 2000 docs als Hard-Cap
                docs, next_cursor = await self._fetch_page(client, cursor=cursor)
                if not docs:
                    break
                for raw in docs:
                    doc = self._doc_to_drucksache(raw)
                    if not doc:
                        continue
                    if doc.drucksache in seen:
                        continue
                    seen.add(doc.drucksache)
                    if query_terms:
                        hay = doc.title.lower()
                        if not all(t in hay for t in query_terms):
                            continue
                    results.append(doc)
                    if len(results) >= limit:
                        return results
                # Cursor unverändert → letzte Page erreicht
                if not next_cursor or next_cursor == cursor:
                    break
                cursor = next_cursor

        return results

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Look up a single Drucksache by ID. Nutzt den f.dokumentnummer-
        Filter — direkter Treffer ohne Pagination."""
        async with self._make_client() as client:
            try:
                resp = await client.get(
                    f"{self.base_url}/drucksache",
                    params={
                        "apikey": self.apikey,
                        "f.dokumentnummer": drucksache,
                        "f.wahlperiode": str(self.wahlperiode),
                    },
                )
                if resp.status_code != 200:
                    return None
                docs = resp.json().get("documents", [])
                for raw in docs:
                    if raw.get("dokumentnummer") == drucksache:
                        return self._doc_to_drucksache(raw)
            except Exception:
                logger.exception("BUND get_document error for %s", drucksache)
        return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        """Download das Drucksachen-PDF und extrahiere Volltext."""
        import fitz

        doc = await self.get_document(drucksache)
        if not doc or not doc.link:
            return None
        async with httpx.AsyncClient(
            timeout=60, follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception:
                logger.exception("BUND download error for %s", drucksache)
                return None


class SaarlandAdapter(ParlamentAdapter):
    """Adapter für den Landtag des Saarlandes via Umbraco JSON-API (#19).

    Backend ist eine Umbraco/.NET-SurfaceController-Schicht hinter
    ``www.landtag-saar.de``. Die Suchseite ``/suche?searchValue=…`` lädt
    ihre Ergebnisse via XHR-POST gegen
    ``/umbraco/aawSearchSurfaceController/SearchSurface/GetSearchResults/``.

    Schema reverse-engineered aus einem HAR-Capture (User-Browser, gegen
    ``Schule``-Suche). Wichtig:

    - Content-Type ist ``application/x-www-form-urlencoded; charset=UTF-8``,
      aber der Body ist trotzdem **rohes JSON** (Kendo-Konvention von
      ``$.ajax`` ohne explizites ``contentType``). Ein
      ``application/json``-Header funktioniert auch, aber nur mit der
      minimalen Body-Form unten — sobald ``Sections.{Print,Operations,…}``
      gesetzt sind, antwortet der Server mit HTTP 500. Mit ``Sections:{}``
      ist alles OK und der Server liefert die Hits sektionsübergreifend.
    - Body-Schema:

      ```json
      {
        "Filter": {"Periods": [17]},
        "Pageination": {"Skip": 0, "Take": 10},
        "Sections": {},
        "Sort": {},
        "OnlyTitle": false,
        "Value": "Schule",
        "CurrentSearchTab": 0
      }
      ```

    - Response: ``FilteredResult[]`` mit pro Item ``DocumentNumber``
      (``"17/11"``), ``Legislative`` (Wahlperiode int), ``DocumentType``
      (``"Antrag"``/``"Anfrage"``/``"Gesetzentwurf"``/…), ``Title``,
      ``PublicDate``, ``DocumentAuthor`` (Liste mit ``Name (Partei);…``),
      ``Publisher`` (Fraktion bei kollektiven Anträgen), ``FilePath``
      (relativ, ``/file.ashx?FileId=…&FileName=…``).

    Der Filter auf ``DocumentType=="Antrag"`` läuft client-side, weil die
    Server-Sections-Struktur die Filter-Granularität nicht hat (Print
    enthält Anfragen + Anträge + Gesetzentwürfe gemischt).

    Drucksachen-Lookup: ``Value="17/11"`` matched die Drucksachen-Nummer
    direkt an erster Position — ein dedizierter ``GetById``-Endpoint
    existiert nicht.
    """

    bundesland = "SL"
    name = "Landtag des Saarlandes"
    base_url = "https://www.landtag-saar.de"

    def __init__(self, *, wahlperiode: int = 17):
        self.wahlperiode = wahlperiode

    def _make_client(self) -> httpx.AsyncClient:
        return httpx.AsyncClient(
            timeout=30,
            follow_redirects=True,
            headers={
                "User-Agent": "Mozilla/5.0 GWOE-Antragspruefer",
                "Accept": "application/json, text/javascript, */*; q=0.01",
                "X-Requested-With": "XMLHttpRequest",
                "Origin": self.base_url,
                "Referer": f"{self.base_url}/suche?searchValue=&ActiveTab=0",
            },
        )

    def _build_body(self, query: str, *, skip: int = 0, take: int = 50) -> str:
        """Bauen den minimalen Body, der vom Server akzeptiert wird.

        Beachte: ``Sections={}`` und ``Sort={}`` sind PFLICHT als leere
        Objekte (nicht weglassen, nicht ausfüllen — ausgefüllte Sections
        triggern HTTP 500).
        """
        return json.dumps({
            "Filter": {"Periods": [self.wahlperiode]},
            "Pageination": {"Skip": skip, "Take": take},
            "Sections": {},
            "Sort": {},
            "OnlyTitle": False,
            "Value": query or "",
            "CurrentSearchTab": 0,
        })

    @staticmethod
    def _doc_to_drucksache(item: dict) -> Optional[Drucksache]:
        from .parteien import extract_fraktionen

        nummer = item.get("DocumentNumber")
        if not nummer:
            return None

        # Fraktionen aus Publisher (kollektive Anträge: "CDU", "SPD") oder
        # DocumentAuthor (individuelle MdL: "Schmitt-Lang, Jutta (CDU)").
        # Beides via extract_fraktionen normalisiert.
        publisher = item.get("Publisher") or ""
        author = item.get("DocumentAuthor") or ""
        fraktionen = extract_fraktionen(
            f"{publisher} {author}".strip(), bundesland="SL",
        )

        # PublicDate ist im Format ``2022-05-12T00:00:00`` — ISO-Date abschneiden.
        public_date = (item.get("PublicDate") or "")[:10]

        # ``FilePath`` ist ``/file.ashx?FileId=…&FileName=…`` — der gibt
        # aber HTML mit einem Iframe-Wrapper zurück, nicht das PDF selbst.
        # Der echte Binär-Endpoint ist ``/Downloadfile.ashx`` (Großbuchstabe!)
        # mit denselben Query-Parametern. Server liefert dort
        # ``Content-Type: application/pdf``.
        file_path = item.get("FilePath") or ""
        if file_path.startswith("/file.ashx"):
            file_path = file_path.replace("/file.ashx", "/Downloadfile.ashx", 1)
        link = (
            f"https://www.landtag-saar.de{file_path}"
            if file_path.startswith("/") else file_path
        )

        return Drucksache(
            drucksache=nummer,
            title=item.get("Title", ""),
            fraktionen=fraktionen,
            datum=public_date,
            link=link,
            bundesland="SL",
            typ=item.get("DocumentType", ""),
        )

    async def _post_search(
        self, client: httpx.AsyncClient, query: str, *, skip: int = 0, take: int = 50,
    ) -> list[dict]:
        url = (
            f"{self.base_url}/umbraco/aawSearchSurfaceController/"
            "SearchSurface/GetSearchResults/"
        )
        body = self._build_body(query, skip=skip, take=take)
        try:
            resp = await client.post(
                url,
                content=body,
                headers={
                    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
                },
            )
            if resp.status_code != 200:
                logger.error("SL HTTP %s: %s", resp.status_code, resp.text[:200])
                return []
            data = resp.json()
            return data.get("FilteredResult", []) or []
        except Exception:
            logger.exception("SL search request error")
            return []

    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Volltextsuche über die aktuelle Wahlperiode, gefiltert auf Anträge.

        Holt 5*limit Hits in einer Page, filtert client-side auf
        ``DocumentType=="Antrag"`` (Print-Section enthält auch Anfragen
        und Gesetzentwürfe), und kürzt auf ``limit``. Sortierung kommt
        relevance-based vom Server — für die UI ist Relevanz zu einer
        Query meist wertvoller als Date-DESC.
        """
        async with self._make_client() as client:
            # Take großzügig, weil der Antrag-Filter ~30-50% der Hits drosselt
            take = max(limit * 5, 30)
            items = await self._post_search(client, query, skip=0, take=take)

        results: list[Drucksache] = []
        seen: set[str] = set()
        for item in items:
            if (item.get("DocumentType") or "").lower() != "antrag":
                continue
            doc = self._doc_to_drucksache(item)
            if doc is None or doc.drucksache in seen:
                continue
            seen.add(doc.drucksache)
            results.append(doc)
            if len(results) >= limit:
                break
        return results

    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Direktes Lookup via ``Value=<drucksache>`` — die Server-Suche
        matcht die Drucksachen-Nummer im Dokument selbst und liefert sie
        zuverlässig als ersten Treffer."""
        async with self._make_client() as client:
            items = await self._post_search(client, drucksache, take=20)

        for item in items:
            if item.get("DocumentNumber") == drucksache:
                return self._doc_to_drucksache(item)
        return None

    async def download_text(self, drucksache: str) -> Optional[str]:
        """Hole das Antrags-PDF via ``/file.ashx`` und extrahiere Volltext."""
        import fitz

        doc = await self.get_document(drucksache)
        if doc is None or not doc.link:
            return None

        async with httpx.AsyncClient(
            timeout=60, follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    logger.error("SL PDF HTTP %s for %s", resp.status_code, drucksache)
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception:
                logger.exception("SL download error for %s", drucksache)
                return None


# Registry of adapters
ADAPTERS = {
    "BUND": BundestagAdapter(),
    "HB": PARiSHBAdapter(),
    "HE": StarWebHEAdapter(),
    "NRW": NRWAdapter(),
    "SN": SNEdasXmlAdapter(),
    "LSA": PortalaAdapter(
        bundesland="LSA",
        name="Landtag von Sachsen-Anhalt (PADOKA)",
        base_url="https://padoka.landtag.sachsen-anhalt.de",
        db_id="lsa.lissh",
        wahlperiode=8,
        portala_path="/portal",
        document_type="Antrag",
        pdf_url_prefix="/files/",
    ),
    "BE": PortalaAdapter(
        bundesland="BE",
        name="Abgeordnetenhaus von Berlin (PARDOK)",
        base_url="https://pardok.parlament-berlin.de",
        db_id="lah.lissh",
        wahlperiode=19,
        portala_path="/portala",
        # Berlin's ETYPF index uses different value strings — drop the
        # document_type subtree, fall back to client-side title filter.
        document_type=None,
        # Quick-win for #13: pulled the date window from the original
        # 180-day MVP up to 730 days so client-side title-filter searches
        # ("Schule" etc.) reach back across more of the WP19 corpus until
        # the eUI fulltext-sf is reverse-engineered. The chunksize bump
        # in PortalaAdapter.search() means the per-request payload stays
        # bounded.
        date_window_days=730,
        pdf_url_prefix="/files/",
    ),
    "MV": ParLDokAdapter(
        bundesland="MV",
        name="Landtag Mecklenburg-Vorpommern (ParlDok)",
        base_url="https://www.dokumentation.landtag-mv.de",
        wahlperiode=8,
        prefix="/parldok",
        document_typ="Antrag",
    ),
    "HH": ParLDokAdapter(
        bundesland="HH",
        name="Hamburgische Bürgerschaft (ParlDok)",
        base_url="https://www.buergerschaft-hh.de",
        wahlperiode=23,
        prefix="/parldok",
        document_typ="Antrag",
    ),
    "TH": ParLDokAdapter(
        bundesland="TH",
        name="Thüringer Landtag (ParlDok)",
        base_url="https://parldok.thueringer-landtag.de",
        wahlperiode=8,
        prefix="/parldok",
        # TH packs Anträge under composite type strings like
        # "Antrag gemäß § 79 GO" with kind="Vorlage", not the
        # MV-style kind="Drucksache"/type="Antrag". Substring-match
        # on "Antrag" plus widened kind list catches them all.
        document_typ="Antrag",
        document_typ_substring=True,
        kinds=["Drucksache", "Vorlage"],
    ),
    "SH": StarFinderCGIAdapter(
        bundesland="SH",
        name="Schleswig-Holsteinischer Landtag (LIS-SH)",
        base_url="http://lissh.lvn.parlanet.de",
        wahlperiode=20,
        db_path="lisshfl.txt",
        document_typ_code="antrag",
    ),
    "BB": PortalaAdapter(
        bundesland="BB",
        name="Landtag Brandenburg (parladoku)",
        base_url="https://www.parlamentsdokumentation.brandenburg.de",
        db_id="lbb.lissh",
        wahlperiode=8,
        portala_path="/portal",
        document_type="Antrag",
        # BB packs the date BEFORE the Drucksachen-Nummer in the h6
        # line and uses the BE-style efxRecordRepeater HTML cards;
        # the auto-detect picks the card path automatically.
    ),
    "RP": PortalaAdapter(
        bundesland="RP",
        name="Landtag Rheinland-Pfalz (OPAL)",
        base_url="https://opal.rlp.de",
        db_id="rlp.lissh",
        wahlperiode=18,
        portala_path="/portal",
        document_type="Antrag",
    ),
    "BY": BayernAdapter(),
    "SL": SaarlandAdapter(),
    "BW": PARLISAdapter(
        bundesland="BW",
        name="Landtag von Baden-Württemberg (PARLIS)",
        base_url="https://parlis.landtag-bw.de",
        wahlperiode=17,
        prefix="/parlis",
        document_typ="Antrag",
    ),
}


def get_adapter(bundesland: str) -> Optional[ParlamentAdapter]:
    """Get adapter for a bundesland."""
    return ADAPTERS.get(bundesland)


async def search_all(query: str, bundesland: str = "NRW", limit: int = 20) -> list[Drucksache]:
    """Search parliament documents in a specific state."""
    adapter = get_adapter(bundesland)
    if not adapter:
        return []
    return await adapter.search(query, limit)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								"""Parliament search adapters for different German states."""
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								import json
 								import logging
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								import httpx
 								import re
 								from abc import ABC, abstractmethod
 								from dataclasses import dataclass
 								from typing import Optional
 								from bs4 import BeautifulSoup
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								logger = logging.getLogger(__name__)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
 								@dataclass
 								class Drucksache:
 								    """A parliamentary document."""
 								    drucksache: str  # e.g. "18/8125"
 								    title: str
 								    fraktionen: list[str]
 								    datum: str  # ISO date
 								    link: str  # PDF URL
 								    bundesland: str
 								    typ: str = "Antrag"  # Antrag, Anfrage, Beschlussempfehlung, etc.
 								class ParlamentAdapter(ABC):
 								    """Base adapter for searching parliament documents."""
 								    bundesland: str
 								    name: str
 								    @abstractmethod
 								    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
 								        """Search for documents matching query."""
 								        pass
 								    @abstractmethod
 								    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
 								        """Get a specific document by ID."""
 								        pass
 								    @abstractmethod
 								    async def download_text(self, drucksache: str) -> Optional[str]:
 								        """Download and extract text from a document."""
 								        pass
 								class NRWAdapter(ParlamentAdapter):
 								    """Adapter for NRW Landtag (opal.landtag.nrw.de)."""
 								    bundesland = "NRW"
 								    name = "Landtag Nordrhein-Westfalen"
 								    base_url = "https://opal.landtag.nrw.de"
 								    search_url = "https://opal.landtag.nrw.de/home/dokumente/dokumentensuche/parlamentsdokumente/aktuelle-dokumente.html"
 								    def _parse_query(self, query: str) -> tuple[str, list[str], bool]:
 								        """
 								        Parse search query for AND logic and exact phrases.
 								        Returns: (search_term_for_api, filter_terms, is_exact)
 								        Examples:
 								        - 'Klimaschutz Energie' -> ('Klimaschutz', ['klimaschutz', 'energie'], False)
 								        - '"Grüner Stahl"' -> ('Grüner Stahl', ['grüner stahl'], True)
 								        - 'Klimaschutz "erneuerbare Energie"' -> ('Klimaschutz', ['klimaschutz', 'erneuerbare energie'], False)
 								        """
 								        query = query.strip()
 								        # Check for exact phrase (entire query in quotes)
 								        if query.startswith('"') and query.endswith('"') and query.count('"') == 2:
 								            exact = query[1:-1].strip()
 								            return (exact, [exact.lower()], True)
 								        # Extract quoted phrases and regular terms
 								        import shlex
 								        try:
 								            parts = shlex.split(query)
 								        except ValueError:
 								            # Fallback for unbalanced quotes
 								            parts = query.split()
 								        if not parts:
 								            return (query, [query.lower()], False)
 								        # Use first term for API search, all terms for filtering
 								        filter_terms = [p.lower() for p in parts]
 								        return (parts[0], filter_terms, False)
 								    def _matches_all_terms(self, doc: 'Drucksache', terms: list[str], is_exact: bool) -> bool:
 								        """Check if document matches all search terms (AND logic)."""
 								        searchable = f"{doc.title} {doc.drucksache} {' '.join(doc.fraktionen)} {doc.typ}".lower()
 								        if is_exact:
 								            # Exact phrase must appear
 								            return terms[0] in searchable
 								        else:
 								            # All terms must appear (AND)
 								            return all(term in searchable for term in terms)
 								    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
 								        """Search NRW Landtag documents via OPAL portal."""
 								        results = []
 								        # Parse query for AND logic
 								        api_query, filter_terms, is_exact = self._parse_query(query)
 								        async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
 								            try:
 								                # First, get the page to establish session
 								                initial = await client.get(self.search_url)
 								                if initial.status_code != 200:
 								                    print(f"NRW search initial request failed: {initial.status_code}")
 								                    return []
 								                # Parse for webflow token from pagination links
 								                soup = BeautifulSoup(initial.text, 'html.parser')
 								                # Find a pagination link to extract the webflow token
 								                pagination_link = soup.select_one('a[href*="webflowexecution"]')
 								                webflow_token = ""
 								                webflow_execution = ""
 								                if pagination_link:
 								                    href = pagination_link.get('href', '')
 								                    # Extract webflowToken and webflowexecution from URL
 								                    token_match = re.search(r'webflowToken=([^&]*)', href)
 								                    exec_match = re.search(r'(webflowexecution[^=]+)=([^&]+)', href)
 								                    if token_match:
 								                        webflow_token = token_match.group(1)
 								                    if exec_match:
 								                        webflow_execution = f"{exec_match.group(1)}={exec_match.group(2)}"
 								                # Now perform the search with POST
 								                # Find the form action URL with webflow token
 								                form = soup.select_one('form#docSearchByItem')
 								                form_action = self.search_url
 								                if form and form.get('action'):
 								                    action = form.get('action')
 								                    if action.startswith('/'):
 								                        form_action = f"{self.base_url}{action}"
 								                    elif action.startswith('http'):
 								                        form_action = action
 								                    else:
 								                        form_action = f"{self.search_url}?{action}"
 								                # Build form data for "Einfache Suche" (searchByItem form)
 								                form_data = {
 								                    '_eventId_sendform': '1',
 								                    'dokNum': api_query,  # This is the text search field
 								                    'formId': 'searchByItem',
 								                    'dokTyp': '',  # All types
 								                    'wp': '18',  # Wahlperiode 18
 								                }
 								                # POST request with form data to the form action URL
 								                search_resp = await client.post(
 								                    form_action,
 								                    data=form_data,
 								                    cookies=initial.cookies,
 								                    headers={'Content-Type': 'application/x-www-form-urlencoded'}
 								                )
 								                if search_resp.status_code != 200:
 								                    print(f"NRW search request failed: {search_resp.status_code}")
 								                    return []
 								                # Parse results
 								                soup = BeautifulSoup(search_resp.text, 'html.parser')
 								                # Find all document result items (li elements containing articles)
 								                items = soup.select('li:has(article)')
 								                for item in items[:limit]:
 								                    try:
 								                        # Extract drucksache number from first link
 								                        num_link = item.select_one('a[href*="MMD"]')
 								                        if not num_link:
 								                            continue
 								                        href = num_link.get('href', '')
 								                        # Extract number: MMD18-12345.pdf -> 18/12345
 								                        match = re.search(r'MMD(\d+)-(\d+)\.pdf', href)
 								                        if not match:
 								                            continue
 								                        legislatur, nummer = match.groups()
 								                        drucksache = f"{legislatur}/{nummer}"
 								                        pdf_url = f"https://www.landtag.nrw.de{href}" if href.startswith('/') else href
 								                        # Extract title from the title link (class e-document-result-item__title)
 								                        title_elem = item.select_one('a.e-document-result-item__title')
 								                        if title_elem:
 								                            # Get text content, clean it up
 								                            title = title_elem.get_text(strip=True)
 								                            # Remove SVG icon text and clean
 								                            title = re.sub(r'\s*<svg.*', '', title)
 								                            title = re.sub(r'\s+', ' ', title).strip()
 								                        else:
 								                            # Fallback: try to find any longer text
 								                            title = f"Drucksache {drucksache}"
 								                        # Clean up common artifacts
 								                        title = re.sub(r'\s*\(\s*externer Link.*?\)', '', title).strip()
 								                        # Extract type (Antrag, Kleine Anfrage, etc.)
 								                        typ_elem = item.select_one('.e-document-result-item__category')
 								                        typ = typ_elem.get_text(strip=True) if typ_elem else "Drucksache"
 								                        # Extract date
 								                        time_elem = item.select_one('time')
 								                        datum = ""
 								                        if time_elem:
 								                            datum_text = time_elem.get_text(strip=True)
 								                            # Convert DD.MM.YYYY to YYYY-MM-DD
 								                            date_match = re.match(r'(\d{2})\.(\d{2})\.(\d{4})', datum_text)
 								                            if date_match:
 								                                d, m, y = date_match.groups()
 								                                datum = f"{y}-{m}-{d}"
 								                        # Extract Urheber (fraktionen) - look for paragraph containing "Urheber:"
 								                        urheber_text = ""
 								                        for p in item.select('p'):
 								                            if 'Urheber:' in p.get_text():
 								                                urheber_text = p.get_text()
 								                                break
 								                        fraktionen = []
 								                        if urheber_text:
 								                            # Extract party names (SPD, CDU, GRÜNE, FDP, AfD)
 								                            for party in ['SPD', 'CDU', 'GRÜNE', 'Grüne', 'FDP', 'AfD']:
 								                                if party in urheber_text:
 								                                    fraktionen.append(party.upper() if party.lower() != 'grüne' else 'GRÜNE')
 								                        doc = Drucksache(
 								                            drucksache=drucksache,
 								                            title=title,
 								                            fraktionen=fraktionen,
 								                            datum=datum,
 								                            link=pdf_url,
 								                            bundesland="NRW",
 								                            typ=typ,
 								                        )
 								                        # Apply AND filter (all terms must match)
 								                        if self._matches_all_terms(doc, filter_terms, is_exact):
 								                            results.append(doc)
 								                    except Exception as e:
 								                        print(f"Error parsing item: {e}")
 								                        continue
 								            except Exception as e:
 								                print(f"NRW search error: {e}")
 								        return results
 								    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
 								        """Get document metadata by drucksache ID (e.g. '18/8125')."""
 								        # Parse legislatur and number
 								        match = re.match(r"(\d+)/(\d+)", drucksache)
 								        if not match:
 								            return None
 								        legislatur, nummer = match.groups()
 								        pdf_url = f"https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument/MMD{legislatur}-{nummer}.pdf"
 								        # Try to fetch and extract basic info
 								        async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
 								            try:
 								                resp = await client.head(pdf_url)
 								                if resp.status_code == 200:
 								                    return Drucksache(
 								                        drucksache=drucksache,
 								                        title=f"Drucksache {drucksache}",
 								                        fraktionen=[],
 								                        datum="",
 								                        link=pdf_url,
 								                        bundesland="NRW",
 								                    )
 								            except:
 								                pass
 								        return None
 								    async def download_text(self, drucksache: str) -> Optional[str]:
 								        """Download PDF and extract text."""
 								        import fitz  # PyMuPDF
 								        doc = await self.get_document(drucksache)
 								        if not doc:
 								            return None
 								        async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
 								            try:
 								                resp = await client.get(doc.link)
 								                if resp.status_code != 200:
 								                    return None
 								                # Extract text with PyMuPDF
 								                pdf = fitz.open(stream=resp.content, filetype="pdf")
 								                text = ""
 								                for page in pdf:
 								                    text += page.get_text()
 								                pdf.close()
 								                return text
 								            except Exception as e:
 								                print(f"Error downloading {drucksache}: {e}")
 								                return None
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								class PortalaAdapter(ParlamentAdapter):
 								    """Adapter for portala/eUI-based parliament documentation systems.
 								    Used by parliaments running the proprietary "esearch" / portala framework
 								    (originally developed for STAR/StarFinder backends, now wrapped in a
 								    Single-Page App with Template Toolkit on the server side):
 								    - **LSA** (Sachsen-Anhalt) — PADOKA at ``padoka.landtag.sachsen-anhalt.de``
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								      under ``/portal/`` (singular)
 								    - **BE** (Berlin) — PARDOK at ``pardok.parlament-berlin.de`` under
 								      ``/portala/`` (with the trailing 'a')
 								    Both instances share the same JSON action schema, only the base URL,
 								    the data source ID, the application path prefix and a few minor
 								    quirks differ — those are constructor parameters so that the same
 								    class can serve both states (and any future portala-based parliament).
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
 								    The search workflow is two-stage:
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+. ``POST {base}{path}/browse.tt.json`` with a complex JSON ``action``
 								       body that contains an Elasticsearch-style query tree under
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								       ``search.json``. The server returns a ``report_id`` plus hit count.
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+. ``POST {base}{path}/report.tt.html`` with ``{report_id, start,
 								       chunksize}`` to fetch the HTML hit list. Each hit carries a Perl
 								       Data::Dumper block in a ``<pre>`` tag with the canonical metadata.
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
 								    The query body schema was reverse-engineered from
 								    https://github.com/okfde/dokukratie/blob/main/dokukratie/scrapers/portala.query.json
 								    (GPL-3.0 — only structure/selectors are reused, not Python code).
 								    Full-text search is **not** implemented in the MVP: the adapter
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								    returns documents of the current Wahlperiode in the given date
 								    window, and the search query is applied as a client-side
 								    title/Urheber filter. The server-side full-text path requires
 								    state-specific ``sf`` index names that are not yet known.
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								    """
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								    def __init__(
 								        self,
 								        *,
 								        bundesland: str,
 								        name: str,
 								        base_url: str,
 								        db_id: str,
 								        wahlperiode: int,
 								        portala_path: str = "/portal",
 								        document_type: Optional[str] = "Antrag",
 								        pdf_url_prefix: str = "/files/",
 								        date_window_days: int = 730,
-												Activate Brandenburg + Rheinland-Pfalz via PortalaAdapter reuse (#27, #30, Phase 2)

Riesige Überraschung aus dem BB-HAR-Trace: Brandenburg ist NICHT
StarWeb wie in dokukratie und bundeslaender.py klassifiziert,
sondern läuft auch auf dem portala/eUI-Backend. Endpoint
/portal/browse.tt.json mit db_id=lbb.lissh. Das alte
/starweb/LBB/ELVIS/-Frontend ist nur Legacy.

Folgeprobing offenbarte: RP/opal.rlp.de läuft ebenfalls portala
(db_id=rlp.lissh, 46759 hits in WP18), ebenso NI/HE/BB. Damit ist
Phase 2 großteils KEIN StarWeb-Adapter-Bau, sondern PortalaAdapter-
Wiederverwendung mit konfigurierbaren Parametern.

Activated via Registry-Einträge:

- "BB" → PortalaAdapter(base_url=parlamentsdokumentation.brandenburg.de,
  db_id=lbb.lissh, wahlperiode=8). Nutzt die BE-Card-Variante des
  Hit-Parsers (efxRecordRepeater).
- "RP" → PortalaAdapter(base_url=opal.rlp.de, db_id=rlp.lissh,
  wahlperiode=18). NICHT mit dem NRW OPAL verwechseln — anderer
  Markenname, andere Engine.

PortalaAdapter erweitert um zwei neue Konstruktor-Parameter mit
backward-kompatiblen Defaults:

- typ_filter: Optional[str] = "DOKDBE"
  Wenn None, wird die TYP=<value>-Klausel weggelassen. Manche
  Instanzen (HE/hlt.lis) lehnen DOKDBE ab.

- omit_date_filter: bool = False
  Wenn True, wird der DAT/DDAT/SDAT-Term weggelassen. HE
  und ähnliche Instanzen haben andere Date-Field-Namen.

Plus _parse_hit_list_cards Date-Regex erweitert: zusätzlich zum
"vom DD.MM.YYYY"-Pattern (BE) jetzt auch "DD.MM.YYYY"-plain
(BB schreibt Datum vor Drucksachen-Nummer ohne "vom"-Marker).

Smoke-Test (lokal):
  BB q="":       5 hits in 5.9s
  BB q="Schule": 5 hits (Pflegeschulen, Genderverbot, Hochschulen)
  RP q="":       5 hits in 4.1s (Entlastung, Bildungschancen)
  RP q="Schule": 5 hits (Hochschulbau, G9-Gymnasien, Leistungsgerechtigkeit)

bundeslaender.py: BB.doku_system "StarWeb"→"portala", RP analog,
beide aktiv=True. Anmerkungen mit dem portala-Verweis und der
Klarstellung "OPAL/RLP ≠ NRW OPAL" erweitert.

NICHT in diesem Commit:
- HE: portala-Backend (hlt.lis) ist erreichbar, aber das HE-Card-
  Layout ist anders (Title direkt im <h3> statt <h3><span>, kein
  <span class="h6"> für Meta) — eigener Parser-Pfad nötig, deferred.
- NI: nilas.niedersachsen.de/portal/ ist eine Login-Page, das
  öffentliche Backend ist nicht zugänglich — deferred.
- HB: kein /portal/-Endpoint, bleibt das alte StarWeb-Servlet —
  braucht eigenen HAR-Trace, deferred.
- BB als StarWeb-Template (#27) ist hinfällig, weil BB portala ist.

Phase 2 (3/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:59:28 +02:00
+								        typ_filter: Optional[str] = "DOKDBE",
 								        omit_date_filter: bool = False,
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								    ) -> None:
 								        """Configure a portala/eUI adapter for one specific parliament.
 								        Args:
 								            bundesland: state code (e.g. ``"LSA"``, ``"BE"``).
 								            name: human-readable adapter label (used in logs/UI).
 								            base_url: ``https://...`` of the portal host without trailing slash.
 								            db_id: data source identifier the eUI server expects in
 								                ``action.sources``, e.g. ``"lsa.lissh"`` or ``"lah.lissh"``.
 								            wahlperiode: current legislative period — fed into the WP
 								                term of the search tree.
 								            portala_path: path prefix where the portala app lives. ``/portal``
 								                for LSA, ``/portala`` for Berlin.
 								            document_type: optional filter applied via ETYPF/DTYPF/DART
 								                terms. ``"Antrag"`` works for LSA; for instances where
 								                the index uses different document_type values (e.g. Berlin),
 								                pass ``None`` to drop the document_type subtree entirely
 								                — the user can still filter client-side by title.
 								            pdf_url_prefix: URL fragment between ``base_url`` and the
 								                relative PDF path returned by the server.
 								            date_window_days: how many days back ``search()`` looks by
 								                default.
-												Activate Brandenburg + Rheinland-Pfalz via PortalaAdapter reuse (#27, #30, Phase 2)

Riesige Überraschung aus dem BB-HAR-Trace: Brandenburg ist NICHT
StarWeb wie in dokukratie und bundeslaender.py klassifiziert,
sondern läuft auch auf dem portala/eUI-Backend. Endpoint
/portal/browse.tt.json mit db_id=lbb.lissh. Das alte
/starweb/LBB/ELVIS/-Frontend ist nur Legacy.

Folgeprobing offenbarte: RP/opal.rlp.de läuft ebenfalls portala
(db_id=rlp.lissh, 46759 hits in WP18), ebenso NI/HE/BB. Damit ist
Phase 2 großteils KEIN StarWeb-Adapter-Bau, sondern PortalaAdapter-
Wiederverwendung mit konfigurierbaren Parametern.

Activated via Registry-Einträge:

- "BB" → PortalaAdapter(base_url=parlamentsdokumentation.brandenburg.de,
  db_id=lbb.lissh, wahlperiode=8). Nutzt die BE-Card-Variante des
  Hit-Parsers (efxRecordRepeater).
- "RP" → PortalaAdapter(base_url=opal.rlp.de, db_id=rlp.lissh,
  wahlperiode=18). NICHT mit dem NRW OPAL verwechseln — anderer
  Markenname, andere Engine.

PortalaAdapter erweitert um zwei neue Konstruktor-Parameter mit
backward-kompatiblen Defaults:

- typ_filter: Optional[str] = "DOKDBE"
  Wenn None, wird die TYP=<value>-Klausel weggelassen. Manche
  Instanzen (HE/hlt.lis) lehnen DOKDBE ab.

- omit_date_filter: bool = False
  Wenn True, wird der DAT/DDAT/SDAT-Term weggelassen. HE
  und ähnliche Instanzen haben andere Date-Field-Namen.

Plus _parse_hit_list_cards Date-Regex erweitert: zusätzlich zum
"vom DD.MM.YYYY"-Pattern (BE) jetzt auch "DD.MM.YYYY"-plain
(BB schreibt Datum vor Drucksachen-Nummer ohne "vom"-Marker).

Smoke-Test (lokal):
  BB q="":       5 hits in 5.9s
  BB q="Schule": 5 hits (Pflegeschulen, Genderverbot, Hochschulen)
  RP q="":       5 hits in 4.1s (Entlastung, Bildungschancen)
  RP q="Schule": 5 hits (Hochschulbau, G9-Gymnasien, Leistungsgerechtigkeit)

bundeslaender.py: BB.doku_system "StarWeb"→"portala", RP analog,
beide aktiv=True. Anmerkungen mit dem portala-Verweis und der
Klarstellung "OPAL/RLP ≠ NRW OPAL" erweitert.

NICHT in diesem Commit:
- HE: portala-Backend (hlt.lis) ist erreichbar, aber das HE-Card-
  Layout ist anders (Title direkt im <h3> statt <h3><span>, kein
  <span class="h6"> für Meta) — eigener Parser-Pfad nötig, deferred.
- NI: nilas.niedersachsen.de/portal/ ist eine Login-Page, das
  öffentliche Backend ist nicht zugänglich — deferred.
- HB: kein /portal/-Endpoint, bleibt das alte StarWeb-Servlet —
  braucht eigenen HAR-Trace, deferred.
- BB als StarWeb-Template (#27) ist hinfällig, weil BB portala ist.

Phase 2 (3/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:59:28 +02:00
+								            typ_filter: ``TYP=<value>`` term in the parsed string and
 								                JSON tree. ``DOKDBE`` works for LSA/BE/BB/BW (the
 								                lissh-style instances). For Hessen (``hlt.lis``) and
 								                similar instances the value is different or absent —
 								                pass ``None`` to drop the term entirely.
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								        """
 								        self.bundesland = bundesland
 								        self.name = name
 								        self.base_url = base_url.rstrip("/")
 								        self.db_id = db_id
 								        self.wahlperiode = wahlperiode
 								        self.portala_path = "/" + portala_path.strip("/")
 								        self.document_type = document_type
 								        self.pdf_url_prefix = "/" + pdf_url_prefix.strip("/") + "/"
 								        self.date_window_days = date_window_days
-												Activate Brandenburg + Rheinland-Pfalz via PortalaAdapter reuse (#27, #30, Phase 2)

Riesige Überraschung aus dem BB-HAR-Trace: Brandenburg ist NICHT
StarWeb wie in dokukratie und bundeslaender.py klassifiziert,
sondern läuft auch auf dem portala/eUI-Backend. Endpoint
/portal/browse.tt.json mit db_id=lbb.lissh. Das alte
/starweb/LBB/ELVIS/-Frontend ist nur Legacy.

Folgeprobing offenbarte: RP/opal.rlp.de läuft ebenfalls portala
(db_id=rlp.lissh, 46759 hits in WP18), ebenso NI/HE/BB. Damit ist
Phase 2 großteils KEIN StarWeb-Adapter-Bau, sondern PortalaAdapter-
Wiederverwendung mit konfigurierbaren Parametern.

Activated via Registry-Einträge:

- "BB" → PortalaAdapter(base_url=parlamentsdokumentation.brandenburg.de,
  db_id=lbb.lissh, wahlperiode=8). Nutzt die BE-Card-Variante des
  Hit-Parsers (efxRecordRepeater).
- "RP" → PortalaAdapter(base_url=opal.rlp.de, db_id=rlp.lissh,
  wahlperiode=18). NICHT mit dem NRW OPAL verwechseln — anderer
  Markenname, andere Engine.

PortalaAdapter erweitert um zwei neue Konstruktor-Parameter mit
backward-kompatiblen Defaults:

- typ_filter: Optional[str] = "DOKDBE"
  Wenn None, wird die TYP=<value>-Klausel weggelassen. Manche
  Instanzen (HE/hlt.lis) lehnen DOKDBE ab.

- omit_date_filter: bool = False
  Wenn True, wird der DAT/DDAT/SDAT-Term weggelassen. HE
  und ähnliche Instanzen haben andere Date-Field-Namen.

Plus _parse_hit_list_cards Date-Regex erweitert: zusätzlich zum
"vom DD.MM.YYYY"-Pattern (BE) jetzt auch "DD.MM.YYYY"-plain
(BB schreibt Datum vor Drucksachen-Nummer ohne "vom"-Marker).

Smoke-Test (lokal):
  BB q="":       5 hits in 5.9s
  BB q="Schule": 5 hits (Pflegeschulen, Genderverbot, Hochschulen)
  RP q="":       5 hits in 4.1s (Entlastung, Bildungschancen)
  RP q="Schule": 5 hits (Hochschulbau, G9-Gymnasien, Leistungsgerechtigkeit)

bundeslaender.py: BB.doku_system "StarWeb"→"portala", RP analog,
beide aktiv=True. Anmerkungen mit dem portala-Verweis und der
Klarstellung "OPAL/RLP ≠ NRW OPAL" erweitert.

NICHT in diesem Commit:
- HE: portala-Backend (hlt.lis) ist erreichbar, aber das HE-Card-
  Layout ist anders (Title direkt im <h3> statt <h3><span>, kein
  <span class="h6"> für Meta) — eigener Parser-Pfad nötig, deferred.
- NI: nilas.niedersachsen.de/portal/ ist eine Login-Page, das
  öffentliche Backend ist nicht zugänglich — deferred.
- HB: kein /portal/-Endpoint, bleibt das alte StarWeb-Servlet —
  braucht eigenen HAR-Trace, deferred.
- BB als StarWeb-Template (#27) ist hinfällig, weil BB portala ist.

Phase 2 (3/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:59:28 +02:00
+								        self.typ_filter = typ_filter
 								        self.omit_date_filter = omit_date_filter
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
 								    # ── LSA-style hit list (Perl Data::Dumper inside <pre> blocks) ──
 								    # Reverse-engineered "WEV*" record fields:
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								    # WEV06.main = title
 								    # WEV32.5    = relative PDF path
 								    # WEV32.main = "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b> ..."
 								    _RE_TITLE = re.compile(r"'WEV06'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
 								    _RE_PDF = re.compile(r"'5'\s*=>\s*'([^']*\.pdf)'")
 								    _RE_DRUCKSACHE = re.compile(r"Drucksache\s*<b>(\d+/\d+)</b>")
 								    _RE_URHEBER_DATUM = re.compile(
 								        r"'WEV32'\s*=>\s*\[\s*\{[^}]*'main'\s*=>\s*[\"']Antrag\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
 								    )
 								    _RE_PRE_BLOCK = re.compile(r'<pre>\$VAR1 = (.*?)</pre>', re.DOTALL)
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								    # ── Berlin-style hit list (production HTML cards, no Perl dump) ──
 								    # The whole div for one record:
 								    _RE_BE_RECORD = re.compile(
 								        r'<div[^>]*class="[^"]*efxRecordRepeater[^"]*"[^>]*data-efx-rec="[^"]*"[^>]*>(.*?)(?=<div[^>]*efxRecordRepeater|<div[^>]*id="efxResultsEnd"|</main>|$)',
 								        re.DOTALL,
 								    )
 								    _RE_BE_TITLE = re.compile(r'<h3[^>]*class="h5[^"]*"[^>]*>\s*<span>([^<]+)</span>')
 								    _RE_BE_LINK = re.compile(r'<a[^>]*href="([^"]+\.pdf)"[^>]*>')
 								    # The metadata h6 looks like:
 								    #   <span class="h6">Antrag (Eilantrag)  &nbsp;<a ...>Drucksache 19/3104</a>  S. 1 bis 24 vom 31.03.2026</span>
 								    _RE_BE_DRUCKSACHE = re.compile(r'Drucksache\s+(\d+/\d+)')
-												Activate Brandenburg + Rheinland-Pfalz via PortalaAdapter reuse (#27, #30, Phase 2)

Riesige Überraschung aus dem BB-HAR-Trace: Brandenburg ist NICHT
StarWeb wie in dokukratie und bundeslaender.py klassifiziert,
sondern läuft auch auf dem portala/eUI-Backend. Endpoint
/portal/browse.tt.json mit db_id=lbb.lissh. Das alte
/starweb/LBB/ELVIS/-Frontend ist nur Legacy.

Folgeprobing offenbarte: RP/opal.rlp.de läuft ebenfalls portala
(db_id=rlp.lissh, 46759 hits in WP18), ebenso NI/HE/BB. Damit ist
Phase 2 großteils KEIN StarWeb-Adapter-Bau, sondern PortalaAdapter-
Wiederverwendung mit konfigurierbaren Parametern.

Activated via Registry-Einträge:

- "BB" → PortalaAdapter(base_url=parlamentsdokumentation.brandenburg.de,
  db_id=lbb.lissh, wahlperiode=8). Nutzt die BE-Card-Variante des
  Hit-Parsers (efxRecordRepeater).
- "RP" → PortalaAdapter(base_url=opal.rlp.de, db_id=rlp.lissh,
  wahlperiode=18). NICHT mit dem NRW OPAL verwechseln — anderer
  Markenname, andere Engine.

PortalaAdapter erweitert um zwei neue Konstruktor-Parameter mit
backward-kompatiblen Defaults:

- typ_filter: Optional[str] = "DOKDBE"
  Wenn None, wird die TYP=<value>-Klausel weggelassen. Manche
  Instanzen (HE/hlt.lis) lehnen DOKDBE ab.

- omit_date_filter: bool = False
  Wenn True, wird der DAT/DDAT/SDAT-Term weggelassen. HE
  und ähnliche Instanzen haben andere Date-Field-Namen.

Plus _parse_hit_list_cards Date-Regex erweitert: zusätzlich zum
"vom DD.MM.YYYY"-Pattern (BE) jetzt auch "DD.MM.YYYY"-plain
(BB schreibt Datum vor Drucksachen-Nummer ohne "vom"-Marker).

Smoke-Test (lokal):
  BB q="":       5 hits in 5.9s
  BB q="Schule": 5 hits (Pflegeschulen, Genderverbot, Hochschulen)
  RP q="":       5 hits in 4.1s (Entlastung, Bildungschancen)
  RP q="Schule": 5 hits (Hochschulbau, G9-Gymnasien, Leistungsgerechtigkeit)

bundeslaender.py: BB.doku_system "StarWeb"→"portala", RP analog,
beide aktiv=True. Anmerkungen mit dem portala-Verweis und der
Klarstellung "OPAL/RLP ≠ NRW OPAL" erweitert.

NICHT in diesem Commit:
- HE: portala-Backend (hlt.lis) ist erreichbar, aber das HE-Card-
  Layout ist anders (Title direkt im <h3> statt <h3><span>, kein
  <span class="h6"> für Meta) — eigener Parser-Pfad nötig, deferred.
- NI: nilas.niedersachsen.de/portal/ ist eine Login-Page, das
  öffentliche Backend ist nicht zugänglich — deferred.
- HB: kein /portal/-Endpoint, bleibt das alte StarWeb-Servlet —
  braucht eigenen HAR-Trace, deferred.
- BB als StarWeb-Template (#27) ist hinfällig, weil BB portala ist.

Phase 2 (3/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:59:28 +02:00
+								    # BE has "Drucksache 19/3104 S. 1 bis 24 vom 31.03.2026" — date is
 								    # marked by ``vom``. BB has the BE card format too but writes the
 								    # date BEFORE the Drucksachen-Nummer with no marker:
 								    # "Antrag Reinhard Simon (BSW) 17.10.2024 Drucksache 8/2 (1 S.)".
 								    # Try ``vom``-prefix first; fall back to the first plain date.
 								    _RE_BE_DATUM_VOM = re.compile(r'vom\s+(\d{1,2}\.\d{1,2}\.\d{4})')
 								    _RE_BE_DATUM_PLAIN = re.compile(r'(\d{1,2}\.\d{1,2}\.\d{4})')
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								    _RE_BE_DOCTYPE = re.compile(r'<span class="h6">\s*([^<&]+?)(?:&nbsp;|<)')
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								    @staticmethod
 								    def _decode_perl_hex(s: str) -> str:
 								        """Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
 								        return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s)
-												Phase B: Parteinamen-Mapper #55 (Roadmap #59)

Zentrale `app/parteien.py` als Single Source of Truth für die Partei-
Auflösung:

- `PARTEIEN`-Tabelle mit kanonischem Key, langem Display-Namen, allen
  bekannten Aliasen, optionalem `bundesland_scope` und Government-
  Marker. 14 Einträge (CDU, CSU, SPD, GRÜNE, FDP, LINKE, AfD, BSW, SSW,
  BiW + die Freie-Wähler-Familie BVB-FW, FW-BAYERN, FW-SL und der
  generische FREIE WÄHLER-Eintrag).
- `normalize_partei(raw, *, bundesland=None)` für Single-String-Lookups
  mit Government-Vorrang und FW-Familien-Disambiguierung
- `extract_fraktionen(text, *, bundesland=None)` als Funnel für die
  vier alten Adapter-Helper. Kommagetrennte Listen, MdL-mit-Klammer-
  partei, HTML-Reste — alles fließt durch eine Stelle, mit BL-Scope-
  Filter (SSW nur in SH, BVB-FW nur in BB, etc.).
- `display_name(canonical, *, long=False)` für UI/PDF — kurze Form
  bleibt der kanonische Key, lange Form ist "BÜNDNIS 90/DIE GRÜNEN"
  statt "GRÜNE" etc.

Adapter-Migration in `app/parlamente.py`:

- Vier nahezu identische `_normalize_fraktion()`-Methoden in
  PortalaAdapter, ParLDokAdapter, StarFinderCGIAdapter, PARLISAdapter
  durch einen einzeiligen Shim ersetzt, der `extract_fraktionen` mit
  `self.bundesland` aufruft. ~120 Zeilen Duplikation entfernt.
- `@staticmethod` aufgehoben, weil wir jetzt `self.bundesland` brauchen
  für die FW-Disambiguierung — alle Aufrufer waren bereits `self._...`,
  also keine Call-Site-Änderung nötig.

`app/embeddings.py:496` Workaround-Hack entfernt:

- `partei.upper() if partei != "GRÜNE" else "GRÜNE"` durch zentralen
  `normalize_partei()`-Aufruf ersetzt — der Hack war ein Kommentarzeichen
  dafür, dass die Partei-Schreibweise irgendwo zwischen Adapter und
  Embedding-Lookup driften konnte. Mit dem Mapper ist die Schreibweise
  überall garantiert kanonisch.

Tests:

- Neue `tests/test_parteien.py` mit 52 Cases — Single-Lookup, FW-
  Disambiguierung (BVB/Bayern/Saarland/RP), Volltext-Extraktion,
  Government-Marker, Tabellen-Konsistenz
- `tests/test_parlamente.py` Test-Klasse umgeschrieben: statt der 6
  statischen `PortalaAdapter._normalize_fraktion(...)`-Tests jetzt 4
  Roundtrip-Tests über echte Adapter-Instanzen, inkl. expliziter
  BB→BVB-FW vs. RP→FREIE WÄHLER-Verifikation

157 Unit-Tests grün (105 alt + 52 neu). Backwards-kompatibel — die
kanonischen Keys sind exakt die in der DB stehenden Strings, kein
Migrations-Schritt nötig.

Refs: #55, #59 (Phase B)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:22:13 +02:00
+								    def _normalize_fraktion(self, urheber: str) -> list[str]:
 								        """Thin shim — die ganze Regex-Logik lebt jetzt zentral in
 								        ``app.parteien.extract_fraktionen`` (siehe #55). ``self.bundesland``
 								        wird mitgegeben, damit FW-Familien-Aliase korrekt disambiguiert
 								        werden.
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								        """
-												Phase B: Parteinamen-Mapper #55 (Roadmap #59)

Zentrale `app/parteien.py` als Single Source of Truth für die Partei-
Auflösung:

- `PARTEIEN`-Tabelle mit kanonischem Key, langem Display-Namen, allen
  bekannten Aliasen, optionalem `bundesland_scope` und Government-
  Marker. 14 Einträge (CDU, CSU, SPD, GRÜNE, FDP, LINKE, AfD, BSW, SSW,
  BiW + die Freie-Wähler-Familie BVB-FW, FW-BAYERN, FW-SL und der
  generische FREIE WÄHLER-Eintrag).
- `normalize_partei(raw, *, bundesland=None)` für Single-String-Lookups
  mit Government-Vorrang und FW-Familien-Disambiguierung
- `extract_fraktionen(text, *, bundesland=None)` als Funnel für die
  vier alten Adapter-Helper. Kommagetrennte Listen, MdL-mit-Klammer-
  partei, HTML-Reste — alles fließt durch eine Stelle, mit BL-Scope-
  Filter (SSW nur in SH, BVB-FW nur in BB, etc.).
- `display_name(canonical, *, long=False)` für UI/PDF — kurze Form
  bleibt der kanonische Key, lange Form ist "BÜNDNIS 90/DIE GRÜNEN"
  statt "GRÜNE" etc.

Adapter-Migration in `app/parlamente.py`:

- Vier nahezu identische `_normalize_fraktion()`-Methoden in
  PortalaAdapter, ParLDokAdapter, StarFinderCGIAdapter, PARLISAdapter
  durch einen einzeiligen Shim ersetzt, der `extract_fraktionen` mit
  `self.bundesland` aufruft. ~120 Zeilen Duplikation entfernt.
- `@staticmethod` aufgehoben, weil wir jetzt `self.bundesland` brauchen
  für die FW-Disambiguierung — alle Aufrufer waren bereits `self._...`,
  also keine Call-Site-Änderung nötig.

`app/embeddings.py:496` Workaround-Hack entfernt:

- `partei.upper() if partei != "GRÜNE" else "GRÜNE"` durch zentralen
  `normalize_partei()`-Aufruf ersetzt — der Hack war ein Kommentarzeichen
  dafür, dass die Partei-Schreibweise irgendwo zwischen Adapter und
  Embedding-Lookup driften konnte. Mit dem Mapper ist die Schreibweise
  überall garantiert kanonisch.

Tests:

- Neue `tests/test_parteien.py` mit 52 Cases — Single-Lookup, FW-
  Disambiguierung (BVB/Bayern/Saarland/RP), Volltext-Extraktion,
  Government-Marker, Tabellen-Konsistenz
- `tests/test_parlamente.py` Test-Klasse umgeschrieben: statt der 6
  statischen `PortalaAdapter._normalize_fraktion(...)`-Tests jetzt 4
  Roundtrip-Tests über echte Adapter-Instanzen, inkl. expliziter
  BB→BVB-FW vs. RP→FREIE WÄHLER-Verifikation

157 Unit-Tests grün (105 alt + 52 neu). Backwards-kompatibel — die
kanonischen Keys sind exakt die in der DB stehenden Strings, kein
Migrations-Schritt nötig.

Refs: #55, #59 (Phase B)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:22:13 +02:00
+								        from .parteien import extract_fraktionen
 								        return extract_fraktionen(urheber, bundesland=self.bundesland)
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
 								    def _build_search_body(
 								        self,
 								        wahlperiode: int,
 								        start_date: str,
 								        end_date: str,
 								    ) -> dict:
 								        """Build the action JSON body for browse.tt.json.
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								        The schema is taken from dokukratie's portala.query.json template
 								        and only differs in the data source and the variable substitutions.
 								        When ``self.document_type`` is None, the ETYPF/DTYPF/DART subtree
 								        is dropped — useful for parliaments whose ETYPF index uses
 								        different value strings than ``"Antrag"``.
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								        """
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								        document_type = self.document_type
 								        date_range_text = f"{start_date} THRU {end_date}"
 								        date_term = lambda sf, num: {  # noqa: E731 — local helper
 								            "tn": "trange", "sf": sf, "op": "eq", "num": num,
 								            "idx": 119, "l": 3,
 								            "p1": start_date, "t1": start_date,
 								            "p2": end_date, "t2": end_date,
 								            "t": date_range_text,
 								        }
 								        # Build the search.lines (form-state mirror) and the json tree
 								        lines: dict = {
 								            "2": str(wahlperiode),
 								            "10": start_date,
 								            "11": end_date,
 								            "20.1": "alWEBBI",
 								            "20.2": "alWEBBI",
 								            "20.3": "alWEBBI",
 								            "90.1": "AND",
 								            "90.2": "AND",
 								            "90.3": "AND",
 								        }
 								        if document_type is not None:
 								            lines["3"] = document_type
 								            lines["4"] = "D"
 								        # Top-level AND tree
 								        top_terms: list = [
 								            {"tn": "term", "t": str(wahlperiode), "idx": 6, "l": 3,
 								             "sf": "WP", "op": "eq", "num": 5},
 								        ]
 								        if document_type is not None:
 								            top_terms.append({"tn": "or", "num": 3, "terms": [
 								                {"tn": "or", "num": 4, "terms": [
 								                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
 								                     "l": 4, "sf": "ETYPF", "op": "eq", "num": 10},
 								                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
 								                     "l": 4, "sf": "ETYP2F", "op": "eq", "num": 11},
 								                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
 								                     "l": 4, "sf": "DTYPF", "op": "eq", "num": 12},
 								                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
 								                     "l": 4, "sf": "DTYP2F", "op": "eq", "num": 13},
 								                    {"tn": "term", "t": f'"{document_type}"', "idx": 50,
 								                     "l": 4, "sf": "1VTYPF", "op": "eq", "num": 14},
 								                ]},
 								                {"tn": "or", "num": 15, "terms": [
 								                    {"tn": "term", "t": '"D"', "idx": 93, "l": 4,
 								                     "sf": "DART", "op": "eq", "num": 16},
 								                    {"tn": "term", "t": '"D"', "idx": 93, "l": 4,
 								                     "sf": "DARTS", "op": "eq", "num": 17},
 								                ]},
 								            ]})
-												Activate Brandenburg + Rheinland-Pfalz via PortalaAdapter reuse (#27, #30, Phase 2)

Riesige Überraschung aus dem BB-HAR-Trace: Brandenburg ist NICHT
StarWeb wie in dokukratie und bundeslaender.py klassifiziert,
sondern läuft auch auf dem portala/eUI-Backend. Endpoint
/portal/browse.tt.json mit db_id=lbb.lissh. Das alte
/starweb/LBB/ELVIS/-Frontend ist nur Legacy.

Folgeprobing offenbarte: RP/opal.rlp.de läuft ebenfalls portala
(db_id=rlp.lissh, 46759 hits in WP18), ebenso NI/HE/BB. Damit ist
Phase 2 großteils KEIN StarWeb-Adapter-Bau, sondern PortalaAdapter-
Wiederverwendung mit konfigurierbaren Parametern.

Activated via Registry-Einträge:

- "BB" → PortalaAdapter(base_url=parlamentsdokumentation.brandenburg.de,
  db_id=lbb.lissh, wahlperiode=8). Nutzt die BE-Card-Variante des
  Hit-Parsers (efxRecordRepeater).
- "RP" → PortalaAdapter(base_url=opal.rlp.de, db_id=rlp.lissh,
  wahlperiode=18). NICHT mit dem NRW OPAL verwechseln — anderer
  Markenname, andere Engine.

PortalaAdapter erweitert um zwei neue Konstruktor-Parameter mit
backward-kompatiblen Defaults:

- typ_filter: Optional[str] = "DOKDBE"
  Wenn None, wird die TYP=<value>-Klausel weggelassen. Manche
  Instanzen (HE/hlt.lis) lehnen DOKDBE ab.

- omit_date_filter: bool = False
  Wenn True, wird der DAT/DDAT/SDAT-Term weggelassen. HE
  und ähnliche Instanzen haben andere Date-Field-Namen.

Plus _parse_hit_list_cards Date-Regex erweitert: zusätzlich zum
"vom DD.MM.YYYY"-Pattern (BE) jetzt auch "DD.MM.YYYY"-plain
(BB schreibt Datum vor Drucksachen-Nummer ohne "vom"-Marker).

Smoke-Test (lokal):
  BB q="":       5 hits in 5.9s
  BB q="Schule": 5 hits (Pflegeschulen, Genderverbot, Hochschulen)
  RP q="":       5 hits in 4.1s (Entlastung, Bildungschancen)
  RP q="Schule": 5 hits (Hochschulbau, G9-Gymnasien, Leistungsgerechtigkeit)

bundeslaender.py: BB.doku_system "StarWeb"→"portala", RP analog,
beide aktiv=True. Anmerkungen mit dem portala-Verweis und der
Klarstellung "OPAL/RLP ≠ NRW OPAL" erweitert.

NICHT in diesem Commit:
- HE: portala-Backend (hlt.lis) ist erreichbar, aber das HE-Card-
  Layout ist anders (Title direkt im <h3> statt <h3><span>, kein
  <span class="h6"> für Meta) — eigener Parser-Pfad nötig, deferred.
- NI: nilas.niedersachsen.de/portal/ ist eine Login-Page, das
  öffentliche Backend ist nicht zugänglich — deferred.
- HB: kein /portal/-Endpoint, bleibt das alte StarWeb-Servlet —
  braucht eigenen HAR-Trace, deferred.
- BB als StarWeb-Template (#27) ist hinfällig, weil BB portala ist.

Phase 2 (3/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:59:28 +02:00
+								        if not self.omit_date_filter:
 								            top_terms.append({"tn": "or", "num": 18, "terms": [
 								                {"tn": "or", "num": 19, "terms": [
 								                    date_term("DAT", 20),
 								                    date_term("DDAT", 21),
 								                ]},
 								                date_term("SDAT", 22),
 								            ]})
 								        if self.typ_filter is not None:
 								            top_terms.append({"tn": "term", "t": self.typ_filter, "idx": 156, "l": 1,
 								                              "sf": "TYP", "op": "eq", "num": 23})
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
 								        # Mirror the same shape into the parsed/sref display strings
-												Activate Brandenburg + Rheinland-Pfalz via PortalaAdapter reuse (#27, #30, Phase 2)

Riesige Überraschung aus dem BB-HAR-Trace: Brandenburg ist NICHT
StarWeb wie in dokukratie und bundeslaender.py klassifiziert,
sondern läuft auch auf dem portala/eUI-Backend. Endpoint
/portal/browse.tt.json mit db_id=lbb.lissh. Das alte
/starweb/LBB/ELVIS/-Frontend ist nur Legacy.

Folgeprobing offenbarte: RP/opal.rlp.de läuft ebenfalls portala
(db_id=rlp.lissh, 46759 hits in WP18), ebenso NI/HE/BB. Damit ist
Phase 2 großteils KEIN StarWeb-Adapter-Bau, sondern PortalaAdapter-
Wiederverwendung mit konfigurierbaren Parametern.

Activated via Registry-Einträge:

- "BB" → PortalaAdapter(base_url=parlamentsdokumentation.brandenburg.de,
  db_id=lbb.lissh, wahlperiode=8). Nutzt die BE-Card-Variante des
  Hit-Parsers (efxRecordRepeater).
- "RP" → PortalaAdapter(base_url=opal.rlp.de, db_id=rlp.lissh,
  wahlperiode=18). NICHT mit dem NRW OPAL verwechseln — anderer
  Markenname, andere Engine.

PortalaAdapter erweitert um zwei neue Konstruktor-Parameter mit
backward-kompatiblen Defaults:

- typ_filter: Optional[str] = "DOKDBE"
  Wenn None, wird die TYP=<value>-Klausel weggelassen. Manche
  Instanzen (HE/hlt.lis) lehnen DOKDBE ab.

- omit_date_filter: bool = False
  Wenn True, wird der DAT/DDAT/SDAT-Term weggelassen. HE
  und ähnliche Instanzen haben andere Date-Field-Namen.

Plus _parse_hit_list_cards Date-Regex erweitert: zusätzlich zum
"vom DD.MM.YYYY"-Pattern (BE) jetzt auch "DD.MM.YYYY"-plain
(BB schreibt Datum vor Drucksachen-Nummer ohne "vom"-Marker).

Smoke-Test (lokal):
  BB q="":       5 hits in 5.9s
  BB q="Schule": 5 hits (Pflegeschulen, Genderverbot, Hochschulen)
  RP q="":       5 hits in 4.1s (Entlastung, Bildungschancen)
  RP q="Schule": 5 hits (Hochschulbau, G9-Gymnasien, Leistungsgerechtigkeit)

bundeslaender.py: BB.doku_system "StarWeb"→"portala", RP analog,
beide aktiv=True. Anmerkungen mit dem portala-Verweis und der
Klarstellung "OPAL/RLP ≠ NRW OPAL" erweitert.

NICHT in diesem Commit:
- HE: portala-Backend (hlt.lis) ist erreichbar, aber das HE-Card-
  Layout ist anders (Title direkt im <h3> statt <h3><span>, kein
  <span class="h6"> für Meta) — eigener Parser-Pfad nötig, deferred.
- NI: nilas.niedersachsen.de/portal/ ist eine Login-Page, das
  öffentliche Backend ist nicht zugänglich — deferred.
- HB: kein /portal/-Endpoint, bleibt das alte StarWeb-Servlet —
  braucht eigenen HAR-Trace, deferred.
- BB als StarWeb-Template (#27) ist hinfällig, weil BB portala ist.

Phase 2 (3/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:59:28 +02:00
+								        typ_clause = f" AND TYP={self.typ_filter}" if self.typ_filter is not None else ""
 								        date_clause = (
 								            f" AND (DAT,DDAT,SDAT= {date_range_text})"
 								            if not self.omit_date_filter else ""
 								        )
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								        if document_type is not None:
 								            parsed = (
 								                f"((/WP {wahlperiode}) AND "
 								                f"(/ETYPF,ETYP2F,DTYPF,DTYP2F,1VTYPF (\"{document_type}\")) "
-												Activate Brandenburg + Rheinland-Pfalz via PortalaAdapter reuse (#27, #30, Phase 2)

Riesige Überraschung aus dem BB-HAR-Trace: Brandenburg ist NICHT
StarWeb wie in dokukratie und bundeslaender.py klassifiziert,
sondern läuft auch auf dem portala/eUI-Backend. Endpoint
/portal/browse.tt.json mit db_id=lbb.lissh. Das alte
/starweb/LBB/ELVIS/-Frontend ist nur Legacy.

Folgeprobing offenbarte: RP/opal.rlp.de läuft ebenfalls portala
(db_id=rlp.lissh, 46759 hits in WP18), ebenso NI/HE/BB. Damit ist
Phase 2 großteils KEIN StarWeb-Adapter-Bau, sondern PortalaAdapter-
Wiederverwendung mit konfigurierbaren Parametern.

Activated via Registry-Einträge:

- "BB" → PortalaAdapter(base_url=parlamentsdokumentation.brandenburg.de,
  db_id=lbb.lissh, wahlperiode=8). Nutzt die BE-Card-Variante des
  Hit-Parsers (efxRecordRepeater).
- "RP" → PortalaAdapter(base_url=opal.rlp.de, db_id=rlp.lissh,
  wahlperiode=18). NICHT mit dem NRW OPAL verwechseln — anderer
  Markenname, andere Engine.

PortalaAdapter erweitert um zwei neue Konstruktor-Parameter mit
backward-kompatiblen Defaults:

- typ_filter: Optional[str] = "DOKDBE"
  Wenn None, wird die TYP=<value>-Klausel weggelassen. Manche
  Instanzen (HE/hlt.lis) lehnen DOKDBE ab.

- omit_date_filter: bool = False
  Wenn True, wird der DAT/DDAT/SDAT-Term weggelassen. HE
  und ähnliche Instanzen haben andere Date-Field-Namen.

Plus _parse_hit_list_cards Date-Regex erweitert: zusätzlich zum
"vom DD.MM.YYYY"-Pattern (BE) jetzt auch "DD.MM.YYYY"-plain
(BB schreibt Datum vor Drucksachen-Nummer ohne "vom"-Marker).

Smoke-Test (lokal):
  BB q="":       5 hits in 5.9s
  BB q="Schule": 5 hits (Pflegeschulen, Genderverbot, Hochschulen)
  RP q="":       5 hits in 4.1s (Entlastung, Bildungschancen)
  RP q="Schule": 5 hits (Hochschulbau, G9-Gymnasien, Leistungsgerechtigkeit)

bundeslaender.py: BB.doku_system "StarWeb"→"portala", RP analog,
beide aktiv=True. Anmerkungen mit dem portala-Verweis und der
Klarstellung "OPAL/RLP ≠ NRW OPAL" erweitert.

NICHT in diesem Commit:
- HE: portala-Backend (hlt.lis) ist erreichbar, aber das HE-Card-
  Layout ist anders (Title direkt im <h3> statt <h3><span>, kein
  <span class="h6"> für Meta) — eigener Parser-Pfad nötig, deferred.
- NI: nilas.niedersachsen.de/portal/ ist eine Login-Page, das
  öffentliche Backend ist nicht zugänglich — deferred.
- HB: kein /portal/-Endpoint, bleibt das alte StarWeb-Servlet —
  braucht eigenen HAR-Trace, deferred.
- BB als StarWeb-Template (#27) ist hinfällig, weil BB portala ist.

Phase 2 (3/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:59:28 +02:00
+								                f"AND (/DART,DARTS (\"D\")){date_clause}){typ_clause}"
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								            )
 								        else:
-												Activate Brandenburg + Rheinland-Pfalz via PortalaAdapter reuse (#27, #30, Phase 2)

Riesige Überraschung aus dem BB-HAR-Trace: Brandenburg ist NICHT
StarWeb wie in dokukratie und bundeslaender.py klassifiziert,
sondern läuft auch auf dem portala/eUI-Backend. Endpoint
/portal/browse.tt.json mit db_id=lbb.lissh. Das alte
/starweb/LBB/ELVIS/-Frontend ist nur Legacy.

Folgeprobing offenbarte: RP/opal.rlp.de läuft ebenfalls portala
(db_id=rlp.lissh, 46759 hits in WP18), ebenso NI/HE/BB. Damit ist
Phase 2 großteils KEIN StarWeb-Adapter-Bau, sondern PortalaAdapter-
Wiederverwendung mit konfigurierbaren Parametern.

Activated via Registry-Einträge:

- "BB" → PortalaAdapter(base_url=parlamentsdokumentation.brandenburg.de,
  db_id=lbb.lissh, wahlperiode=8). Nutzt die BE-Card-Variante des
  Hit-Parsers (efxRecordRepeater).
- "RP" → PortalaAdapter(base_url=opal.rlp.de, db_id=rlp.lissh,
  wahlperiode=18). NICHT mit dem NRW OPAL verwechseln — anderer
  Markenname, andere Engine.

PortalaAdapter erweitert um zwei neue Konstruktor-Parameter mit
backward-kompatiblen Defaults:

- typ_filter: Optional[str] = "DOKDBE"
  Wenn None, wird die TYP=<value>-Klausel weggelassen. Manche
  Instanzen (HE/hlt.lis) lehnen DOKDBE ab.

- omit_date_filter: bool = False
  Wenn True, wird der DAT/DDAT/SDAT-Term weggelassen. HE
  und ähnliche Instanzen haben andere Date-Field-Namen.

Plus _parse_hit_list_cards Date-Regex erweitert: zusätzlich zum
"vom DD.MM.YYYY"-Pattern (BE) jetzt auch "DD.MM.YYYY"-plain
(BB schreibt Datum vor Drucksachen-Nummer ohne "vom"-Marker).

Smoke-Test (lokal):
  BB q="":       5 hits in 5.9s
  BB q="Schule": 5 hits (Pflegeschulen, Genderverbot, Hochschulen)
  RP q="":       5 hits in 4.1s (Entlastung, Bildungschancen)
  RP q="Schule": 5 hits (Hochschulbau, G9-Gymnasien, Leistungsgerechtigkeit)

bundeslaender.py: BB.doku_system "StarWeb"→"portala", RP analog,
beide aktiv=True. Anmerkungen mit dem portala-Verweis und der
Klarstellung "OPAL/RLP ≠ NRW OPAL" erweitert.

NICHT in diesem Commit:
- HE: portala-Backend (hlt.lis) ist erreichbar, aber das HE-Card-
  Layout ist anders (Title direkt im <h3> statt <h3><span>, kein
  <span class="h6"> für Meta) — eigener Parser-Pfad nötig, deferred.
- NI: nilas.niedersachsen.de/portal/ ist eine Login-Page, das
  öffentliche Backend ist nicht zugänglich — deferred.
- HB: kein /portal/-Endpoint, bleibt das alte StarWeb-Servlet —
  braucht eigenen HAR-Trace, deferred.
- BB als StarWeb-Template (#27) ist hinfällig, weil BB portala ist.

Phase 2 (3/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:59:28 +02:00
+								            parsed = f"((/WP {wahlperiode}){date_clause}){typ_clause}"
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								        return {
 								            "action": "SearchAndDisplay",
 								            "sources": [self.db_id],
 								            "report": {
 								                "rhl": "main",
 								                "rhlmode": "add",
 								                "format": "generic1-full",
 								                "mime": "html",
 								                "sort": "WEVSO1/D WEVSO2 WEVSO3",
 								            },
 								            "search": {
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								                "lines": lines,
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                "serverrecordname": "sr_generic1",
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								                "parsed": parsed,
 								                "sref": parsed,
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                "json": [{
 								                    "tn": "and",
 								                    "num": 1,
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								                    "terms": top_terms,
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                }],
 								            },
 								            "dataSet": "1",
 								        }
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								    @staticmethod
 								    def _datum_de_to_iso(datum_de: str) -> str:
 								        """Convert DD.MM.YYYY → YYYY-MM-DD; return '' for empty input."""
 								        if not datum_de:
 								            return ""
 								        d, m, y = datum_de.split(".")
 								        return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								    def _parse_hit_list_html(self, html: str, query_filter: str = "") -> list[Drucksache]:
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								        """Extract Drucksachen from a report.tt.html response.
 								        Two formats are supported and auto-detected:
 								        - **LSA-style:** the records are embedded as Perl Data::Dumper
 								          dumps inside ``<pre>$VAR1 = …</pre>`` blocks. WEV06 → title,
 								          WEV32 → metadata + PDF path. Used by Sachsen-Anhalt's PADOKA
 								          template.
 								        - **Berlin-style:** standard production HTML cards with
 								          ``efxRecordRepeater`` divs. Title in an ``<h3 class="h5">``,
 								          metadata + PDF link in an ``<span class="h6">``. Used by
 								          Berlin's PARDOK template.
 								        """
 								        if self._RE_PRE_BLOCK.search(html):
 								            return self._parse_hit_list_dump(html, query_filter)
 								        return self._parse_hit_list_cards(html, query_filter)
 								    def _parse_hit_list_dump(self, html: str, query_filter: str) -> list[Drucksache]:
 								        """Parse LSA-style ``<pre>$VAR1 = …</pre>`` Perl-dump records."""
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								        results: list[Drucksache] = []
 								        for pre in self._RE_PRE_BLOCK.findall(html):
 								            m_ds = self._RE_DRUCKSACHE.search(pre)
 								            if not m_ds:
 								                continue
 								            drucksache = m_ds.group(1)
 								            m_t = self._RE_TITLE.search(pre)
 								            title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"
 								            m_pdf = self._RE_PDF.search(pre)
 								            pdf_rel = m_pdf.group(1) if m_pdf else ""
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								            pdf_url = f"{self.base_url}{self.pdf_url_prefix}{pdf_rel}" if pdf_rel else ""
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
 								            m_w32 = self._RE_URHEBER_DATUM.search(pre)
 								            urheber = self._decode_perl_hex(m_w32.group(1).strip()) if m_w32 else ""
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								            datum_iso = self._datum_de_to_iso(m_w32.group(2) if m_w32 else "")
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								            fraktionen = self._normalize_fraktion(urheber) if urheber else []
 								            doc = Drucksache(
 								                drucksache=drucksache,
 								                title=title,
 								                fraktionen=fraktionen,
 								                datum=datum_iso,
 								                link=pdf_url,
 								                bundesland=self.bundesland,
 								                typ="Antrag",
 								            )
 								            if query_filter:
 								                hay = f"{title} {urheber}".lower()
 								                if not all(t in hay for t in query_filter.lower().split()):
 								                    continue
 								            results.append(doc)
 								        return results
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								    def _parse_hit_list_cards(self, html: str, query_filter: str) -> list[Drucksache]:
 								        """Parse Berlin-style ``efxRecordRepeater`` HTML-card records.
 								        Each card contains an ``<h3>`` title, a metadata ``<span class="h6">``
 								        with the document type, the Drucksachen-Nummer, and the date,
 								        plus a direct ``<a href="…pdf">`` link to the PDF on the same host.
 								        """
 								        results: list[Drucksache] = []
 								        # Split the HTML on every record-div opener — easier than balancing
 								        # divs with regex.
 								        chunks = html.split('class="record')
 								        # First chunk is the prelude, skip it
 								        for chunk in chunks[1:]:
 								            # Each chunk now starts at the record class attribute
 								            m_t = self._RE_BE_TITLE.search(chunk)
 								            title = m_t.group(1).strip() if m_t else "Ohne Titel"
 								            m_ds = self._RE_BE_DRUCKSACHE.search(chunk)
 								            if not m_ds:
 								                continue
 								            drucksache = m_ds.group(1)
 								            m_pdf = self._RE_BE_LINK.search(chunk)
 								            pdf_url = ""
 								            if m_pdf:
 								                href = m_pdf.group(1)
 								                if href.startswith("http://") or href.startswith("https://"):
 								                    pdf_url = href
 								                elif href.startswith("/"):
 								                    pdf_url = f"{self.base_url}{href}"
 								                else:
 								                    pdf_url = f"{self.base_url}{self.pdf_url_prefix}{href}"
-												Activate Brandenburg + Rheinland-Pfalz via PortalaAdapter reuse (#27, #30, Phase 2)

Riesige Überraschung aus dem BB-HAR-Trace: Brandenburg ist NICHT
StarWeb wie in dokukratie und bundeslaender.py klassifiziert,
sondern läuft auch auf dem portala/eUI-Backend. Endpoint
/portal/browse.tt.json mit db_id=lbb.lissh. Das alte
/starweb/LBB/ELVIS/-Frontend ist nur Legacy.

Folgeprobing offenbarte: RP/opal.rlp.de läuft ebenfalls portala
(db_id=rlp.lissh, 46759 hits in WP18), ebenso NI/HE/BB. Damit ist
Phase 2 großteils KEIN StarWeb-Adapter-Bau, sondern PortalaAdapter-
Wiederverwendung mit konfigurierbaren Parametern.

Activated via Registry-Einträge:

- "BB" → PortalaAdapter(base_url=parlamentsdokumentation.brandenburg.de,
  db_id=lbb.lissh, wahlperiode=8). Nutzt die BE-Card-Variante des
  Hit-Parsers (efxRecordRepeater).
- "RP" → PortalaAdapter(base_url=opal.rlp.de, db_id=rlp.lissh,
  wahlperiode=18). NICHT mit dem NRW OPAL verwechseln — anderer
  Markenname, andere Engine.

PortalaAdapter erweitert um zwei neue Konstruktor-Parameter mit
backward-kompatiblen Defaults:

- typ_filter: Optional[str] = "DOKDBE"
  Wenn None, wird die TYP=<value>-Klausel weggelassen. Manche
  Instanzen (HE/hlt.lis) lehnen DOKDBE ab.

- omit_date_filter: bool = False
  Wenn True, wird der DAT/DDAT/SDAT-Term weggelassen. HE
  und ähnliche Instanzen haben andere Date-Field-Namen.

Plus _parse_hit_list_cards Date-Regex erweitert: zusätzlich zum
"vom DD.MM.YYYY"-Pattern (BE) jetzt auch "DD.MM.YYYY"-plain
(BB schreibt Datum vor Drucksachen-Nummer ohne "vom"-Marker).

Smoke-Test (lokal):
  BB q="":       5 hits in 5.9s
  BB q="Schule": 5 hits (Pflegeschulen, Genderverbot, Hochschulen)
  RP q="":       5 hits in 4.1s (Entlastung, Bildungschancen)
  RP q="Schule": 5 hits (Hochschulbau, G9-Gymnasien, Leistungsgerechtigkeit)

bundeslaender.py: BB.doku_system "StarWeb"→"portala", RP analog,
beide aktiv=True. Anmerkungen mit dem portala-Verweis und der
Klarstellung "OPAL/RLP ≠ NRW OPAL" erweitert.

NICHT in diesem Commit:
- HE: portala-Backend (hlt.lis) ist erreichbar, aber das HE-Card-
  Layout ist anders (Title direkt im <h3> statt <h3><span>, kein
  <span class="h6"> für Meta) — eigener Parser-Pfad nötig, deferred.
- NI: nilas.niedersachsen.de/portal/ ist eine Login-Page, das
  öffentliche Backend ist nicht zugänglich — deferred.
- HB: kein /portal/-Endpoint, bleibt das alte StarWeb-Servlet —
  braucht eigenen HAR-Trace, deferred.
- BB als StarWeb-Template (#27) ist hinfällig, weil BB portala ist.

Phase 2 (3/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:59:28 +02:00
+								            m_dat = self._RE_BE_DATUM_VOM.search(chunk) or self._RE_BE_DATUM_PLAIN.search(chunk)
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								            datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "")
 								            m_doc = self._RE_BE_DOCTYPE.search(chunk)
 								            doctype_full = m_doc.group(1).strip() if m_doc else "Drucksache"
 								            # Berlin often packs the originator(s) into the same h6 line:
 								            #   "Antrag  CDU, SPD" → fraktionen = ["CDU","SPD"], typ = "Antrag"
 								            # Senat-Vorlagen carry no fraction, only "Vorlage zur …".
 								            fraktionen = self._normalize_fraktion(doctype_full)
 								            # Strip the fraction names back out of the typ string so the UI
 								            # shows a clean "Antrag" / "Vorlage …" label.
 								            typ = doctype_full
 								            if fraktionen:
 								                # Cut at the first occurrence of any party name
 								                cuts = [typ.upper().find(f.upper()) for f in fraktionen]
 								                cuts = [c for c in cuts if c >= 0]
 								                if cuts:
 								                    typ = typ[: min(cuts)].rstrip(" ,")
 								            doc = Drucksache(
 								                drucksache=drucksache,
 								                title=title,
 								                fraktionen=fraktionen,
 								                datum=datum_iso,
 								                link=pdf_url,
 								                bundesland=self.bundesland,
 								                typ=typ,
 								            )
 								            if query_filter:
-												Fix NameError in PortalaAdapter card parser

_parse_hit_list_cards referenced an undefined `doctype` instead of
`doctype_full` on the query-filter path. The surrounding try/except in
search() swallowed the exception, so Berlin queries silently returned
0 hits whenever a search term was given.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 07:50:44 +02:00
+								                hay = f"{title} {doctype_full}".lower()
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								                if not all(t in hay for t in query_filter.lower().split()):
 								                    continue
 								            results.append(doc)
 								        return results
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								        """Search recent documents of the current Wahlperiode.
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
 								        ``query`` is applied as a client-side title/Urheber filter; the
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								        server-side query covers the configured ``date_window_days``
 								        (default 24 months).
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								        """
 								        from datetime import date, timedelta
 								        end = date.today()
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								        start = end - timedelta(days=self.date_window_days)
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								        body = self._build_search_body(
 								            wahlperiode=self.wahlperiode,
 								            start_date=start.isoformat(),
 								            end_date=end.isoformat(),
 								        )
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								        browse_html = f"{self.base_url}{self.portala_path}/browse.tt.html"
 								        browse_json = f"{self.base_url}{self.portala_path}/browse.tt.json"
 								        report_html = f"{self.base_url}{self.portala_path}/report.tt.html"
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								        async with httpx.AsyncClient(
-												PortalaAdapter: quick-win bigger window + chunksize for BE/LSA (#13)

Real server-side fulltext search through the eUI sf-Index requires
reverse-engineering the LSA/BE-specific search field (the obvious
candidates VOLL, VOLL.main, WEV62 and bare-term-without-sf all
return zero hits when probed). Without browser DevTools to capture
a real fulltext request that's a multi-hour project — split out
to remain in #13 as a follow-up.

This commit ships the pragmatic interim fix from #11:

- BE date_window_days: 180 → 730
  Berlin had a tight default window because PARDOK has ~10x more
  documents than PADOKA. With the bigger window the client-side
  title/Urheber filter reaches back across most of WP19 instead
  of just the last six months.

- chunksize logic in PortalaAdapter.search() inverted from
  "small when query, big when no query" to the opposite. The
  query-filtered path now pulls up to max(limit*10, 500) records
  per page so the title-filter has enough material; the unfiltered
  browse path stays at max(limit*2, 100).

- httpx timeout 30s → 60s. LSA's report.tt.html occasionally
  takes 30+s on cold start; warm requests are <10s.

Smoke test (local):
  BE  Schule: 15 hits (was 0)
  LSA Schule: 14 hits (was N/A; same path)

Live verification follows after deploy.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 13:58:34 +02:00
+								            # Bumped from 30s for #13 quick-win: chunksize=500 against the
 								            # LSA report.tt.html endpoint occasionally takes 30+ seconds.
 								            timeout=60,
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								            follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                # Step 1: warm up cookies via the browse page
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								                await client.get(browse_html)
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
 								                # Step 2: submit the search action
 								                resp = await client.post(
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								                    browse_json,
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                    json=body,
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								                    headers={"Referer": browse_html},
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                )
 								                if resp.status_code != 200:
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								                    logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                    return []
 								                data = resp.json()
 								                report_id = data.get("report_id")
 								                if not report_id:
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								                    logger.error("%s: no report_id in response: %s", self.bundesland, data)
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                    return []
 								                # Step 3: fetch the HTML hit list
-												PortalaAdapter: chunksize-Floor auf 1500 (#61 Bug 5 follow-up)

Berlin-PARDOK ist von Schriftlichen Anfragen dominiert und liefert ohne
server-side ETYPF-Filter (BE: document_type=None) bei chunksize=100 nur
1-2 Anträge zurück. Damit reicht das Window selbst für limit=20 nicht
aus, um z.B. die A100-Antrag-Drucksache 19/2650 zu finden — und
get_document() liefert None.

Floor bewusst hoch auf 1500 angehoben (vorher 100/500). Bei einem
typischen Verhältnis 1:30 Antrag/Anfrage in BE liefert das ~50 Anträge,
genug für robuste Lookups in den letzten 24 Monaten.

176 Unit-Tests grün.

Refs: #61 Bug 5

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 12:23:35 +02:00
+								                # Take a generous chunk so der client-side type-filter
 								                # genug Material zum Filtern hat. Berlin-PARDOK ist
 								                # dominiert von "Schriftliche Anfrage"-Hits und ohne
 								                # server-side ETYPF-Filter (BE: document_type=None) liefern
 								                # 100 Roh-Hits oft nur 1-2 Anträge. Floor bewusst hoch.
 								                # Quick-win für #13 + #61 Bug 5.
 								                chunksize = max(limit * 10, 1500)
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                report_resp = await client.post(
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								                    report_html,
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                    json={"report_id": report_id, "start": 0, "chunksize": chunksize},
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								                    headers={"Referer": browse_html},
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                )
 								                if report_resp.status_code != 200:
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								                    logger.error("%s report HTTP %s", self.bundesland, report_resp.status_code)
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                    return []
 								                results = self._parse_hit_list_html(report_resp.text, query_filter=query)
-												Adapter-Bugs aus #61: BB Datum + BB/RP Type-Filter

Drei aus #61 identifizierte Production-Bugs gefixt:

- **Bug 4 (BB Datum)**: BB.wahlperiode_start vom 2024-10-23 (Konstituie-
  rende Sitzung) auf 2024-09-22 (Wahltag) zurückgesetzt. Damit fällt
  die Geschäftsordnungs-Drucksache 8/2 vom 2024-10-17 in den
  Plausibilitäts-Check. Ist auch semantisch sauberer — die WP fängt
  mit der Wahl an, nicht mit der formalen Konstituierung.

- **Bug 2/3 (BB/RP Type-Filter leakt Kleine Anfrage / Beschluss-
  empfehlung)**: Server-side ETYPF/DTYPF-Filter ist best-effort über
  die portala-Instanzen — BB+RP lassen die nicht-Antrag-Typen durch.
  Client-side strict-filter im PortalaAdapter.search() nach Aufruf von
  _parse_hit_list_html: nur Hits, deren typ-String das Substring
  "antrag" enthält, kommen weiter. Substring-Match (nicht exact),
  damit "Antrag gemäß § 79 GO" und ähnliche Subtypen passieren.

176 Unit-Tests grün, Live-Verifikation via Sub-A im Container nach
Deploy.

Refs: #61 (Bug 2, 3, 4)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:56:20 +02:00
+								                # Server-side ETYPF/DTYPF filter is best-effort across portala
 								                # instances — BB/RP let "Kleine Anfrage" und "Beschluss-
-												PortalaAdapter: client-side Antrag-Filter immer aktiv (#61 Bug 5)

BE-Adapter hat document_type=None (eigene ETYPF-Werte werden vom
Berliner PARDOK nicht akzeptiert), wodurch der Server alle Doku-Typen
zurückliefert. Das 200-Result-Window war damit vollständig von
'Schriftliche Anfrage'-Hits aushungernd, sodass Anträge wie 19/2650 nie
ans Frontend kamen — und get_document() für genau diese Drucksachen
None lieferte.

Patch: client-side 'antrag'-Substring-Filter läuft jetzt unabhängig
vom Server-Filter (vorher nur wenn document_type gesetzt war). BB/RP
und alle PortalaAdapter-Instanzen profitieren mit.

176 Unit-Tests grün, Live-Verifikation Sub-B im Container nach Deploy.

Refs: #61 Bug 5

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 12:11:20 +02:00
+								                # empfehlung" durch, BE hat sogar `document_type=None`
 								                # (eigene ETYPF-Werte), wodurch "Schriftliche Anfrage" das
 								                # 200-Result-Window aushungern und Anträge wie 19/2650 nie
 								                # zurückkommen. Wir filtern client-side IMMER auf
 								                # "antrag"-Substring im typ — unabhängig davon, ob der
 								                # Server-Filter gesetzt war (siehe #61 Bug 2, 3, 5).
 								                results = [
 								                    d for d in results
 								                    if "antrag" in (d.typ or "").lower()
 								                ]
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                return results[:limit]
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								            except Exception:
 								                logger.exception("%s search error", self.bundesland)
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                return []
 								    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
 								        """Look up a single document by ID via the search endpoint with a
 								        document_number filter."""
 								        # Pragmatic MVP: do a broad search and filter for the requested ID.
 								        # A targeted single-document fetch would require a different
 								        # action.search.json structure that we have not reverse-engineered yet.
 								        results = await self.search(query="", limit=200)
 								        for doc in results:
 								            if doc.drucksache == drucksache:
 								                return doc
 								        return None
 								    async def download_text(self, drucksache: str) -> Optional[str]:
 								        """Download the PDF for a Drucksache and extract its text."""
 								        import fitz  # PyMuPDF
 								        doc = await self.get_document(drucksache)
 								        if not doc or not doc.link:
 								            return None
 								        async with httpx.AsyncClient(
 								            timeout=60,
 								            follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                resp = await client.get(doc.link)
 								                if resp.status_code != 200:
 								                    return None
 								                pdf = fitz.open(stream=resp.content, filetype="pdf")
 								                text = ""
 								                for page in pdf:
 								                    text += page.get_text()
 								                pdf.close()
 								                return text
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								            except Exception:
 								                logger.exception("%s download error for %s", self.bundesland, drucksache)
 								                return None
 								class ParLDokAdapter(ParlamentAdapter):
 								    """Adapter for ParlDok 8.x parliament documentation systems (J3S GmbH).
 								    ParlDok is a proprietary parliament documentation product by J3S GmbH
 								    (https://www.j3s.de). Different from the portala/eUI framework used by
 								    LSA/BE: ParlDok 8.x is a single-page app whose backend is a JSON API
 								    rooted at ``{base_url}{prefix}/Fulltext/...``. The legacy ParLDok 5.x
 								    HTML POST form (``parldok/formalkriterien``) used by dokukratie's MV
 								    YAML scraper has been deprecated by the LandtagMV upgrade to 8.3.5.
 								    Confirmed instances using this engine (April 2026):
 								    - **MV** (Mecklenburg-Vorpommern) — ``dokumentation.landtag-mv.de/parldok``
 								    - HH, SN, TH all advertise ParlDok in dokukratie but their actual
 								      versions/themes have not been verified yet.
 								    Search workflow:
 . ``GET {base_url}{prefix}/`` to obtain the session cookie. The
 								       backend rejects POSTs without it.
 . ``POST {base_url}{prefix}/Fulltext/Search`` with form-encoded
 								       ``data=<json>`` payload. The JSON carries a ``tags`` array of
 								       facet selections; each tag is ``{"type": <facet_type_int>,
 								       "id": <facet_value>}``. Reverse-engineered facet type constants
 								       from the bundle.js (``pd.facet_*``):
 								       - ``facet_fraction = 2``
 								       - ``facet_kind = 7`` (Drucksache, Plenarprotokoll, …)
 								       - ``facet_type = 8`` (Antrag, Gesetzentwurf, Kleine Anfrage, …)
 								       - ``facet_lp = 10`` (Wahlperiode)
 								       Response is JSON ``{success, data: <stringified JSON>}`` where the
 								       inner ``data`` carries ``{count, docs: [{id, title, date,
 								       authorhtml, kind, type, lp, number, link, ...}], ...}``.
 . PDF download: ``GET {base_url}{prefix}/dokument/{numeric_id}``.
 								       Returns ``application/pdf`` directly. The ``link`` field returned
 								       by the search API already contains the path fragment
 								       ``/dokument/<id>#navpanes=0`` — strip the fragment and prepend
 								       the configured ``prefix``.
 								    Drucksachen-Nummer is reconstructed as ``f"{lp}/{number}"`` from the
 								    search hit. Full-text search is *not* implemented in this MVP — the
 								    backend supports it via ``facet_fulltext = 0`` tags but the public
 								    LP-only filter already returns the relevant Antrag pool. ``query``
 								    is applied as a client-side title/Urheber filter.
 								    """
 								    # Reverse-engineered facet type constants from bundle.js (pd.facet_*).
-												ParLDokAdapter: server-side fulltext search via facet_fulltext (#12)

Replaces the client-side title/Urheber substring filter with a
real server-side full-text search through ParlDok's facet_fulltext
tag (type=0). The tag schema is reverse-engineered from
pd.addInput in the live bundle.js:

  {"type": 0,
   "id": <getFulltextId(term)>,    # non-alphanum → "-"
   "fulltext": <raw term>,
   "label": <raw term>,
   "field": "Alle"}                # search all indexed fields

The Resultpage queryid inherits the fulltext filter, so
pagination works without re-sending the tag.

Smoke test (local):
  Schule → 10 hits (was 3)
  Klima  → 10 hits across multiple parties + dates
  Wohnen → 10 hits including older 2025 Anträge

The 10-page (1000-doc) safety bound still applies on top of the
fulltext-filtered result set, but since the server now narrows
to ~2k Schule-related docs WP-wide instead of the 8k+ raw WP
total, the bound is no longer the limiting factor for typical
queries.

Closes #12. BE/LSA equivalent (#13) is independent — eUI
sf-index names still need DevTools tracing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 12:57:34 +02:00
+								    FACET_FULLTEXT = 0
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								    FACET_FRACTION = 2
 								    FACET_KIND = 7
 								    FACET_TYPE = 8
 								    FACET_LP = 10
 								    def __init__(
 								        self,
 								        *,
 								        bundesland: str,
 								        name: str,
 								        base_url: str,
 								        wahlperiode: int,
 								        prefix: str = "/parldok",
 								        document_typ: str = "Antrag",
-												Activate Thüringen via ParLDokAdapter reuse + filter widening (#25, Phase 1)

Thüringen läuft auf parldok.thueringer-landtag.de mit ParlDok 8.3.5
(J3S GmbH) — exakt dieselbe Version wie MV. Aber TH packt seine
Anträge unter zusammengesetzten type-Strings ("Antrag gemäß § 79 GO",
"Antrag gemäß § 74 (2) GO") und kind="Vorlage" statt der MV-Variante
kind="Drucksache"/type="Antrag". Strict-Match auf "Antrag" hat 0
Treffer geliefert.

Lösung: ParLDokAdapter um zwei Konstruktor-Parameter erweitert:
- document_typ_substring=True → Substring-Match auf type-Feld
  ("Antrag" matched "Antrag gemäß § 79 GO", "Alternativantrag" usw.)
- kinds=["Drucksache", "Vorlage"] → erweiterte kind-Liste

Defaults sind backward-kompatibel (Substring-Match aus, kinds nur
Drucksache), sodass MV und HH unverändert weiterlaufen.

_hit_matches_filters() als zentraler Filter-Helper extrahiert,
search() und get_document() nutzen ihn — get_document() überspringt
ihn allerdings, weil dort beliebige Drucksachen aufrufbar sein müssen,
unabhängig vom search-Time-Filter.

Hostname-Korrektur: parldok.thueringen.de redirected per 303 auf
parldok.thueringer-landtag.de. doku_base_url in bundeslaender.py
auf den neuen Host umgestellt.

Smoke-Test (lokal):
  TH q="":       8 hits in 3.3s
  TH q="Schule": 2 hits in 25.7s (Lernmittelbeschaffung, Modernisierung
                  Bund-Länder-Vereinbarung — beide Schul-bezogen)
  TH q="Klima":  0 hits (keine in den letzten 1000 Drucksachen)

Damit ist Phase 1 (3/3) komplett. Nächstes Phase-2 Issue: #27 BB als
StarWebAdapter-Template.

Phase 1 (3/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:48:02 +02:00
+								        document_typ_substring: bool = False,
 								        kinds: Optional[list[str]] = None,
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								    ) -> None:
 								        """Configure a ParlDok 8.x adapter for one specific parliament.
 								        Args:
 								            bundesland: state code, e.g. ``"MV"``.
 								            name: human-readable label.
 								            base_url: ``https://...`` host root, no trailing slash.
 								            wahlperiode: current legislative period — fed into the
 								                ``facet_lp`` tag of the search payload.
 								            prefix: app prefix where ParlDok lives. ``/parldok`` for MV.
 								            document_typ: client-side filter on the ``type`` field of
 								                each hit ("Antrag", "Gesetzentwurf", …). Set to empty
 								                string to disable type filtering.
-												Activate Thüringen via ParLDokAdapter reuse + filter widening (#25, Phase 1)

Thüringen läuft auf parldok.thueringer-landtag.de mit ParlDok 8.3.5
(J3S GmbH) — exakt dieselbe Version wie MV. Aber TH packt seine
Anträge unter zusammengesetzten type-Strings ("Antrag gemäß § 79 GO",
"Antrag gemäß § 74 (2) GO") und kind="Vorlage" statt der MV-Variante
kind="Drucksache"/type="Antrag". Strict-Match auf "Antrag" hat 0
Treffer geliefert.

Lösung: ParLDokAdapter um zwei Konstruktor-Parameter erweitert:
- document_typ_substring=True → Substring-Match auf type-Feld
  ("Antrag" matched "Antrag gemäß § 79 GO", "Alternativantrag" usw.)
- kinds=["Drucksache", "Vorlage"] → erweiterte kind-Liste

Defaults sind backward-kompatibel (Substring-Match aus, kinds nur
Drucksache), sodass MV und HH unverändert weiterlaufen.

_hit_matches_filters() als zentraler Filter-Helper extrahiert,
search() und get_document() nutzen ihn — get_document() überspringt
ihn allerdings, weil dort beliebige Drucksachen aufrufbar sein müssen,
unabhängig vom search-Time-Filter.

Hostname-Korrektur: parldok.thueringen.de redirected per 303 auf
parldok.thueringer-landtag.de. doku_base_url in bundeslaender.py
auf den neuen Host umgestellt.

Smoke-Test (lokal):
  TH q="":       8 hits in 3.3s
  TH q="Schule": 2 hits in 25.7s (Lernmittelbeschaffung, Modernisierung
                  Bund-Länder-Vereinbarung — beide Schul-bezogen)
  TH q="Klima":  0 hits (keine in den letzten 1000 Drucksachen)

Damit ist Phase 1 (3/3) komplett. Nächstes Phase-2 Issue: #27 BB als
StarWebAdapter-Template.

Phase 1 (3/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:48:02 +02:00
+								            document_typ_substring: if True, ``document_typ`` is matched
 								                as a substring against the hit's ``type`` field instead
 								                of an exact match. Needed for instances where the
 								                Drucksachen-Anträge live under composite type strings
 								                like ``"Antrag gemäß § 79 GO"`` (Thüringen) — strict
 								                ``"Antrag"`` would never match.
 								            kinds: optional list of acceptable ``kind`` values. Defaults
 								                to ``["Drucksache"]`` if None — but TH packs its Anträge
 								                under ``kind="Vorlage"`` so the parameter has to be
 								                widened there.
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								        """
 								        self.bundesland = bundesland
 								        self.name = name
 								        self.base_url = base_url.rstrip("/")
 								        self.prefix = "/" + prefix.strip("/")
 								        self.wahlperiode = wahlperiode
 								        self.document_typ = document_typ
-												Activate Thüringen via ParLDokAdapter reuse + filter widening (#25, Phase 1)

Thüringen läuft auf parldok.thueringer-landtag.de mit ParlDok 8.3.5
(J3S GmbH) — exakt dieselbe Version wie MV. Aber TH packt seine
Anträge unter zusammengesetzten type-Strings ("Antrag gemäß § 79 GO",
"Antrag gemäß § 74 (2) GO") und kind="Vorlage" statt der MV-Variante
kind="Drucksache"/type="Antrag". Strict-Match auf "Antrag" hat 0
Treffer geliefert.

Lösung: ParLDokAdapter um zwei Konstruktor-Parameter erweitert:
- document_typ_substring=True → Substring-Match auf type-Feld
  ("Antrag" matched "Antrag gemäß § 79 GO", "Alternativantrag" usw.)
- kinds=["Drucksache", "Vorlage"] → erweiterte kind-Liste

Defaults sind backward-kompatibel (Substring-Match aus, kinds nur
Drucksache), sodass MV und HH unverändert weiterlaufen.

_hit_matches_filters() als zentraler Filter-Helper extrahiert,
search() und get_document() nutzen ihn — get_document() überspringt
ihn allerdings, weil dort beliebige Drucksachen aufrufbar sein müssen,
unabhängig vom search-Time-Filter.

Hostname-Korrektur: parldok.thueringen.de redirected per 303 auf
parldok.thueringer-landtag.de. doku_base_url in bundeslaender.py
auf den neuen Host umgestellt.

Smoke-Test (lokal):
  TH q="":       8 hits in 3.3s
  TH q="Schule": 2 hits in 25.7s (Lernmittelbeschaffung, Modernisierung
                  Bund-Länder-Vereinbarung — beide Schul-bezogen)
  TH q="Klima":  0 hits (keine in den letzten 1000 Drucksachen)

Damit ist Phase 1 (3/3) komplett. Nächstes Phase-2 Issue: #27 BB als
StarWebAdapter-Template.

Phase 1 (3/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:48:02 +02:00
+								        self.document_typ_substring = document_typ_substring
 								        self.kinds = kinds if kinds is not None else ["Drucksache"]
 								    def _hit_matches_filters(self, hit: dict) -> bool:
 								        """Apply the kind/typ filters to a raw hit dict.
 								        Centralised so the search loop can short-circuit cleanly. ``hit``
 								        comes from ``Fulltext/Search`` or ``Fulltext/Resultpage`` JSON
 								        responses; both share the same record schema.
 								        """
 								        if self.kinds and hit.get("kind") not in self.kinds:
 								            return False
 								        hit_type = (hit.get("type") or "").strip()
 								        if self.document_typ:
 								            if self.document_typ_substring:
 								                if self.document_typ not in hit_type:
 								                    return False
 								            else:
 								                if hit_type != self.document_typ:
 								                    return False
 								        return True
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
 								    @staticmethod
 								    def _datum_de_to_iso(datum_de: str) -> str:
 								        """DD.MM.YYYY → YYYY-MM-DD; '' for empty input."""
 								        if not datum_de:
 								            return ""
 								        try:
 								            d, m, y = datum_de.split(".")
 								            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
 								        except ValueError:
 								            return ""
-												Phase B: Parteinamen-Mapper #55 (Roadmap #59)

Zentrale `app/parteien.py` als Single Source of Truth für die Partei-
Auflösung:

- `PARTEIEN`-Tabelle mit kanonischem Key, langem Display-Namen, allen
  bekannten Aliasen, optionalem `bundesland_scope` und Government-
  Marker. 14 Einträge (CDU, CSU, SPD, GRÜNE, FDP, LINKE, AfD, BSW, SSW,
  BiW + die Freie-Wähler-Familie BVB-FW, FW-BAYERN, FW-SL und der
  generische FREIE WÄHLER-Eintrag).
- `normalize_partei(raw, *, bundesland=None)` für Single-String-Lookups
  mit Government-Vorrang und FW-Familien-Disambiguierung
- `extract_fraktionen(text, *, bundesland=None)` als Funnel für die
  vier alten Adapter-Helper. Kommagetrennte Listen, MdL-mit-Klammer-
  partei, HTML-Reste — alles fließt durch eine Stelle, mit BL-Scope-
  Filter (SSW nur in SH, BVB-FW nur in BB, etc.).
- `display_name(canonical, *, long=False)` für UI/PDF — kurze Form
  bleibt der kanonische Key, lange Form ist "BÜNDNIS 90/DIE GRÜNEN"
  statt "GRÜNE" etc.

Adapter-Migration in `app/parlamente.py`:

- Vier nahezu identische `_normalize_fraktion()`-Methoden in
  PortalaAdapter, ParLDokAdapter, StarFinderCGIAdapter, PARLISAdapter
  durch einen einzeiligen Shim ersetzt, der `extract_fraktionen` mit
  `self.bundesland` aufruft. ~120 Zeilen Duplikation entfernt.
- `@staticmethod` aufgehoben, weil wir jetzt `self.bundesland` brauchen
  für die FW-Disambiguierung — alle Aufrufer waren bereits `self._...`,
  also keine Call-Site-Änderung nötig.

`app/embeddings.py:496` Workaround-Hack entfernt:

- `partei.upper() if partei != "GRÜNE" else "GRÜNE"` durch zentralen
  `normalize_partei()`-Aufruf ersetzt — der Hack war ein Kommentarzeichen
  dafür, dass die Partei-Schreibweise irgendwo zwischen Adapter und
  Embedding-Lookup driften konnte. Mit dem Mapper ist die Schreibweise
  überall garantiert kanonisch.

Tests:

- Neue `tests/test_parteien.py` mit 52 Cases — Single-Lookup, FW-
  Disambiguierung (BVB/Bayern/Saarland/RP), Volltext-Extraktion,
  Government-Marker, Tabellen-Konsistenz
- `tests/test_parlamente.py` Test-Klasse umgeschrieben: statt der 6
  statischen `PortalaAdapter._normalize_fraktion(...)`-Tests jetzt 4
  Roundtrip-Tests über echte Adapter-Instanzen, inkl. expliziter
  BB→BVB-FW vs. RP→FREIE WÄHLER-Verifikation

157 Unit-Tests grün (105 alt + 52 neu). Backwards-kompatibel — die
kanonischen Keys sind exakt die in der DB stehenden Strings, kein
Migrations-Schritt nötig.

Refs: #55, #59 (Phase B)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:22:13 +02:00
+								    def _normalize_fraktion(self, authorhtml: str) -> list[str]:
 								        """Thin shim — siehe ``app.parteien.extract_fraktionen``. #55."""
 								        from .parteien import extract_fraktionen
 								        return extract_fraktionen(authorhtml, bundesland=self.bundesland)
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
-												ParLDokAdapter: server-side fulltext search via facet_fulltext (#12)

Replaces the client-side title/Urheber substring filter with a
real server-side full-text search through ParlDok's facet_fulltext
tag (type=0). The tag schema is reverse-engineered from
pd.addInput in the live bundle.js:

  {"type": 0,
   "id": <getFulltextId(term)>,    # non-alphanum → "-"
   "fulltext": <raw term>,
   "label": <raw term>,
   "field": "Alle"}                # search all indexed fields

The Resultpage queryid inherits the fulltext filter, so
pagination works without re-sending the tag.

Smoke test (local):
  Schule → 10 hits (was 3)
  Klima  → 10 hits across multiple parties + dates
  Wohnen → 10 hits including older 2025 Anträge

The 10-page (1000-doc) safety bound still applies on top of the
fulltext-filtered result set, but since the server now narrows
to ~2k Schule-related docs WP-wide instead of the 8k+ raw WP
total, the bound is no longer the limiting factor for typical
queries.

Closes #12. BE/LSA equivalent (#13) is independent — eUI
sf-index names still need DevTools tracing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 12:57:34 +02:00
+								    @staticmethod
 								    def _fulltext_id(term: str) -> str:
 								        """Sanitize a search term to ParlDok's facet ID format.
 								        Mirrors ``pd.getFulltextId`` from ``bundle.js``: replace every
 								        non-alphanumeric character with ``-``. The server uses this to
 								        deduplicate identical search facets.
 								        """
 								        return re.sub(r"[^a-zA-Z0-9]", "-", term)
 								    def _build_search_body(self, *, length: int = 100, query: str = "") -> dict:
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								        """Build the JSON payload for the initial ``Fulltext/Search`` call.
-												ParLDokAdapter: Volltext (#12) deaktivieren — einheitlich Title-Filter (#18)

Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA
(beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung
2026-04-08: einheitliches Verhalten ist wichtiger als das beste
Verhalten in 2 von 4 Adaptern.

Konkrete Änderungen:

- _build_search_body() schickt query nicht mehr server-side. Der
  query-Parameter bleibt in der Signatur als unused-mit-del, weil
  die Wieder-Aktivierung später ein Drop-in sein soll wenn die
  PortalaAdapter-Variante reverse-engineered wurde.

- _initial_search() und _paginated_hits() ohne query-Parameter.

- search() macht clientseitigen Title+Urheber-Filter wie der
  PortalaAdapter — same Codepfad, einheitliches Verhalten.

- get_document() nutzt die unveränderte Pagination.

- FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code
  als Dokumentation für die spätere Re-Aktivierung. Im Docstring
  ist die Tag-Form festgehalten.

Folgen:

- MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter
  über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA
  pre-#13.

- Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10.

- Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und
  ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred)
  bleiben als Folge-Optionen.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 19:01:00 +02:00
+								        Filters by Wahlperiode only — type/kind/fulltext filtering all
 								        happen client-side after the hit list is paginated. The
 								        ``query`` parameter is accepted for API compatibility but is
 								        currently NOT forwarded to the server (#18: einheitliche
 								        client-side Title-Suche, kein Server-Volltext, weil das
 								        Verhalten zwischen Adaptern sonst asymmetrisch wird). The
 								        ``FACET_FULLTEXT`` constant and :meth:`_fulltext_id` helper
 								        are kept around as documentation for the previous #12
 								        server-side variant — when fulltext gets uniformly
 								        re-introduced later, the dormant tag is just::
 								            {"type": self.FACET_FULLTEXT,
 								             "id": self._fulltext_id(query),
 								             "fulltext": query, "label": query, "field": "Alle"}
-												ParLDokAdapter: server-side fulltext search via facet_fulltext (#12)

Replaces the client-side title/Urheber substring filter with a
real server-side full-text search through ParlDok's facet_fulltext
tag (type=0). The tag schema is reverse-engineered from
pd.addInput in the live bundle.js:

  {"type": 0,
   "id": <getFulltextId(term)>,    # non-alphanum → "-"
   "fulltext": <raw term>,
   "label": <raw term>,
   "field": "Alle"}                # search all indexed fields

The Resultpage queryid inherits the fulltext filter, so
pagination works without re-sending the tag.

Smoke test (local):
  Schule → 10 hits (was 3)
  Klima  → 10 hits across multiple parties + dates
  Wohnen → 10 hits including older 2025 Anträge

The 10-page (1000-doc) safety bound still applies on top of the
fulltext-filtered result set, but since the server now narrows
to ~2k Schule-related docs WP-wide instead of the 8k+ raw WP
total, the bound is no longer the limiting factor for typical
queries.

Closes #12. BE/LSA equivalent (#13) is independent — eUI
sf-index names still need DevTools tracing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 12:57:34 +02:00
 								        Pagination beyond the first page goes through
 								        ``Fulltext/Resultpage`` — the ``Search`` endpoint itself
 								        ignores any non-zero ``Start``.
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								        """
-												ParLDokAdapter: Volltext (#12) deaktivieren — einheitlich Title-Filter (#18)

Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA
(beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung
2026-04-08: einheitliches Verhalten ist wichtiger als das beste
Verhalten in 2 von 4 Adaptern.

Konkrete Änderungen:

- _build_search_body() schickt query nicht mehr server-side. Der
  query-Parameter bleibt in der Signatur als unused-mit-del, weil
  die Wieder-Aktivierung später ein Drop-in sein soll wenn die
  PortalaAdapter-Variante reverse-engineered wurde.

- _initial_search() und _paginated_hits() ohne query-Parameter.

- search() macht clientseitigen Title+Urheber-Filter wie der
  PortalaAdapter — same Codepfad, einheitliches Verhalten.

- get_document() nutzt die unveränderte Pagination.

- FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code
  als Dokumentation für die spätere Re-Aktivierung. Im Docstring
  ist die Tag-Form festgehalten.

Folgen:

- MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter
  über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA
  pre-#13.

- Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10.

- Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und
  ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred)
  bleiben als Folge-Optionen.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 19:01:00 +02:00
+								        del query  # explicitly unused — see docstring
-												ParLDokAdapter: server-side fulltext search via facet_fulltext (#12)

Replaces the client-side title/Urheber substring filter with a
real server-side full-text search through ParlDok's facet_fulltext
tag (type=0). The tag schema is reverse-engineered from
pd.addInput in the live bundle.js:

  {"type": 0,
   "id": <getFulltextId(term)>,    # non-alphanum → "-"
   "fulltext": <raw term>,
   "label": <raw term>,
   "field": "Alle"}                # search all indexed fields

The Resultpage queryid inherits the fulltext filter, so
pagination works without re-sending the tag.

Smoke test (local):
  Schule → 10 hits (was 3)
  Klima  → 10 hits across multiple parties + dates
  Wohnen → 10 hits including older 2025 Anträge

The 10-page (1000-doc) safety bound still applies on top of the
fulltext-filtered result set, but since the server now narrows
to ~2k Schule-related docs WP-wide instead of the 8k+ raw WP
total, the bound is no longer the limiting factor for typical
queries.

Closes #12. BE/LSA equivalent (#13) is independent — eUI
sf-index names still need DevTools tracing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 12:57:34 +02:00
+								        tags: list[dict] = [{"type": self.FACET_LP, "id": self.wahlperiode}]
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								        return {
 								            "devicekey": "",
 								            "max": length,
 								            "withfilter": False,
 								            # sort=2 → newest first (date desc); sort=1 is relevance.
 								            "sort": 2,
 								            "topk": length,
 								            "llm": 0,
 								            "newdocsearch": False,
 								            "limit": {"Start": 0, "Length": length},
-												ParLDokAdapter: server-side fulltext search via facet_fulltext (#12)

Replaces the client-side title/Urheber substring filter with a
real server-side full-text search through ParlDok's facet_fulltext
tag (type=0). The tag schema is reverse-engineered from
pd.addInput in the live bundle.js:

  {"type": 0,
   "id": <getFulltextId(term)>,    # non-alphanum → "-"
   "fulltext": <raw term>,
   "label": <raw term>,
   "field": "Alle"}                # search all indexed fields

The Resultpage queryid inherits the fulltext filter, so
pagination works without re-sending the tag.

Smoke test (local):
  Schule → 10 hits (was 3)
  Klima  → 10 hits across multiple parties + dates
  Wohnen → 10 hits including older 2025 Anträge

The 10-page (1000-doc) safety bound still applies on top of the
fulltext-filtered result set, but since the server now narrows
to ~2k Schule-related docs WP-wide instead of the 8k+ raw WP
total, the bound is no longer the limiting factor for typical
queries.

Closes #12. BE/LSA equivalent (#13) is independent — eUI
sf-index names still need DevTools tracing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 12:57:34 +02:00
+								            "tags": tags,
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								            "updateFilters": [],
 								        }
 								    def _hit_to_drucksache(self, hit: dict) -> Optional[Drucksache]:
-												ParLDokAdapter: Skip Hits mit leerem PDF-Link (#61 Bug 1, TH-Pipeline-Blocker)

Live-Verifikation in der Container-DB hat aufgedeckt, dass TH ParlDok
für sehr frische Vorlagen (z.B. 8/1594, datum 2026-03-31, allowed=false)
``link``/``prelink`` als leeren String liefert — das PDF ist noch nicht
zur Veröffentlichung freigegeben.

Bisheriges Verhalten: Adapter konstruierte einen Drucksache-Eintrag mit
``link=''``, der dann durch die Pipeline rutschte und im Frontend als
unklickbarer Eintrag erschien. ``download_text()`` würde später an
``not doc.link`` scheitern, was die Analyse blockt.

Sauberer Skip an der Quelle: ``_hit_to_drucksache`` returnt None, wenn
weder ``link`` noch ``prelink`` einen Pfad liefern. Das ist konsistent
mit den anderen None-Returns für unbrauchbare Hits (kein lp, kein
number).

Lokal verifiziert: 176 Unit-Tests grün. Live-Verifikation gegen
Production folgt nach Deploy via Sub-A-Test im Container.

Refs: #61 (Bug 1: TH leerer Link)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:53:16 +02:00
+								        """Convert one ParlDok JSON hit to a Drucksache. None if unusable.
 								        ParlDok markiert frische Vorlagen mit leerem ``link``/``prelink``
 								        wenn das PDF noch nicht freigegeben ist (z.B. TH 8/1594, datum
 -03-31, ``allowed: false``). Solche Hits sind für unsere
 								        Pipeline wertlos — `download_text` würde an `not doc.link`
 								        scheitern und das Frontend würde einen unklickbaren Eintrag
 								        anzeigen. Sauberer Skip an dieser Stelle. Issue #61, Bug 1.
 								        """
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								        lp = hit.get("lp")
 								        number = hit.get("number")
 								        if not lp or not number:
 								            return None
 								        link_field = hit.get("link") or hit.get("prelink") or ""
-												ParLDokAdapter: Skip Hits mit leerem PDF-Link (#61 Bug 1, TH-Pipeline-Blocker)

Live-Verifikation in der Container-DB hat aufgedeckt, dass TH ParlDok
für sehr frische Vorlagen (z.B. 8/1594, datum 2026-03-31, allowed=false)
``link``/``prelink`` als leeren String liefert — das PDF ist noch nicht
zur Veröffentlichung freigegeben.

Bisheriges Verhalten: Adapter konstruierte einen Drucksache-Eintrag mit
``link=''``, der dann durch die Pipeline rutschte und im Frontend als
unklickbarer Eintrag erschien. ``download_text()`` würde später an
``not doc.link`` scheitern, was die Analyse blockt.

Sauberer Skip an der Quelle: ``_hit_to_drucksache`` returnt None, wenn
weder ``link`` noch ``prelink`` einen Pfad liefern. Das ist konsistent
mit den anderen None-Returns für unbrauchbare Hits (kein lp, kein
number).

Lokal verifiziert: 176 Unit-Tests grün. Live-Verifikation gegen
Production folgt nach Deploy via Sub-A-Test im Container.

Refs: #61 (Bug 1: TH leerer Link)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:53:16 +02:00
+								        if not link_field:
 								            return None
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								        # Strip "#navpanes=0" fragment and prepend the prefix.
 								        path = link_field.split("#", 1)[0]
-												ParLDokAdapter: Skip Hits mit leerem PDF-Link (#61 Bug 1, TH-Pipeline-Blocker)

Live-Verifikation in der Container-DB hat aufgedeckt, dass TH ParlDok
für sehr frische Vorlagen (z.B. 8/1594, datum 2026-03-31, allowed=false)
``link``/``prelink`` als leeren String liefert — das PDF ist noch nicht
zur Veröffentlichung freigegeben.

Bisheriges Verhalten: Adapter konstruierte einen Drucksache-Eintrag mit
``link=''``, der dann durch die Pipeline rutschte und im Frontend als
unklickbarer Eintrag erschien. ``download_text()`` würde später an
``not doc.link`` scheitern, was die Analyse blockt.

Sauberer Skip an der Quelle: ``_hit_to_drucksache`` returnt None, wenn
weder ``link`` noch ``prelink`` einen Pfad liefern. Das ist konsistent
mit den anderen None-Returns für unbrauchbare Hits (kein lp, kein
number).

Lokal verifiziert: 176 Unit-Tests grün. Live-Verifikation gegen
Production folgt nach Deploy via Sub-A-Test im Container.

Refs: #61 (Bug 1: TH leerer Link)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:53:16 +02:00
+								        pdf_url = f"{self.base_url}{self.prefix}{path}"
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
 								        return Drucksache(
 								            drucksache=f"{lp}/{number}",
 								            title=hit.get("title", ""),
 								            fraktionen=self._normalize_fraktion(hit.get("authorhtml", "")),
 								            datum=self._datum_de_to_iso(hit.get("date", "")),
 								            link=pdf_url,
 								            bundesland=self.bundesland,
 								            typ=hit.get("type", "") or hit.get("kind", ""),
 								        )
 								    async def _post_json(
 								        self, client: httpx.AsyncClient, endpoint: str, payload: dict,
 								    ) -> Optional[dict]:
 								        """POST a JSON-stringified payload to a ParlDok endpoint.
 								        ``endpoint`` is the path tail (e.g. ``"Fulltext/Search"`` or
 								        ``"Fulltext/Resultpage"``). Returns the inner JSON object
 								        (already parsed from the stringified ``data`` field), or None
 								        on error.
 								        """
 								        homepage = f"{self.base_url}{self.prefix}/"
 								        url = f"{self.base_url}{self.prefix}/{endpoint}"
 								        try:
 								            resp = await client.post(
 								                url,
 								                data={"data": json.dumps(payload, ensure_ascii=False)},
 								                headers={
 								                    "X-Requested-With": "XMLHttpRequest",
 								                    "Referer": homepage,
 								                },
 								            )
 								            if resp.status_code != 200:
 								                logger.error(
 								                    "%s %s HTTP %s",
 								                    self.bundesland, endpoint, resp.status_code,
 								                )
 								                return None
 								            outer = resp.json()
 								            if not outer.get("success"):
 								                logger.error(
 								                    "%s %s not successful: %s",
 								                    self.bundesland, endpoint, outer.get("message"),
 								                )
 								                return None
 								            return json.loads(outer["data"])
 								        except Exception:
 								            logger.exception("%s ParlDok %s error", self.bundesland, endpoint)
 								            return None
 								    async def _initial_search(
-												ParLDokAdapter: Volltext (#12) deaktivieren — einheitlich Title-Filter (#18)

Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA
(beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung
2026-04-08: einheitliches Verhalten ist wichtiger als das beste
Verhalten in 2 von 4 Adaptern.

Konkrete Änderungen:

- _build_search_body() schickt query nicht mehr server-side. Der
  query-Parameter bleibt in der Signatur als unused-mit-del, weil
  die Wieder-Aktivierung später ein Drop-in sein soll wenn die
  PortalaAdapter-Variante reverse-engineered wurde.

- _initial_search() und _paginated_hits() ohne query-Parameter.

- search() macht clientseitigen Title+Urheber-Filter wie der
  PortalaAdapter — same Codepfad, einheitliches Verhalten.

- get_document() nutzt die unveränderte Pagination.

- FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code
  als Dokumentation für die spätere Re-Aktivierung. Im Docstring
  ist die Tag-Form festgehalten.

Folgen:

- MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter
  über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA
  pre-#13.

- Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10.

- Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und
  ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred)
  bleiben als Folge-Optionen.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 19:01:00 +02:00
+								        self, client: httpx.AsyncClient, *, length: int,
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								    ) -> tuple[Optional[int], list[dict]]:
 								        """Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
 								        The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
 								        calls. ParlDok ignores any non-zero ``Start`` on this endpoint —
 								        the first 100 hits are the only ones reachable via ``Search``.
 								        """
-												ParLDokAdapter: Volltext (#12) deaktivieren — einheitlich Title-Filter (#18)

Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA
(beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung
2026-04-08: einheitliches Verhalten ist wichtiger als das beste
Verhalten in 2 von 4 Adaptern.

Konkrete Änderungen:

- _build_search_body() schickt query nicht mehr server-side. Der
  query-Parameter bleibt in der Signatur als unused-mit-del, weil
  die Wieder-Aktivierung später ein Drop-in sein soll wenn die
  PortalaAdapter-Variante reverse-engineered wurde.

- _initial_search() und _paginated_hits() ohne query-Parameter.

- search() macht clientseitigen Title+Urheber-Filter wie der
  PortalaAdapter — same Codepfad, einheitliches Verhalten.

- get_document() nutzt die unveränderte Pagination.

- FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code
  als Dokumentation für die spätere Re-Aktivierung. Im Docstring
  ist die Tag-Form festgehalten.

Folgen:

- MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter
  über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA
  pre-#13.

- Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10.

- Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und
  ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred)
  bleiben als Folge-Optionen.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 19:01:00 +02:00
+								        body = self._build_search_body(length=length)
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								        inner = await self._post_json(client, "Fulltext/Search", body)
 								        if not inner:
 								            return None, []
 								        return inner.get("queryid"), (inner.get("docs") or [])
 								    async def _result_page(
 								        self, client: httpx.AsyncClient, *, queryid: int, start: int, length: int,
 								    ) -> list[dict]:
 								        """Fetch a further result page via ``Fulltext/Resultpage``."""
 								        payload = {
 								            "devicekey": "",
 								            "queryid": queryid,
 								            "limit": {"Start": start, "Length": length},
 								        }
 								        inner = await self._post_json(client, "Fulltext/Resultpage", payload)
 								        if not inner:
 								            return []
 								        return inner.get("docs") or []
 								    def _make_client(self) -> httpx.AsyncClient:
 								        return httpx.AsyncClient(
 								            timeout=30,
 								            follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        )
-												ParLDokAdapter: Volltext (#12) deaktivieren — einheitlich Title-Filter (#18)

Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA
(beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung
2026-04-08: einheitliches Verhalten ist wichtiger als das beste
Verhalten in 2 von 4 Adaptern.

Konkrete Änderungen:

- _build_search_body() schickt query nicht mehr server-side. Der
  query-Parameter bleibt in der Signatur als unused-mit-del, weil
  die Wieder-Aktivierung später ein Drop-in sein soll wenn die
  PortalaAdapter-Variante reverse-engineered wurde.

- _initial_search() und _paginated_hits() ohne query-Parameter.

- search() macht clientseitigen Title+Urheber-Filter wie der
  PortalaAdapter — same Codepfad, einheitliches Verhalten.

- get_document() nutzt die unveränderte Pagination.

- FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code
  als Dokumentation für die spätere Re-Aktivierung. Im Docstring
  ist die Tag-Form festgehalten.

Folgen:

- MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter
  über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA
  pre-#13.

- Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10.

- Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und
  ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred)
  bleiben als Folge-Optionen.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 19:01:00 +02:00
+								    async def _paginated_hits(self, client: httpx.AsyncClient):
 								        """Async iterator over Drucksachen-style hits across pages.
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
 								        Yields raw hit dicts in newest-first order. The first batch comes
 								        from ``Fulltext/Search``, subsequent batches from
 								        ``Fulltext/Resultpage`` using the queryid the server returned for
 								        the initial call. Stops when a page comes back empty, undersized,
-												ParLDokAdapter: Volltext (#12) deaktivieren — einheitlich Title-Filter (#18)

Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA
(beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung
2026-04-08: einheitliches Verhalten ist wichtiger als das beste
Verhalten in 2 von 4 Adaptern.

Konkrete Änderungen:

- _build_search_body() schickt query nicht mehr server-side. Der
  query-Parameter bleibt in der Signatur als unused-mit-del, weil
  die Wieder-Aktivierung später ein Drop-in sein soll wenn die
  PortalaAdapter-Variante reverse-engineered wurde.

- _initial_search() und _paginated_hits() ohne query-Parameter.

- search() macht clientseitigen Title+Urheber-Filter wie der
  PortalaAdapter — same Codepfad, einheitliches Verhalten.

- get_document() nutzt die unveränderte Pagination.

- FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code
  als Dokumentation für die spätere Re-Aktivierung. Im Docstring
  ist die Tag-Form festgehalten.

Folgen:

- MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter
  über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA
  pre-#13.

- Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10.

- Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und
  ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred)
  bleiben als Folge-Optionen.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 19:01:00 +02:00
+								        or after :attr:`MAX_PAGES` iterations.
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								        """
-												ParLDokAdapter: Volltext (#12) deaktivieren — einheitlich Title-Filter (#18)

Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA
(beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung
2026-04-08: einheitliches Verhalten ist wichtiger als das beste
Verhalten in 2 von 4 Adaptern.

Konkrete Änderungen:

- _build_search_body() schickt query nicht mehr server-side. Der
  query-Parameter bleibt in der Signatur als unused-mit-del, weil
  die Wieder-Aktivierung später ein Drop-in sein soll wenn die
  PortalaAdapter-Variante reverse-engineered wurde.

- _initial_search() und _paginated_hits() ohne query-Parameter.

- search() macht clientseitigen Title+Urheber-Filter wie der
  PortalaAdapter — same Codepfad, einheitliches Verhalten.

- get_document() nutzt die unveränderte Pagination.

- FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code
  als Dokumentation für die spätere Re-Aktivierung. Im Docstring
  ist die Tag-Form festgehalten.

Folgen:

- MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter
  über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA
  pre-#13.

- Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10.

- Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und
  ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred)
  bleiben als Folge-Optionen.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 19:01:00 +02:00
+								        queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE)
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								        for hit in hits:
 								            yield hit
 								        if not queryid or len(hits) < self.PAGE_SIZE:
 								            return
 								        for page in range(1, self.MAX_PAGES):
 								            page_hits = await self._result_page(
 								                client,
 								                queryid=queryid,
 								                start=page * self.PAGE_SIZE,
 								                length=self.PAGE_SIZE,
 								            )
 								            if not page_hits:
 								                return
 								            for hit in page_hits:
 								                yield hit
 								            if len(page_hits) < self.PAGE_SIZE:
 								                return
 								    # ParlDok 8.x caps Length per request at 100 — paginate if needed.
 								    PAGE_SIZE = 100
 								    # Safety bound: scan at most 10 pages × 100 = 1000 most recent docs.
 								    # Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more
 								    # than enough for the typical UI request (limit 5..20). Filtered
 								    # queries that find nothing in the last 1000 docs return empty
-												ParLDokAdapter: Volltext (#12) deaktivieren — einheitlich Title-Filter (#18)

Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA
(beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung
2026-04-08: einheitliches Verhalten ist wichtiger als das beste
Verhalten in 2 von 4 Adaptern.

Konkrete Änderungen:

- _build_search_body() schickt query nicht mehr server-side. Der
  query-Parameter bleibt in der Signatur als unused-mit-del, weil
  die Wieder-Aktivierung später ein Drop-in sein soll wenn die
  PortalaAdapter-Variante reverse-engineered wurde.

- _initial_search() und _paginated_hits() ohne query-Parameter.

- search() macht clientseitigen Title+Urheber-Filter wie der
  PortalaAdapter — same Codepfad, einheitliches Verhalten.

- get_document() nutzt die unveränderte Pagination.

- FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code
  als Dokumentation für die spätere Re-Aktivierung. Im Docstring
  ist die Tag-Form festgehalten.

Folgen:

- MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter
  über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA
  pre-#13.

- Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10.

- Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und
  ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred)
  bleiben als Folge-Optionen.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 19:01:00 +02:00
+								    # rather than scan the entire WP — same trade-off as the BE/LSA
 								    # PortalaAdapter quick-win window.
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								    MAX_PAGES = 10
 								    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
-												ParLDokAdapter: Volltext (#12) deaktivieren — einheitlich Title-Filter (#18)

Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA
(beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung
2026-04-08: einheitliches Verhalten ist wichtiger als das beste
Verhalten in 2 von 4 Adaptern.

Konkrete Änderungen:

- _build_search_body() schickt query nicht mehr server-side. Der
  query-Parameter bleibt in der Signatur als unused-mit-del, weil
  die Wieder-Aktivierung später ein Drop-in sein soll wenn die
  PortalaAdapter-Variante reverse-engineered wurde.

- _initial_search() und _paginated_hits() ohne query-Parameter.

- search() macht clientseitigen Title+Urheber-Filter wie der
  PortalaAdapter — same Codepfad, einheitliches Verhalten.

- get_document() nutzt die unveränderte Pagination.

- FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code
  als Dokumentation für die spätere Re-Aktivierung. Im Docstring
  ist die Tag-Form festgehalten.

Folgen:

- MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter
  über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA
  pre-#13.

- Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10.

- Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und
  ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred)
  bleiben als Folge-Optionen.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 19:01:00 +02:00
+								        """Search the configured Wahlperiode, sorted newest-first.
 								        #18: einheitliches Verhalten — Server filtert nur nach WP, der
 								        Client paginiert über die ganze WP und filtert lokal nach
 								        Treffern in Titel oder Urheber. Volltext-Filter (#12) ist
 								        zurückgebaut, weil das Verhalten zwischen Adaptern sonst
 								        asymmetrisch wird. Sortierung kommt vom Server (newest-first
 								        durch ``sort=2`` in :meth:`_build_search_body`).
 								        Dedupe per ``lp/number`` weil ParlDok dieselbe Drucksache
 								        mehrfach in verschiedenen Vorgängen/Beratungen liefert.
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								        """
 								        results: list[Drucksache] = []
 								        seen: set[str] = set()
-												ParLDokAdapter: Volltext (#12) deaktivieren — einheitlich Title-Filter (#18)

Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA
(beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung
2026-04-08: einheitliches Verhalten ist wichtiger als das beste
Verhalten in 2 von 4 Adaptern.

Konkrete Änderungen:

- _build_search_body() schickt query nicht mehr server-side. Der
  query-Parameter bleibt in der Signatur als unused-mit-del, weil
  die Wieder-Aktivierung später ein Drop-in sein soll wenn die
  PortalaAdapter-Variante reverse-engineered wurde.

- _initial_search() und _paginated_hits() ohne query-Parameter.

- search() macht clientseitigen Title+Urheber-Filter wie der
  PortalaAdapter — same Codepfad, einheitliches Verhalten.

- get_document() nutzt die unveränderte Pagination.

- FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code
  als Dokumentation für die spätere Re-Aktivierung. Im Docstring
  ist die Tag-Form festgehalten.

Folgen:

- MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter
  über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA
  pre-#13.

- Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10.

- Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und
  ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred)
  bleiben als Folge-Optionen.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 19:01:00 +02:00
+								        query_terms = [t.lower() for t in query.split() if t] if query else []
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
 								        async with self._make_client() as client:
 								            await client.get(f"{self.base_url}{self.prefix}/")
-												ParLDokAdapter: Volltext (#12) deaktivieren — einheitlich Title-Filter (#18)

Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA
(beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung
2026-04-08: einheitliches Verhalten ist wichtiger als das beste
Verhalten in 2 von 4 Adaptern.

Konkrete Änderungen:

- _build_search_body() schickt query nicht mehr server-side. Der
  query-Parameter bleibt in der Signatur als unused-mit-del, weil
  die Wieder-Aktivierung später ein Drop-in sein soll wenn die
  PortalaAdapter-Variante reverse-engineered wurde.

- _initial_search() und _paginated_hits() ohne query-Parameter.

- search() macht clientseitigen Title+Urheber-Filter wie der
  PortalaAdapter — same Codepfad, einheitliches Verhalten.

- get_document() nutzt die unveränderte Pagination.

- FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code
  als Dokumentation für die spätere Re-Aktivierung. Im Docstring
  ist die Tag-Form festgehalten.

Folgen:

- MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter
  über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA
  pre-#13.

- Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10.

- Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und
  ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred)
  bleiben als Folge-Optionen.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 19:01:00 +02:00
+								            async for hit in self._paginated_hits(client):
-												Activate Thüringen via ParLDokAdapter reuse + filter widening (#25, Phase 1)

Thüringen läuft auf parldok.thueringer-landtag.de mit ParlDok 8.3.5
(J3S GmbH) — exakt dieselbe Version wie MV. Aber TH packt seine
Anträge unter zusammengesetzten type-Strings ("Antrag gemäß § 79 GO",
"Antrag gemäß § 74 (2) GO") und kind="Vorlage" statt der MV-Variante
kind="Drucksache"/type="Antrag". Strict-Match auf "Antrag" hat 0
Treffer geliefert.

Lösung: ParLDokAdapter um zwei Konstruktor-Parameter erweitert:
- document_typ_substring=True → Substring-Match auf type-Feld
  ("Antrag" matched "Antrag gemäß § 79 GO", "Alternativantrag" usw.)
- kinds=["Drucksache", "Vorlage"] → erweiterte kind-Liste

Defaults sind backward-kompatibel (Substring-Match aus, kinds nur
Drucksache), sodass MV und HH unverändert weiterlaufen.

_hit_matches_filters() als zentraler Filter-Helper extrahiert,
search() und get_document() nutzen ihn — get_document() überspringt
ihn allerdings, weil dort beliebige Drucksachen aufrufbar sein müssen,
unabhängig vom search-Time-Filter.

Hostname-Korrektur: parldok.thueringen.de redirected per 303 auf
parldok.thueringer-landtag.de. doku_base_url in bundeslaender.py
auf den neuen Host umgestellt.

Smoke-Test (lokal):
  TH q="":       8 hits in 3.3s
  TH q="Schule": 2 hits in 25.7s (Lernmittelbeschaffung, Modernisierung
                  Bund-Länder-Vereinbarung — beide Schul-bezogen)
  TH q="Klima":  0 hits (keine in den letzten 1000 Drucksachen)

Damit ist Phase 1 (3/3) komplett. Nächstes Phase-2 Issue: #27 BB als
StarWebAdapter-Template.

Phase 1 (3/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:48:02 +02:00
+								                if not self._hit_matches_filters(hit):
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								                    continue
 								                doc = self._hit_to_drucksache(hit)
 								                if not doc:
 								                    continue
 								                if doc.drucksache in seen:
 								                    continue
 								                seen.add(doc.drucksache)
-												ParLDokAdapter: Volltext (#12) deaktivieren — einheitlich Title-Filter (#18)

Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA
(beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung
2026-04-08: einheitliches Verhalten ist wichtiger als das beste
Verhalten in 2 von 4 Adaptern.

Konkrete Änderungen:

- _build_search_body() schickt query nicht mehr server-side. Der
  query-Parameter bleibt in der Signatur als unused-mit-del, weil
  die Wieder-Aktivierung später ein Drop-in sein soll wenn die
  PortalaAdapter-Variante reverse-engineered wurde.

- _initial_search() und _paginated_hits() ohne query-Parameter.

- search() macht clientseitigen Title+Urheber-Filter wie der
  PortalaAdapter — same Codepfad, einheitliches Verhalten.

- get_document() nutzt die unveränderte Pagination.

- FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code
  als Dokumentation für die spätere Re-Aktivierung. Im Docstring
  ist die Tag-Form festgehalten.

Folgen:

- MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter
  über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA
  pre-#13.

- Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10.

- Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und
  ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred)
  bleiben als Folge-Optionen.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 19:01:00 +02:00
+								                if query_terms:
 								                    hay = f"{doc.title} {hit.get('authorhtml', '')}".lower()
 								                    if not all(t in hay for t in query_terms):
 								                        continue
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								                results.append(doc)
 								                if len(results) >= limit:
 								                    return results
 								        return results
 								    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
 								        """Look up a single Antrag by ``lp/number`` ID.
 								        Pragmatic MVP: page through the WP unfiltered until we find a
 								        match. ParlDok offers a ``facet_number`` (14) facet that would
 								        let us target the lookup directly, but the facet ID values are
 								        instance-specific (would require a ``Fulltext/Filter`` discovery
 								        call) and the WP-wide pagination is fast enough for the typical
 k–10k Drucksachen per period.
 								        """
 								        wanted_lp, wanted_num = (drucksache.split("/", 1) + [""])[:2]
 								        if not wanted_num:
 								            return None
 								        async with self._make_client() as client:
 								            await client.get(f"{self.base_url}{self.prefix}/")
 								            async for hit in self._paginated_hits(client):
-												Activate Thüringen via ParLDokAdapter reuse + filter widening (#25, Phase 1)

Thüringen läuft auf parldok.thueringer-landtag.de mit ParlDok 8.3.5
(J3S GmbH) — exakt dieselbe Version wie MV. Aber TH packt seine
Anträge unter zusammengesetzten type-Strings ("Antrag gemäß § 79 GO",
"Antrag gemäß § 74 (2) GO") und kind="Vorlage" statt der MV-Variante
kind="Drucksache"/type="Antrag". Strict-Match auf "Antrag" hat 0
Treffer geliefert.

Lösung: ParLDokAdapter um zwei Konstruktor-Parameter erweitert:
- document_typ_substring=True → Substring-Match auf type-Feld
  ("Antrag" matched "Antrag gemäß § 79 GO", "Alternativantrag" usw.)
- kinds=["Drucksache", "Vorlage"] → erweiterte kind-Liste

Defaults sind backward-kompatibel (Substring-Match aus, kinds nur
Drucksache), sodass MV und HH unverändert weiterlaufen.

_hit_matches_filters() als zentraler Filter-Helper extrahiert,
search() und get_document() nutzen ihn — get_document() überspringt
ihn allerdings, weil dort beliebige Drucksachen aufrufbar sein müssen,
unabhängig vom search-Time-Filter.

Hostname-Korrektur: parldok.thueringen.de redirected per 303 auf
parldok.thueringer-landtag.de. doku_base_url in bundeslaender.py
auf den neuen Host umgestellt.

Smoke-Test (lokal):
  TH q="":       8 hits in 3.3s
  TH q="Schule": 2 hits in 25.7s (Lernmittelbeschaffung, Modernisierung
                  Bund-Länder-Vereinbarung — beide Schul-bezogen)
  TH q="Klima":  0 hits (keine in den letzten 1000 Drucksachen)

Damit ist Phase 1 (3/3) komplett. Nächstes Phase-2 Issue: #27 BB als
StarWebAdapter-Template.

Phase 1 (3/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:48:02 +02:00
+								                # Don't apply doc-type filters here — get_document is
 								                # used to look up arbitrary Drucksachen, including ones
 								                # whose kind/typ doesn't match the search-time filter.
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								                if str(hit.get("lp")) == wanted_lp and str(hit.get("number")) == wanted_num:
 								                    return self._hit_to_drucksache(hit)
 								        return None
 								    async def download_text(self, drucksache: str) -> Optional[str]:
 								        """Download the PDF for a Drucksache and extract its text."""
 								        import fitz  # PyMuPDF
 								        doc = await self.get_document(drucksache)
 								        if not doc or not doc.link:
 								            return None
 								        async with httpx.AsyncClient(
 								            timeout=60,
 								            follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                resp = await client.get(doc.link)
 								                if resp.status_code != 200:
 								                    logger.error(
 								                        "%s PDF HTTP %s for %s (%s)",
 								                        self.bundesland, resp.status_code, drucksache, doc.link,
 								                    )
 								                    return None
 								                pdf = fitz.open(stream=resp.content, filetype="pdf")
 								                text = ""
 								                for page in pdf:
 								                    text += page.get_text()
 								                pdf.close()
 								                return text
 								            except Exception:
 								                logger.exception("%s ParlDok download error for %s", self.bundesland, drucksache)
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
+								                return None
-												Activate Schleswig-Holstein via StarFinderCGIAdapter (#20, Phase 2)

SH läuft auf der ältesten der vier Backend-Familien: Starfinder-CGI
auf lissh.lvn.parlanet.de. URL-basiert (nicht stateful wie das
moderne StarWeb-Servlet von BB/HE/NI/RP/HB), Latin-1-encoding,
flat HTML-Tabelle als Hit-Format. Eigener Adapter weil das Schema
fundamental anders ist als alles andere.

Endpoint:
  http://lissh.lvn.parlanet.de/cgi-bin/starfinder/0
    ?path=lisshfl.txt&id=FASTLINK&pass=&search=WP=20+AND+dtyp=antrag
    &format=WEBKURZFL

Hit-Format pro <tr class="tabcol*">:
  <b>{TITLE}</b><br>
  Antrag {URHEBER} {DD.MM.YYYY} Drucksache <a href="{PDF}">{N/M}</a>

Quelle: dokukratie/sh.yml + Live-Probing.

Encoding: Server liefert iso-8859-1 ohne korrektes Content-Type-
Header. Adapter dekodiert resp.content explizit als latin-1.

SSW-Detection im _normalize_fraktion: SH ist das einzige BL mit
SSW-Fraktion (von der 5%-Hürde befreit), pattern ist \\bSSW\\b
analog zu \\bAfD\\b.

Free-Text-Suche client-seitig (siehe #18) — server-side query-
syntax mit (term) im starfinder-search-Param wird vom Server nicht
als Volltext interpretiert, einheitlich mit allen anderen aktiven
Adaptern.

Smoke-Test (lokal):
  SH q="":         8 hits in 14.4s
  SH q="Schule":   8 hits in 14.8s (Schulentwicklung Westküste,
                    Hochschulen, queere Vielfalt an Schule etc.)
  SH q="Klima":    8 hits (klimafreundlich, Klimafolgen,
                    Strategischer Aktionsplan)
  SH q="Bildung":  8 hits (berufliche Bildung, Holocaust-Wissen)

bundeslaender.py::SH.aktiv = True. doku_base_url auf
lissh.lvn.parlanet.de korrigiert (ehemaliger landtag.ltsh.de-
Eintrag passte nicht zum echten Endpoint).

Damit ist Phase 2 (1/6) angefangen — als Nebenpfad, weil das
StarWeb-Servlet (#27 BB als Template für 5 weitere) ohne HAR-
Trace nicht sauber reverse-engineerbar war.

Phase 2 (1/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:34:06 +02:00
+								class StarFinderCGIAdapter(ParlamentAdapter):
 								    """Adapter for old-school CGI Starfinder instances.
 								    Currently used by Schleswig-Holstein on
 								    ``lissh.lvn.parlanet.de/cgi-bin/starfinder/0`` — the **oldest** of the
 								    parliament backends we touch. Predates StarWeb's HTML form-submit
 								    machinery: instead of submitting a stateful AdvancedSearch form
 								    (which BB/HE/NI/RP/HB do), Starfinder accepts the entire query as
 								    URL parameters and returns plain HTML with a flat ``<tr>`` table of
 								    records.
 								    Reverse-engineering quelle: ``dokukratie/sh.yml`` plus a probe
 								    against the live endpoint. Format details:
 								    - URL template: ``{base}/cgi-bin/starfinder/0?path={db_path}&id=FASTLINK
 								      &pass=&search={starfinder_query}&format=WEBKURZFL``
 								    - Query syntax: ``WP=20+AND+dtyp=antrag`` (URL-encoded). The
 								      ``dtyp`` codes are lowercase short labels (``antrag``, ``kleine``).
 								    - Encoding: ``iso-8859-1`` (Latin-1) — NOT UTF-8. The HTTP response
 								      doesn't always declare it via Content-Type, so we explicitly
 								      decode with ``latin1`` to avoid mojibake on the German umlauts.
 								    - Hit-format: each record is one ``<tr class="tabcol|tabcol2|tabcol3">``
 								      with the title in ``<b>``, then ``Antrag <Urheber> <DD.MM.YYYY>
 								      Drucksache <a href="...pdf">XX/YYYY</a>``.
 								    """
 								    _RE_RECORD = re.compile(
 								        r'<tr class="tabcol[23]?">.*?</tr>',
 								        re.DOTALL,
 								    )
 								    _RE_TITLE = re.compile(r"<b>(.*?)</b>", re.DOTALL)
 								    _RE_DRUCKSACHE_LINK = re.compile(
 								        r'<a href="([^"]+\.pdf)"[^>]*>(\d+/\d+)</a>'
 								    )
 								    # The line between <b>title</b> and the <a>-link looks like:
 								    #   "Antrag Christian Dirschauer (SSW) 07.04.2026 Drucksache "
 								    # We pull the originator(s) and the date out of it.
 								    _RE_URHEBER_DATUM = re.compile(
 								        r"</b>\s*<br>\s*[A-Za-zÄÖÜäöüß]+\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
 								        re.DOTALL,
 								    )
 								    def __init__(
 								        self,
 								        *,
 								        bundesland: str,
 								        name: str,
 								        base_url: str,
 								        wahlperiode: int,
 								        db_path: str = "lisshfl.txt",
 								        document_typ_code: str = "antrag",
 								    ) -> None:
 								        self.bundesland = bundesland
 								        self.name = name
 								        self.base_url = base_url.rstrip("/")
 								        self.wahlperiode = wahlperiode
 								        self.db_path = db_path
 								        self.document_typ_code = document_typ_code
 								    @staticmethod
 								    def _datum_de_to_iso(datum_de: str) -> str:
 								        if not datum_de:
 								            return ""
 								        try:
 								            d, m, y = datum_de.split(".")
 								            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
 								        except ValueError:
 								            return ""
-												Phase B: Parteinamen-Mapper #55 (Roadmap #59)

Zentrale `app/parteien.py` als Single Source of Truth für die Partei-
Auflösung:

- `PARTEIEN`-Tabelle mit kanonischem Key, langem Display-Namen, allen
  bekannten Aliasen, optionalem `bundesland_scope` und Government-
  Marker. 14 Einträge (CDU, CSU, SPD, GRÜNE, FDP, LINKE, AfD, BSW, SSW,
  BiW + die Freie-Wähler-Familie BVB-FW, FW-BAYERN, FW-SL und der
  generische FREIE WÄHLER-Eintrag).
- `normalize_partei(raw, *, bundesland=None)` für Single-String-Lookups
  mit Government-Vorrang und FW-Familien-Disambiguierung
- `extract_fraktionen(text, *, bundesland=None)` als Funnel für die
  vier alten Adapter-Helper. Kommagetrennte Listen, MdL-mit-Klammer-
  partei, HTML-Reste — alles fließt durch eine Stelle, mit BL-Scope-
  Filter (SSW nur in SH, BVB-FW nur in BB, etc.).
- `display_name(canonical, *, long=False)` für UI/PDF — kurze Form
  bleibt der kanonische Key, lange Form ist "BÜNDNIS 90/DIE GRÜNEN"
  statt "GRÜNE" etc.

Adapter-Migration in `app/parlamente.py`:

- Vier nahezu identische `_normalize_fraktion()`-Methoden in
  PortalaAdapter, ParLDokAdapter, StarFinderCGIAdapter, PARLISAdapter
  durch einen einzeiligen Shim ersetzt, der `extract_fraktionen` mit
  `self.bundesland` aufruft. ~120 Zeilen Duplikation entfernt.
- `@staticmethod` aufgehoben, weil wir jetzt `self.bundesland` brauchen
  für die FW-Disambiguierung — alle Aufrufer waren bereits `self._...`,
  also keine Call-Site-Änderung nötig.

`app/embeddings.py:496` Workaround-Hack entfernt:

- `partei.upper() if partei != "GRÜNE" else "GRÜNE"` durch zentralen
  `normalize_partei()`-Aufruf ersetzt — der Hack war ein Kommentarzeichen
  dafür, dass die Partei-Schreibweise irgendwo zwischen Adapter und
  Embedding-Lookup driften konnte. Mit dem Mapper ist die Schreibweise
  überall garantiert kanonisch.

Tests:

- Neue `tests/test_parteien.py` mit 52 Cases — Single-Lookup, FW-
  Disambiguierung (BVB/Bayern/Saarland/RP), Volltext-Extraktion,
  Government-Marker, Tabellen-Konsistenz
- `tests/test_parlamente.py` Test-Klasse umgeschrieben: statt der 6
  statischen `PortalaAdapter._normalize_fraktion(...)`-Tests jetzt 4
  Roundtrip-Tests über echte Adapter-Instanzen, inkl. expliziter
  BB→BVB-FW vs. RP→FREIE WÄHLER-Verifikation

157 Unit-Tests grün (105 alt + 52 neu). Backwards-kompatibel — die
kanonischen Keys sind exakt die in der DB stehenden Strings, kein
Migrations-Schritt nötig.

Refs: #55, #59 (Phase B)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:22:13 +02:00
+								    def _normalize_fraktion(self, text: str) -> list[str]:
 								        """Thin shim — siehe ``app.parteien.extract_fraktionen``. #55.
-												Activate Schleswig-Holstein via StarFinderCGIAdapter (#20, Phase 2)

SH läuft auf der ältesten der vier Backend-Familien: Starfinder-CGI
auf lissh.lvn.parlanet.de. URL-basiert (nicht stateful wie das
moderne StarWeb-Servlet von BB/HE/NI/RP/HB), Latin-1-encoding,
flat HTML-Tabelle als Hit-Format. Eigener Adapter weil das Schema
fundamental anders ist als alles andere.

Endpoint:
  http://lissh.lvn.parlanet.de/cgi-bin/starfinder/0
    ?path=lisshfl.txt&id=FASTLINK&pass=&search=WP=20+AND+dtyp=antrag
    &format=WEBKURZFL

Hit-Format pro <tr class="tabcol*">:
  <b>{TITLE}</b><br>
  Antrag {URHEBER} {DD.MM.YYYY} Drucksache <a href="{PDF}">{N/M}</a>

Quelle: dokukratie/sh.yml + Live-Probing.

Encoding: Server liefert iso-8859-1 ohne korrektes Content-Type-
Header. Adapter dekodiert resp.content explizit als latin-1.

SSW-Detection im _normalize_fraktion: SH ist das einzige BL mit
SSW-Fraktion (von der 5%-Hürde befreit), pattern ist \\bSSW\\b
analog zu \\bAfD\\b.

Free-Text-Suche client-seitig (siehe #18) — server-side query-
syntax mit (term) im starfinder-search-Param wird vom Server nicht
als Volltext interpretiert, einheitlich mit allen anderen aktiven
Adaptern.

Smoke-Test (lokal):
  SH q="":         8 hits in 14.4s
  SH q="Schule":   8 hits in 14.8s (Schulentwicklung Westküste,
                    Hochschulen, queere Vielfalt an Schule etc.)
  SH q="Klima":    8 hits (klimafreundlich, Klimafolgen,
                    Strategischer Aktionsplan)
  SH q="Bildung":  8 hits (berufliche Bildung, Holocaust-Wissen)

bundeslaender.py::SH.aktiv = True. doku_base_url auf
lissh.lvn.parlanet.de korrigiert (ehemaliger landtag.ltsh.de-
Eintrag passte nicht zum echten Endpoint).

Damit ist Phase 2 (1/6) angefangen — als Nebenpfad, weil das
StarWeb-Servlet (#27 BB als Template für 5 weitere) ohne HAR-
Trace nicht sauber reverse-engineerbar war.

Phase 2 (1/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:34:06 +02:00
-												Phase B: Parteinamen-Mapper #55 (Roadmap #59)

Zentrale `app/parteien.py` als Single Source of Truth für die Partei-
Auflösung:

- `PARTEIEN`-Tabelle mit kanonischem Key, langem Display-Namen, allen
  bekannten Aliasen, optionalem `bundesland_scope` und Government-
  Marker. 14 Einträge (CDU, CSU, SPD, GRÜNE, FDP, LINKE, AfD, BSW, SSW,
  BiW + die Freie-Wähler-Familie BVB-FW, FW-BAYERN, FW-SL und der
  generische FREIE WÄHLER-Eintrag).
- `normalize_partei(raw, *, bundesland=None)` für Single-String-Lookups
  mit Government-Vorrang und FW-Familien-Disambiguierung
- `extract_fraktionen(text, *, bundesland=None)` als Funnel für die
  vier alten Adapter-Helper. Kommagetrennte Listen, MdL-mit-Klammer-
  partei, HTML-Reste — alles fließt durch eine Stelle, mit BL-Scope-
  Filter (SSW nur in SH, BVB-FW nur in BB, etc.).
- `display_name(canonical, *, long=False)` für UI/PDF — kurze Form
  bleibt der kanonische Key, lange Form ist "BÜNDNIS 90/DIE GRÜNEN"
  statt "GRÜNE" etc.

Adapter-Migration in `app/parlamente.py`:

- Vier nahezu identische `_normalize_fraktion()`-Methoden in
  PortalaAdapter, ParLDokAdapter, StarFinderCGIAdapter, PARLISAdapter
  durch einen einzeiligen Shim ersetzt, der `extract_fraktionen` mit
  `self.bundesland` aufruft. ~120 Zeilen Duplikation entfernt.
- `@staticmethod` aufgehoben, weil wir jetzt `self.bundesland` brauchen
  für die FW-Disambiguierung — alle Aufrufer waren bereits `self._...`,
  also keine Call-Site-Änderung nötig.

`app/embeddings.py:496` Workaround-Hack entfernt:

- `partei.upper() if partei != "GRÜNE" else "GRÜNE"` durch zentralen
  `normalize_partei()`-Aufruf ersetzt — der Hack war ein Kommentarzeichen
  dafür, dass die Partei-Schreibweise irgendwo zwischen Adapter und
  Embedding-Lookup driften konnte. Mit dem Mapper ist die Schreibweise
  überall garantiert kanonisch.

Tests:

- Neue `tests/test_parteien.py` mit 52 Cases — Single-Lookup, FW-
  Disambiguierung (BVB/Bayern/Saarland/RP), Volltext-Extraktion,
  Government-Marker, Tabellen-Konsistenz
- `tests/test_parlamente.py` Test-Klasse umgeschrieben: statt der 6
  statischen `PortalaAdapter._normalize_fraktion(...)`-Tests jetzt 4
  Roundtrip-Tests über echte Adapter-Instanzen, inkl. expliziter
  BB→BVB-FW vs. RP→FREIE WÄHLER-Verifikation

157 Unit-Tests grün (105 alt + 52 neu). Backwards-kompatibel — die
kanonischen Keys sind exakt die in der DB stehenden Strings, kein
Migrations-Schritt nötig.

Refs: #55, #59 (Phase B)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:22:13 +02:00
+								        SH-spezifisch: SSW gehört zur SH-Tabelle und wird durch
 								        ``bundesland=SH`` korrekt mit-extrahiert.
-												Activate Schleswig-Holstein via StarFinderCGIAdapter (#20, Phase 2)

SH läuft auf der ältesten der vier Backend-Familien: Starfinder-CGI
auf lissh.lvn.parlanet.de. URL-basiert (nicht stateful wie das
moderne StarWeb-Servlet von BB/HE/NI/RP/HB), Latin-1-encoding,
flat HTML-Tabelle als Hit-Format. Eigener Adapter weil das Schema
fundamental anders ist als alles andere.

Endpoint:
  http://lissh.lvn.parlanet.de/cgi-bin/starfinder/0
    ?path=lisshfl.txt&id=FASTLINK&pass=&search=WP=20+AND+dtyp=antrag
    &format=WEBKURZFL

Hit-Format pro <tr class="tabcol*">:
  <b>{TITLE}</b><br>
  Antrag {URHEBER} {DD.MM.YYYY} Drucksache <a href="{PDF}">{N/M}</a>

Quelle: dokukratie/sh.yml + Live-Probing.

Encoding: Server liefert iso-8859-1 ohne korrektes Content-Type-
Header. Adapter dekodiert resp.content explizit als latin-1.

SSW-Detection im _normalize_fraktion: SH ist das einzige BL mit
SSW-Fraktion (von der 5%-Hürde befreit), pattern ist \\bSSW\\b
analog zu \\bAfD\\b.

Free-Text-Suche client-seitig (siehe #18) — server-side query-
syntax mit (term) im starfinder-search-Param wird vom Server nicht
als Volltext interpretiert, einheitlich mit allen anderen aktiven
Adaptern.

Smoke-Test (lokal):
  SH q="":         8 hits in 14.4s
  SH q="Schule":   8 hits in 14.8s (Schulentwicklung Westküste,
                    Hochschulen, queere Vielfalt an Schule etc.)
  SH q="Klima":    8 hits (klimafreundlich, Klimafolgen,
                    Strategischer Aktionsplan)
  SH q="Bildung":  8 hits (berufliche Bildung, Holocaust-Wissen)

bundeslaender.py::SH.aktiv = True. doku_base_url auf
lissh.lvn.parlanet.de korrigiert (ehemaliger landtag.ltsh.de-
Eintrag passte nicht zum echten Endpoint).

Damit ist Phase 2 (1/6) angefangen — als Nebenpfad, weil das
StarWeb-Servlet (#27 BB als Template für 5 weitere) ohne HAR-
Trace nicht sauber reverse-engineerbar war.

Phase 2 (1/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:34:06 +02:00
+								        """
-												Phase B: Parteinamen-Mapper #55 (Roadmap #59)

Zentrale `app/parteien.py` als Single Source of Truth für die Partei-
Auflösung:

- `PARTEIEN`-Tabelle mit kanonischem Key, langem Display-Namen, allen
  bekannten Aliasen, optionalem `bundesland_scope` und Government-
  Marker. 14 Einträge (CDU, CSU, SPD, GRÜNE, FDP, LINKE, AfD, BSW, SSW,
  BiW + die Freie-Wähler-Familie BVB-FW, FW-BAYERN, FW-SL und der
  generische FREIE WÄHLER-Eintrag).
- `normalize_partei(raw, *, bundesland=None)` für Single-String-Lookups
  mit Government-Vorrang und FW-Familien-Disambiguierung
- `extract_fraktionen(text, *, bundesland=None)` als Funnel für die
  vier alten Adapter-Helper. Kommagetrennte Listen, MdL-mit-Klammer-
  partei, HTML-Reste — alles fließt durch eine Stelle, mit BL-Scope-
  Filter (SSW nur in SH, BVB-FW nur in BB, etc.).
- `display_name(canonical, *, long=False)` für UI/PDF — kurze Form
  bleibt der kanonische Key, lange Form ist "BÜNDNIS 90/DIE GRÜNEN"
  statt "GRÜNE" etc.

Adapter-Migration in `app/parlamente.py`:

- Vier nahezu identische `_normalize_fraktion()`-Methoden in
  PortalaAdapter, ParLDokAdapter, StarFinderCGIAdapter, PARLISAdapter
  durch einen einzeiligen Shim ersetzt, der `extract_fraktionen` mit
  `self.bundesland` aufruft. ~120 Zeilen Duplikation entfernt.
- `@staticmethod` aufgehoben, weil wir jetzt `self.bundesland` brauchen
  für die FW-Disambiguierung — alle Aufrufer waren bereits `self._...`,
  also keine Call-Site-Änderung nötig.

`app/embeddings.py:496` Workaround-Hack entfernt:

- `partei.upper() if partei != "GRÜNE" else "GRÜNE"` durch zentralen
  `normalize_partei()`-Aufruf ersetzt — der Hack war ein Kommentarzeichen
  dafür, dass die Partei-Schreibweise irgendwo zwischen Adapter und
  Embedding-Lookup driften konnte. Mit dem Mapper ist die Schreibweise
  überall garantiert kanonisch.

Tests:

- Neue `tests/test_parteien.py` mit 52 Cases — Single-Lookup, FW-
  Disambiguierung (BVB/Bayern/Saarland/RP), Volltext-Extraktion,
  Government-Marker, Tabellen-Konsistenz
- `tests/test_parlamente.py` Test-Klasse umgeschrieben: statt der 6
  statischen `PortalaAdapter._normalize_fraktion(...)`-Tests jetzt 4
  Roundtrip-Tests über echte Adapter-Instanzen, inkl. expliziter
  BB→BVB-FW vs. RP→FREIE WÄHLER-Verifikation

157 Unit-Tests grün (105 alt + 52 neu). Backwards-kompatibel — die
kanonischen Keys sind exakt die in der DB stehenden Strings, kein
Migrations-Schritt nötig.

Refs: #55, #59 (Phase B)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:22:13 +02:00
+								        from .parteien import extract_fraktionen
 								        return extract_fraktionen(text, bundesland=self.bundesland)
-												Activate Schleswig-Holstein via StarFinderCGIAdapter (#20, Phase 2)

SH läuft auf der ältesten der vier Backend-Familien: Starfinder-CGI
auf lissh.lvn.parlanet.de. URL-basiert (nicht stateful wie das
moderne StarWeb-Servlet von BB/HE/NI/RP/HB), Latin-1-encoding,
flat HTML-Tabelle als Hit-Format. Eigener Adapter weil das Schema
fundamental anders ist als alles andere.

Endpoint:
  http://lissh.lvn.parlanet.de/cgi-bin/starfinder/0
    ?path=lisshfl.txt&id=FASTLINK&pass=&search=WP=20+AND+dtyp=antrag
    &format=WEBKURZFL

Hit-Format pro <tr class="tabcol*">:
  <b>{TITLE}</b><br>
  Antrag {URHEBER} {DD.MM.YYYY} Drucksache <a href="{PDF}">{N/M}</a>

Quelle: dokukratie/sh.yml + Live-Probing.

Encoding: Server liefert iso-8859-1 ohne korrektes Content-Type-
Header. Adapter dekodiert resp.content explizit als latin-1.

SSW-Detection im _normalize_fraktion: SH ist das einzige BL mit
SSW-Fraktion (von der 5%-Hürde befreit), pattern ist \\bSSW\\b
analog zu \\bAfD\\b.

Free-Text-Suche client-seitig (siehe #18) — server-side query-
syntax mit (term) im starfinder-search-Param wird vom Server nicht
als Volltext interpretiert, einheitlich mit allen anderen aktiven
Adaptern.

Smoke-Test (lokal):
  SH q="":         8 hits in 14.4s
  SH q="Schule":   8 hits in 14.8s (Schulentwicklung Westküste,
                    Hochschulen, queere Vielfalt an Schule etc.)
  SH q="Klima":    8 hits (klimafreundlich, Klimafolgen,
                    Strategischer Aktionsplan)
  SH q="Bildung":  8 hits (berufliche Bildung, Holocaust-Wissen)

bundeslaender.py::SH.aktiv = True. doku_base_url auf
lissh.lvn.parlanet.de korrigiert (ehemaliger landtag.ltsh.de-
Eintrag passte nicht zum echten Endpoint).

Damit ist Phase 2 (1/6) angefangen — als Nebenpfad, weil das
StarWeb-Servlet (#27 BB als Template für 5 weitere) ohne HAR-
Trace nicht sauber reverse-engineerbar war.

Phase 2 (1/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:34:06 +02:00
 								    def _build_url(self) -> str:
 								        """Build the Starfinder URL for the structural WP+dtyp browse.
 								        Free-text filtering is done client-side on the parsed records
 								        (consistent with #18 — alle Adapter machen einheitlich Title-
 								        Filter ohne Server-Volltext, weil das Verhalten zwischen
 								        Adaptern sonst asymmetrisch wird).
 								        """
 								        search_param = f"WP={self.wahlperiode}+AND+dtyp={self.document_typ_code}"
 								        return (
 								            f"{self.base_url}/cgi-bin/starfinder/0"
 								            f"?path={self.db_path}&id=FASTLINK&pass=&search={search_param}"
 								            f"&format=WEBKURZFL"
 								        )
 								    def _parse_records(self, html: str) -> list[Drucksache]:
 								        results: list[Drucksache] = []
 								        for record_html in self._RE_RECORD.findall(html):
 								            m_link = self._RE_DRUCKSACHE_LINK.search(record_html)
 								            if not m_link:
 								                continue
 								            pdf_url, drucksache = m_link.group(1), m_link.group(2)
 								            m_title = self._RE_TITLE.search(record_html)
 								            title = re.sub(r"\s+", " ", m_title.group(1)).strip() if m_title else f"Drucksache {drucksache}"
 								            urheber = ""
 								            datum_iso = ""
 								            m_meta = self._RE_URHEBER_DATUM.search(record_html)
 								            if m_meta:
 								                urheber = m_meta.group(1).strip()
 								                datum_iso = self._datum_de_to_iso(m_meta.group(2))
 								            results.append(Drucksache(
 								                drucksache=drucksache,
 								                title=title,
 								                fraktionen=self._normalize_fraktion(urheber),
 								                datum=datum_iso,
 								                link=pdf_url,
 								                bundesland=self.bundesland,
 								                typ="Antrag",
 								            ))
 								        return results
 								    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
 								        url = self._build_url()
 								        async with httpx.AsyncClient(
 								            timeout=60,
 								            follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                resp = await client.get(url)
 								                if resp.status_code != 200:
 								                    logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
 								                    return []
 								                # Force latin1 because the Starfinder server doesn't always
 								                # advertise the encoding correctly.
 								                html = resp.content.decode("latin-1", errors="replace")
 								                results = self._parse_records(html)
 								            except Exception:
 								                logger.exception("%s search error", self.bundesland)
 								                return []
 								        # Client-side title + Urheber filter (siehe #18)
 								        if query:
 								            terms = [t.lower() for t in query.split() if t]
 								            results = [
 								                d for d in results
 								                if all(t in f"{d.title} {' '.join(d.fraktionen)}".lower() for t in terms)
 								            ]
 								        return results[:limit]
 								    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
 								        """Look up a single Drucksache by ID.
 								        SH responses are pre-sorted newest-first; we re-fetch up to 200
 								        records and scan for the exact match. The Starfinder server
 								        doesn't expose a number-only filter that we know of.
 								        """
 								        results = await self.search(query="", limit=200)
 								        for doc in results:
 								            if doc.drucksache == drucksache:
 								                return doc
 								        return None
 								    async def download_text(self, drucksache: str) -> Optional[str]:
 								        import fitz  # PyMuPDF
 								        doc = await self.get_document(drucksache)
 								        if not doc or not doc.link:
 								            return None
 								        async with httpx.AsyncClient(
 								            timeout=60,
 								            follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                resp = await client.get(doc.link)
 								                if resp.status_code != 200:
 								                    return None
 								                pdf = fitz.open(stream=resp.content, filetype="pdf")
 								                text = ""
 								                for page in pdf:
 								                    text += page.get_text()
 								                pdf.close()
 								                return text
 								            except Exception:
 								                logger.exception("%s PDF download error for %s", self.bundesland, drucksache)
 								                return None
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								class BayernAdapter(ParlamentAdapter):
-												#23 BayernAdapter — TYPO3-Solr HTML scraping (Anträge in WP19)

Stub durch echten Adapter ersetzt. Recon + Implementierung in einem
Wurf, weil das Backend deutlich freundlicher ist als bei SL/NI:

- Server-side rendered HTML, keine SPA, keine Auth, keine Cookies
- TYPO3 mit ext-solr unter /parlament/dokumente/drucksachen
- Filter direkt als URL-Query-Params (q, dokumentenart, wahlperiodeid[],
  sort, anzahl_treffer, page)
- 17.598 Drucksachen in WP19, davon ~10-15% Anträge — wir holen pro
  Page 100 Hits, paginieren bis 3 Pages und filtern client-seitig auf
  <p>Antrag …</p> (analog zu SL/HE)

Pattern-Extraktion über drei Regexen aus dem stabilen result-block:

  <div class="row result">
    <h4><a href="…pdf">Drucksache Nr. 19/<NR> vom DD.MM.YYYY</a></h4>
    <p>Antrag <FRAKTION>[, <FRAKTION2>]</p>
    <h5><strong>TITLE</strong></h5>
  </div>

Drucksachen-Lookup: q=<drucksache> matched die Nummer im Volltext und
liefert sie als einzigen Hit — wie bei SL und HB, kein dedizierter
GetById-Endpoint nötig.

Smoke-Test im Container:

  search("Schule", 5) → 5 Anträge in WP19 (SPD/FW-BAYERN+CSU/GRÜNE/AfD/AfD)
  get_document(19/11388) → match
  download_text(19/11388) → 4694 chars echter Antrags-Volltext
  search("", 5) → 5 newest Anträge mit korrektem date-DESC sort

Free-Voters-Disambiguation funktioniert über den #55 Parteinamen-Mapper:
"FREIE WÄHLER" auf Bayerns Liste wird zu "FW-BAYERN" canonicalized
(separat von "FREIE WÄHLER" in RP und "BVB-FW" in BB).

Tests: 185/185 grün.

UI-Aktivierung erfolgt separat in #35 (blockiert auf diesem Commit
und auf den BY-WP19-Wahlprogrammen — CSU, GRÜNE, AfD, SPD, FDP, FW).

Refs: #23, #49 (Roadmap Phase 3)

											
										
										
											2026-04-10 01:00:47 +02:00
+								    """Adapter for Bayerischer Landtag (#23) — TYPO3-Solr HTML scraping.
 								    Backend ist eine TYPO3-Site mit ext-solr-Suche unter
 								    ``/parlament/dokumente/drucksachen``. Server-side rendering, keine
 								    SPA, keine API. Reverse-engineering ist trivial — die Drucksachen-
 								    Liste hat ein stabiles HTML-Pattern und der Server akzeptiert die
 								    Filter direkt als URL-Query-Parameter.
 								    Search-URL:
 								        GET /parlament/dokumente/drucksachen?dokumentenart=Drucksache
 								            &wahlperiodeid[]=19
 								            &q=<volltext>
 								            &sort=date
 								            &anzahl_treffer=100
 								            &page=<n>
 								    Response-Pattern (HTML):
 								        <div class="row result">
 								            <div class="col-12">
 								                <h4>
 								                    <a href="https://www.bayern.landtag.de/www/ElanTextAblage_WP19/Drucksachen/Basisdrucksachen/0000009000/0000009107.pdf">
 								                        Drucksache Nr. 19/11407 vom 08.04.2026
 								                    </a>
 								                </h4>
 								                <p> Antrag AfD </p>
 								                <h5><strong>Kostenloses Parken für E-Fahrzeuge…</strong></h5>
 								            </div>
 								        </div>
 								    Felder pro Eintrag:
 								      * ``Drucksache Nr. 19/<NUM> vom DD.MM.YYYY`` → drucksache + datum
 								      * ``<a href="…Basisdrucksachen/…NUM.pdf">`` → PDF-Link (Anträge)
 								        oder ``…Schriftliche Anfragen/…pdf`` für Anfragen — Anträge
 								        werden client-seitig über ``<p>Antrag …`` gefiltert
 								      * ``<p>Antrag <FRAKTION>[, <FRAKTION2>]</p>`` → typ + Fraktionen
 								      * ``<h5><strong>TITLE</strong></h5>`` → title
 								    Drucksachen-Lookup nutzt denselben Endpoint mit ``q=<drucksache>``;
 								    die Solr-Suche matcht die Nummer im Volltext und liefert sie als
 								    einzigen oder ersten Treffer.
 								    Pagination: 100 pro Page (Maximum), max 17.598 Drucksachen in WP19
 								    Stand 2026-04-10. Wir holen client-side max ``limit*5`` Anträge nach
 								    Filterung.
 								    """
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    bundesland = "BY"
 								    name = "Bayerischer Landtag"
 								    base_url = "https://www.bayern.landtag.de"
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
-												#23 BayernAdapter — TYPO3-Solr HTML scraping (Anträge in WP19)

Stub durch echten Adapter ersetzt. Recon + Implementierung in einem
Wurf, weil das Backend deutlich freundlicher ist als bei SL/NI:

- Server-side rendered HTML, keine SPA, keine Auth, keine Cookies
- TYPO3 mit ext-solr unter /parlament/dokumente/drucksachen
- Filter direkt als URL-Query-Params (q, dokumentenart, wahlperiodeid[],
  sort, anzahl_treffer, page)
- 17.598 Drucksachen in WP19, davon ~10-15% Anträge — wir holen pro
  Page 100 Hits, paginieren bis 3 Pages und filtern client-seitig auf
  <p>Antrag …</p> (analog zu SL/HE)

Pattern-Extraktion über drei Regexen aus dem stabilen result-block:

  <div class="row result">
    <h4><a href="…pdf">Drucksache Nr. 19/<NR> vom DD.MM.YYYY</a></h4>
    <p>Antrag <FRAKTION>[, <FRAKTION2>]</p>
    <h5><strong>TITLE</strong></h5>
  </div>

Drucksachen-Lookup: q=<drucksache> matched die Nummer im Volltext und
liefert sie als einzigen Hit — wie bei SL und HB, kein dedizierter
GetById-Endpoint nötig.

Smoke-Test im Container:

  search("Schule", 5) → 5 Anträge in WP19 (SPD/FW-BAYERN+CSU/GRÜNE/AfD/AfD)
  get_document(19/11388) → match
  download_text(19/11388) → 4694 chars echter Antrags-Volltext
  search("", 5) → 5 newest Anträge mit korrektem date-DESC sort

Free-Voters-Disambiguation funktioniert über den #55 Parteinamen-Mapper:
"FREIE WÄHLER" auf Bayerns Liste wird zu "FW-BAYERN" canonicalized
(separat von "FREIE WÄHLER" in RP und "BVB-FW" in BB).

Tests: 185/185 grün.

UI-Aktivierung erfolgt separat in #35 (blockiert auf diesem Commit
und auf den BY-WP19-Wahlprogrammen — CSU, GRÜNE, AfD, SPD, FDP, FW).

Refs: #23, #49 (Roadmap Phase 3)

											
										
										
											2026-04-10 01:00:47 +02:00
+								    _RE_RESULT_BLOCK = re.compile(
 								        r'<div class="row result">(.*?)</div>\s*</div>', re.DOTALL,
 								    )
 								    _RE_DRUCKSACHE_HEADER = re.compile(
 								        r'Drucksache\s+Nr\.\s*(\d+/\d+)\s*vom\s*(\d{2}\.\d{2}\.\d{4})',
 								        re.IGNORECASE,
 								    )
 								    _RE_PDF_HREF = re.compile(r'href="([^"]+\.pdf)"')
 								    _RE_TYP_FRAKTION = re.compile(r'<p>\s*([^<]+?)\s*</p>')
 								    _RE_TITLE = re.compile(r'<h5>\s*<strong>([^<]+)</strong>\s*</h5>')
 								    def __init__(self, *, wahlperiode: int = 19):
 								        self.wahlperiode = wahlperiode
 								    @staticmethod
 								    def _datum_de_to_iso(datum_de: str) -> str:
 								        if not datum_de:
 								            return ""
 								        try:
 								            d, m, y = datum_de.split(".")
 								            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
 								        except ValueError:
 								            return ""
 								    def _parse_results(self, html: str) -> list[Drucksache]:
 								        """Extrahiere alle Drucksachen-Einträge aus einer Result-Page.
 								        Filtert client-seitig auf ``<p>Antrag …</p>`` — die Page enthält
 								        Anträge, Schriftliche Anfragen, Mündliche Anfragen, Berichte und
 								        Gesetzentwürfe gemischt.
 								        """
 								        from .parteien import extract_fraktionen
 								        results: list[Drucksache] = []
 								        for block in self._RE_RESULT_BLOCK.findall(html):
 								            m_header = self._RE_DRUCKSACHE_HEADER.search(block)
 								            if not m_header:
 								                continue
 								            drucksache = m_header.group(1)
 								            datum_iso = self._datum_de_to_iso(m_header.group(2))
 								            m_typ = self._RE_TYP_FRAKTION.search(block)
 								            typ_frak = m_typ.group(1).strip() if m_typ else ""
 								            # Format ist "<TYP> <FRAKTIONEN>" — Typ ist das erste Token,
 								            # Rest ist Fraktion(en) komma-separiert.
 								            parts = typ_frak.split(None, 1)
 								            typ = parts[0] if parts else ""
 								            fraktionen_text = parts[1] if len(parts) > 1 else ""
 								            # Bayern listet auch Schriftliche Anfragen, Berichte etc. in
 								            # derselben Liste — wir wollen nur Anträge.
 								            if typ.lower() != "antrag":
 								                continue
 								            fraktionen = extract_fraktionen(
 								                fraktionen_text, bundesland="BY",
 								            )
 								            m_title = self._RE_TITLE.search(block)
 								            title = m_title.group(1).strip() if m_title else f"Drucksache {drucksache}"
 								            # Kollabieren von Mehrfach-Whitespace innerhalb des Titels
 								            title = re.sub(r"\s+", " ", title)
 								            m_pdf = self._RE_PDF_HREF.search(block)
 								            pdf_url = m_pdf.group(1) if m_pdf else ""
 								            results.append(Drucksache(
 								                drucksache=drucksache,
 								                title=title,
 								                fraktionen=fraktionen,
 								                datum=datum_iso,
 								                link=pdf_url,
 								                bundesland="BY",
 								                typ=typ,
 								            ))
 								        return results
 								    def _build_search_params(self, query: str, page: int = 1) -> dict:
 								        # Bayern nutzt PHP-Style-Array-Suffix ``wahlperiodeid[]`` —
 								        # httpx codiert Listen als wiederholte Keys, wir bauen den
 								        # Param-Namen mit ``[]`` direkt in den dict-Key ein.
 								        return {
 								            "dokumentenart": "Drucksache",
 								            "wahlperiodeid[]": str(self.wahlperiode),
 								            "q": query or "",
 								            "sort": "date",
 								            "anzahl_treffer": "100",
 								            "page": str(page),
 								        }
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
-												#23 BayernAdapter — TYPO3-Solr HTML scraping (Anträge in WP19)

Stub durch echten Adapter ersetzt. Recon + Implementierung in einem
Wurf, weil das Backend deutlich freundlicher ist als bei SL/NI:

- Server-side rendered HTML, keine SPA, keine Auth, keine Cookies
- TYPO3 mit ext-solr unter /parlament/dokumente/drucksachen
- Filter direkt als URL-Query-Params (q, dokumentenart, wahlperiodeid[],
  sort, anzahl_treffer, page)
- 17.598 Drucksachen in WP19, davon ~10-15% Anträge — wir holen pro
  Page 100 Hits, paginieren bis 3 Pages und filtern client-seitig auf
  <p>Antrag …</p> (analog zu SL/HE)

Pattern-Extraktion über drei Regexen aus dem stabilen result-block:

  <div class="row result">
    <h4><a href="…pdf">Drucksache Nr. 19/<NR> vom DD.MM.YYYY</a></h4>
    <p>Antrag <FRAKTION>[, <FRAKTION2>]</p>
    <h5><strong>TITLE</strong></h5>
  </div>

Drucksachen-Lookup: q=<drucksache> matched die Nummer im Volltext und
liefert sie als einzigen Hit — wie bei SL und HB, kein dedizierter
GetById-Endpoint nötig.

Smoke-Test im Container:

  search("Schule", 5) → 5 Anträge in WP19 (SPD/FW-BAYERN+CSU/GRÜNE/AfD/AfD)
  get_document(19/11388) → match
  download_text(19/11388) → 4694 chars echter Antrags-Volltext
  search("", 5) → 5 newest Anträge mit korrektem date-DESC sort

Free-Voters-Disambiguation funktioniert über den #55 Parteinamen-Mapper:
"FREIE WÄHLER" auf Bayerns Liste wird zu "FW-BAYERN" canonicalized
(separat von "FREIE WÄHLER" in RP und "BVB-FW" in BB).

Tests: 185/185 grün.

UI-Aktivierung erfolgt separat in #35 (blockiert auf diesem Commit
und auf den BY-WP19-Wahlprogrammen — CSU, GRÜNE, AfD, SPD, FDP, FW).

Refs: #23, #49 (Roadmap Phase 3)

											
										
										
											2026-04-10 01:00:47 +02:00
+								        """Volltext-Suche über die aktuelle Wahlperiode, gefiltert auf Anträge.
 								        Sortiert newest-first (``sort=date``). Holt 1-3 Pages, je 100
 								        Hits (Antrags-Anteil ist ~10-15% des Drucksachen-Mix), client-
 								        seitig nach ``Antrag``-Typ gefiltert.
 								        """
 								        url = f"{self.base_url}/parlament/dokumente/drucksachen"
 								        results: list[Drucksache] = []
 								        seen: set[str] = set()
 								        async with httpx.AsyncClient(
 								            timeout=30,
 								            follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            for page in range(1, 4):  # max 300 raw hits → ~30-50 Anträge
 								                try:
 								                    resp = await client.get(url, params=self._build_search_params(query, page=page))
 								                except Exception:
 								                    logger.exception("BY search request error page=%d", page)
 								                    break
 								                if resp.status_code != 200:
 								                    logger.error("BY search HTTP %s page=%d", resp.status_code, page)
 								                    break
 								                page_results = self._parse_results(resp.text)
 								                if not page_results:
 								                    break
 								                for d in page_results:
 								                    if d.drucksache in seen:
 								                        continue
 								                    seen.add(d.drucksache)
 								                    results.append(d)
 								                    if len(results) >= limit:
 								                        return results
 								        return results
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
-												#23 BayernAdapter — TYPO3-Solr HTML scraping (Anträge in WP19)

Stub durch echten Adapter ersetzt. Recon + Implementierung in einem
Wurf, weil das Backend deutlich freundlicher ist als bei SL/NI:

- Server-side rendered HTML, keine SPA, keine Auth, keine Cookies
- TYPO3 mit ext-solr unter /parlament/dokumente/drucksachen
- Filter direkt als URL-Query-Params (q, dokumentenart, wahlperiodeid[],
  sort, anzahl_treffer, page)
- 17.598 Drucksachen in WP19, davon ~10-15% Anträge — wir holen pro
  Page 100 Hits, paginieren bis 3 Pages und filtern client-seitig auf
  <p>Antrag …</p> (analog zu SL/HE)

Pattern-Extraktion über drei Regexen aus dem stabilen result-block:

  <div class="row result">
    <h4><a href="…pdf">Drucksache Nr. 19/<NR> vom DD.MM.YYYY</a></h4>
    <p>Antrag <FRAKTION>[, <FRAKTION2>]</p>
    <h5><strong>TITLE</strong></h5>
  </div>

Drucksachen-Lookup: q=<drucksache> matched die Nummer im Volltext und
liefert sie als einzigen Hit — wie bei SL und HB, kein dedizierter
GetById-Endpoint nötig.

Smoke-Test im Container:

  search("Schule", 5) → 5 Anträge in WP19 (SPD/FW-BAYERN+CSU/GRÜNE/AfD/AfD)
  get_document(19/11388) → match
  download_text(19/11388) → 4694 chars echter Antrags-Volltext
  search("", 5) → 5 newest Anträge mit korrektem date-DESC sort

Free-Voters-Disambiguation funktioniert über den #55 Parteinamen-Mapper:
"FREIE WÄHLER" auf Bayerns Liste wird zu "FW-BAYERN" canonicalized
(separat von "FREIE WÄHLER" in RP und "BVB-FW" in BB).

Tests: 185/185 grün.

UI-Aktivierung erfolgt separat in #35 (blockiert auf diesem Commit
und auf den BY-WP19-Wahlprogrammen — CSU, GRÜNE, AfD, SPD, FDP, FW).

Refs: #23, #49 (Roadmap Phase 3)

											
										
										
											2026-04-10 01:00:47 +02:00
+								        """Direktes Lookup via ``q=<drucksache>``. Solr-Volltext matcht
 								        die Drucksachen-Nummer und liefert sie als einzigen Hit zurück."""
 								        url = f"{self.base_url}/parlament/dokumente/drucksachen"
 								        params = self._build_search_params(drucksache, page=1)
 								        async with httpx.AsyncClient(
 								            timeout=30, follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                resp = await client.get(url, params=params)
 								            except Exception:
 								                logger.exception("BY get_document request error for %s", drucksache)
 								                return None
 								        if resp.status_code != 200:
 								            return None
 								        for d in self._parse_results(resp.text):
 								            if d.drucksache == drucksache:
 								                return d
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								        return None
-												Add PortalaAdapter for PADOKA / Sachsen-Anhalt (#2)

Adds a clean-room PortalaAdapter that talks to the eUI/portala framework
behind PADOKA (Landtag Sachsen-Anhalt). Same engine powers Berlin's
PARDOK; the same adapter will serve issue #3 once activated for BE.

Reverse-engineering notes
- The "PADOKA = StarWeb" assumption from issue #1 / dokukratie's st.yml
  is outdated. The Sachsen-Anhalt portal was migrated to the same
  eUI/portala SPA framework Berlin uses. The legacy starweb URL returns
  503; the new entry point is /portal/browse.tt.html.
- Search workflow is two-stage:
  1. POST /portal/browse.tt.json with a JSON action body containing an
     Elasticsearch-style query tree under search.json. Returns a
     report_id plus hit count.
  2. POST /portal/report.tt.html with {report_id, start, chunksize}
     returns the HTML hit list. Each record carries a Perl Data::Dumper
     block in a <pre> tag with the canonical metadata.
- The query schema (sources, search.lines, search.json tree, report
  block) is taken from dokukratie/scrapers/portala.query.json (GPL-3.0)
  — only structure/selectors are reused, no Python code is ported.
- DB id is "lsa.lissh"; the server validates this and rejects unknown
  interfaces with an explicit errormsg.
- PDFs live under /files/drs/wp{N}/drs/d{nr}{xxx}.pdf and are served
  directly without any session cookie.

What the adapter does
- search() builds a date-window query (last ~24 months) for "Antrag"
  document type and returns the most recent hits. The user's free-text
  query is applied as a client-side title/Urheber filter (no fulltext
  search server-side yet — see "Limitations" below).
- Hits are parsed from the Perl record dumps in the report HTML:
  - WEV06.main → title (Perl \x{xx} hex escapes decoded)
  - WEV32.5   → relative PDF path
  - WEV32.main → "Antrag <Urheber> <DD.MM.YYYY> Drucksache <b>X/YYYY</b>"
- Fraktion strings are normalised to canonical codes (CDU, SPD, GRÜNE,
  FDP, AfD, LINKE, Landesregierung).
- get_document() looks up a single Drucksache by re-running the search.
- download_text() fetches the PDF and extracts text via PyMuPDF.
- bundeslaender.py: LSA's doku_system corrected from "StarWeb" to
  "PARDOK", anmerkung updated with the migration story.

Limitations (deliberate, MVP)
- No server-side full-text search. The portala framework's sf index
  names for LSA full-text content are not yet known; tree mutations
  with sf=alAB return 0 hits. Client-side filter is "good enough" for
  the next ~24 months of Anträge (≈few hundred per WP).
- LSA is still aktiv=False in bundeslaender.py — the adapter is dormant
  in production until issue #2's wahlprogramm ingest and frontend
  activation land.

Verified live against padoka.landtag.sachsen-anhalt.de:
- search(query="", limit=5) returned 5 current Anträge from März 2026
  (LINKE + GRÜNE) with correct dates, fractions, titles and PDF URLs.
- download_text("8/6790") returned 5051 chars of real Antragstext
  ("ICE-Halt für Salzwedel dauerhaft erhalten").

Refs #2.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 21:50:23 +02:00
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    async def download_text(self, drucksache: str) -> Optional[str]:
-												#23 BayernAdapter — TYPO3-Solr HTML scraping (Anträge in WP19)

Stub durch echten Adapter ersetzt. Recon + Implementierung in einem
Wurf, weil das Backend deutlich freundlicher ist als bei SL/NI:

- Server-side rendered HTML, keine SPA, keine Auth, keine Cookies
- TYPO3 mit ext-solr unter /parlament/dokumente/drucksachen
- Filter direkt als URL-Query-Params (q, dokumentenart, wahlperiodeid[],
  sort, anzahl_treffer, page)
- 17.598 Drucksachen in WP19, davon ~10-15% Anträge — wir holen pro
  Page 100 Hits, paginieren bis 3 Pages und filtern client-seitig auf
  <p>Antrag …</p> (analog zu SL/HE)

Pattern-Extraktion über drei Regexen aus dem stabilen result-block:

  <div class="row result">
    <h4><a href="…pdf">Drucksache Nr. 19/<NR> vom DD.MM.YYYY</a></h4>
    <p>Antrag <FRAKTION>[, <FRAKTION2>]</p>
    <h5><strong>TITLE</strong></h5>
  </div>

Drucksachen-Lookup: q=<drucksache> matched die Nummer im Volltext und
liefert sie als einzigen Hit — wie bei SL und HB, kein dedizierter
GetById-Endpoint nötig.

Smoke-Test im Container:

  search("Schule", 5) → 5 Anträge in WP19 (SPD/FW-BAYERN+CSU/GRÜNE/AfD/AfD)
  get_document(19/11388) → match
  download_text(19/11388) → 4694 chars echter Antrags-Volltext
  search("", 5) → 5 newest Anträge mit korrektem date-DESC sort

Free-Voters-Disambiguation funktioniert über den #55 Parteinamen-Mapper:
"FREIE WÄHLER" auf Bayerns Liste wird zu "FW-BAYERN" canonicalized
(separat von "FREIE WÄHLER" in RP und "BVB-FW" in BB).

Tests: 185/185 grün.

UI-Aktivierung erfolgt separat in #35 (blockiert auf diesem Commit
und auf den BY-WP19-Wahlprogrammen — CSU, GRÜNE, AfD, SPD, FDP, FW).

Refs: #23, #49 (Roadmap Phase 3)

											
										
										
											2026-04-10 01:00:47 +02:00
+								        """Download das Antrags-PDF und extrahiere Volltext."""
 								        import fitz
 								        doc = await self.get_document(drucksache)
 								        if doc is None or not doc.link:
 								            return None
 								        async with httpx.AsyncClient(
 								            timeout=60, follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                resp = await client.get(doc.link)
 								                if resp.status_code != 200:
 								                    logger.error("BY PDF HTTP %s for %s", resp.status_code, drucksache)
 								                    return None
 								                pdf = fitz.open(stream=resp.content, filetype="pdf")
 								                text = ""
 								                for page in pdf:
 								                    text += page.get_text()
 								                pdf.close()
 								                return text
 								            except Exception:
 								                logger.exception("BY download error for %s", drucksache)
 								                return None
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
-												Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)

PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:

1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
   2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
   format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
   keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
   plus HAR-Verifikation gegen die Live-Instanz.

2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
   search_id mit status=running, KEINE report_id. Erst eine zweite
   SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
   bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
   aus esearch-ui.main.js requestReportOK() Z. ~1268.

3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
   <!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
   als LSA Perl-Dump oder BE HTML-Cards. Felder:
     - EWBV22: "Drucksache 17/10323"
     - EWBD05: direkter PDF-URL
     - WMV33: Schlagworte (joined by ;)
     - WMV30: Urheber-Kurzform
     - EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"

Smoke-Test (lokal):
  BW q='':       8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
  BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
                 Schwimmunterricht, Lehrerbedarf etc.)
  BW q='Klima':  8 hits, Klimaschutz/CO2/Energieberatung
  get_document(17/10323): roundtrip funktioniert

bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.

Phase 1 (1/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:38:04 +02:00
+								class PARLISAdapter(ParlamentAdapter):
 								    """Adapter for Baden-Württemberg's PARLIS — eUI/portala-Variante mit
 								    polling und JSON-in-HTML-Comment-Records.
 								    PARLIS auf ``parlis.landtag-bw.de`` läuft technisch auf demselben
 								    eUI-Backend wie LSA-PADOKA und BE-PARDOK, aber mit drei wichtigen
 								    Unterschieden, die eine eigene Klasse statt einer PortalaAdapter-
 								    Subklasse rechtfertigen:
 . **Body-Schema:** Statt der portala/LSA-typischen ``search.lines``
 								       mit ``2/3/4/10/11/20.x/90.x``-Slots nutzt PARLIS ein viel kürzeres
 								       ``l1/l2/l3/l4`` Schema (siehe ``dokukratie/scrapers/portala.query.bw.json``).
 								       ``serverrecordname`` ist ``"vorgang"`` statt ``"sr_generic1"``,
 								       ``format`` ist ``"suchergebnis-vorgang-full"``, ``sort`` ist
 								       ``"SORT01/D SORT02/D SORT03"``. Es gibt kein ``parsed`` und kein
 								       ``json``-Tree — der Server akzeptiert das minimale Schema direkt.
 . **Async polling:** Im Gegensatz zu LSA/BE liefert die initiale
 								       ``Fulltext/Search``-Antwort nur eine ``search_id`` mit
 								       ``status: "running"``, KEINE ``report_id``. Erst eine zweite
 								       ``SearchAndDisplay``-Anfrage mit ``id: <search_id>`` (und ohne
 								       ``search``-Component) bekommt die fertige ``report_id`` zurück.
 								       In meinen Live-Tests reichte ein einziger 2-Sekunden-Sleep
 								       zwischen den Calls.
 . **Hit-Format:** Die ``report.tt.html``-Antwort liefert keine
 								       Perl-Dump-Blöcke (LSA) und keine Bootstrap-Card-Divs (BE),
 								       sondern **JSON-Records in HTML-Kommentaren**::
 								           <!--{"WMV33":[{"main":"Schlagworte"}],
 								                "EWBV22":[{"main":"Drucksache 17/10323"}],
 								                "EWBD05":[{"main":"https://.../17_10323.pdf"}],
 								                "EWBV23":[{"main":"Antrag Felix Herkens (GRÜNE) u. a. 16.03.2026"}],
 								                ...}-->
 								       Der Parser zieht die Comments raw raus und mappt die WMV/EWBV-
 								       Felder auf das ``Drucksache``-Dataclass.
 								    Reverse-Engineering-Quelle: ``dokukratie/scrapers/portala.query.bw.json``
 								    + Live-HAR gegen ``parlis.landtag-bw.de`` (Issue #29).
 								    """
 								    # Reverse-engineered field map for the JSON records that come embedded
 								    # in HTML comments inside report.tt.html responses.
 								    #
 								    # Records look like ``<!--{"WMV33":[...],...}-->`` and may contain
 								    # nested ``<i>...</i>`` highlight tags inside the JSON values.
 								    # Non-greedy match against the literal closing ``}-->`` because that
 								    # delimiter does not appear inside the JSON payload itself.
 								    _RE_RECORD = re.compile(r"<!--(\{.*?\})-->", re.DOTALL)
 								    _RE_DRUCKSACHE = re.compile(r"Drucksache\s+(\d+/\d+)")
 								    _RE_DATUM = re.compile(r"(\d{1,2}\.\d{1,2}\.\d{4})")
 								    def __init__(
 								        self,
 								        *,
 								        bundesland: str,
 								        name: str,
 								        base_url: str,
 								        wahlperiode: int,
 								        prefix: str = "/parlis",
 								        document_typ: str = "Antrag",
 								        date_window_days: int = 730,
 								        poll_attempts: int = 15,
 								        poll_interval_seconds: float = 2.0,
 								    ) -> None:
 								        """Configure a PARLIS adapter for one specific parliament instance.
 								        Args:
 								            bundesland: state code, e.g. ``"BW"``.
 								            name: human-readable label.
 								            base_url: ``https://parlis.landtag-bw.de`` (no trailing slash).
 								            wahlperiode: legislative period — feeds into ``lines.l1``.
 								            prefix: app prefix where PARLIS lives. ``/parlis`` for BW.
 								            document_typ: feeds into ``lines.l4``. The server interprets
 								                this as a German document type label like ``"Antrag"``.
 								            date_window_days: look-back window for the search range,
 								                quick-win against title-only filtering — same approach
 								                as the PortalaAdapter for LSA/BE.
 								            poll_attempts: how many times to poll for ``report_id`` before
 								                giving up. ~15 × 2s = 30s upper bound.
 								            poll_interval_seconds: sleep between poll attempts.
 								        """
 								        self.bundesland = bundesland
 								        self.name = name
 								        self.base_url = base_url.rstrip("/")
 								        self.prefix = "/" + prefix.strip("/")
 								        self.wahlperiode = wahlperiode
 								        self.document_typ = document_typ
 								        self.date_window_days = date_window_days
 								        self.poll_attempts = poll_attempts
 								        self.poll_interval_seconds = poll_interval_seconds
 								    @staticmethod
 								    def _datum_de_to_iso(datum_de: str) -> str:
 								        """DD.MM.YYYY → YYYY-MM-DD; '' for empty input."""
 								        if not datum_de:
 								            return ""
 								        try:
 								            d, m, y = datum_de.split(".")
 								            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
 								        except ValueError:
 								            return ""
-												Phase B: Parteinamen-Mapper #55 (Roadmap #59)

Zentrale `app/parteien.py` als Single Source of Truth für die Partei-
Auflösung:

- `PARTEIEN`-Tabelle mit kanonischem Key, langem Display-Namen, allen
  bekannten Aliasen, optionalem `bundesland_scope` und Government-
  Marker. 14 Einträge (CDU, CSU, SPD, GRÜNE, FDP, LINKE, AfD, BSW, SSW,
  BiW + die Freie-Wähler-Familie BVB-FW, FW-BAYERN, FW-SL und der
  generische FREIE WÄHLER-Eintrag).
- `normalize_partei(raw, *, bundesland=None)` für Single-String-Lookups
  mit Government-Vorrang und FW-Familien-Disambiguierung
- `extract_fraktionen(text, *, bundesland=None)` als Funnel für die
  vier alten Adapter-Helper. Kommagetrennte Listen, MdL-mit-Klammer-
  partei, HTML-Reste — alles fließt durch eine Stelle, mit BL-Scope-
  Filter (SSW nur in SH, BVB-FW nur in BB, etc.).
- `display_name(canonical, *, long=False)` für UI/PDF — kurze Form
  bleibt der kanonische Key, lange Form ist "BÜNDNIS 90/DIE GRÜNEN"
  statt "GRÜNE" etc.

Adapter-Migration in `app/parlamente.py`:

- Vier nahezu identische `_normalize_fraktion()`-Methoden in
  PortalaAdapter, ParLDokAdapter, StarFinderCGIAdapter, PARLISAdapter
  durch einen einzeiligen Shim ersetzt, der `extract_fraktionen` mit
  `self.bundesland` aufruft. ~120 Zeilen Duplikation entfernt.
- `@staticmethod` aufgehoben, weil wir jetzt `self.bundesland` brauchen
  für die FW-Disambiguierung — alle Aufrufer waren bereits `self._...`,
  also keine Call-Site-Änderung nötig.

`app/embeddings.py:496` Workaround-Hack entfernt:

- `partei.upper() if partei != "GRÜNE" else "GRÜNE"` durch zentralen
  `normalize_partei()`-Aufruf ersetzt — der Hack war ein Kommentarzeichen
  dafür, dass die Partei-Schreibweise irgendwo zwischen Adapter und
  Embedding-Lookup driften konnte. Mit dem Mapper ist die Schreibweise
  überall garantiert kanonisch.

Tests:

- Neue `tests/test_parteien.py` mit 52 Cases — Single-Lookup, FW-
  Disambiguierung (BVB/Bayern/Saarland/RP), Volltext-Extraktion,
  Government-Marker, Tabellen-Konsistenz
- `tests/test_parlamente.py` Test-Klasse umgeschrieben: statt der 6
  statischen `PortalaAdapter._normalize_fraktion(...)`-Tests jetzt 4
  Roundtrip-Tests über echte Adapter-Instanzen, inkl. expliziter
  BB→BVB-FW vs. RP→FREIE WÄHLER-Verifikation

157 Unit-Tests grün (105 alt + 52 neu). Backwards-kompatibel — die
kanonischen Keys sind exakt die in der DB stehenden Strings, kein
Migrations-Schritt nötig.

Refs: #55, #59 (Phase B)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:22:13 +02:00
+								    def _normalize_fraktion(self, text: str) -> list[str]:
 								        """Thin shim — siehe ``app.parteien.extract_fraktionen``. #55.
 								        PARLIS packt den Originator in ``EWBV23`` wie
 								        ``"Antrag Felix Herkens (GRÜNE), Saskia Frank (GRÜNE)..."``.
-												Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)

PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:

1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
   2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
   format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
   keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
   plus HAR-Verifikation gegen die Live-Instanz.

2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
   search_id mit status=running, KEINE report_id. Erst eine zweite
   SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
   bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
   aus esearch-ui.main.js requestReportOK() Z. ~1268.

3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
   <!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
   als LSA Perl-Dump oder BE HTML-Cards. Felder:
     - EWBV22: "Drucksache 17/10323"
     - EWBD05: direkter PDF-URL
     - WMV33: Schlagworte (joined by ;)
     - WMV30: Urheber-Kurzform
     - EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"

Smoke-Test (lokal):
  BW q='':       8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
  BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
                 Schwimmunterricht, Lehrerbedarf etc.)
  BW q='Klima':  8 hits, Klimaschutz/CO2/Energieberatung
  get_document(17/10323): roundtrip funktioniert

bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.

Phase 1 (1/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:38:04 +02:00
+								        """
-												Phase B: Parteinamen-Mapper #55 (Roadmap #59)

Zentrale `app/parteien.py` als Single Source of Truth für die Partei-
Auflösung:

- `PARTEIEN`-Tabelle mit kanonischem Key, langem Display-Namen, allen
  bekannten Aliasen, optionalem `bundesland_scope` und Government-
  Marker. 14 Einträge (CDU, CSU, SPD, GRÜNE, FDP, LINKE, AfD, BSW, SSW,
  BiW + die Freie-Wähler-Familie BVB-FW, FW-BAYERN, FW-SL und der
  generische FREIE WÄHLER-Eintrag).
- `normalize_partei(raw, *, bundesland=None)` für Single-String-Lookups
  mit Government-Vorrang und FW-Familien-Disambiguierung
- `extract_fraktionen(text, *, bundesland=None)` als Funnel für die
  vier alten Adapter-Helper. Kommagetrennte Listen, MdL-mit-Klammer-
  partei, HTML-Reste — alles fließt durch eine Stelle, mit BL-Scope-
  Filter (SSW nur in SH, BVB-FW nur in BB, etc.).
- `display_name(canonical, *, long=False)` für UI/PDF — kurze Form
  bleibt der kanonische Key, lange Form ist "BÜNDNIS 90/DIE GRÜNEN"
  statt "GRÜNE" etc.

Adapter-Migration in `app/parlamente.py`:

- Vier nahezu identische `_normalize_fraktion()`-Methoden in
  PortalaAdapter, ParLDokAdapter, StarFinderCGIAdapter, PARLISAdapter
  durch einen einzeiligen Shim ersetzt, der `extract_fraktionen` mit
  `self.bundesland` aufruft. ~120 Zeilen Duplikation entfernt.
- `@staticmethod` aufgehoben, weil wir jetzt `self.bundesland` brauchen
  für die FW-Disambiguierung — alle Aufrufer waren bereits `self._...`,
  also keine Call-Site-Änderung nötig.

`app/embeddings.py:496` Workaround-Hack entfernt:

- `partei.upper() if partei != "GRÜNE" else "GRÜNE"` durch zentralen
  `normalize_partei()`-Aufruf ersetzt — der Hack war ein Kommentarzeichen
  dafür, dass die Partei-Schreibweise irgendwo zwischen Adapter und
  Embedding-Lookup driften konnte. Mit dem Mapper ist die Schreibweise
  überall garantiert kanonisch.

Tests:

- Neue `tests/test_parteien.py` mit 52 Cases — Single-Lookup, FW-
  Disambiguierung (BVB/Bayern/Saarland/RP), Volltext-Extraktion,
  Government-Marker, Tabellen-Konsistenz
- `tests/test_parlamente.py` Test-Klasse umgeschrieben: statt der 6
  statischen `PortalaAdapter._normalize_fraktion(...)`-Tests jetzt 4
  Roundtrip-Tests über echte Adapter-Instanzen, inkl. expliziter
  BB→BVB-FW vs. RP→FREIE WÄHLER-Verifikation

157 Unit-Tests grün (105 alt + 52 neu). Backwards-kompatibel — die
kanonischen Keys sind exakt die in der DB stehenden Strings, kein
Migrations-Schritt nötig.

Refs: #55, #59 (Phase B)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:22:13 +02:00
+								        from .parteien import extract_fraktionen
 								        return extract_fraktionen(text, bundesland=self.bundesland)
-												Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)

PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:

1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
   2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
   format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
   keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
   plus HAR-Verifikation gegen die Live-Instanz.

2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
   search_id mit status=running, KEINE report_id. Erst eine zweite
   SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
   bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
   aus esearch-ui.main.js requestReportOK() Z. ~1268.

3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
   <!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
   als LSA Perl-Dump oder BE HTML-Cards. Felder:
     - EWBV22: "Drucksache 17/10323"
     - EWBD05: direkter PDF-URL
     - WMV33: Schlagworte (joined by ;)
     - WMV30: Urheber-Kurzform
     - EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"

Smoke-Test (lokal):
  BW q='':       8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
  BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
                 Schwimmunterricht, Lehrerbedarf etc.)
  BW q='Klima':  8 hits, Klimaschutz/CO2/Energieberatung
  get_document(17/10323): roundtrip funktioniert

bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.

Phase 1 (1/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:38:04 +02:00
 								    def _build_initial_body(self, start_date: str, end_date: str) -> dict:
 								        """Build the first ``SearchAndDisplay`` body with the search component.
 								        The schema follows ``dokukratie/scrapers/portala.query.bw.json``
 								        verbatim — only the placeholder values are substituted.
 								        """
 								        return {
 								            "action": "SearchAndDisplay",
 								            "report": {
 								                "rhl": "main",
 								                "rhlmode": "add",
 								                "format": "suchergebnis-vorgang-full",
 								                "mime": "html",
 								                "sort": "SORT01/D SORT02/D SORT03",
 								            },
 								            "search": {
 								                "lines": {
 								                    "l1": str(self.wahlperiode),
 								                    "l2": start_date,
 								                    "l3": end_date,
 								                    "l4": self.document_typ,
 								                },
 								                "serverrecordname": "vorgang",
 								            },
 								            "sources": ["Star"],
 								        }
 								    def _build_poll_body(self, search_id: str) -> dict:
 								        """Build the polling body — same action, but with the search_id
 								        instead of a fresh search component."""
 								        return {
 								            "action": "SearchAndDisplay",
 								            "report": {
 								                "rhl": "main",
 								                "rhlmode": "add",
 								                "format": "suchergebnis-vorgang-full",
 								                "mime": "html",
 								                "sort": "SORT01/D SORT02/D SORT03",
 								            },
 								            "id": search_id,
 								            "sources": ["Star"],
 								        }
 								    def _hit_record_to_drucksache(self, record: dict) -> Optional[Drucksache]:
 								        """Map a single JSON-in-comment record to a ``Drucksache``.
 								        PARLIS-record schema (reverse-engineered, all values are arrays
 								        of ``{"main": ...}`` dicts):
 								        - ``EWBV22``: "Drucksache 17/10323"
 								        - ``EWBD05``: direct PDF URL
 								        - ``EWBV23``: "Antrag <Urheber> <DD.MM.YYYY>" — single combined line
 								        - ``WMV30``: short Urheber summary ("Felix Herkens (GRÜNE) u. a.")
 								        - ``WMV33``: subject keywords (Schlagworte)
 								        - ``EWBD01``: "Drucksache <X/Y> <DD.MM.YYYY>"
 								        """
 								        def first(field: str) -> str:
 								            block = record.get(field)
 								            if isinstance(block, list) and block:
 								                return (block[0].get("main") or "").strip()
 								            return ""
 								        ds_text = first("EWBV22") or first("EWBD01")
 								        m_ds = self._RE_DRUCKSACHE.search(ds_text)
 								        if not m_ds:
 								            return None
 								        drucksache = m_ds.group(1)
 								        # The "title" we want is the Schlagworte/topic, not the
 								        # Drucksachen-Header. PARLIS keeps the human-readable subject
 								        # in WMV33 (Schlagworte joined by semicolons) — that's the
 								        # closest equivalent to "title" the LSA/BE adapters expose.
 								        # Fallback to the EWBV23 line if WMV33 is empty.
 								        schlagworte = first("WMV33")
 								        # Strip embedded <i>...</i> highlight tags
 								        schlagworte_clean = re.sub(r"</?i>", "", schlagworte).strip()
 								        title = schlagworte_clean or first("EWBV23") or f"Drucksache {drucksache}"
 								        # Date + Urheber out of EWBV23 ("Antrag <Urheber> <DD.MM.YYYY>")
 								        ewbv23 = first("EWBV23")
 								        m_dat = self._RE_DATUM.search(ewbv23)
 								        datum_iso = self._datum_de_to_iso(m_dat.group(1) if m_dat else "")
 								        urheber_short = first("WMV30")
 								        fraktionen = self._normalize_fraktion(urheber_short or ewbv23)
 								        pdf_url = first("EWBD05")
 								        return Drucksache(
 								            drucksache=drucksache,
 								            title=title,
 								            fraktionen=fraktionen,
 								            datum=datum_iso,
 								            link=pdf_url,
 								            bundesland=self.bundesland,
 								            typ=self.document_typ,
 								        )
 								    async def _initial_search_and_poll(
 								        self, client: httpx.AsyncClient, start_date: str, end_date: str,
 								    ) -> Optional[str]:
 								        """Run the initial search + poll until ``report_id`` arrives."""
 								        import asyncio
 								        browse_html = f"{self.base_url}{self.prefix}/browse.tt.html"
 								        browse_json = f"{self.base_url}{self.prefix}/browse.tt.json"
 								        # Step 1: warm cookies
 								        await client.get(browse_html)
 								        # Step 2: initial search
 								        try:
 								            resp = await client.post(
 								                browse_json,
 								                json=self._build_initial_body(start_date, end_date),
 								                headers={"Referer": browse_html},
 								            )
 								        except Exception:
 								            logger.exception("%s initial search request error", self.bundesland)
 								            return None
 								        if resp.status_code != 200:
 								            logger.error("%s initial search HTTP %s", self.bundesland, resp.status_code)
 								            return None
 								        data = resp.json()
 								        if data.get("report_id"):
 								            return data["report_id"]
 								        search_id = data.get("search_id")
 								        if not search_id:
 								            logger.error("%s no search_id in initial response: %s", self.bundesland, data)
 								            return None
 								        # Step 3: poll until report_id appears or we run out of attempts
 								        for _ in range(self.poll_attempts):
 								            await asyncio.sleep(self.poll_interval_seconds)
 								            try:
 								                resp = await client.post(
 								                    browse_json,
 								                    json=self._build_poll_body(search_id),
 								                    headers={"Referer": browse_html},
 								                )
 								            except Exception:
 								                logger.exception("%s poll request error", self.bundesland)
 								                return None
 								            if resp.status_code != 200:
 								                logger.error("%s poll HTTP %s", self.bundesland, resp.status_code)
 								                return None
 								            data = resp.json()
 								            if data.get("report_id"):
 								                return data["report_id"]
 								            star = data.get("sources", {}).get("Star", {})
 								            if star.get("status") == "stopped" and not data.get("report_id"):
 								                # Search finished but no report — empty result
 								                return None
 								        logger.warning("%s gave up polling after %d attempts", self.bundesland, self.poll_attempts)
 								        return None
 								    def _parse_report_html(self, html: str) -> list[Drucksache]:
 								        """Extract Drucksachen from a report.tt.html response.
 								        Records are JSON objects embedded in HTML comments. We pull each
 								        comment block via regex, parse it as JSON, and map the WMV/EWBV
 								        fields to a Drucksache.
 								        """
 								        results: list[Drucksache] = []
 								        for m in self._RE_RECORD.finditer(html):
 								            json_text = m.group(1)
 								            try:
 								                record = json.loads(json_text)
 								            except json.JSONDecodeError:
 								                continue
 								            doc = self._hit_record_to_drucksache(record)
 								            if doc:
 								                results.append(doc)
 								        return results
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
-												Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)

PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:

1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
   2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
   format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
   keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
   plus HAR-Verifikation gegen die Live-Instanz.

2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
   search_id mit status=running, KEINE report_id. Erst eine zweite
   SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
   bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
   aus esearch-ui.main.js requestReportOK() Z. ~1268.

3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
   <!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
   als LSA Perl-Dump oder BE HTML-Cards. Felder:
     - EWBV22: "Drucksache 17/10323"
     - EWBD05: direkter PDF-URL
     - WMV33: Schlagworte (joined by ;)
     - WMV30: Urheber-Kurzform
     - EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"

Smoke-Test (lokal):
  BW q='':       8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
  BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
                 Schwimmunterricht, Lehrerbedarf etc.)
  BW q='Klima':  8 hits, Klimaschutz/CO2/Energieberatung
  get_document(17/10323): roundtrip funktioniert

bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.

Phase 1 (1/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:38:04 +02:00
+								        """Search recent BW Anträge with optional client-side title filter.
 								        Server-side full-text is not used (#18 — einheitliches
 								        Verhalten ohne Volltext bis alle Adapter es können). The
 								        client filter looks at title (Schlagworte) + Urheber.
 								        """
 								        from datetime import date, timedelta
 								        end = date.today()
 								        start = end - timedelta(days=self.date_window_days)
 								        async with httpx.AsyncClient(
 								            timeout=60,
 								            follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                report_id = await self._initial_search_and_poll(
 								                    client, start.isoformat(), end.isoformat(),
 								                )
 								                if not report_id:
 								                    return []
 								                # Pull a generous chunk so the client-side filter has
 								                # enough material to work with.
 								                chunksize = max(limit * 10, 200) if query else max(limit * 2, 50)
 								                report_url = (
 								                    f"{self.base_url}{self.prefix}/report.tt.html"
 								                    f"?report_id={report_id}&start=0&chunksize={chunksize}"
 								                )
 								                resp = await client.get(
 								                    report_url,
 								                    headers={"Referer": f"{self.base_url}{self.prefix}/browse.tt.html"},
 								                )
 								                if resp.status_code != 200:
 								                    logger.error("%s report HTTP %s", self.bundesland, resp.status_code)
 								                    return []
 								                results = self._parse_report_html(resp.text)
 								            except Exception:
 								                logger.exception("%s search error", self.bundesland)
 								                return []
 								        # Client-side filter
 								        if query:
 								            terms = [t.lower() for t in query.split() if t]
 								            results = [
 								                d for d in results
 								                if all(t in f"{d.title} {' '.join(d.fraktionen)}".lower() for t in terms)
 								            ]
 								        return results[:limit]
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
-												Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)

PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:

1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
   2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
   format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
   keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
   plus HAR-Verifikation gegen die Live-Instanz.

2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
   search_id mit status=running, KEINE report_id. Erst eine zweite
   SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
   bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
   aus esearch-ui.main.js requestReportOK() Z. ~1268.

3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
   <!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
   als LSA Perl-Dump oder BE HTML-Cards. Felder:
     - EWBV22: "Drucksache 17/10323"
     - EWBD05: direkter PDF-URL
     - WMV33: Schlagworte (joined by ;)
     - WMV30: Urheber-Kurzform
     - EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"

Smoke-Test (lokal):
  BW q='':       8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
  BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
                 Schwimmunterricht, Lehrerbedarf etc.)
  BW q='Klima':  8 hits, Klimaschutz/CO2/Energieberatung
  get_document(17/10323): roundtrip funktioniert

bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.

Phase 1 (1/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:38:04 +02:00
+								        """Look up a single Drucksache by ID via a broad browse."""
 								        results = await self.search(query="", limit=200)
 								        for doc in results:
 								            if doc.drucksache == drucksache:
 								                return doc
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								        return None
-												Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)

PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:

1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
   2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
   format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
   keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
   plus HAR-Verifikation gegen die Live-Instanz.

2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
   search_id mit status=running, KEINE report_id. Erst eine zweite
   SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
   bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
   aus esearch-ui.main.js requestReportOK() Z. ~1268.

3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
   <!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
   als LSA Perl-Dump oder BE HTML-Cards. Felder:
     - EWBV22: "Drucksache 17/10323"
     - EWBD05: direkter PDF-URL
     - WMV33: Schlagworte (joined by ;)
     - WMV30: Urheber-Kurzform
     - EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"

Smoke-Test (lokal):
  BW q='':       8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
  BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
                 Schwimmunterricht, Lehrerbedarf etc.)
  BW q='Klima':  8 hits, Klimaschutz/CO2/Energieberatung
  get_document(17/10323): roundtrip funktioniert

bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.

Phase 1 (1/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:38:04 +02:00
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    async def download_text(self, drucksache: str) -> Optional[str]:
-												Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)

PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:

1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
   2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
   format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
   keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
   plus HAR-Verifikation gegen die Live-Instanz.

2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
   search_id mit status=running, KEINE report_id. Erst eine zweite
   SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
   bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
   aus esearch-ui.main.js requestReportOK() Z. ~1268.

3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
   <!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
   als LSA Perl-Dump oder BE HTML-Cards. Felder:
     - EWBV22: "Drucksache 17/10323"
     - EWBD05: direkter PDF-URL
     - WMV33: Schlagworte (joined by ;)
     - WMV30: Urheber-Kurzform
     - EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"

Smoke-Test (lokal):
  BW q='':       8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
  BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
                 Schwimmunterricht, Lehrerbedarf etc.)
  BW q='Klima':  8 hits, Klimaschutz/CO2/Energieberatung
  get_document(17/10323): roundtrip funktioniert

bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.

Phase 1 (1/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:38:04 +02:00
+								        """Download the PDF for a Drucksache and extract its text."""
 								        import fitz  # PyMuPDF
 								        doc = await self.get_document(drucksache)
 								        if not doc or not doc.link:
 								            return None
 								        async with httpx.AsyncClient(
 								            timeout=60,
 								            follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                resp = await client.get(doc.link)
 								                if resp.status_code != 200:
 								                    logger.error(
 								                        "%s PDF HTTP %s for %s (%s)",
 								                        self.bundesland, resp.status_code, drucksache, doc.link,
 								                    )
 								                    return None
 								                pdf = fitz.open(stream=resp.content, filetype="pdf")
 								                text = ""
 								                for page in pdf:
 								                    text += page.get_text()
 								                pdf.close()
 								                return text
 								            except Exception:
 								                logger.exception("%s PDF download error for %s", self.bundesland, drucksache)
 								                return None
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
-												Phase J: SN EDAS-XML-Adapter (#26/#38) — Sachsen aktiv via XML-Export

Reaktiviert die in Phase J vertagte Adapter-Implementation: statt
ASP.NET-Postbacks zu simulieren (blockt durch __VIEWSTATE-Komplexität
plus robots.txt: Disallow: /), liest die neue ``SNEdasXmlAdapter``-
Klasse einen wöchentlich manuell aus EDAS exportierten XML-Dump.

Workflow:

1. User exportiert in der EDAS-Suchmaske mit Filter "Dokumententyp =
   Antr" einen XML-Dump (bis zu 2500 Treffer/Export, sortiert
   newest-first nach Datum)
2. Datei wird unter ``data/sn-edas-export.xml`` abgelegt (ins
   persistent volume des prod-containers)
3. ``search()``/``get_document()`` lesen die XML-Datei lokal — keine
   Server-Calls gegen edas.landtag.sachsen.de
4. ``download_text()`` resolved die echte PDF-URL on-demand über einen
   einzelnen GET gegen ``viewer_navigation.aspx`` (single GET, kein
   Postback) und holt dann das PDF von ``ws.landtag.sachsen.de/images``

XML-Schema (ISO-8859-1):

- ``<ID>`` interne EDAS-Doc-ID
- ``<Wahlperiode>``, ``<Dokumentenart>``, ``<Dokumentennummer>``
- ``<Fundstelle>`` z.B. ``"Antr CDU, BSW, SPD 01.10.2024 Drs 8/2"`` —
  enthält Typ, Urheber und Datum, parsen via Regex
- ``<Titel>`` Volltext-Titel

PDF-URL-Schema (extrahiert aus dem viewer_navigation.aspx onLoad-
Handler): ``ws.landtag.sachsen.de/images/{wp}_Drs_{nr}_{...}.pdf``
mit variablen Suffix-Komponenten — wir machen die Resolution lazy.

Mapper-Erweiterung:

- ``parteien.PARTEIEN``-Tabelle um ``BÜNDNISGRÜNE``/``Bündnisgrüne``
  ergänzt — der Sachsen-spezifische zusammengeschriebene Eigenname der
  GRÜNEN-Fraktion (sonst wären 8/2100 etc. mit leerer Fraktionen-Liste
  rausgekommen)

BL-Eintrag:

- ``SN.aktiv = True``
- ``doku_system="EDAS-XML-Export"`` (klare Klassifikation, dass es
  KEIN normaler Webcrawler ist)
- Test ``test_sn_is_eigensystem_not_parldok`` umbenannt in
  ``test_sn_uses_xml_export_not_parldok``

Live-Probe lokal:

```
search('Klima', limit=5):
  8/2100 2025-03-17 | [GRÜNE]              | Fahrradoffensive Sachsen ...
  7/192  2019-10-11 | [LINKE]              | Erste Schritte zur Klimager...
  7/2067 2020-03-19 | [CDU, SPD, GRÜNE]    | Sächsische Waldbesitzer ...
```

176 Unit-Tests grün. Container braucht beim Deploy einen XML-Upload
ins data/-Volume — separater scp-Schritt.

Refs: #26, #38, #59 (Phase J revived)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 14:39:03 +02:00
+								class SNEdasXmlAdapter(ParlamentAdapter):
 								    """Sachsen-Adapter via XML-Export aus EDAS (#26/#38).
 								    EDAS (edas.landtag.sachsen.de) blockiert sowohl per ``robots.txt:
 								    Disallow: /`` als auch über ASP.NET-Webforms-Postbacks autonomes
 								    Crawling. Der Sächsische Landtag bietet aber einen offiziellen
 								    XML-Export-Knopf in der Suchmaske, der bis zu 2500 Treffer als
 								    strukturiertes XML herunterlädt — das umgeht beide Probleme:
 								    - **Manueller Export-Workflow**: Der User exportiert wöchentlich die
 								      Dokumentenliste mit Filter "Dokumententyp = Antr" und legt die
 								      Datei unter ``data/sn-edas-export.xml`` ab. Die Pipeline liest sie
 								      lokal und ist damit komplett unabhängig vom EDAS-Server.
 								    - **PDF-URL-Extraktion**: Das XML liefert ID, Wahlperiode,
 								      Dokumentennummer, Fundstelle (mit Fraktion + Datum) und Titel —
 								      aber keine PDF-URL. Wir holen die PDF-URL **erst beim
 								      ``download_text()``** aus dem ``viewer_navigation.aspx``-Frame
 								      des Landtags (ein einzelner GET, kein Postback). Dadurch
 								      generieren wir nur dann Server-Last, wenn ein Antrag tatsächlich
 								      analysiert wird.
 								    XML-Schema:
 								    ```
 								    <treffer>
 								      <ID><![CDATA[297875]]></ID>
 								      <Wahlperiode><![CDATA[8]]></Wahlperiode>
 								      <Dokumentenart><![CDATA[Drs]]></Dokumentenart>
 								      <Dokumentennummer><![CDATA[2]]></Dokumentennummer>
 								      <Fundstelle><![CDATA[Antr CDU, BSW, SPD 01.10.2024 Drs 8/2]]></Fundstelle>
 								      <Titel><![CDATA[Geschäftsordnung des Sächsischen Landtags]]></Titel>
 								    </treffer>
 								    ```
 								    Encoding ist ISO-8859-1 (Sachsen ist alt-school).
 								    """
 								    bundesland = "SN"
 								    name = "Sächsischer Landtag (EDAS-XML-Export)"
 								    base_url = "https://edas.landtag.sachsen.de"
 								    viewer_path = "/viewer/viewer_navigation.aspx"
 								    # Default-Pfad zum Export-File. Wird im Container vom mounted data/-
 								    # Volume bedient — der User legt die XML-Datei dort ab.
 								    DEFAULT_EXPORT_PATH = "data/sn-edas-export.xml"
 								    _RE_TREFFER = re.compile(r"<treffer>([\s\S]*?)</treffer>")
 								    _RE_FIELD = re.compile(r"<(\w+)><!\[CDATA\[(.*?)\]\]></\1>", re.DOTALL)
 								    _RE_FUNDSTELLE = re.compile(
 								        r"^(?P<typ>\S+)\s+(?P<urheber>.+?)\s+(?P<datum>\d{1,2}\.\d{1,2}\.\d{4})\s+Drs\s+\d+/\d+$"
 								    )
 								    _RE_VIEWER_PDF = re.compile(
 								        r"https://ws\.landtag\.sachsen\.de/images/[\w_]+\.pdf"
 								    )
 								    def __init__(self, *, export_path: Optional[str] = None):
 								        from pathlib import Path as _P
 								        # Pfad relativ zum webapp-Root, falls nicht absolut
 								        if export_path is None:
 								            self.export_path = _P(__file__).resolve().parent.parent / self.DEFAULT_EXPORT_PATH
 								        else:
 								            self.export_path = _P(export_path)
 								    def _normalize_fraktion(self, text: str) -> list[str]:
 								        from .parteien import extract_fraktionen
 								        return extract_fraktionen(text, bundesland=self.bundesland)
 								    @staticmethod
 								    def _datum_de_to_iso(datum_de: str) -> str:
 								        try:
 								            d, m, y = datum_de.split(".")
 								            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
 								        except ValueError:
 								            return ""
 								    def _read_export(self) -> str:
 								        """Lade die XML-Datei. Returns leeren String wenn nicht vorhanden
 								        — der Adapter degradiert dann gracefully zu 0 Hits."""
 								        if not self.export_path.exists():
 								            logger.warning("SN: export file not found at %s", self.export_path)
 								            return ""
 								        return self.export_path.read_text(encoding="iso-8859-1")
 								    def _parse_treffer(self, xml: str) -> list[Drucksache]:
 								        results: list[Drucksache] = []
 								        for chunk in self._RE_TREFFER.findall(xml):
 								            fields = dict(self._RE_FIELD.findall(chunk))
 								            wp = fields.get("Wahlperiode", "").strip()
 								            nr = fields.get("Dokumentennummer", "").strip()
 								            if not (wp and nr):
 								                continue
 								            drucksache = f"{wp}/{nr}"
 								            titel = fields.get("Titel", "").strip()
 								            fundstelle = fields.get("Fundstelle", "").strip()
 								            # Aus Fundstelle "Antr CDU, BSW, SPD 01.10.2024 Drs 8/2" die
 								            # Felder extrahieren
 								            datum_iso = ""
 								            urheber = ""
 								            typ = "Antrag"
 								            m = self._RE_FUNDSTELLE.match(fundstelle)
 								            if m:
 								                urheber = m.group("urheber")
 								                datum_iso = self._datum_de_to_iso(m.group("datum"))
 								            fraktionen = self._normalize_fraktion(urheber)
 								            # Stub-Link: viewer.aspx mit den drei Parametern. Die echte
 								            # PDF-URL wird beim download_text() per zweitem Call aufgelöst.
 								            link = (
 								                f"{self.base_url}/parlamentsdokumentation/parlamentsarchiv/"
 								                f"viewer.aspx?dok_nr={nr}&dok_art=Drs&leg_per={wp}"
 								            )
 								            results.append(Drucksache(
 								                drucksache=drucksache,
 								                title=titel,
 								                fraktionen=fraktionen,
 								                datum=datum_iso,
 								                link=link,
 								                bundesland=self.bundesland,
 								                typ=typ,
 								            ))
 								        return results
 								    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
 								        """Liefert Anträge aus dem statischen XML-Export, optional
 								        client-side title-filtered nach Query. Das XML ist bereits
 								        newest-first sortiert (verifiziert: erste Treffer 8/2 vom
 .10.2024, letzte 5/9268 vom 04.06.2012)."""
 								        xml = self._read_export()
 								        if not xml:
 								            return []
 								        results = self._parse_treffer(xml)
 								        if query:
 								            qterms = [t.lower() for t in query.split()]
 								            results = [
 								                d for d in results
 								                if all(t in d.title.lower() or t in " ".join(d.fraktionen).lower()
 								                       for t in qterms)
 								            ]
 								        return results[:limit]
 								    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
 								        """Lookup im statischen Export, kein Server-Call."""
 								        xml = self._read_export()
 								        if not xml:
 								            return None
 								        for doc in self._parse_treffer(xml):
 								            if doc.drucksache == drucksache:
 								                return doc
 								        return None
 								    async def _resolve_pdf_url(
 								        self, client: httpx.AsyncClient, drucksache: str,
 								    ) -> Optional[str]:
 								        """Resolve die echte PDF-URL über das viewer_navigation.aspx-
 								        Frame. Single GET-Call, kein Postback."""
 								        wp, _, nr = drucksache.partition("/")
 								        if not (wp and nr):
 								            return None
 								        url = (
 								            f"{self.base_url}/viewer/viewer_navigation.aspx"
 								            f"?dok_nr={nr}&dok_art=Drs&leg_per={wp}"
 								        )
 								        try:
 								            resp = await client.get(url)
 								            if resp.status_code != 200:
 								                return None
 								            m = self._RE_VIEWER_PDF.search(resp.text)
 								            return m.group(0) if m else None
 								        except Exception:
 								            logger.exception("SN viewer probe error for %s", drucksache)
 								            return None
 								    async def download_text(self, drucksache: str) -> Optional[str]:
 								        import fitz
 								        async with httpx.AsyncClient(
 								            timeout=60, follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            pdf_url = await self._resolve_pdf_url(client, drucksache)
 								            if not pdf_url:
 								                logger.error("SN: no PDF URL found for %s", drucksache)
 								                return None
 								            try:
 								                resp = await client.get(pdf_url)
 								                if resp.status_code != 200:
 								                    return None
 								                pdf = fitz.open(stream=resp.content, filetype="pdf")
 								                text = ""
 								                for page in pdf:
 								                    text += page.get_text()
 								                pdf.close()
 								                return text
 								            except Exception:
 								                logger.exception("SN PDF download error for %s", drucksache)
 								                return None
-												Phase I: HB PARiSHBAdapter (#21/#33) — Bremen aktiv

Schließt #21 (HB-Scraper) und #33 (UI-Aktivierung). Eigenständige
``PARiSHBAdapter``-Klasse für paris.bremische-buergerschaft.de.

Backend (HAR-Trace TEMP/paris.bremische-buergerschaft.de.har):

- Single-POST gegen ``/starweb/paris/servlet.starweb`` mit
  form-urlencoded Body
- ``path=paris/LISSHFL.web``, ``format=LISSH_BrowseVorgang_Report``
- ``01_LISSHFL_Themen=<query>`` (Volltext-Thesaurus)
- ``02_LISSHFL_PARL=S OR L`` (Stadt + Landtag in einem Rutsch)
- ``03_LISSHFL_WP=21`` (aktuelle Wahlperiode; Multi-WP-Range
  timeout-t den Server bei 60s)
- Wildcards (``*``) timeout-en ebenfalls — bei leerer Query verwenden
  wir das hochfrequente Stoppwort ``"der"`` als Catch-all

Hit-Format aus dem Single-Page-HTML:

- ``<tbody name="RecordRepeater"><tr name="Repeat_TYP">``
- Title in ``<h2><a>``
- ``Drs <b>21/730 S</b>`` mit S/L-Suffix für Stadtbürgerschaft vs
  Landtag — Drucksachen-IDs werden als ``21/730S`` (ohne Space)
  gespeichert
- ``Änderungsantrag vom 23.02.2026`` (Typ + Datum)
- Fraktionen-Liste nach ``<br/>``
- PDF-Link mit ``target="new"`` auf bremische-buergerschaft.de

Pipeline:

- ``search()`` mit client-side ``"antrag"``-Filter (analog #61),
  fängt ``"Antrag"``, ``"Änderungsantrag"`` etc.
- ``get_document()`` linearer Lookup
- ``download_text()`` PDF-via-fitz

BL-Eintrag in ``bundeslaender.py``:

- ``HB.aktiv = True``
- ``doku_system="PARiS"`` (statt der alten Klassifikation "StarWeb" —
  PARiS ist eine deutlich abweichende Servlet-Variante, kein eUI)
- ``drucksache_format="21/1234S"``
- Test ``test_hb_is_starweb_not_paris`` umbenannt in
  ``test_hb_is_paris_starweb_variant``, prüft jetzt auf "PARiS"

Live-Probe:

```
21/730S  2026-02-23 | [SPD,GRÜNE,LINKE] | Änderungsantrag | Haushaltsgesetze ...
21/1449  2025-11-05 | [SPD,GRÜNE,LINKE] | Antrag         | Finanzierung der Bremischen Häfen
21/555S  2025-06-17 | [CDU]              | Antrag         | Clima-Campus zügig beantworten
```

176 Unit-Tests grün, Live-Verifikation Sub-A im Container nach Deploy.

Refs: #21, #33, #59 (Phase I)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 14:21:49 +02:00
+								class PARiSHBAdapter(ParlamentAdapter):
 								    """Bremen-Adapter für PARiS (paris.bremische-buergerschaft.de).
 								    PARiS ist die alte Java-Servlet-Variante von StarWeb (anders als
 								    HE/starweb.hessen.de, das auf dem moderneren eUI läuft). Die Suche
 								    geht über genau einen POST-Call gegen ``/starweb/paris/servlet.starweb``
 								    mit form-urlencoded Body. Response ist ein vollständiges HTML-
 								    Ergebnis-Page mit ``<tbody name="RecordRepeater">``-Hits.
 								    Hit-Format pro ``<tr name="Repeat_TYP">``:
 								    - ``<abbr title="Bremische Stadtbürgerschaft">S</abbr>`` oder
 								      ``<abbr title="Bremischer Landtag">L</abbr>`` als Indikator
 								    - ``<h2><a>TITEL</a></h2>``
 								    - Stichworte (Thesaurus-Links, ignoriert)
 								    - ``Drs <b>21/730 S</b>`` (Drucksachen-Nr mit S/L-Suffix)
 								    - ``Änderungsantrag vom 23.02.2026`` (Typ + Datum)
 								    - ``SPD, BÜNDNIS 90/DIE GRÜNEN, Die Linke`` (Fraktionen)
 								    - ``<a href="https://www.bremische-buergerschaft.de/dokumente/...pdf">``
 								    Bremen hat zwei parallele Parlamente: Bürgerschaft (Landtag) für
 								    landespolitische Anträge und Stadtbürgerschaft für Bremens
 								    kommunale Sachen. Wir lassen beide durch (``PARL=S OR L``) — der
 								    Stadtbürgerschafts-Anteil ist für die GWÖ-Bilanzierung sogar
 								    interessanter, weil viele Entscheidungen auf kommunaler Ebene
 								    laufen.
 								    """
 								    bundesland = "HB"
 								    name = "Bremische Bürgerschaft (PARiS)"
 								    base_url = "https://paris.bremische-buergerschaft.de"
 								    servlet_path = "/starweb/paris/servlet.starweb"
 								    wahlperiode = 21
 								    # Pro-Hit-Regex über das `<tr name="Repeat_TYP">`-Pattern
 								    _RE_TR = re.compile(
 								        r'<tr\s+name="Repeat_TYP"[^>]*>([\s\S]*?)</tr\s*>',
 								        re.IGNORECASE,
 								    )
 								    _RE_TITLE = re.compile(r'<h2[^>]*>\s*<a[^>]*>(.*?)</a>', re.DOTALL)
 								    _RE_DRUCKSACHE = re.compile(r'Drs\s*<b>\s*(\d+/\d+)\s*([SL]?)\s*</b>')
 								    _RE_TYP_DATUM = re.compile(r'</b>\s*,\s*([^,<\n]+?)\s+vom\s+(\d{1,2}\.\d{1,2}\.\d{4})')
 								    _RE_FRAKTIONEN_AFTER_DATUM = re.compile(r'vom\s+\d{1,2}\.\d{1,2}\.\d{4}\s*<br\s*/?\s*>\s*([^<]+)')
 								    _RE_PDF_LINK = re.compile(
 								        r'<a\s+href="(https?://[^"]*\.pdf[^"]*)"[^>]*target="new"',
 								        re.IGNORECASE,
 								    )
 								    def _normalize_fraktion(self, text: str) -> list[str]:
 								        from .parteien import extract_fraktionen
 								        return extract_fraktionen(text, bundesland=self.bundesland)
 								    @staticmethod
 								    def _datum_de_to_iso(datum_de: str) -> str:
 								        try:
 								            d, m, y = datum_de.split(".")
 								            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
 								        except ValueError:
 								            return ""
 								    @staticmethod
 								    def _strip_html(s: str) -> str:
 								        """Entferne HTML-Tags und entities aus einem Snippet."""
 								        s = re.sub(r"<[^>]+>", "", s)
 								        s = s.replace("&ndash;", "–").replace("&nbsp;", " ")
 								        s = re.sub(r"&[a-zA-Z]+;", " ", s)
 								        return re.sub(r"\s+", " ", s).strip()
 								    def _parse_record_html(self, chunk: str) -> Optional[Drucksache]:
 								        m_ds = self._RE_DRUCKSACHE.search(chunk)
 								        if not m_ds:
 								            return None
 								        nr_only = m_ds.group(1)         # "21/730"
 								        suffix = m_ds.group(2) or ""    # "S" oder "L"
 								        # Drucksachen-ID: ohne Whitespace, mit Suffix dahinter wenn vorhanden
 								        drucksache = f"{nr_only}{suffix}" if suffix else nr_only
 								        m_t = self._RE_TITLE.search(chunk)
 								        title = self._strip_html(m_t.group(1)) if m_t else f"Drucksache {drucksache}"
 								        m_pdf = self._RE_PDF_LINK.search(chunk)
 								        pdf_url = m_pdf.group(1) if m_pdf else ""
 								        m_td = self._RE_TYP_DATUM.search(chunk)
 								        if m_td:
 								            typ = self._strip_html(m_td.group(1))
 								            datum = self._datum_de_to_iso(m_td.group(2))
 								        else:
 								            typ = "Drucksache"
 								            datum = ""
 								        m_fr = self._RE_FRAKTIONEN_AFTER_DATUM.search(chunk)
 								        urheber = self._strip_html(m_fr.group(1)) if m_fr else ""
 								        fraktionen = self._normalize_fraktion(urheber)
 								        return Drucksache(
 								            drucksache=drucksache,
 								            title=title,
 								            fraktionen=fraktionen,
 								            datum=datum,
 								            link=pdf_url,
 								            bundesland=self.bundesland,
 								            typ=typ,
 								        )
 								    def _build_form_body(self, query: str) -> dict:
 								        """Form-Body für PARiS Suche.
 								        - ``path=paris/LISSHFL.web``: die LISSH-Vorgangsdatenbank
 								        - ``format=LISSH_BrowseVorgang_Report``: Browse-Format mit
 								          allen Hits in einer Page (kein Pagination)
 								        - ``01_LISSHFL_Themen``: Thesaurus-Volltext-Suche. Der Server
 								          akzeptiert kein ``*``-Wildcard und timeout-t bei leerem Wert,
 								          deshalb verwenden wir bei leerer Query ein hochfrequentes
 								          Stoppwort als Catch-all.
 								        - ``02_LISSHFL_PARL=S OR L``: Stadtbürgerschaft + Landtag
 								        - ``03_LISSHFL_WP``: aktuelle Wahlperiode (kein Range — ein
 								          Multi-WP-Range hat im Test 60s+ gebraucht)
 								        """
 								        return {
 								            "path": "paris/LISSHFL.web",
 								            "format": "LISSH_BrowseVorgang_Report",
 								            "01_LISSHFL_Themen": query or "der",  # häufiges Stoppwort
 								            "02_LISSHFL_PARL": "S OR L",
 								            "03_LISSHFL_WP": str(self.wahlperiode),
 								        }
 								    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
 								        """Single-POST-Search gegen den PARiS-Servlet."""
 								        body = self._build_form_body(query)
 								        async with httpx.AsyncClient(
 								            timeout=60, follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                resp = await client.post(
 								                    f"{self.base_url}{self.servlet_path}",
 								                    data=body,
 								                    headers={"Content-Type": "application/x-www-form-urlencoded"},
 								                )
 								                if resp.status_code != 200:
 								                    logger.error("HB PARiS HTTP %s", resp.status_code)
 								                    return []
 								                results: list[Drucksache] = []
 								                for chunk in self._RE_TR.findall(resp.text):
 								                    doc = self._parse_record_html(chunk)
 								                    if not doc:
 								                        continue
 								                    if "antrag" not in (doc.typ or "").lower():
 								                        continue
 								                    results.append(doc)
 								                    if len(results) >= limit:
 								                        break
 								                return results
 								            except Exception:
 								                logger.exception("HB PARiS search error")
 								                return []
 								    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
 								        """Linearer Lookup über die search()-Resultate."""
 								        # Bei Drucksachen-IDs mit Suffix (21/730S) zerlegen wir die,
 								        # damit die Volltext-Suche den nackten Drucksachen-Anteil findet
 								        m = re.match(r"(\d+/\d+)([SL]?)$", drucksache)
 								        if not m:
 								            return None
 								        results = await self.search("*", limit=200)
 								        for d in results:
 								            if d.drucksache == drucksache:
 								                return d
 								        return None
 								    async def download_text(self, drucksache: str) -> Optional[str]:
 								        import fitz
 								        doc = await self.get_document(drucksache)
 								        if not doc or not doc.link:
 								            return None
 								        async with httpx.AsyncClient(
 								            timeout=60, follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                resp = await client.get(doc.link)
 								                if resp.status_code != 200:
 								                    return None
 								                pdf = fitz.open(stream=resp.content, filetype="pdf")
 								                text = ""
 								                for page in pdf:
 								                    text += page.get_text()
 								                pdf.close()
 								                return text
 								            except Exception:
 								                logger.exception("HB PARiS PDF download error for %s", drucksache)
 								                return None
-												Phase H: HE StarWebHEAdapter (#24/#30) — Hessen aktiv

Schließt #24 (HE Card-Parser) und #36 (UI-Aktivierung). Eigenständige
``StarWebHEAdapter``-Klasse für starweb.hessen.de.

Backend-Discovery aus HAR-Trace (TEMP/starweb.hessen.de.har):

- starweb.hessen.de läuft auf einem eUI-Backend mit synchronem 2-Step-
  Flow (kein Polling wie BW PARLIS): POST ``browse.tt.json`` →
  ``report_id`` direkt in der Response → GET ``report.tt.html?
  report_id=...&start=0&chunksize=1500``
- Source: ``hlt.lis``
- Server verlangt ZWINGEND einen ``search.json``-Term-Tree, ``parsed``/
  ``sref`` allein reichen nicht. Top-NOT mit zwei Operanden:
  ``not(WP-Filter, NOWEB=X)``
- Hit-Format: Cards (``efxRecordRepeater``) mit Daten in HTML-Kommentar-
  Perl-Dumps ``<!--<pre class="dump">$VAR1 = ...</pre>-->``
- Field-Mapping: WEV01=Title, WEV02=Datum, WEV03=Typ, WEV07=PDF-URL,
  WEV08=Drucksachen-Nummer, WEV12=Urheber

Pipeline:

- ``search()`` synchron 2-Step, client-side ``"antrag"``-Filter (analog
  #61 für portala) — fängt "Dringlicher Berichtsantrag" und ähnliche
  Subtypen
- ``get_document()`` linearer Lookup über die ersten 200 Hits
- ``download_text()`` PDF-via-fitz (HE-PDF-URLs werden auf https
  upgegradet)

BL-Eintrag in ``bundeslaender.py``:

- ``HE.aktiv = True``
- ``doku_system="portala"`` (statt "StarWeb" — die /starweb/LIS-Pfade
  sind nur Legacy, das echte Backend ist /portal)
- ``doku_base_url="https://starweb.hessen.de/portal"``

ADAPTERS-Registrierung an Position vor NRW.

Live-Probe:

```
21/4157 2026-04-07 | [GRÜNE] | Dringlicher Berichtsantrag | Vorstellung, Kosten...
21/4156 2026-04-02 | [GRÜNE] | Berichtsantrag             | Schulische Prävention...
21/4136 2026-03-30 | [GRÜNE] | Dringlicher Berichtsantrag | Streichung des Schulfachs...
```

176 Unit-Tests grün, Sub-A im Container nach Deploy zu verifizieren.

Refs: #24, #30, #36, #59 (Phase H)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 14:15:35 +02:00
+								class StarWebHEAdapter(ParlamentAdapter):
 								    """Hessen-spezifischer eUI-Adapter (#24/#30).
 								    starweb.hessen.de läuft auf einem eUI-Backend mit synchronem 2-Step-
 								    Flow (anders als BW PARLIS, das asynchron pollt):
 . POST ``/portal/browse.tt.json`` mit ``action=SearchAndDisplay`` →
 								       Response enthält ``report_id`` direkt
 . GET ``/portal/report.tt.html?report_id=...`` → HTML mit den Hits
 								    Hit-Format: Cards mit ``efxRecordRepeater``-divs, Daten in HTML-
 								    Kommentar-Perl-Dumps (``<!--<pre class="dump">$VAR1 = ...</pre>-->``).
 								    Field-Mapping:
 								    - ``WEV01`` → Title
 								    - ``WEV02`` → Datum
 								    - ``WEV03`` → Typ
 								    - ``WEV07`` → PDF-URL
 								    - ``WEV08`` → Drucksachen-Nummer
 								    - ``WEV12`` → Urheber/Fraktion
 								    Source: ``hlt.lis`` (Hessischer Landtag), Wahlperiode 21.
 								    """
 								    _RE_HE_COMMENT_DUMP = re.compile(
 								        r'<!--\s*<pre[^>]*class="dump"[^>]*>\s*\$VAR1 = (.*?)</pre>\s*-->',
 								        re.DOTALL,
 								    )
 								    _RE_HE_WEV01 = re.compile(r"'WEV01'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
 								    _RE_HE_WEV02 = re.compile(r"'WEV02'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"'](\d{1,2}\.\d{1,2}\.\d{4})[\"']")
 								    _RE_HE_WEV03 = re.compile(r"'WEV03'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
 								    _RE_HE_WEV07 = re.compile(r"'WEV07'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
 								    _RE_HE_WEV08 = re.compile(r"'WEV08'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"'](\d+/\d+)[\"']")
 								    _RE_HE_WEV12 = re.compile(r"'WEV12'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
 								    bundesland = "HE"
 								    name = "Hessischer Landtag (StarWeb)"
 								    base_url = "https://starweb.hessen.de"
 								    portal_path = "/portal"
 								    wahlperiode = 21
 								    def _normalize_fraktion(self, text: str) -> list[str]:
 								        from .parteien import extract_fraktionen
 								        return extract_fraktionen(text, bundesland=self.bundesland)
 								    @staticmethod
 								    def _datum_de_to_iso(datum_de: str) -> str:
 								        if not datum_de:
 								            return ""
 								        try:
 								            d, m, y = datum_de.split(".")
 								            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
 								        except ValueError:
 								            return ""
 								    @staticmethod
 								    def _decode_perl_hex(text: str) -> str:
 								        """Wandle ``\\x{e9}`` → ``é`` etc. um. Robuste Hex-Substitution."""
 								        return re.sub(
 								            r"\\x\{([0-9a-fA-F]+)\}",
 								            lambda m: chr(int(m.group(1), 16)),
 								            text,
 								        )
 								    def _build_initial_body(self, query: str = "") -> dict:
 								        """HE-Server-Body. Aktuelle WP, optional Volltext-Filter.
 								        Der Server verlangt ZWINGEND einen ``search.json``-Term-Tree mit
 								        einer ``not(query, NOWEB=X)``-Wurzel. ``parsed``/``sref`` allein
 								        reichen nicht — der Server ignoriert sie und liefert nur
 								        ``facets`` zurück.
 								        """
 								        wp_str = str(self.wahlperiode)
 								        wp_term = {
 								            "tn": "term", "t": wp_str, "sf": "WP",
 								            "op": "eq", "idx": 45, "l": 3, "num": 1,
 								        }
 								        # Bauen den Top-NOT-Tree: NOT(query_subtree, NOWEB=X)
 								        if query:
 								            vtdrs_term = {
 								                "tn": "term",
 								                "t": f"\"(/VT ('\\\"{query}\\\"'))\"",
 								                "sf": "VTDRS", "op": "eq", "idx": 9, "l": 3, "num": 3,
 								            }
 								            inner = {"tn": "and", "terms": [vtdrs_term, wp_term], "num": 4}
 								            parsed = (
 								                f"((/VTDRS \"(/VT ('\\\"{query}\\\"'))\") "
 								                f"AND (/WP {wp_str})) AND NOT NOWEB=X"
 								            )
 								        else:
 								            inner = wp_term
 								            parsed = f"(/WP {wp_str}) AND NOT NOWEB=X"
 								        json_tree = [{
 								            "tn": "not",
 								            "terms": [
 								                inner,
 								                {"tn": "term", "t": "X", "sf": "NOWEB",
 								                 "op": "eq", "idx": 100, "l": 3, "num": 2},
 								            ],
 								        }]
 								        return {
 								            "action": "SearchAndDisplay",
 								            "sources": ["hlt.lis"],
 								            "report": {
 								                "rhl": "main",
 								                "rhlmode": "add",
 								                "format": "generic2-short",
 								                "mime": "html",
 								                "sort": "WPSORT/D DRSORT/D",
 								            },
 								            "search": {
 								                "lines": {"1": query, "2": wp_str},
 								                "serverrecordname": "generic2Search",
 								                "parsed": parsed,
 								                "sref": parsed,
 								                "json": json_tree,
 								            },
 								        }
 								    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
 								        """Synchroner 2-Step gegen starweb.hessen.de."""
 								        from .parteien import extract_fraktionen
 								        body = self._build_initial_body(query)
 								        browse_url = f"{self.base_url}{self.portal_path}/browse.tt.json"
 								        report_url = f"{self.base_url}{self.portal_path}/report.tt.html"
 								        async with httpx.AsyncClient(
 								            timeout=60, follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                resp = await client.post(browse_url, json=body)
 								                if resp.status_code != 200:
 								                    logger.error("HE browse HTTP %s", resp.status_code)
 								                    return []
 								                data = resp.json()
 								                report_id = data.get("report_id")
 								                if not report_id:
 								                    logger.error("HE: no report_id in browse response keys=%s", sorted(data.keys()))
 								                    return []
 								                # Step 2: report.tt.html mit chunksize — ohne den Parameter
 								                # liefert der Server nur den allerersten Hit (8 KB HTML).
 								                # Wir nehmen 1500 als Floor, analog #61 PortalaAdapter, weil
 								                # nach dem client-side Antrag-Filter die Hit-Dichte gering
 								                # ist (HE hat ~1:30 Antrag/Anfrage).
 								                chunksize = max(limit * 30, 1500)
 								                rep = await client.get(
 								                    report_url,
 								                    params={
 								                        "report_id": report_id,
 								                        "start": 0,
 								                        "chunksize": chunksize,
 								                    },
 								                )
 								                if rep.status_code != 200:
 								                    logger.error("HE report HTTP %s", rep.status_code)
 								                    return []
 								                results = self._parse_report_html(rep.text)
 								                # Client-side Antrag-Filter (analog #61 Bug 2/3 für portala)
 								                results = [d for d in results if "antrag" in (d.typ or "").lower()]
 								                # Optional Query-Filter client-side
 								                if query:
 								                    qterms = query.lower().split()
 								                    results = [
 								                        d for d in results
 								                        if all(t in (d.title.lower() + " " + " ".join(d.fraktionen).lower()) for t in qterms)
 								                    ]
 								                return results[:limit]
 								            except Exception:
 								                logger.exception("HE search error")
 								                return []
 								    def _parse_report_html(self, html: str) -> list[Drucksache]:
 								        """Zieht Daten aus den ``<!--<pre class="dump">$VAR1 = ...-->``-
 								        Kommentaren. WEV01–WEV12 → Drucksache-Felder."""
 								        from .parteien import extract_fraktionen
 								        results: list[Drucksache] = []
 								        for dump in self._RE_HE_COMMENT_DUMP.findall(html):
 								            m_ds = self._RE_HE_WEV08.search(dump)
 								            if not m_ds:
 								                continue
 								            drucksache = m_ds.group(1)
 								            m_t = self._RE_HE_WEV01.search(dump)
 								            title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"
 								            m_pdf = self._RE_HE_WEV07.search(dump)
 								            pdf_url = m_pdf.group(1) if m_pdf else ""
 								            if pdf_url.startswith("http://"):
 								                pdf_url = "https://" + pdf_url[len("http://"):]
 								            m_dat = self._RE_HE_WEV02.search(dump)
 								            datum_iso = self._datum_de_to_iso(m_dat.group(1)) if m_dat else ""
 								            m_typ = self._RE_HE_WEV03.search(dump)
 								            typ = self._decode_perl_hex(m_typ.group(1)) if m_typ else "Drucksache"
 								            m_urheber = self._RE_HE_WEV12.search(dump)
 								            urheber = self._decode_perl_hex(m_urheber.group(1)) if m_urheber else ""
 								            fraktionen = extract_fraktionen(urheber, bundesland=self.bundesland)
 								            results.append(Drucksache(
 								                drucksache=drucksache, title=title, fraktionen=fraktionen,
 								                datum=datum_iso, link=pdf_url, bundesland=self.bundesland,
 								                typ=typ,
 								            ))
 								        return results
 								    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
 								        """Linearer Lookup über search() — wie die anderen Adapter, kein
 								        Direkt-ID-Filter."""
 								        results = await self.search("", limit=200)
 								        for d in results:
 								            if d.drucksache == drucksache:
 								                return d
 								        return None
 								    async def download_text(self, drucksache: str) -> Optional[str]:
 								        import fitz
 								        doc = await self.get_document(drucksache)
 								        if not doc or not doc.link:
 								            return None
 								        async with httpx.AsyncClient(
 								            timeout=60, follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                resp = await client.get(doc.link)
 								                if resp.status_code != 200:
 								                    return None
 								                pdf = fitz.open(stream=resp.content, filetype="pdf")
 								                text = ""
 								                for page in pdf:
 								                    text += page.get_text()
 								                pdf.close()
 								                return text
 								            except Exception:
 								                logger.exception("HE PDF download error for %s", drucksache)
 								                return None
-												Phase G: BundestagAdapter via DIP-API (#56)

Schließt #56 (Bundespolitik überprüfbar machen). Neuer
``BundestagAdapter`` in ``app/parlamente.py``, neuer ``BUND``-Eintrag in
``app/bundeslaender.py`` als 17. Parlament-Slot.

API:

- DIP-Search-API auf ``search.dip.bundestag.de/api/v1/drucksache``
- API-Key aus ``dip-config.js`` gescraped (öffentlich, klartext)
- Auth via URL-Param ``?apikey=...`` plus ``Origin: https://dip.bundestag.de``-
  Header (Origin-Locking, server-to-server-tauglich)
- Pagination via ``cursor``-Parameter, 100 Hits pro Page
- ``f.drucksachetyp=Antrag`` und ``f.wahlperiode=21`` als Server-Filter

Mapping:

- ``dokumentnummer`` → ``Drucksache.drucksache``
- ``titel`` → ``title``
- ``urheber[*].titel`` → durch ``parteien.extract_fraktionen`` zu
  ``["AfD"]``/``["GRÜNE"]``/etc. — die ``"Fraktion der AfD"``-
  Schreibweise wird vom zentralen Mapper aus #55 bereits korrekt
  geparst, kein Adapter-spezifisches Pattern nötig
- ``fundstelle.pdf_url`` → ``link``
- ``datum`` → bereits ISO ``YYYY-MM-DD``

``get_document(drucksache)`` nutzt ``f.dokumentnummer`` als direkter
Server-Filter, kein linearer Pagination-Scan.

BUND-Eintrag in ``bundeslaender.py``:

- ``code="BUND"``, ``parlament_name="Deutscher Bundestag"``,
  ``wahlperiode=21``, ``wahlperiode_start="2025-03-25"`` (Konstituierung
  21. WP nach BTW 2025), ``regierungsfraktionen=["CDU", "CSU", "SPD"]``
  (Kabinett Merz)
- ``aktiv=True`` — taucht automatisch in ``alle_bundeslaender()`` und
  ``aktive_bundeslaender()`` auf, damit die UI- und
  Auswertungs-Pipelines BUND ohne zusätzliche Sonderpfade kennen
- 17 Einträge in ``BUNDESLAENDER`` statt 16 — Tests entsprechend
  aktualisiert (``test_sixteen_bundeslaender_plus_bund``,
  ``test_alle_bundeslaender_returns_all``,
  ``test_all_wahlperioden_lists_each_bl_twice``)

Live-Probe direkt im Repo:

```
adapter: Deutscher Bundestag (DIP), wahlperiode=21
search returned 5 docs
  21/5136 2026-03-31 | ['AfD'] | Transparenz, Wirtschaftlichkeit ...
  21/5064 2026-03-27 | ['GRÜNE'] | Ausverkauf der Energieinfrastruktur ...
  21/5059 2026-03-27 | ['AfD'] | Berufsfreiheit für Selbstständige ...
get_document('21/5136') -> drucksache=21/5136
```

176 Unit-Tests grün, Live-Verifikation Sub-A im Container nach Deploy.

Refs: #56, #59 (Phase G)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 14:04:11 +02:00
+								class BundestagAdapter(ParlamentAdapter):
 								    """Adapter für den Deutschen Bundestag via DIP-API.
 								    Quelle: ``search.dip.bundestag.de/api/v1`` — die offizielle REST-API
 								    des Dokumentations- und Informationssystems (DIP). Schema dokumentiert
 								    unter https://dip.bundestag.de/über-dip/hilfe/api (SPA, Inhalt im
 								    Bundle ``main.*.chunk.js``). Auth via URL-Parameter ``apikey=...``
 								    PLUS einem ``Origin: https://dip.bundestag.de``-Header — der Server
 								    macht Origin-Locking auf seine eigene Single-Page-App.
 								    Der API-Key liegt offen in ``dip-config.js`` und wird vom DIP-Frontend
 								    bei jedem Request als URL-Parameter mitgeschickt. Solange wir den
 								    Origin-Header setzen, akzeptiert die API das auch von server-to-
 								    server-Calls.
 								    Doc-Mapping (``/api/v1/drucksache``):
 								    - ``dokumentnummer`` → ``drucksache`` (z.B. ``"21/5136"``)
 								    - ``titel`` → ``title``
 								    - ``urheber[*].bezeichnung``/``titel`` → ``fraktionen`` (durch
 								      ``parteien.extract_fraktionen`` normalisiert, deckt
 								      ``"Fraktion der AfD"`` → ``"AfD"`` ab)
 								    - ``datum`` → ``datum`` (bereits ISO YYYY-MM-DD)
 								    - ``fundstelle.pdf_url`` → ``link``
 								    - ``drucksachetyp`` → ``typ`` (Filter auf ``"Antrag"``)
 								    Pagination via ``cursor``-Parameter — der Server gibt nach jedem
 								    Result einen neuen Cursor zurück, den wir als nächsten Request
 								    mitschicken. 100 Hits pro Page, pro Wahlperiode ~600 Anträge.
 								    """
 								    bundesland = "BUND"
 								    name = "Deutscher Bundestag (DIP)"
 								    base_url = "https://search.dip.bundestag.de/api/v1"
 								    # Aus dip-config.js gescraped (öffentlich, klartext, von der DIP-SPA
 								    # bei jedem Request mitgesendet). Origin-Locking macht den Key
 								    # nicht-trivial weiterzugeben, aber für server-to-server-Calls mit
 								    # gesetztem Origin-Header voll funktional.
 								    DEFAULT_APIKEY = "SbGXhWA.3cpnNdb8rkht7iWpvSgTP8XIG88LoCrGd4"
 								    ORIGIN = "https://dip.bundestag.de"
 								    def __init__(
 								        self,
 								        *,
 								        apikey: Optional[str] = None,
 								        wahlperiode: int = 21,
 								        document_typ: str = "Antrag",
 								    ):
 								        self.apikey = apikey or self.DEFAULT_APIKEY
 								        self.wahlperiode = wahlperiode
 								        self.document_typ = document_typ
 								    def _make_client(self) -> httpx.AsyncClient:
 								        return httpx.AsyncClient(
 								            timeout=30,
 								            follow_redirects=True,
 								            headers={
 								                "Origin": self.ORIGIN,
 								                "Referer": f"{self.ORIGIN}/",
 								                "User-Agent": "Mozilla/5.0 GWOE-Antragspruefer",
 								                "Accept": "application/json",
 								            },
 								        )
 								    def _doc_to_drucksache(self, doc: dict) -> Optional[Drucksache]:
 								        """Map ein DIP-/drucksache-JSON auf unser ``Drucksache``-dataclass.
 								        ``None`` wenn essentielle Felder fehlen."""
 								        from .parteien import extract_fraktionen
 								        nummer = doc.get("dokumentnummer")
 								        if not nummer:
 								            return None
 								        # PDF-URL aus fundstelle ziehen — ist die zuverlässige Adresse
 								        fundstelle = doc.get("fundstelle") or {}
 								        pdf_url = fundstelle.get("pdf_url") or ""
 								        if not pdf_url:
 								            return None
 								        # Fraktionen aus urheber-Liste extrahieren. DIP listet sie als
 								        # "Fraktion der AfD" o.ä. — extract_fraktionen kennt das Pattern
 								        # bereits aus den Landtags-Adaptern.
 								        urheber_strs: list[str] = []
 								        for u in (doc.get("urheber") or []):
 								            if isinstance(u, dict):
 								                urheber_strs.append(u.get("titel") or u.get("bezeichnung") or "")
 								        urheber_combined = ", ".join(filter(None, urheber_strs))
 								        fraktionen = extract_fraktionen(urheber_combined, bundesland=self.bundesland)
 								        return Drucksache(
 								            drucksache=nummer,
 								            title=doc.get("titel", ""),
 								            fraktionen=fraktionen,
 								            datum=doc.get("datum", ""),
 								            link=pdf_url,
 								            bundesland=self.bundesland,
 								            typ=doc.get("drucksachetyp", "Antrag"),
 								        )
 								    async def _fetch_page(
 								        self, client: httpx.AsyncClient, *, cursor: Optional[str] = None,
 								    ) -> tuple[list[dict], Optional[str]]:
 								        """Lade eine Page vom /drucksache-Endpoint. Returns (docs, next_cursor)."""
 								        params = {
 								            "apikey": self.apikey,
 								            "f.drucksachetyp": self.document_typ,
 								            "f.wahlperiode": str(self.wahlperiode),
 								        }
 								        if cursor:
 								            params["cursor"] = cursor
 								        try:
 								            resp = await client.get(f"{self.base_url}/drucksache", params=params)
 								            if resp.status_code != 200:
 								                logger.error("BUND DIP HTTP %s: %s", resp.status_code, resp.text[:200])
 								                return [], None
 								            data = resp.json()
 								            return data.get("documents", []), data.get("cursor")
 								        except Exception:
 								            logger.exception("BUND DIP request error")
 								            return [], None
 								    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
 								        """Liste die neuesten Anträge der konfigurierten Wahlperiode.
 								        Server liefert Antrags-gefiltert + nach Aktualität sortiert; wir
 								        paginieren über cursor bis ``limit`` (oder das Ende der Periode)
 								        erreicht ist. Query wird client-side als Title-Substring-Filter
 								        angewandt — die DIP-API hat einen ``f.titel``-Filter, aber für
 								        Konsistenz mit den Landtags-Adaptern (alle nutzen client-side
 								        Filter wegen Schema-Drift) machen wir es hier auch so.
 								        """
 								        results: list[Drucksache] = []
 								        seen: set[str] = set()
 								        query_terms = [t.lower() for t in query.split() if t] if query else []
 								        async with self._make_client() as client:
 								            cursor: Optional[str] = None
 								            for _ in range(20):  # max 20 pages = 2000 docs als Hard-Cap
 								                docs, next_cursor = await self._fetch_page(client, cursor=cursor)
 								                if not docs:
 								                    break
 								                for raw in docs:
 								                    doc = self._doc_to_drucksache(raw)
 								                    if not doc:
 								                        continue
 								                    if doc.drucksache in seen:
 								                        continue
 								                    seen.add(doc.drucksache)
 								                    if query_terms:
 								                        hay = doc.title.lower()
 								                        if not all(t in hay for t in query_terms):
 								                            continue
 								                    results.append(doc)
 								                    if len(results) >= limit:
 								                        return results
 								                # Cursor unverändert → letzte Page erreicht
 								                if not next_cursor or next_cursor == cursor:
 								                    break
 								                cursor = next_cursor
 								        return results
 								    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
 								        """Look up a single Drucksache by ID. Nutzt den f.dokumentnummer-
 								        Filter — direkter Treffer ohne Pagination."""
 								        async with self._make_client() as client:
 								            try:
 								                resp = await client.get(
 								                    f"{self.base_url}/drucksache",
 								                    params={
 								                        "apikey": self.apikey,
 								                        "f.dokumentnummer": drucksache,
 								                        "f.wahlperiode": str(self.wahlperiode),
 								                    },
 								                )
 								                if resp.status_code != 200:
 								                    return None
 								                docs = resp.json().get("documents", [])
 								                for raw in docs:
 								                    if raw.get("dokumentnummer") == drucksache:
 								                        return self._doc_to_drucksache(raw)
 								            except Exception:
 								                logger.exception("BUND get_document error for %s", drucksache)
 								        return None
 								    async def download_text(self, drucksache: str) -> Optional[str]:
 								        """Download das Drucksachen-PDF und extrahiere Volltext."""
 								        import fitz
 								        doc = await self.get_document(drucksache)
 								        if not doc or not doc.link:
 								            return None
 								        async with httpx.AsyncClient(
 								            timeout=60, follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                resp = await client.get(doc.link)
 								                if resp.status_code != 200:
 								                    return None
 								                pdf = fitz.open(stream=resp.content, filetype="pdf")
 								                text = ""
 								                for page in pdf:
 								                    text += page.get_text()
 								                pdf.close()
 								                return text
 								            except Exception:
 								                logger.exception("BUND download error for %s", drucksache)
 								                return None
-												#19 SaarlandAdapter — Umbraco JSON-API mit Iframe-Unwrap

Reverse-Engineering aus HAR-Capture (User-Browser, /suche?searchValue=Schule):

- Endpoint: POST /umbraco/aawSearchSurfaceController/SearchSurface/GetSearchResults/
- Content-Type: application/x-www-form-urlencoded; charset=UTF-8 mit rohem
  JSON im Body (Kendo-Konvention von $.ajax ohne expliziten contentType)
- Body MUSS Sections={} und Sort={} als leere Dicts haben — sobald
  Sections.Print/etc. gesetzt sind, antwortet der Server mit HTTP 500
  (eigene Stunden in der Sackgasse, bis HAR den minimalen Body zeigte)
- Body-Schema: {Filter:{Periods:[17]}, Pageination:{Skip,Take}, Sections:{},
  Sort:{}, OnlyTitle:false, Value:<query>, CurrentSearchTab:0}

Response-Mapping (FilteredResult[*]):

- DocumentNumber → drucksache (e.g. "17/11")
- Title → title
- DocumentType → typ; client-side gefiltert auf "Antrag" (Print-Section
  enthält Anfragen + Anträge + Gesetzentwürfe gemischt, ~30-50% sind Anträge)
- Publisher (kollektive Anträge: "CDU"/"SPD") + DocumentAuthor
  (individuelle MdL: "Name, Vorname (CDU);…") via parteien.extract_fraktionen
- PublicDate (ISO mit T-Suffix) → datum (auf 10 Zeichen abgeschnitten)
- FilePath: ``/file.ashx?FileId=…&FileName=…`` ist ein HTML-Iframe-Wrapper
  (455 Bytes), nicht das PDF! Echter Binär-Endpoint ist
  ``/Downloadfile.ashx`` (Großbuchstabe!) mit denselben Query-Parametern.
  Der Wrapper hat mich beim ersten Smoke-Test mit "no objects found"
  angeschmissen, der Iframe-Hint im HTML hat den Trick verraten.

Drucksachen-Lookup nutzt ``Value=<drucksache>``: der Server matcht die
Nummer im Volltext und liefert sie zuverlässig als ersten Hit. Kein
dedizierter GetById-Endpoint vorhanden.

Smoke-Test gegen prod (im Container):
- search("Schule", limit=5) → 2 Anträge in WP17 (140 Print-Hits gesamt,
  Antrag-Filter auf 2/140 — der Rest sind Anfragen/Gesetzentwürfe):
  17/11 [CDU] "Schule als Lern- und Bildungsort weiter stärken …"
  17/419 [AfD] "Eine gute Bildungspolitik als wesentlicher Bestandteil …"
- get_document("17/11") → match
- download_text("17/11") → 3520 chars echter Antrags-Volltext (Header,
  Fraktion, Resolutionstext)

Tests: 185/185 grün (keine Regression).

UI-Aktivierung erfolgt separat in #31 (blockiert auf diesem Commit).

Refs: #19, #49 (Roadmap Phase 3)

											
										
										
											2026-04-10 00:46:02 +02:00
+								class SaarlandAdapter(ParlamentAdapter):
 								    """Adapter für den Landtag des Saarlandes via Umbraco JSON-API (#19).
 								    Backend ist eine Umbraco/.NET-SurfaceController-Schicht hinter
 								    ``www.landtag-saar.de``. Die Suchseite ``/suche?searchValue=…`` lädt
 								    ihre Ergebnisse via XHR-POST gegen
 								    ``/umbraco/aawSearchSurfaceController/SearchSurface/GetSearchResults/``.
 								    Schema reverse-engineered aus einem HAR-Capture (User-Browser, gegen
 								    ``Schule``-Suche). Wichtig:
 								    - Content-Type ist ``application/x-www-form-urlencoded; charset=UTF-8``,
 								      aber der Body ist trotzdem **rohes JSON** (Kendo-Konvention von
 								      ``$.ajax`` ohne explizites ``contentType``). Ein
 								      ``application/json``-Header funktioniert auch, aber nur mit der
 								      minimalen Body-Form unten — sobald ``Sections.{Print,Operations,…}``
 								      gesetzt sind, antwortet der Server mit HTTP 500. Mit ``Sections:{}``
 								      ist alles OK und der Server liefert die Hits sektionsübergreifend.
 								    - Body-Schema:
 								      ```json
 								      {
 								        "Filter": {"Periods": [17]},
 								        "Pageination": {"Skip": 0, "Take": 10},
 								        "Sections": {},
 								        "Sort": {},
 								        "OnlyTitle": false,
 								        "Value": "Schule",
 								        "CurrentSearchTab": 0
 								      }
 								      ```
 								    - Response: ``FilteredResult[]`` mit pro Item ``DocumentNumber``
 								      (``"17/11"``), ``Legislative`` (Wahlperiode int), ``DocumentType``
 								      (``"Antrag"``/``"Anfrage"``/``"Gesetzentwurf"``/…), ``Title``,
 								      ``PublicDate``, ``DocumentAuthor`` (Liste mit ``Name (Partei);…``),
 								      ``Publisher`` (Fraktion bei kollektiven Anträgen), ``FilePath``
 								      (relativ, ``/file.ashx?FileId=…&FileName=…``).
 								    Der Filter auf ``DocumentType=="Antrag"`` läuft client-side, weil die
 								    Server-Sections-Struktur die Filter-Granularität nicht hat (Print
 								    enthält Anfragen + Anträge + Gesetzentwürfe gemischt).
 								    Drucksachen-Lookup: ``Value="17/11"`` matched die Drucksachen-Nummer
 								    direkt an erster Position — ein dedizierter ``GetById``-Endpoint
 								    existiert nicht.
 								    """
 								    bundesland = "SL"
 								    name = "Landtag des Saarlandes"
 								    base_url = "https://www.landtag-saar.de"
 								    def __init__(self, *, wahlperiode: int = 17):
 								        self.wahlperiode = wahlperiode
 								    def _make_client(self) -> httpx.AsyncClient:
 								        return httpx.AsyncClient(
 								            timeout=30,
 								            follow_redirects=True,
 								            headers={
 								                "User-Agent": "Mozilla/5.0 GWOE-Antragspruefer",
 								                "Accept": "application/json, text/javascript, */*; q=0.01",
 								                "X-Requested-With": "XMLHttpRequest",
 								                "Origin": self.base_url,
 								                "Referer": f"{self.base_url}/suche?searchValue=&ActiveTab=0",
 								            },
 								        )
 								    def _build_body(self, query: str, *, skip: int = 0, take: int = 50) -> str:
 								        """Bauen den minimalen Body, der vom Server akzeptiert wird.
 								        Beachte: ``Sections={}`` und ``Sort={}`` sind PFLICHT als leere
 								        Objekte (nicht weglassen, nicht ausfüllen — ausgefüllte Sections
 								        triggern HTTP 500).
 								        """
 								        return json.dumps({
 								            "Filter": {"Periods": [self.wahlperiode]},
 								            "Pageination": {"Skip": skip, "Take": take},
 								            "Sections": {},
 								            "Sort": {},
 								            "OnlyTitle": False,
 								            "Value": query or "",
 								            "CurrentSearchTab": 0,
 								        })
 								    @staticmethod
 								    def _doc_to_drucksache(item: dict) -> Optional[Drucksache]:
 								        from .parteien import extract_fraktionen
 								        nummer = item.get("DocumentNumber")
 								        if not nummer:
 								            return None
 								        # Fraktionen aus Publisher (kollektive Anträge: "CDU", "SPD") oder
 								        # DocumentAuthor (individuelle MdL: "Schmitt-Lang, Jutta (CDU)").
 								        # Beides via extract_fraktionen normalisiert.
 								        publisher = item.get("Publisher") or ""
 								        author = item.get("DocumentAuthor") or ""
 								        fraktionen = extract_fraktionen(
 								            f"{publisher} {author}".strip(), bundesland="SL",
 								        )
 								        # PublicDate ist im Format ``2022-05-12T00:00:00`` — ISO-Date abschneiden.
 								        public_date = (item.get("PublicDate") or "")[:10]
 								        # ``FilePath`` ist ``/file.ashx?FileId=…&FileName=…`` — der gibt
 								        # aber HTML mit einem Iframe-Wrapper zurück, nicht das PDF selbst.
 								        # Der echte Binär-Endpoint ist ``/Downloadfile.ashx`` (Großbuchstabe!)
 								        # mit denselben Query-Parametern. Server liefert dort
 								        # ``Content-Type: application/pdf``.
 								        file_path = item.get("FilePath") or ""
 								        if file_path.startswith("/file.ashx"):
 								            file_path = file_path.replace("/file.ashx", "/Downloadfile.ashx", 1)
 								        link = (
 								            f"https://www.landtag-saar.de{file_path}"
 								            if file_path.startswith("/") else file_path
 								        )
 								        return Drucksache(
 								            drucksache=nummer,
 								            title=item.get("Title", ""),
 								            fraktionen=fraktionen,
 								            datum=public_date,
 								            link=link,
 								            bundesland="SL",
 								            typ=item.get("DocumentType", ""),
 								        )
 								    async def _post_search(
 								        self, client: httpx.AsyncClient, query: str, *, skip: int = 0, take: int = 50,
 								    ) -> list[dict]:
 								        url = (
 								            f"{self.base_url}/umbraco/aawSearchSurfaceController/"
 								            "SearchSurface/GetSearchResults/"
 								        )
 								        body = self._build_body(query, skip=skip, take=take)
 								        try:
 								            resp = await client.post(
 								                url,
 								                content=body,
 								                headers={
 								                    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
 								                },
 								            )
 								            if resp.status_code != 200:
 								                logger.error("SL HTTP %s: %s", resp.status_code, resp.text[:200])
 								                return []
 								            data = resp.json()
 								            return data.get("FilteredResult", []) or []
 								        except Exception:
 								            logger.exception("SL search request error")
 								            return []
 								    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
 								        """Volltextsuche über die aktuelle Wahlperiode, gefiltert auf Anträge.
 								        Holt 5*limit Hits in einer Page, filtert client-side auf
 								        ``DocumentType=="Antrag"`` (Print-Section enthält auch Anfragen
 								        und Gesetzentwürfe), und kürzt auf ``limit``. Sortierung kommt
 								        relevance-based vom Server — für die UI ist Relevanz zu einer
 								        Query meist wertvoller als Date-DESC.
 								        """
 								        async with self._make_client() as client:
 								            # Take großzügig, weil der Antrag-Filter ~30-50% der Hits drosselt
 								            take = max(limit * 5, 30)
 								            items = await self._post_search(client, query, skip=0, take=take)
 								        results: list[Drucksache] = []
 								        seen: set[str] = set()
 								        for item in items:
 								            if (item.get("DocumentType") or "").lower() != "antrag":
 								                continue
 								            doc = self._doc_to_drucksache(item)
 								            if doc is None or doc.drucksache in seen:
 								                continue
 								            seen.add(doc.drucksache)
 								            results.append(doc)
 								            if len(results) >= limit:
 								                break
 								        return results
 								    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
 								        """Direktes Lookup via ``Value=<drucksache>`` — die Server-Suche
 								        matcht die Drucksachen-Nummer im Dokument selbst und liefert sie
 								        zuverlässig als ersten Treffer."""
 								        async with self._make_client() as client:
 								            items = await self._post_search(client, drucksache, take=20)
 								        for item in items:
 								            if item.get("DocumentNumber") == drucksache:
 								                return self._doc_to_drucksache(item)
 								        return None
 								    async def download_text(self, drucksache: str) -> Optional[str]:
 								        """Hole das Antrags-PDF via ``/file.ashx`` und extrahiere Volltext."""
 								        import fitz
 								        doc = await self.get_document(drucksache)
 								        if doc is None or not doc.link:
 								            return None
 								        async with httpx.AsyncClient(
 								            timeout=60, follow_redirects=True,
 								            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
 								        ) as client:
 								            try:
 								                resp = await client.get(doc.link)
 								                if resp.status_code != 200:
 								                    logger.error("SL PDF HTTP %s for %s", resp.status_code, drucksache)
 								                    return None
 								                pdf = fitz.open(stream=resp.content, filetype="pdf")
 								                text = ""
 								                for page in pdf:
 								                    text += page.get_text()
 								                pdf.close()
 								                return text
 								            except Exception:
 								                logger.exception("SL download error for %s", drucksache)
 								                return None
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								# Registry of adapters
 								ADAPTERS = {
-												Phase G: BundestagAdapter via DIP-API (#56)

Schließt #56 (Bundespolitik überprüfbar machen). Neuer
``BundestagAdapter`` in ``app/parlamente.py``, neuer ``BUND``-Eintrag in
``app/bundeslaender.py`` als 17. Parlament-Slot.

API:

- DIP-Search-API auf ``search.dip.bundestag.de/api/v1/drucksache``
- API-Key aus ``dip-config.js`` gescraped (öffentlich, klartext)
- Auth via URL-Param ``?apikey=...`` plus ``Origin: https://dip.bundestag.de``-
  Header (Origin-Locking, server-to-server-tauglich)
- Pagination via ``cursor``-Parameter, 100 Hits pro Page
- ``f.drucksachetyp=Antrag`` und ``f.wahlperiode=21`` als Server-Filter

Mapping:

- ``dokumentnummer`` → ``Drucksache.drucksache``
- ``titel`` → ``title``
- ``urheber[*].titel`` → durch ``parteien.extract_fraktionen`` zu
  ``["AfD"]``/``["GRÜNE"]``/etc. — die ``"Fraktion der AfD"``-
  Schreibweise wird vom zentralen Mapper aus #55 bereits korrekt
  geparst, kein Adapter-spezifisches Pattern nötig
- ``fundstelle.pdf_url`` → ``link``
- ``datum`` → bereits ISO ``YYYY-MM-DD``

``get_document(drucksache)`` nutzt ``f.dokumentnummer`` als direkter
Server-Filter, kein linearer Pagination-Scan.

BUND-Eintrag in ``bundeslaender.py``:

- ``code="BUND"``, ``parlament_name="Deutscher Bundestag"``,
  ``wahlperiode=21``, ``wahlperiode_start="2025-03-25"`` (Konstituierung
  21. WP nach BTW 2025), ``regierungsfraktionen=["CDU", "CSU", "SPD"]``
  (Kabinett Merz)
- ``aktiv=True`` — taucht automatisch in ``alle_bundeslaender()`` und
  ``aktive_bundeslaender()`` auf, damit die UI- und
  Auswertungs-Pipelines BUND ohne zusätzliche Sonderpfade kennen
- 17 Einträge in ``BUNDESLAENDER`` statt 16 — Tests entsprechend
  aktualisiert (``test_sixteen_bundeslaender_plus_bund``,
  ``test_alle_bundeslaender_returns_all``,
  ``test_all_wahlperioden_lists_each_bl_twice``)

Live-Probe direkt im Repo:

```
adapter: Deutscher Bundestag (DIP), wahlperiode=21
search returned 5 docs
  21/5136 2026-03-31 | ['AfD'] | Transparenz, Wirtschaftlichkeit ...
  21/5064 2026-03-27 | ['GRÜNE'] | Ausverkauf der Energieinfrastruktur ...
  21/5059 2026-03-27 | ['AfD'] | Berufsfreiheit für Selbstständige ...
get_document('21/5136') -> drucksache=21/5136
```

176 Unit-Tests grün, Live-Verifikation Sub-A im Container nach Deploy.

Refs: #56, #59 (Phase G)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 14:04:11 +02:00
+								    "BUND": BundestagAdapter(),
-												Phase I: HB PARiSHBAdapter (#21/#33) — Bremen aktiv

Schließt #21 (HB-Scraper) und #33 (UI-Aktivierung). Eigenständige
``PARiSHBAdapter``-Klasse für paris.bremische-buergerschaft.de.

Backend (HAR-Trace TEMP/paris.bremische-buergerschaft.de.har):

- Single-POST gegen ``/starweb/paris/servlet.starweb`` mit
  form-urlencoded Body
- ``path=paris/LISSHFL.web``, ``format=LISSH_BrowseVorgang_Report``
- ``01_LISSHFL_Themen=<query>`` (Volltext-Thesaurus)
- ``02_LISSHFL_PARL=S OR L`` (Stadt + Landtag in einem Rutsch)
- ``03_LISSHFL_WP=21`` (aktuelle Wahlperiode; Multi-WP-Range
  timeout-t den Server bei 60s)
- Wildcards (``*``) timeout-en ebenfalls — bei leerer Query verwenden
  wir das hochfrequente Stoppwort ``"der"`` als Catch-all

Hit-Format aus dem Single-Page-HTML:

- ``<tbody name="RecordRepeater"><tr name="Repeat_TYP">``
- Title in ``<h2><a>``
- ``Drs <b>21/730 S</b>`` mit S/L-Suffix für Stadtbürgerschaft vs
  Landtag — Drucksachen-IDs werden als ``21/730S`` (ohne Space)
  gespeichert
- ``Änderungsantrag vom 23.02.2026`` (Typ + Datum)
- Fraktionen-Liste nach ``<br/>``
- PDF-Link mit ``target="new"`` auf bremische-buergerschaft.de

Pipeline:

- ``search()`` mit client-side ``"antrag"``-Filter (analog #61),
  fängt ``"Antrag"``, ``"Änderungsantrag"`` etc.
- ``get_document()`` linearer Lookup
- ``download_text()`` PDF-via-fitz

BL-Eintrag in ``bundeslaender.py``:

- ``HB.aktiv = True``
- ``doku_system="PARiS"`` (statt der alten Klassifikation "StarWeb" —
  PARiS ist eine deutlich abweichende Servlet-Variante, kein eUI)
- ``drucksache_format="21/1234S"``
- Test ``test_hb_is_starweb_not_paris`` umbenannt in
  ``test_hb_is_paris_starweb_variant``, prüft jetzt auf "PARiS"

Live-Probe:

```
21/730S  2026-02-23 | [SPD,GRÜNE,LINKE] | Änderungsantrag | Haushaltsgesetze ...
21/1449  2025-11-05 | [SPD,GRÜNE,LINKE] | Antrag         | Finanzierung der Bremischen Häfen
21/555S  2025-06-17 | [CDU]              | Antrag         | Clima-Campus zügig beantworten
```

176 Unit-Tests grün, Live-Verifikation Sub-A im Container nach Deploy.

Refs: #21, #33, #59 (Phase I)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 14:21:49 +02:00
+								    "HB": PARiSHBAdapter(),
-												Phase H: HE StarWebHEAdapter (#24/#30) — Hessen aktiv

Schließt #24 (HE Card-Parser) und #36 (UI-Aktivierung). Eigenständige
``StarWebHEAdapter``-Klasse für starweb.hessen.de.

Backend-Discovery aus HAR-Trace (TEMP/starweb.hessen.de.har):

- starweb.hessen.de läuft auf einem eUI-Backend mit synchronem 2-Step-
  Flow (kein Polling wie BW PARLIS): POST ``browse.tt.json`` →
  ``report_id`` direkt in der Response → GET ``report.tt.html?
  report_id=...&start=0&chunksize=1500``
- Source: ``hlt.lis``
- Server verlangt ZWINGEND einen ``search.json``-Term-Tree, ``parsed``/
  ``sref`` allein reichen nicht. Top-NOT mit zwei Operanden:
  ``not(WP-Filter, NOWEB=X)``
- Hit-Format: Cards (``efxRecordRepeater``) mit Daten in HTML-Kommentar-
  Perl-Dumps ``<!--<pre class="dump">$VAR1 = ...</pre>-->``
- Field-Mapping: WEV01=Title, WEV02=Datum, WEV03=Typ, WEV07=PDF-URL,
  WEV08=Drucksachen-Nummer, WEV12=Urheber

Pipeline:

- ``search()`` synchron 2-Step, client-side ``"antrag"``-Filter (analog
  #61 für portala) — fängt "Dringlicher Berichtsantrag" und ähnliche
  Subtypen
- ``get_document()`` linearer Lookup über die ersten 200 Hits
- ``download_text()`` PDF-via-fitz (HE-PDF-URLs werden auf https
  upgegradet)

BL-Eintrag in ``bundeslaender.py``:

- ``HE.aktiv = True``
- ``doku_system="portala"`` (statt "StarWeb" — die /starweb/LIS-Pfade
  sind nur Legacy, das echte Backend ist /portal)
- ``doku_base_url="https://starweb.hessen.de/portal"``

ADAPTERS-Registrierung an Position vor NRW.

Live-Probe:

```
21/4157 2026-04-07 | [GRÜNE] | Dringlicher Berichtsantrag | Vorstellung, Kosten...
21/4156 2026-04-02 | [GRÜNE] | Berichtsantrag             | Schulische Prävention...
21/4136 2026-03-30 | [GRÜNE] | Dringlicher Berichtsantrag | Streichung des Schulfachs...
```

176 Unit-Tests grün, Sub-A im Container nach Deploy zu verifizieren.

Refs: #24, #30, #36, #59 (Phase H)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 14:15:35 +02:00
+								    "HE": StarWebHEAdapter(),
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    "NRW": NRWAdapter(),
-												Phase J: SN EDAS-XML-Adapter (#26/#38) — Sachsen aktiv via XML-Export

Reaktiviert die in Phase J vertagte Adapter-Implementation: statt
ASP.NET-Postbacks zu simulieren (blockt durch __VIEWSTATE-Komplexität
plus robots.txt: Disallow: /), liest die neue ``SNEdasXmlAdapter``-
Klasse einen wöchentlich manuell aus EDAS exportierten XML-Dump.

Workflow:

1. User exportiert in der EDAS-Suchmaske mit Filter "Dokumententyp =
   Antr" einen XML-Dump (bis zu 2500 Treffer/Export, sortiert
   newest-first nach Datum)
2. Datei wird unter ``data/sn-edas-export.xml`` abgelegt (ins
   persistent volume des prod-containers)
3. ``search()``/``get_document()`` lesen die XML-Datei lokal — keine
   Server-Calls gegen edas.landtag.sachsen.de
4. ``download_text()`` resolved die echte PDF-URL on-demand über einen
   einzelnen GET gegen ``viewer_navigation.aspx`` (single GET, kein
   Postback) und holt dann das PDF von ``ws.landtag.sachsen.de/images``

XML-Schema (ISO-8859-1):

- ``<ID>`` interne EDAS-Doc-ID
- ``<Wahlperiode>``, ``<Dokumentenart>``, ``<Dokumentennummer>``
- ``<Fundstelle>`` z.B. ``"Antr CDU, BSW, SPD 01.10.2024 Drs 8/2"`` —
  enthält Typ, Urheber und Datum, parsen via Regex
- ``<Titel>`` Volltext-Titel

PDF-URL-Schema (extrahiert aus dem viewer_navigation.aspx onLoad-
Handler): ``ws.landtag.sachsen.de/images/{wp}_Drs_{nr}_{...}.pdf``
mit variablen Suffix-Komponenten — wir machen die Resolution lazy.

Mapper-Erweiterung:

- ``parteien.PARTEIEN``-Tabelle um ``BÜNDNISGRÜNE``/``Bündnisgrüne``
  ergänzt — der Sachsen-spezifische zusammengeschriebene Eigenname der
  GRÜNEN-Fraktion (sonst wären 8/2100 etc. mit leerer Fraktionen-Liste
  rausgekommen)

BL-Eintrag:

- ``SN.aktiv = True``
- ``doku_system="EDAS-XML-Export"`` (klare Klassifikation, dass es
  KEIN normaler Webcrawler ist)
- Test ``test_sn_is_eigensystem_not_parldok`` umbenannt in
  ``test_sn_uses_xml_export_not_parldok``

Live-Probe lokal:

```
search('Klima', limit=5):
  8/2100 2025-03-17 | [GRÜNE]              | Fahrradoffensive Sachsen ...
  7/192  2019-10-11 | [LINKE]              | Erste Schritte zur Klimager...
  7/2067 2020-03-19 | [CDU, SPD, GRÜNE]    | Sächsische Waldbesitzer ...
```

176 Unit-Tests grün. Container braucht beim Deploy einen XML-Upload
ins data/-Volume — separater scp-Schritt.

Refs: #26, #38, #59 (Phase J revived)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 14:39:03 +02:00
+								    "SN": SNEdasXmlAdapter(),
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								    "LSA": PortalaAdapter(
 								        bundesland="LSA",
 								        name="Landtag von Sachsen-Anhalt (PADOKA)",
 								        base_url="https://padoka.landtag.sachsen-anhalt.de",
 								        db_id="lsa.lissh",
 								        wahlperiode=8,
 								        portala_path="/portal",
 								        document_type="Antrag",
 								        pdf_url_prefix="/files/",
 								    ),
 								    "BE": PortalaAdapter(
 								        bundesland="BE",
 								        name="Abgeordnetenhaus von Berlin (PARDOK)",
 								        base_url="https://pardok.parlament-berlin.de",
 								        db_id="lah.lissh",
 								        wahlperiode=19,
 								        portala_path="/portala",
 								        # Berlin's ETYPF index uses different value strings — drop the
 								        # document_type subtree, fall back to client-side title filter.
 								        document_type=None,
-												PortalaAdapter: quick-win bigger window + chunksize for BE/LSA (#13)

Real server-side fulltext search through the eUI sf-Index requires
reverse-engineering the LSA/BE-specific search field (the obvious
candidates VOLL, VOLL.main, WEV62 and bare-term-without-sf all
return zero hits when probed). Without browser DevTools to capture
a real fulltext request that's a multi-hour project — split out
to remain in #13 as a follow-up.

This commit ships the pragmatic interim fix from #11:

- BE date_window_days: 180 → 730
  Berlin had a tight default window because PARDOK has ~10x more
  documents than PADOKA. With the bigger window the client-side
  title/Urheber filter reaches back across most of WP19 instead
  of just the last six months.

- chunksize logic in PortalaAdapter.search() inverted from
  "small when query, big when no query" to the opposite. The
  query-filtered path now pulls up to max(limit*10, 500) records
  per page so the title-filter has enough material; the unfiltered
  browse path stays at max(limit*2, 100).

- httpx timeout 30s → 60s. LSA's report.tt.html occasionally
  takes 30+s on cold start; warm requests are <10s.

Smoke test (local):
  BE  Schule: 15 hits (was 0)
  LSA Schule: 14 hits (was N/A; same path)

Live verification follows after deploy.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 13:58:34 +02:00
+								        # Quick-win for #13: pulled the date window from the original
 								        # 180-day MVP up to 730 days so client-side title-filter searches
 								        # ("Schule" etc.) reach back across more of the WP19 corpus until
 								        # the eUI fulltext-sf is reverse-engineered. The chunksize bump
 								        # in PortalaAdapter.search() means the per-request payload stays
 								        # bounded.
 								        date_window_days=730,
-												Activate Berlin (PARDOK) — search-only MVP (#3)

PortalaAdapter is now parameterizable and serves both LSA and Berlin
from a single class. Berlin is activated as the third live bundesland
(after NRW + LSA), with the deliberate caveat that the LTW 2023
Wahlprogramme are not yet indexed.

PortalaAdapter refactor
- Class attributes (bundesland, name, base_url, db_id, wahlperiode)
  moved into the constructor. New optional parameters:
    - portala_path: "/portal" for LSA, "/portala" for Berlin
    - document_type: "Antrag" for LSA, None for Berlin (BE's ETYPF
      index uses different value strings; the document_type subtree
      is dropped from the action.search.json tree)
    - pdf_url_prefix: "/files/" by default; absolute URLs in the hit
      list are passed through unchanged (Berlin embeds full
      starweb/adis/citat/... links)
    - date_window_days: 730 for LSA, 180 for BE (BE has ~10x more
      documents per WP, narrower window keeps payloads bounded)
- _build_search_body builds the JSON tree dynamically: when
  document_type is None, the entire ETYPF/DTYPF/DART subtree is
  omitted, mirrored in the parsed/sref display strings as well.
- _parse_hit_list_html now auto-detects between two formats:
    1. LSA-style: <pre>$VAR1 = …</pre> Perl Data::Dumper records
       (existing parser, untouched).
    2. Berlin-style: production HTML cards with efxRecordRepeater
       divs, h3 titles, h6 metadata lines containing the document
       type, drucksachen-id and date, plus a direct <a href="…pdf">
       to the PDF on the same host.
- Berlin extracts originator parties from the h6 line ("Antrag CDU,
  SPD" → ["CDU","SPD"], typ "Antrag") via the new word-boundary
  _normalize_fraktion regex.
- _normalize_fraktion rewritten with regex word boundaries, fixing a
  long-standing bug where comma-separated fraction lists like
  "CDU, SPD" failed to match CDU. Also picks up BSW for the
  Brombeer/SPD-BSW landtage and "Senat von Berlin" as Landesregierung.

bundeslaender.py
- BE flipped to aktiv=True. anmerkung documents the Wahlprogramm-
  Lücke and the auto-detected hit-list format.

Live verified against pardok.parlament-berlin.de:
- WP 19 with 180-day date window returns 2962 hits, page 1 contains
  5 records all with title, drucksache, date, PDF URL.
- 19/3107 ("Kleingewässerprogramm") correctly extracted as Antrag of
  CDU+SPD; 19/3104-3106 as Vorlagen zur Beschlussfassung; 19/3108 as
  Vorlage zur Kenntnisnahme.
- LSA still returns the same 5 current Anträge of März 2026 — no
  regression from the refactor.

Known limitation (will be tracked as a follow-up issue)
- Berlin Wahlprogramme zur LTW 2023 are not yet indexed in the
  embeddings DB. The 2023 PDFs are no longer linked from the live
  party websites (which currently feature 2026 draft programmes), and
  Wayback has no snapshots. The analyzer therefore falls back to
  bundesländer-übergreifende Grundsatzprogramme for BE Anträge until
  the 2023 PDFs are sourced manually.

Refs #3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:33:16 +02:00
+								        pdf_url_prefix="/files/",
 								    ),
-												Activate Mecklenburg-Vorpommern (ParlDok) — search-only MVP (#4)

Adds a new ParLDokAdapter for ParlDok 8.x parliament documentation
systems by J3S GmbH. MV becomes the fourth supported state alongside
NRW, LSA and BE.

Notable details:

- ParlDok 8.x is a single-page app whose backend is a JSON API rooted
  at {base}/parldok/Fulltext/{Search,Resultpage}. The legacy ParLDok
  5.x HTML POST form (parldok/formalkriterien) used by dokukratie's
  mv.yml has been deprecated by the LandtagMV upgrade to 8.3.5 and
  is no longer reachable via the old form fields — hence a new
  adapter rather than reusing the dokukratie scraper.

- Two-stage pagination: Fulltext/Search returns the first 100 hits
  + a queryid; further pages come from Fulltext/Resultpage with
  {queryid, limit:{Start,Length}}. The Search endpoint silently
  ignores any non-zero Start, so single-stage offset pagination is
  not an option.

- Server-side filter via facet_lp (type=10) on the configured WP;
  type=Antrag is filtered client-side because the facet_type value
  IDs are instance-specific and would require an extra
  Fulltext/Filter discovery call. ParlDok also returns the same
  Drucksache multiple times when it appears in several
  Vorgänge/Beratungen, so search() dedupes by lp/number.

- Wahlprogramme zur LTW 26.09.2021 are not yet indexed (follow-up
  in #4) — analyses run with the federal Grundsatzprogramm fallback,
  same as Berlin until #10 lands.

Drive-by cleanup of PortalaAdapter print() statements: switched to
the module-level logger so adapter parser bugs no longer disappear
into stdout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 08:19:48 +02:00
+								    "MV": ParLDokAdapter(
 								        bundesland="MV",
 								        name="Landtag Mecklenburg-Vorpommern (ParlDok)",
 								        base_url="https://www.dokumentation.landtag-mv.de",
 								        wahlperiode=8,
 								        prefix="/parldok",
 								        document_typ="Antrag",
 								    ),
-												Activate Hamburg via ParLDokAdapter reuse (#28, Phase 1)

Hamburg's parldok runs ParlDok 8.3.1 (J3S GmbH) — kompatibel mit
der MV-Variante (8.3.5). Selber /parldok/Fulltext/Search-Endpoint,
selbe Body-Schema, selbes Hit-Format. Dadurch ist der existierende
ParLDokAdapter aus #4 ohne Code-Änderungen wiederverwendbar.

Eingetragen wurde nur:
- ADAPTERS["HH"] = ParLDokAdapter(base_url=buergerschaft-hh.de,
  wahlperiode=23, prefix=/parldok, document_typ="Antrag")
- bundeslaender.py::HH.aktiv = True

Smoke-Test (lokal):
  HH q="":       8 hits in 1.5s, jüngste WP23-Anträge sortiert newest-first
  HH q="Schule": 1 hit in 13.2s (HH ist klein, WP23 erst seit März 2025,
                  HH nutzt eher "Kita"/"Bildung"/"Lehrkräfte" im Titel)
  HH q="Klima":  2 hits

Verifikation HH ist 8.x:
  curl https://www.buergerschaft-hh.de/parldok/ | grep generator
  → "ParlDok 8.3.1, entwickelt von der J3S GmbH"

Dies ist der zweite Phase-1-Win — ein nahezu kostenloser Adapter-
Reuse weil das Backend identisch ist. Anders als BW (#29), das eine
eigene PARLISAdapter-Klasse brauchte, braucht HH gar keinen neuen
Code.

Phase 1 (2/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:41:23 +02:00
+								    "HH": ParLDokAdapter(
 								        bundesland="HH",
 								        name="Hamburgische Bürgerschaft (ParlDok)",
 								        base_url="https://www.buergerschaft-hh.de",
 								        wahlperiode=23,
 								        prefix="/parldok",
 								        document_typ="Antrag",
 								    ),
-												Activate Thüringen via ParLDokAdapter reuse + filter widening (#25, Phase 1)

Thüringen läuft auf parldok.thueringer-landtag.de mit ParlDok 8.3.5
(J3S GmbH) — exakt dieselbe Version wie MV. Aber TH packt seine
Anträge unter zusammengesetzten type-Strings ("Antrag gemäß § 79 GO",
"Antrag gemäß § 74 (2) GO") und kind="Vorlage" statt der MV-Variante
kind="Drucksache"/type="Antrag". Strict-Match auf "Antrag" hat 0
Treffer geliefert.

Lösung: ParLDokAdapter um zwei Konstruktor-Parameter erweitert:
- document_typ_substring=True → Substring-Match auf type-Feld
  ("Antrag" matched "Antrag gemäß § 79 GO", "Alternativantrag" usw.)
- kinds=["Drucksache", "Vorlage"] → erweiterte kind-Liste

Defaults sind backward-kompatibel (Substring-Match aus, kinds nur
Drucksache), sodass MV und HH unverändert weiterlaufen.

_hit_matches_filters() als zentraler Filter-Helper extrahiert,
search() und get_document() nutzen ihn — get_document() überspringt
ihn allerdings, weil dort beliebige Drucksachen aufrufbar sein müssen,
unabhängig vom search-Time-Filter.

Hostname-Korrektur: parldok.thueringen.de redirected per 303 auf
parldok.thueringer-landtag.de. doku_base_url in bundeslaender.py
auf den neuen Host umgestellt.

Smoke-Test (lokal):
  TH q="":       8 hits in 3.3s
  TH q="Schule": 2 hits in 25.7s (Lernmittelbeschaffung, Modernisierung
                  Bund-Länder-Vereinbarung — beide Schul-bezogen)
  TH q="Klima":  0 hits (keine in den letzten 1000 Drucksachen)

Damit ist Phase 1 (3/3) komplett. Nächstes Phase-2 Issue: #27 BB als
StarWebAdapter-Template.

Phase 1 (3/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:48:02 +02:00
+								    "TH": ParLDokAdapter(
 								        bundesland="TH",
 								        name="Thüringer Landtag (ParlDok)",
 								        base_url="https://parldok.thueringer-landtag.de",
 								        wahlperiode=8,
 								        prefix="/parldok",
 								        # TH packs Anträge under composite type strings like
 								        # "Antrag gemäß § 79 GO" with kind="Vorlage", not the
 								        # MV-style kind="Drucksache"/type="Antrag". Substring-match
 								        # on "Antrag" plus widened kind list catches them all.
 								        document_typ="Antrag",
 								        document_typ_substring=True,
 								        kinds=["Drucksache", "Vorlage"],
 								    ),
-												Activate Schleswig-Holstein via StarFinderCGIAdapter (#20, Phase 2)

SH läuft auf der ältesten der vier Backend-Familien: Starfinder-CGI
auf lissh.lvn.parlanet.de. URL-basiert (nicht stateful wie das
moderne StarWeb-Servlet von BB/HE/NI/RP/HB), Latin-1-encoding,
flat HTML-Tabelle als Hit-Format. Eigener Adapter weil das Schema
fundamental anders ist als alles andere.

Endpoint:
  http://lissh.lvn.parlanet.de/cgi-bin/starfinder/0
    ?path=lisshfl.txt&id=FASTLINK&pass=&search=WP=20+AND+dtyp=antrag
    &format=WEBKURZFL

Hit-Format pro <tr class="tabcol*">:
  <b>{TITLE}</b><br>
  Antrag {URHEBER} {DD.MM.YYYY} Drucksache <a href="{PDF}">{N/M}</a>

Quelle: dokukratie/sh.yml + Live-Probing.

Encoding: Server liefert iso-8859-1 ohne korrektes Content-Type-
Header. Adapter dekodiert resp.content explizit als latin-1.

SSW-Detection im _normalize_fraktion: SH ist das einzige BL mit
SSW-Fraktion (von der 5%-Hürde befreit), pattern ist \\bSSW\\b
analog zu \\bAfD\\b.

Free-Text-Suche client-seitig (siehe #18) — server-side query-
syntax mit (term) im starfinder-search-Param wird vom Server nicht
als Volltext interpretiert, einheitlich mit allen anderen aktiven
Adaptern.

Smoke-Test (lokal):
  SH q="":         8 hits in 14.4s
  SH q="Schule":   8 hits in 14.8s (Schulentwicklung Westküste,
                    Hochschulen, queere Vielfalt an Schule etc.)
  SH q="Klima":    8 hits (klimafreundlich, Klimafolgen,
                    Strategischer Aktionsplan)
  SH q="Bildung":  8 hits (berufliche Bildung, Holocaust-Wissen)

bundeslaender.py::SH.aktiv = True. doku_base_url auf
lissh.lvn.parlanet.de korrigiert (ehemaliger landtag.ltsh.de-
Eintrag passte nicht zum echten Endpoint).

Damit ist Phase 2 (1/6) angefangen — als Nebenpfad, weil das
StarWeb-Servlet (#27 BB als Template für 5 weitere) ohne HAR-
Trace nicht sauber reverse-engineerbar war.

Phase 2 (1/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:34:06 +02:00
+								    "SH": StarFinderCGIAdapter(
 								        bundesland="SH",
 								        name="Schleswig-Holsteinischer Landtag (LIS-SH)",
 								        base_url="http://lissh.lvn.parlanet.de",
 								        wahlperiode=20,
 								        db_path="lisshfl.txt",
 								        document_typ_code="antrag",
 								    ),
-												Activate Brandenburg + Rheinland-Pfalz via PortalaAdapter reuse (#27, #30, Phase 2)

Riesige Überraschung aus dem BB-HAR-Trace: Brandenburg ist NICHT
StarWeb wie in dokukratie und bundeslaender.py klassifiziert,
sondern läuft auch auf dem portala/eUI-Backend. Endpoint
/portal/browse.tt.json mit db_id=lbb.lissh. Das alte
/starweb/LBB/ELVIS/-Frontend ist nur Legacy.

Folgeprobing offenbarte: RP/opal.rlp.de läuft ebenfalls portala
(db_id=rlp.lissh, 46759 hits in WP18), ebenso NI/HE/BB. Damit ist
Phase 2 großteils KEIN StarWeb-Adapter-Bau, sondern PortalaAdapter-
Wiederverwendung mit konfigurierbaren Parametern.

Activated via Registry-Einträge:

- "BB" → PortalaAdapter(base_url=parlamentsdokumentation.brandenburg.de,
  db_id=lbb.lissh, wahlperiode=8). Nutzt die BE-Card-Variante des
  Hit-Parsers (efxRecordRepeater).
- "RP" → PortalaAdapter(base_url=opal.rlp.de, db_id=rlp.lissh,
  wahlperiode=18). NICHT mit dem NRW OPAL verwechseln — anderer
  Markenname, andere Engine.

PortalaAdapter erweitert um zwei neue Konstruktor-Parameter mit
backward-kompatiblen Defaults:

- typ_filter: Optional[str] = "DOKDBE"
  Wenn None, wird die TYP=<value>-Klausel weggelassen. Manche
  Instanzen (HE/hlt.lis) lehnen DOKDBE ab.

- omit_date_filter: bool = False
  Wenn True, wird der DAT/DDAT/SDAT-Term weggelassen. HE
  und ähnliche Instanzen haben andere Date-Field-Namen.

Plus _parse_hit_list_cards Date-Regex erweitert: zusätzlich zum
"vom DD.MM.YYYY"-Pattern (BE) jetzt auch "DD.MM.YYYY"-plain
(BB schreibt Datum vor Drucksachen-Nummer ohne "vom"-Marker).

Smoke-Test (lokal):
  BB q="":       5 hits in 5.9s
  BB q="Schule": 5 hits (Pflegeschulen, Genderverbot, Hochschulen)
  RP q="":       5 hits in 4.1s (Entlastung, Bildungschancen)
  RP q="Schule": 5 hits (Hochschulbau, G9-Gymnasien, Leistungsgerechtigkeit)

bundeslaender.py: BB.doku_system "StarWeb"→"portala", RP analog,
beide aktiv=True. Anmerkungen mit dem portala-Verweis und der
Klarstellung "OPAL/RLP ≠ NRW OPAL" erweitert.

NICHT in diesem Commit:
- HE: portala-Backend (hlt.lis) ist erreichbar, aber das HE-Card-
  Layout ist anders (Title direkt im <h3> statt <h3><span>, kein
  <span class="h6"> für Meta) — eigener Parser-Pfad nötig, deferred.
- NI: nilas.niedersachsen.de/portal/ ist eine Login-Page, das
  öffentliche Backend ist nicht zugänglich — deferred.
- HB: kein /portal/-Endpoint, bleibt das alte StarWeb-Servlet —
  braucht eigenen HAR-Trace, deferred.
- BB als StarWeb-Template (#27) ist hinfällig, weil BB portala ist.

Phase 2 (3/6) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 00:59:28 +02:00
+								    "BB": PortalaAdapter(
 								        bundesland="BB",
 								        name="Landtag Brandenburg (parladoku)",
 								        base_url="https://www.parlamentsdokumentation.brandenburg.de",
 								        db_id="lbb.lissh",
 								        wahlperiode=8,
 								        portala_path="/portal",
 								        document_type="Antrag",
 								        # BB packs the date BEFORE the Drucksachen-Nummer in the h6
 								        # line and uses the BE-style efxRecordRepeater HTML cards;
 								        # the auto-detect picks the card path automatically.
 								    ),
 								    "RP": PortalaAdapter(
 								        bundesland="RP",
 								        name="Landtag Rheinland-Pfalz (OPAL)",
 								        base_url="https://opal.rlp.de",
 								        db_id="rlp.lissh",
 								        wahlperiode=18,
 								        portala_path="/portal",
 								        document_type="Antrag",
 								    ),
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    "BY": BayernAdapter(),
-												#19 SaarlandAdapter — Umbraco JSON-API mit Iframe-Unwrap

Reverse-Engineering aus HAR-Capture (User-Browser, /suche?searchValue=Schule):

- Endpoint: POST /umbraco/aawSearchSurfaceController/SearchSurface/GetSearchResults/
- Content-Type: application/x-www-form-urlencoded; charset=UTF-8 mit rohem
  JSON im Body (Kendo-Konvention von $.ajax ohne expliziten contentType)
- Body MUSS Sections={} und Sort={} als leere Dicts haben — sobald
  Sections.Print/etc. gesetzt sind, antwortet der Server mit HTTP 500
  (eigene Stunden in der Sackgasse, bis HAR den minimalen Body zeigte)
- Body-Schema: {Filter:{Periods:[17]}, Pageination:{Skip,Take}, Sections:{},
  Sort:{}, OnlyTitle:false, Value:<query>, CurrentSearchTab:0}

Response-Mapping (FilteredResult[*]):

- DocumentNumber → drucksache (e.g. "17/11")
- Title → title
- DocumentType → typ; client-side gefiltert auf "Antrag" (Print-Section
  enthält Anfragen + Anträge + Gesetzentwürfe gemischt, ~30-50% sind Anträge)
- Publisher (kollektive Anträge: "CDU"/"SPD") + DocumentAuthor
  (individuelle MdL: "Name, Vorname (CDU);…") via parteien.extract_fraktionen
- PublicDate (ISO mit T-Suffix) → datum (auf 10 Zeichen abgeschnitten)
- FilePath: ``/file.ashx?FileId=…&FileName=…`` ist ein HTML-Iframe-Wrapper
  (455 Bytes), nicht das PDF! Echter Binär-Endpoint ist
  ``/Downloadfile.ashx`` (Großbuchstabe!) mit denselben Query-Parametern.
  Der Wrapper hat mich beim ersten Smoke-Test mit "no objects found"
  angeschmissen, der Iframe-Hint im HTML hat den Trick verraten.

Drucksachen-Lookup nutzt ``Value=<drucksache>``: der Server matcht die
Nummer im Volltext und liefert sie zuverlässig als ersten Hit. Kein
dedizierter GetById-Endpoint vorhanden.

Smoke-Test gegen prod (im Container):
- search("Schule", limit=5) → 2 Anträge in WP17 (140 Print-Hits gesamt,
  Antrag-Filter auf 2/140 — der Rest sind Anfragen/Gesetzentwürfe):
  17/11 [CDU] "Schule als Lern- und Bildungsort weiter stärken …"
  17/419 [AfD] "Eine gute Bildungspolitik als wesentlicher Bestandteil …"
- get_document("17/11") → match
- download_text("17/11") → 3520 chars echter Antrags-Volltext (Header,
  Fraktion, Resolutionstext)

Tests: 185/185 grün (keine Regression).

UI-Aktivierung erfolgt separat in #31 (blockiert auf diesem Commit).

Refs: #19, #49 (Roadmap Phase 3)

											
										
										
											2026-04-10 00:46:02 +02:00
+								    "SL": SaarlandAdapter(),
-												Activate Baden-Württemberg via PARLISAdapter (#29, Phase 1)

PARLIS auf parlis.landtag-bw.de läuft technisch auf demselben
eUI-Backend wie LSA-PADOKA und BE-PARDOK, hat aber drei wichtige
Unterschiede, die eine eigene Klasse statt einer PortalaAdapter-
Subklasse rechtfertigen:

1. Body-Schema: minimales lines mit l1/l2/l3/l4 (statt LSA/BE
   2/3/4/10/11/20.x/90.x), serverrecordname=vorgang,
   format=suchergebnis-vorgang-full, sort=SORT01/D SORT02/D SORT03,
   keine parsed/json-Felder. Quelle: dokukratie/scrapers/portala.query.bw.json
   plus HAR-Verifikation gegen die Live-Instanz.

2. Async polling: die initiale SearchAndDisplay-Antwort liefert nur
   search_id mit status=running, KEINE report_id. Erst eine zweite
   SearchAndDisplay-Anfrage mit id=<search_id> (ohne search-Component)
   bekommt nach 1-3 Sekunden die report_id zurück. Reverse-engineered
   aus esearch-ui.main.js requestReportOK() Z. ~1268.

3. Hit-Format: report.tt.html liefert Records als JSON-in-HTML-Comments
   <!--{"WMV33":[...],"EWBV22":[...],...}-->. Komplett anderes Format
   als LSA Perl-Dump oder BE HTML-Cards. Felder:
     - EWBV22: "Drucksache 17/10323"
     - EWBD05: direkter PDF-URL
     - WMV33: Schlagworte (joined by ;)
     - WMV30: Urheber-Kurzform
     - EWBV23: "Antrag <Urheber> <DD.MM.YYYY>"

Smoke-Test (lokal):
  BW q='':       8 hits in 17s, jüngste WP17-Anträge mit Datum + Fraktion
  BW q='Schule': 8 hits, alle wirklich Schul-bezogen (Hochschule, Grundschule,
                 Schwimmunterricht, Lehrerbedarf etc.)
  BW q='Klima':  8 hits, Klimaschutz/CO2/Energieberatung
  get_document(17/10323): roundtrip funktioniert

bundeslaender.py: aktiv=True für BW; Anmerkung erweitert mit
PARLISAdapter-Verweis und drei-Unterschiede-Hinweis für künftige
Wartung. Test test_four_active_bundeslaender umbenannt zu
test_active_bundeslaender_include_phase_1_set, prüft jetzt nur
Subset-Bedingung statt exakter Count, damit Phase-1/2-Erweiterungen
keine Test-Updates brauchen.

Phase 1 (1/3) aus Roadmap-Issue #49.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-08 23:38:04 +02:00
+								    "BW": PARLISAdapter(
 								        bundesland="BW",
 								        name="Landtag von Baden-Württemberg (PARLIS)",
 								        base_url="https://parlis.landtag-bw.de",
 								        wahlperiode=17,
 								        prefix="/parlis",
 								        document_typ="Antrag",
 								    ),
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								}
 								def get_adapter(bundesland: str) -> Optional[ParlamentAdapter]:
 								    """Get adapter for a bundesland."""
 								    return ADAPTERS.get(bundesland)
 								async def search_all(query: str, bundesland: str = "NRW", limit: int = 20) -> list[Drucksache]:
 								    """Search parliament documents in a specific state."""
 								    adapter = get_adapter(bundesland)
 								    if not adapter:
 								        return []
 								    return await adapter.search(query, limit)