Activate Schleswig-Holstein via StarFinderCGIAdapter (#20, Phase 2)

SH läuft auf der ältesten der vier Backend-Familien: Starfinder-CGI auf lissh.lvn.parlanet.de. URL-basiert (nicht stateful wie das moderne StarWeb-Servlet von BB/HE/NI/RP/HB), Latin-1-encoding, flat HTML-Tabelle als Hit-Format. Eigener Adapter weil das Schema fundamental anders ist als alles andere. Endpoint: http://lissh.lvn.parlanet.de/cgi-bin/starfinder/0 ?path=lisshfl.txt&id=FASTLINK&pass=&search=WP=20+AND+dtyp=antrag &format=WEBKURZFL Hit-Format pro <tr class="tabcol*">: <b>{TITLE}</b><br> Antrag {URHEBER} {DD.MM.YYYY} Drucksache <a href="{PDF}">{N/M}</a> Quelle: dokukratie/sh.yml + Live-Probing. Encoding: Server liefert iso-8859-1 ohne korrektes Content-Type- Header. Adapter dekodiert resp.content explizit als latin-1. SSW-Detection im _normalize_fraktion: SH ist das einzige BL mit SSW-Fraktion (von der 5%-Hürde befreit), pattern ist \\bSSW\\b analog zu \\bAfD\\b. Free-Text-Suche client-seitig (siehe #18) — server-side query- syntax mit (term) im starfinder-search-Param wird vom Server nicht als Volltext interpretiert, einheitlich mit allen anderen aktiven Adaptern. Smoke-Test (lokal): SH q="": 8 hits in 14.4s SH q="Schule": 8 hits in 14.8s (Schulentwicklung Westküste, Hochschulen, queere Vielfalt an Schule etc.) SH q="Klima": 8 hits (klimafreundlich, Klimafolgen, Strategischer Aktionsplan) SH q="Bildung": 8 hits (berufliche Bildung, Holocaust-Wissen) bundeslaender.py::SH.aktiv = True. doku_base_url auf lissh.lvn.parlanet.de korrigiert (ehemaliger landtag.ltsh.de- Eintrag passte nicht zum echten Endpoint). Damit ist Phase 2 (1/6) angefangen — als Nebenpfad, weil das StarWeb-Servlet (#27 BB als Template für 5 weitere) ohne HAR- Trace nicht sauber reverse-engineerbar war. Phase 2 (1/6) aus Roadmap-Issue #49. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 00:34:06 +02:00 · 2026-04-09 00:34:06 +02:00 · f82c60e40d
commit f82c60e40d
parent dc0bb07c12
2 changed files with 228 additions and 2 deletions
--- a/app/bundeslaender.py
+++ b/app/bundeslaender.py
@ -364,10 +364,19 @@ BUNDESLAENDER: dict[str, Bundesland] = {
        regierungsfraktionen=["CDU", "GRÜNE"],
        landtagsfraktionen=["CDU", "GRÜNE", "SPD", "FDP", "SSW"],
        doku_system="StarWeb",
-        doku_base_url="https://www.landtag.ltsh.de",
+        doku_base_url="http://lissh.lvn.parlanet.de",
        drucksache_format="20/1234",
        dokukratie_scraper="sh",
-        anmerkung="SSW ist von der 5%-Hürde befreit.",
+        aktiv=True,
+        anmerkung=(
+            "SSW ist von der 5%-Hürde befreit. Doku-System ist die "
+            "alte Starfinder-CGI auf lissh.lvn.parlanet.de — URL-"
+            "basiert via "
+            "/cgi-bin/starfinder/0?path=lisshfl.txt&search=WP=20+AND+dtyp=antrag, "
+            "Latin-1-encoding. NICHT die moderne StarWeb-Servlet-"
+            "Variante (BB/HE/NI/RP/HB) — eigene Klasse "
+            "StarFinderCGIAdapter."
+        ),
    ),
    "TH": Bundesland(
        code="TH",
--- a/app/parlamente.py
+++ b/app/parlamente.py
@ -1269,6 +1269,215 @@ class ParLDokAdapter(ParlamentAdapter):
                return None


+class StarFinderCGIAdapter(ParlamentAdapter):
+    """Adapter for old-school CGI Starfinder instances.
+
+    Currently used by Schleswig-Holstein on
+    ``lissh.lvn.parlanet.de/cgi-bin/starfinder/0`` — the **oldest** of the
+    parliament backends we touch. Predates StarWeb's HTML form-submit
+    machinery: instead of submitting a stateful AdvancedSearch form
+    (which BB/HE/NI/RP/HB do), Starfinder accepts the entire query as
+    URL parameters and returns plain HTML with a flat ``<tr>`` table of
+    records.
+
+    Reverse-engineering quelle: ``dokukratie/sh.yml`` plus a probe
+    against the live endpoint. Format details:
+
+    - URL template: ``{base}/cgi-bin/starfinder/0?path={db_path}&id=FASTLINK
+      &pass=&search={starfinder_query}&format=WEBKURZFL``
+    - Query syntax: ``WP=20+AND+dtyp=antrag`` (URL-encoded). The
+      ``dtyp`` codes are lowercase short labels (``antrag``, ``kleine``).
+    - Encoding: ``iso-8859-1`` (Latin-1) — NOT UTF-8. The HTTP response
+      doesn't always declare it via Content-Type, so we explicitly
+      decode with ``latin1`` to avoid mojibake on the German umlauts.
+    - Hit-format: each record is one ``<tr class="tabcol|tabcol2|tabcol3">``
+      with the title in ``<b>``, then ``Antrag <Urheber> <DD.MM.YYYY>
+      Drucksache <a href="...pdf">XX/YYYY</a>``.
+    """
+
+    _RE_RECORD = re.compile(
+        r'<tr class="tabcol[23]?">.*?</tr>',
+        re.DOTALL,
+    )
+    _RE_TITLE = re.compile(r"<b>(.*?)</b>", re.DOTALL)
+    _RE_DRUCKSACHE_LINK = re.compile(
+        r'<a href="([^"]+\.pdf)"[^>]*>(\d+/\d+)</a>'
+    )
+    # The line between <b>title</b> and the <a>-link looks like:
+    #   "Antrag Christian Dirschauer (SSW) 07.04.2026 Drucksache "
+    # We pull the originator(s) and the date out of it.
+    _RE_URHEBER_DATUM = re.compile(
+        r"</b>\s*<br>\s*[A-Za-zÄÖÜäöüß]+\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
+        re.DOTALL,
+    )
+
+    def __init__(
+        self,
+        *,
+        bundesland: str,
+        name: str,
+        base_url: str,
+        wahlperiode: int,
+        db_path: str = "lisshfl.txt",
+        document_typ_code: str = "antrag",
+    ) -> None:
+        self.bundesland = bundesland
+        self.name = name
+        self.base_url = base_url.rstrip("/")
+        self.wahlperiode = wahlperiode
+        self.db_path = db_path
+        self.document_typ_code = document_typ_code
+
+    @staticmethod
+    def _datum_de_to_iso(datum_de: str) -> str:
+        if not datum_de:
+            return ""
+        try:
+            d, m, y = datum_de.split(".")
+            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
+        except ValueError:
+            return ""
+
+    @staticmethod
+    def _normalize_fraktion(text: str) -> list[str]:
+        """SH format: 'Christian Dirschauer (SSW), Jette Waldinger-Thiering (SSW)'.
+
+        Includes SSW which is unique to SH (befreit von 5%-Hürde).
+        """
+        if not text:
+            return []
+        u = text.upper()
+        out: list[str] = []
+        if re.search(r"\bBÜNDNIS\s*90\b", u) or re.search(r"\bGR(?:Ü|UE)NE\b", u):
+            out.append("GRÜNE")
+        if re.search(r"\bCDU\b", u):
+            out.append("CDU")
+        if re.search(r"\bSPD\b", u):
+            out.append("SPD")
+        if re.search(r"\bF\.?\s*D\.?\s*P\.?\b", u):
+            out.append("FDP")
+        if re.search(r"\bAFD\b", u):
+            out.append("AfD")
+        if re.search(r"\bLINKE\b", u):
+            out.append("LINKE")
+        if re.search(r"\bSSW\b", u):
+            out.append("SSW")
+        if re.search(r"LANDESREGIERUNG|\bMINISTER|STAATSKANZLEI|MINISTERPRÄSIDENT", u):
+            out.append("Landesregierung")
+        return out
+
+    def _build_url(self) -> str:
+        """Build the Starfinder URL for the structural WP+dtyp browse.
+
+        Free-text filtering is done client-side on the parsed records
+        (consistent with #18 — alle Adapter machen einheitlich Title-
+        Filter ohne Server-Volltext, weil das Verhalten zwischen
+        Adaptern sonst asymmetrisch wird).
+        """
+        search_param = f"WP={self.wahlperiode}+AND+dtyp={self.document_typ_code}"
+        return (
+            f"{self.base_url}/cgi-bin/starfinder/0"
+            f"?path={self.db_path}&id=FASTLINK&pass=&search={search_param}"
+            f"&format=WEBKURZFL"
+        )
+
+    def _parse_records(self, html: str) -> list[Drucksache]:
+        results: list[Drucksache] = []
+        for record_html in self._RE_RECORD.findall(html):
+            m_link = self._RE_DRUCKSACHE_LINK.search(record_html)
+            if not m_link:
+                continue
+            pdf_url, drucksache = m_link.group(1), m_link.group(2)
+
+            m_title = self._RE_TITLE.search(record_html)
+            title = re.sub(r"\s+", " ", m_title.group(1)).strip() if m_title else f"Drucksache {drucksache}"
+
+            urheber = ""
+            datum_iso = ""
+            m_meta = self._RE_URHEBER_DATUM.search(record_html)
+            if m_meta:
+                urheber = m_meta.group(1).strip()
+                datum_iso = self._datum_de_to_iso(m_meta.group(2))
+
+            results.append(Drucksache(
+                drucksache=drucksache,
+                title=title,
+                fraktionen=self._normalize_fraktion(urheber),
+                datum=datum_iso,
+                link=pdf_url,
+                bundesland=self.bundesland,
+                typ="Antrag",
+            ))
+        return results
+
+    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
+        url = self._build_url()
+        async with httpx.AsyncClient(
+            timeout=60,
+            follow_redirects=True,
+            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
+        ) as client:
+            try:
+                resp = await client.get(url)
+                if resp.status_code != 200:
+                    logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
+                    return []
+                # Force latin1 because the Starfinder server doesn't always
+                # advertise the encoding correctly.
+                html = resp.content.decode("latin-1", errors="replace")
+                results = self._parse_records(html)
+            except Exception:
+                logger.exception("%s search error", self.bundesland)
+                return []
+
+        # Client-side title + Urheber filter (siehe #18)
+        if query:
+            terms = [t.lower() for t in query.split() if t]
+            results = [
+                d for d in results
+                if all(t in f"{d.title} {' '.join(d.fraktionen)}".lower() for t in terms)
+            ]
+        return results[:limit]
+
+    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
+        """Look up a single Drucksache by ID.
+
+        SH responses are pre-sorted newest-first; we re-fetch up to 200
+        records and scan for the exact match. The Starfinder server
+        doesn't expose a number-only filter that we know of.
+        """
+        results = await self.search(query="", limit=200)
+        for doc in results:
+            if doc.drucksache == drucksache:
+                return doc
+        return None
+
+    async def download_text(self, drucksache: str) -> Optional[str]:
+        import fitz  # PyMuPDF
+
+        doc = await self.get_document(drucksache)
+        if not doc or not doc.link:
+            return None
+        async with httpx.AsyncClient(
+            timeout=60,
+            follow_redirects=True,
+            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
+        ) as client:
+            try:
+                resp = await client.get(doc.link)
+                if resp.status_code != 200:
+                    return None
+                pdf = fitz.open(stream=resp.content, filetype="pdf")
+                text = ""
+                for page in pdf:
+                    text += page.get_text()
+                pdf.close()
+                return text
+            except Exception:
+                logger.exception("%s PDF download error for %s", self.bundesland, drucksache)
+                return None
+
+
 class BayernAdapter(ParlamentAdapter):
    """Adapter for Bayerischer Landtag."""

@ -1754,6 +1963,14 @@ ADAPTERS = {
        document_typ_substring=True,
        kinds=["Drucksache", "Vorlage"],
    ),
+    "SH": StarFinderCGIAdapter(
+        bundesland="SH",
+        name="Schleswig-Holsteinischer Landtag (LIS-SH)",
+        base_url="http://lissh.lvn.parlanet.de",
+        wahlperiode=20,
+        db_path="lisshfl.txt",
+        document_typ_code="antrag",
+    ),
    "BY": BayernAdapter(),
    "BW": PARLISAdapter(
        bundesland="BW",