From 4a8986e009b72926ab64b9d30cc6b27c6c180629 Mon Sep 17 00:00:00 2001
From: Dotty Dotter <dotty@Mac.wideopen.space>
Date: Thu, 9 Apr 2026 14:15:35 +0200
Subject: [PATCH] =?UTF-8?q?Phase=20H:=20HE=20StarWebHEAdapter=20(#24/#30)?=
 =?UTF-8?q?=20=E2=80=94=20Hessen=20aktiv?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Schließt #24 (HE Card-Parser) und #36 (UI-Aktivierung). Eigenständige
``StarWebHEAdapter``-Klasse für starweb.hessen.de.

Backend-Discovery aus HAR-Trace (TEMP/starweb.hessen.de.har):

- starweb.hessen.de läuft auf einem eUI-Backend mit synchronem 2-Step-
  Flow (kein Polling wie BW PARLIS): POST ``browse.tt.json`` →
  ``report_id`` direkt in der Response → GET ``report.tt.html?
  report_id=...&start=0&chunksize=1500``
- Source: ``hlt.lis``
- Server verlangt ZWINGEND einen ``search.json``-Term-Tree, ``parsed``/
  ``sref`` allein reichen nicht. Top-NOT mit zwei Operanden:
  ``not(WP-Filter, NOWEB=X)``
- Hit-Format: Cards (``efxRecordRepeater``) mit Daten in HTML-Kommentar-
  Perl-Dumps ``<!--<pre class="dump">$VAR1 = ...</pre>-->``
- Field-Mapping: WEV01=Title, WEV02=Datum, WEV03=Typ, WEV07=PDF-URL,
  WEV08=Drucksachen-Nummer, WEV12=Urheber

Pipeline:

- ``search()`` synchron 2-Step, client-side ``"antrag"``-Filter (analog
  #61 für portala) — fängt "Dringlicher Berichtsantrag" und ähnliche
  Subtypen
- ``get_document()`` linearer Lookup über die ersten 200 Hits
- ``download_text()`` PDF-via-fitz (HE-PDF-URLs werden auf https
  upgegradet)

BL-Eintrag in ``bundeslaender.py``:

- ``HE.aktiv = True``
- ``doku_system="portala"`` (statt "StarWeb" — die /starweb/LIS-Pfade
  sind nur Legacy, das echte Backend ist /portal)
- ``doku_base_url="https://starweb.hessen.de/portal"``

ADAPTERS-Registrierung an Position vor NRW.

Live-Probe:

```
21/4157 2026-04-07 | [GRÜNE] | Dringlicher Berichtsantrag | Vorstellung, Kosten...
21/4156 2026-04-02 | [GRÜNE] | Berichtsantrag             | Schulische Prävention...
21/4136 2026-03-30 | [GRÜNE] | Dringlicher Berichtsantrag | Streichung des Schulfachs...
```

176 Unit-Tests grün, Sub-A im Container nach Deploy zu verifizieren.

Refs: #24, #30, #36, #59 (Phase H)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 app/bundeslaender.py |  14 ++-
 app/parlamente.py    | 248 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 259 insertions(+), 3 deletions(-)
diff --git a/app/bundeslaender.py b/app/bundeslaender.py
index dd7d321..1e744cf 100644
--- a/app/bundeslaender.py
+++ b/app/bundeslaender.py
@@ -239,11 +239,19 @@ BUNDESLAENDER: dict[str, Bundesland] = {
         naechste_wahl="2028-10-22",
         regierungsfraktionen=["CDU", "SPD"],
         landtagsfraktionen=["CDU", "AfD", "SPD", "GRÜNE", "FDP"],
-        doku_system="StarWeb",
-        doku_base_url="https://starweb.hessen.de/starweb/LIS",
+        doku_system="portala",
+        doku_base_url="https://starweb.hessen.de/portal",
         drucksache_format="21/1234",
         dokukratie_scraper="he",
-        anmerkung="Wahltermin 2028 ist Schätzung.",
+        aktiv=True,
+        anmerkung=(
+            "starweb.hessen.de läuft auf demselben portala/eUI-Backend "
+            "wie LSA/BE/BB/RP, aber mit HE-spezifischem Hit-Format: "
+            "Cards (efxRecordRepeater) mit Daten in HTML-Kommentar-"
+            "Perl-Dumps (WEV01-WEV12). PortalaAdapter mit eigenem "
+            "Parser-Modus _parse_hit_list_he_comment_dump (#24/#30). "
+            "Wahltermin 2028 ist Schätzung."
+        ),
     ),
     "MV": Bundesland(
         code="MV",
diff --git a/app/parlamente.py b/app/parlamente.py
index b7a2213..df2bbcd 100644
--- a/app/parlamente.py
+++ b/app/parlamente.py
@@ -1845,6 +1845,253 @@ class PARLISAdapter(ParlamentAdapter):
                 return None
 
 
+class StarWebHEAdapter(ParlamentAdapter):
+    """Hessen-spezifischer eUI-Adapter (#24/#30).
+
+    starweb.hessen.de läuft auf einem eUI-Backend mit synchronem 2-Step-
+    Flow (anders als BW PARLIS, das asynchron pollt):
+
+    1. POST ``/portal/browse.tt.json`` mit ``action=SearchAndDisplay`` →
+       Response enthält ``report_id`` direkt
+    2. GET ``/portal/report.tt.html?report_id=...`` → HTML mit den Hits
+
+    Hit-Format: Cards mit ``efxRecordRepeater``-divs, Daten in HTML-
+    Kommentar-Perl-Dumps (``<!--<pre class="dump">$VAR1 = ...</pre>-->``).
+    Field-Mapping:
+
+    - ``WEV01`` → Title
+    - ``WEV02`` → Datum
+    - ``WEV03`` → Typ
+    - ``WEV07`` → PDF-URL
+    - ``WEV08`` → Drucksachen-Nummer
+    - ``WEV12`` → Urheber/Fraktion
+
+    Source: ``hlt.lis`` (Hessischer Landtag), Wahlperiode 21.
+    """
+
+    _RE_HE_COMMENT_DUMP = re.compile(
+        r'<!--\s*<pre[^>]*class="dump"[^>]*>\s*\$VAR1 = (.*?)</pre>\s*-->',
+        re.DOTALL,
+    )
+    _RE_HE_WEV01 = re.compile(r"'WEV01'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
+    _RE_HE_WEV02 = re.compile(r"'WEV02'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"'](\d{1,2}\.\d{1,2}\.\d{4})[\"']")
+    _RE_HE_WEV03 = re.compile(r"'WEV03'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
+    _RE_HE_WEV07 = re.compile(r"'WEV07'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
+    _RE_HE_WEV08 = re.compile(r"'WEV08'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"'](\d+/\d+)[\"']")
+    _RE_HE_WEV12 = re.compile(r"'WEV12'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
+
+    bundesland = "HE"
+    name = "Hessischer Landtag (StarWeb)"
+    base_url = "https://starweb.hessen.de"
+    portal_path = "/portal"
+    wahlperiode = 21
+
+    def _normalize_fraktion(self, text: str) -> list[str]:
+        from .parteien import extract_fraktionen
+        return extract_fraktionen(text, bundesland=self.bundesland)
+
+    @staticmethod
+    def _datum_de_to_iso(datum_de: str) -> str:
+        if not datum_de:
+            return ""
+        try:
+            d, m, y = datum_de.split(".")
+            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
+        except ValueError:
+            return ""
+
+    @staticmethod
+    def _decode_perl_hex(text: str) -> str:
+        """Wandle ``\\x{e9}`` → ``é`` etc. um. Robuste Hex-Substitution."""
+        return re.sub(
+            r"\\x\{([0-9a-fA-F]+)\}",
+            lambda m: chr(int(m.group(1), 16)),
+            text,
+        )
+
+    def _build_initial_body(self, query: str = "") -> dict:
+        """HE-Server-Body. Aktuelle WP, optional Volltext-Filter.
+
+        Der Server verlangt ZWINGEND einen ``search.json``-Term-Tree mit
+        einer ``not(query, NOWEB=X)``-Wurzel. ``parsed``/``sref`` allein
+        reichen nicht — der Server ignoriert sie und liefert nur
+        ``facets`` zurück.
+        """
+        wp_str = str(self.wahlperiode)
+        wp_term = {
+            "tn": "term", "t": wp_str, "sf": "WP",
+            "op": "eq", "idx": 45, "l": 3, "num": 1,
+        }
+        # Bauen den Top-NOT-Tree: NOT(query_subtree, NOWEB=X)
+        if query:
+            vtdrs_term = {
+                "tn": "term",
+                "t": f"\"(/VT ('\\\"{query}\\\"'))\"",
+                "sf": "VTDRS", "op": "eq", "idx": 9, "l": 3, "num": 3,
+            }
+            inner = {"tn": "and", "terms": [vtdrs_term, wp_term], "num": 4}
+            parsed = (
+                f"((/VTDRS \"(/VT ('\\\"{query}\\\"'))\") "
+                f"AND (/WP {wp_str})) AND NOT NOWEB=X"
+            )
+        else:
+            inner = wp_term
+            parsed = f"(/WP {wp_str}) AND NOT NOWEB=X"
+
+        json_tree = [{
+            "tn": "not",
+            "terms": [
+                inner,
+                {"tn": "term", "t": "X", "sf": "NOWEB",
+                 "op": "eq", "idx": 100, "l": 3, "num": 2},
+            ],
+        }]
+
+        return {
+            "action": "SearchAndDisplay",
+            "sources": ["hlt.lis"],
+            "report": {
+                "rhl": "main",
+                "rhlmode": "add",
+                "format": "generic2-short",
+                "mime": "html",
+                "sort": "WPSORT/D DRSORT/D",
+            },
+            "search": {
+                "lines": {"1": query, "2": wp_str},
+                "serverrecordname": "generic2Search",
+                "parsed": parsed,
+                "sref": parsed,
+                "json": json_tree,
+            },
+        }
+
+    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
+        """Synchroner 2-Step gegen starweb.hessen.de."""
+        from .parteien import extract_fraktionen
+
+        body = self._build_initial_body(query)
+        browse_url = f"{self.base_url}{self.portal_path}/browse.tt.json"
+        report_url = f"{self.base_url}{self.portal_path}/report.tt.html"
+
+        async with httpx.AsyncClient(
+            timeout=60, follow_redirects=True,
+            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
+        ) as client:
+            try:
+                resp = await client.post(browse_url, json=body)
+                if resp.status_code != 200:
+                    logger.error("HE browse HTTP %s", resp.status_code)
+                    return []
+                data = resp.json()
+                report_id = data.get("report_id")
+                if not report_id:
+                    logger.error("HE: no report_id in browse response keys=%s", sorted(data.keys()))
+                    return []
+
+                # Step 2: report.tt.html mit chunksize — ohne den Parameter
+                # liefert der Server nur den allerersten Hit (8 KB HTML).
+                # Wir nehmen 1500 als Floor, analog #61 PortalaAdapter, weil
+                # nach dem client-side Antrag-Filter die Hit-Dichte gering
+                # ist (HE hat ~1:30 Antrag/Anfrage).
+                chunksize = max(limit * 30, 1500)
+                rep = await client.get(
+                    report_url,
+                    params={
+                        "report_id": report_id,
+                        "start": 0,
+                        "chunksize": chunksize,
+                    },
+                )
+                if rep.status_code != 200:
+                    logger.error("HE report HTTP %s", rep.status_code)
+                    return []
+                results = self._parse_report_html(rep.text)
+                # Client-side Antrag-Filter (analog #61 Bug 2/3 für portala)
+                results = [d for d in results if "antrag" in (d.typ or "").lower()]
+                # Optional Query-Filter client-side
+                if query:
+                    qterms = query.lower().split()
+                    results = [
+                        d for d in results
+                        if all(t in (d.title.lower() + " " + " ".join(d.fraktionen).lower()) for t in qterms)
+                    ]
+                return results[:limit]
+            except Exception:
+                logger.exception("HE search error")
+                return []
+
+    def _parse_report_html(self, html: str) -> list[Drucksache]:
+        """Zieht Daten aus den ``<!--<pre class="dump">$VAR1 = ...-->``-
+        Kommentaren. WEV01–WEV12 → Drucksache-Felder."""
+        from .parteien import extract_fraktionen
+
+        results: list[Drucksache] = []
+        for dump in self._RE_HE_COMMENT_DUMP.findall(html):
+            m_ds = self._RE_HE_WEV08.search(dump)
+            if not m_ds:
+                continue
+            drucksache = m_ds.group(1)
+
+            m_t = self._RE_HE_WEV01.search(dump)
+            title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"
+
+            m_pdf = self._RE_HE_WEV07.search(dump)
+            pdf_url = m_pdf.group(1) if m_pdf else ""
+            if pdf_url.startswith("http://"):
+                pdf_url = "https://" + pdf_url[len("http://"):]
+
+            m_dat = self._RE_HE_WEV02.search(dump)
+            datum_iso = self._datum_de_to_iso(m_dat.group(1)) if m_dat else ""
+
+            m_typ = self._RE_HE_WEV03.search(dump)
+            typ = self._decode_perl_hex(m_typ.group(1)) if m_typ else "Drucksache"
+
+            m_urheber = self._RE_HE_WEV12.search(dump)
+            urheber = self._decode_perl_hex(m_urheber.group(1)) if m_urheber else ""
+            fraktionen = extract_fraktionen(urheber, bundesland=self.bundesland)
+
+            results.append(Drucksache(
+                drucksache=drucksache, title=title, fraktionen=fraktionen,
+                datum=datum_iso, link=pdf_url, bundesland=self.bundesland,
+                typ=typ,
+            ))
+
+        return results
+
+    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
+        """Linearer Lookup über search() — wie die anderen Adapter, kein
+        Direkt-ID-Filter."""
+        results = await self.search("", limit=200)
+        for d in results:
+            if d.drucksache == drucksache:
+                return d
+        return None
+
+    async def download_text(self, drucksache: str) -> Optional[str]:
+        import fitz
+        doc = await self.get_document(drucksache)
+        if not doc or not doc.link:
+            return None
+        async with httpx.AsyncClient(
+            timeout=60, follow_redirects=True,
+            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
+        ) as client:
+            try:
+                resp = await client.get(doc.link)
+                if resp.status_code != 200:
+                    return None
+                pdf = fitz.open(stream=resp.content, filetype="pdf")
+                text = ""
+                for page in pdf:
+                    text += page.get_text()
+                pdf.close()
+                return text
+            except Exception:
+                logger.exception("HE PDF download error for %s", drucksache)
+                return None
+
+
 class BundestagAdapter(ParlamentAdapter):
     """Adapter für den Deutschen Bundestag via DIP-API.
 
@@ -2060,6 +2307,7 @@ class BundestagAdapter(ParlamentAdapter):
 # Registry of adapters
 ADAPTERS = {
     "BUND": BundestagAdapter(),
+    "HE": StarWebHEAdapter(),
     "NRW": NRWAdapter(),
     "LSA": PortalaAdapter(
         bundesland="LSA",