Phase I: HB PARiSHBAdapter (#21/#33) — Bremen aktiv

Schließt #21 (HB-Scraper) und #33 (UI-Aktivierung). Eigenständige ``PARiSHBAdapter``-Klasse für paris.bremische-buergerschaft.de. Backend (HAR-Trace TEMP/paris.bremische-buergerschaft.de.har): - Single-POST gegen ``/starweb/paris/servlet.starweb`` mit form-urlencoded Body - ``path=paris/LISSHFL.web``, ``format=LISSH_BrowseVorgang_Report`` - ``01_LISSHFL_Themen=<query>`` (Volltext-Thesaurus) - ``02_LISSHFL_PARL=S OR L`` (Stadt + Landtag in einem Rutsch) - ``03_LISSHFL_WP=21`` (aktuelle Wahlperiode; Multi-WP-Range timeout-t den Server bei 60s) - Wildcards (``*``) timeout-en ebenfalls — bei leerer Query verwenden wir das hochfrequente Stoppwort ``"der"`` als Catch-all Hit-Format aus dem Single-Page-HTML: - ``<tbody name="RecordRepeater"><tr name="Repeat_TYP">`` - Title in ``<h2><a>`` - ``Drs <b>21/730 S</b>`` mit S/L-Suffix für Stadtbürgerschaft vs Landtag — Drucksachen-IDs werden als ``21/730S`` (ohne Space) gespeichert - ``Änderungsantrag vom 23.02.2026`` (Typ + Datum) - Fraktionen-Liste nach ``<br/>`` - PDF-Link mit ``target="new"`` auf bremische-buergerschaft.de Pipeline: - ``search()`` mit client-side ``"antrag"``-Filter (analog #61), fängt ``"Antrag"``, ``"Änderungsantrag"`` etc. - ``get_document()`` linearer Lookup - ``download_text()`` PDF-via-fitz BL-Eintrag in ``bundeslaender.py``: - ``HB.aktiv = True`` - ``doku_system="PARiS"`` (statt der alten Klassifikation "StarWeb" — PARiS ist eine deutlich abweichende Servlet-Variante, kein eUI) - ``drucksache_format="21/1234S"`` - Test ``test_hb_is_starweb_not_paris`` umbenannt in ``test_hb_is_paris_starweb_variant``, prüft jetzt auf "PARiS" Live-Probe: ``` 21/730S 2026-02-23 | [SPD,GRÜNE,LINKE] | Änderungsantrag | Haushaltsgesetze ... 21/1449 2025-11-05 | [SPD,GRÜNE,LINKE] | Antrag | Finanzierung der Bremischen Häfen 21/555S 2025-06-17 | [CDU] | Antrag | Clima-Campus zügig beantworten ``` 176 Unit-Tests grün, Live-Verifikation Sub-A im Container nach Deploy. Refs: #21, #33, #59 (Phase I) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 14:21:49 +02:00 · 2026-04-09 14:21:49 +02:00 · 278d74ff97
commit 278d74ff97
parent 4a8986e009
3 changed files with 215 additions and 13 deletions
--- a/app/bundeslaender.py
+++ b/app/bundeslaender.py
@ -195,18 +195,19 @@ BUNDESLAENDER: dict[str, Bundesland] = {
        naechste_wahl="2027-05-09",
        regierungsfraktionen=["SPD", "GRÜNE", "LINKE"],
        landtagsfraktionen=["SPD", "CDU", "GRÜNE", "LINKE", "AfD", "BiW"],
-        doku_system="StarWeb",
+        doku_system="PARiS",
-        doku_base_url="https://paris.bremische-buergerschaft.de",
+        doku_base_url="https://paris.bremische-buergerschaft.de/starweb/paris",
-        drucksache_format="21/1234",
+        drucksache_format="21/1234S",
        dokukratie_scraper="hb",
        aktiv=True,
        anmerkung=(
-            "PARiS ist eine StarWeb-Skin auf bremischer Hardware — kein "
+            "PARiS ist eine alte Java-Servlet-Variante von StarWeb. "
-            "eigenständiges System. Endpoint folgt dem Standard "
+            "Single-POST-Search gegen /starweb/paris/servlet.starweb mit "
-            "/starweb/paris/servlet.starweb?path=paris/LISSH.web (siehe "
+            "form-urlencoded Body, Hits in <tbody name='RecordRepeater'>. "
-            "dokukratie/hb.yml). Wiederverwendbar mit dem generischen "
+            "Drucksachen tragen einen S/L-Suffix für Stadtbürgerschaft "
-            "StarWebAdapter aus Issue #27. AfD durch Listenstreichung 2023 "
+            "vs. Landtag (z.B. 21/730S). Eigener PARiSHBAdapter (#21/#33). "
-            "nicht im Landtag, stattdessen Bürger in Wut (BiW). Wahltag 2027 "
+            "AfD durch Listenstreichung 2023 nicht im Landtag, stattdessen "
-            "noch nicht festgesetzt."
+            "BiW. Wahltag 2027 noch nicht festgesetzt."
        ),
    ),
    "HH": Bundesland(
--- a/app/parlamente.py
+++ b/app/parlamente.py
@ -1845,6 +1845,202 @@ class PARLISAdapter(ParlamentAdapter):
                return None
 class PARiSHBAdapter(ParlamentAdapter):
    """Bremen-Adapter für PARiS (paris.bremische-buergerschaft.de).
    PARiS ist die alte Java-Servlet-Variante von StarWeb (anders als
    HE/starweb.hessen.de, das auf dem moderneren eUI läuft). Die Suche
    geht über genau einen POST-Call gegen ``/starweb/paris/servlet.starweb``
    mit form-urlencoded Body. Response ist ein vollständiges HTML-
    Ergebnis-Page mit ``<tbody name="RecordRepeater">``-Hits.
    Hit-Format pro ``<tr name="Repeat_TYP">``:
    - ``<abbr title="Bremische Stadtbürgerschaft">S</abbr>`` oder
      ``<abbr title="Bremischer Landtag">L</abbr>`` als Indikator
    - ``<h2><a>TITEL</a></h2>``
    - Stichworte (Thesaurus-Links, ignoriert)
    - ``Drs <b>21/730 S</b>`` (Drucksachen-Nr mit S/L-Suffix)
    - ``Änderungsantrag vom 23.02.2026`` (Typ + Datum)
    - ``SPD, BÜNDNIS 90/DIE GRÜNEN, Die Linke`` (Fraktionen)
    - ``<a href="https://www.bremische-buergerschaft.de/dokumente/...pdf">``
    Bremen hat zwei parallele Parlamente: Bürgerschaft (Landtag) für
    landespolitische Anträge und Stadtbürgerschaft für Bremens
    kommunale Sachen. Wir lassen beide durch (``PARL=S OR L``) — der
    Stadtbürgerschafts-Anteil ist für die GWÖ-Bilanzierung sogar
    interessanter, weil viele Entscheidungen auf kommunaler Ebene
    laufen.
    """
    bundesland = "HB"
    name = "Bremische Bürgerschaft (PARiS)"
    base_url = "https://paris.bremische-buergerschaft.de"
    servlet_path = "/starweb/paris/servlet.starweb"
    wahlperiode = 21
    # Pro-Hit-Regex über das `<tr name="Repeat_TYP">`-Pattern
    _RE_TR = re.compile(
        r'<tr\s+name="Repeat_TYP"[^>]*>([\s\S]*?)</tr\s*>',
        re.IGNORECASE,
    )
    _RE_TITLE = re.compile(r'<h2[^>]*>\s*<a[^>]*>(.*?)</a>', re.DOTALL)
    _RE_DRUCKSACHE = re.compile(r'Drs\s*<b>\s*(\d+/\d+)\s*([SL]?)\s*</b>')
    _RE_TYP_DATUM = re.compile(r'</b>\s*,\s*([^,<\n]+?)\s+vom\s+(\d{1,2}\.\d{1,2}\.\d{4})')
    _RE_FRAKTIONEN_AFTER_DATUM = re.compile(r'vom\s+\d{1,2}\.\d{1,2}\.\d{4}\s*<br\s*/?\s*>\s*([^<]+)')
    _RE_PDF_LINK = re.compile(
        r'<a\s+href="(https?://[^"]*\.pdf[^"]*)"[^>]*target="new"',
        re.IGNORECASE,
    )
    def _normalize_fraktion(self, text: str) -> list[str]:
        from .parteien import extract_fraktionen
        return extract_fraktionen(text, bundesland=self.bundesland)
    @staticmethod
    def _datum_de_to_iso(datum_de: str) -> str:
        try:
            d, m, y = datum_de.split(".")
            return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
        except ValueError:
            return ""
    @staticmethod
    def _strip_html(s: str) -> str:
        """Entferne HTML-Tags und entities aus einem Snippet."""
        s = re.sub(r"<[^>]+>", "", s)
        s = s.replace("&ndash;", "–").replace("&nbsp;", " ")
        s = re.sub(r"&[a-zA-Z]+;", " ", s)
        return re.sub(r"\s+", " ", s).strip()
    def _parse_record_html(self, chunk: str) -> Optional[Drucksache]:
        m_ds = self._RE_DRUCKSACHE.search(chunk)
        if not m_ds:
            return None
        nr_only = m_ds.group(1)         # "21/730"
        suffix = m_ds.group(2) or ""    # "S" oder "L"
        # Drucksachen-ID: ohne Whitespace, mit Suffix dahinter wenn vorhanden
        drucksache = f"{nr_only}{suffix}" if suffix else nr_only
        m_t = self._RE_TITLE.search(chunk)
        title = self._strip_html(m_t.group(1)) if m_t else f"Drucksache {drucksache}"
        m_pdf = self._RE_PDF_LINK.search(chunk)
        pdf_url = m_pdf.group(1) if m_pdf else ""
        m_td = self._RE_TYP_DATUM.search(chunk)
        if m_td:
            typ = self._strip_html(m_td.group(1))
            datum = self._datum_de_to_iso(m_td.group(2))
        else:
            typ = "Drucksache"
            datum = ""
        m_fr = self._RE_FRAKTIONEN_AFTER_DATUM.search(chunk)
        urheber = self._strip_html(m_fr.group(1)) if m_fr else ""
        fraktionen = self._normalize_fraktion(urheber)
        return Drucksache(
            drucksache=drucksache,
            title=title,
            fraktionen=fraktionen,
            datum=datum,
            link=pdf_url,
            bundesland=self.bundesland,
            typ=typ,
        )
    def _build_form_body(self, query: str) -> dict:
        """Form-Body für PARiS Suche.
        - ``path=paris/LISSHFL.web``: die LISSH-Vorgangsdatenbank
        - ``format=LISSH_BrowseVorgang_Report``: Browse-Format mit
          allen Hits in einer Page (kein Pagination)
        - ``01_LISSHFL_Themen``: Thesaurus-Volltext-Suche. Der Server
          akzeptiert kein ``*``-Wildcard und timeout-t bei leerem Wert,
          deshalb verwenden wir bei leerer Query ein hochfrequentes
          Stoppwort als Catch-all.
        - ``02_LISSHFL_PARL=S OR L``: Stadtbürgerschaft + Landtag
        - ``03_LISSHFL_WP``: aktuelle Wahlperiode (kein Range — ein
          Multi-WP-Range hat im Test 60s+ gebraucht)
        """
        return {
            "path": "paris/LISSHFL.web",
            "format": "LISSH_BrowseVorgang_Report",
            "01_LISSHFL_Themen": query or "der",  # häufiges Stoppwort
            "02_LISSHFL_PARL": "S OR L",
            "03_LISSHFL_WP": str(self.wahlperiode),
        }
    async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
        """Single-POST-Search gegen den PARiS-Servlet."""
        body = self._build_form_body(query)
        async with httpx.AsyncClient(
            timeout=60, follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.post(
                    f"{self.base_url}{self.servlet_path}",
                    data=body,
                    headers={"Content-Type": "application/x-www-form-urlencoded"},
                )
                if resp.status_code != 200:
                    logger.error("HB PARiS HTTP %s", resp.status_code)
                    return []
                results: list[Drucksache] = []
                for chunk in self._RE_TR.findall(resp.text):
                    doc = self._parse_record_html(chunk)
                    if not doc:
                        continue
                    if "antrag" not in (doc.typ or "").lower():
                        continue
                    results.append(doc)
                    if len(results) >= limit:
                        break
                return results
            except Exception:
                logger.exception("HB PARiS search error")
                return []
    async def get_document(self, drucksache: str) -> Optional[Drucksache]:
        """Linearer Lookup über die search()-Resultate."""
        # Bei Drucksachen-IDs mit Suffix (21/730S) zerlegen wir die,
        # damit die Volltext-Suche den nackten Drucksachen-Anteil findet
        m = re.match(r"(\d+/\d+)([SL]?)$", drucksache)
        if not m:
            return None
        results = await self.search("*", limit=200)
        for d in results:
            if d.drucksache == drucksache:
                return d
        return None
    async def download_text(self, drucksache: str) -> Optional[str]:
        import fitz
        doc = await self.get_document(drucksache)
        if not doc or not doc.link:
            return None
        async with httpx.AsyncClient(
            timeout=60, follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
        ) as client:
            try:
                resp = await client.get(doc.link)
                if resp.status_code != 200:
                    return None
                pdf = fitz.open(stream=resp.content, filetype="pdf")
                text = ""
                for page in pdf:
                    text += page.get_text()
                pdf.close()
                return text
            except Exception:
                logger.exception("HB PARiS PDF download error for %s", drucksache)
                return None
 class StarWebHEAdapter(ParlamentAdapter):
    """Hessen-spezifischer eUI-Adapter (#24/#30).
@ -2307,6 +2503,7 @@ class BundestagAdapter(ParlamentAdapter):
 # Registry of adapters
 ADAPTERS = {
    "BUND": BundestagAdapter(),
    "HB": PARiSHBAdapter(),
    "HE": StarWebHEAdapter(),
    "NRW": NRWAdapter(),
    "LSA": PortalaAdapter(
--- a/tests/test_bundeslaender.py
+++ b/tests/test_bundeslaender.py
@ -69,9 +69,13 @@ class TestClassificationFix48:
    def test_th_is_parldok_not_starweb(self):
        assert BUNDESLAENDER["TH"].doku_system == "ParlDok"
-    def test_hb_is_starweb_not_paris(self):
+    def test_hb_is_paris_starweb_variant(self):
-        """PARiS is just a StarWeb skin — must be classified as StarWeb."""
+        """PARiS war als StarWeb-Skin klassifiziert (#48), nach #21/#33
-        assert BUNDESLAENDER["HB"].doku_system == "StarWeb"
+        differenzieren wir auf "PARiS" weil die Servlet-API-Konvention
        deutlich von der modernen StarWeb-Familie abweicht (Form-POST
        statt browse.tt.json/SearchAndDisplay).
        """
        assert BUNDESLAENDER["HB"].doku_system == "PARiS"
    def test_sn_is_eigensystem_not_parldok(self):
        """EDAS is ASP.NET-Webforms, NOT ParlDok-compatible with MV."""