Activate Thüringen via ParLDokAdapter reuse + filter widening (#25, Phase 1)

Thüringen läuft auf parldok.thueringer-landtag.de mit ParlDok 8.3.5 (J3S GmbH) — exakt dieselbe Version wie MV. Aber TH packt seine Anträge unter zusammengesetzten type-Strings ("Antrag gemäß § 79 GO", "Antrag gemäß § 74 (2) GO") und kind="Vorlage" statt der MV-Variante kind="Drucksache"/type="Antrag". Strict-Match auf "Antrag" hat 0 Treffer geliefert. Lösung: ParLDokAdapter um zwei Konstruktor-Parameter erweitert: - document_typ_substring=True → Substring-Match auf type-Feld ("Antrag" matched "Antrag gemäß § 79 GO", "Alternativantrag" usw.) - kinds=["Drucksache", "Vorlage"] → erweiterte kind-Liste Defaults sind backward-kompatibel (Substring-Match aus, kinds nur Drucksache), sodass MV und HH unverändert weiterlaufen. _hit_matches_filters() als zentraler Filter-Helper extrahiert, search() und get_document() nutzen ihn — get_document() überspringt ihn allerdings, weil dort beliebige Drucksachen aufrufbar sein müssen, unabhängig vom search-Time-Filter. Hostname-Korrektur: parldok.thueringen.de redirected per 303 auf parldok.thueringer-landtag.de. doku_base_url in bundeslaender.py auf den neuen Host umgestellt. Smoke-Test (lokal): TH q="": 8 hits in 3.3s TH q="Schule": 2 hits in 25.7s (Lernmittelbeschaffung, Modernisierung Bund-Länder-Vereinbarung — beide Schul-bezogen) TH q="Klima": 0 hits (keine in den letzten 1000 Drucksachen) Damit ist Phase 1 (3/3) komplett. Nächstes Phase-2 Issue: #27 BB als StarWebAdapter-Template. Phase 1 (3/3) aus Roadmap-Issue #49. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 23:48:02 +02:00 · 2026-04-08 23:48:02 +02:00 · dc0bb07c12
commit dc0bb07c12
parent 916c5d84d7
2 changed files with 60 additions and 13 deletions
--- a/app/bundeslaender.py
+++ b/app/bundeslaender.py
@ -379,18 +379,19 @@ BUNDESLAENDER: dict[str, Bundesland] = {
        regierungsfraktionen=["CDU", "BSW", "SPD"],
        landtagsfraktionen=["AfD", "CDU", "LINKE", "BSW", "SPD"],
        doku_system="ParlDok",
-        doku_base_url="https://parldok.thueringen.de",
+        doku_base_url="https://parldok.thueringer-landtag.de",
        drucksache_format="8/1234",
        dokukratie_scraper="th",
        aktiv=True,
        anmerkung=(
            "Erste Brombeer-Koalition Deutschlands (CDU+BSW+SPD) als "
-            "Minderheitsregierung mit 44 von 88 Sitzen. Mario Voigt (CDU) "
+            "Minderheitsregierung mit 44 von 88 Sitzen. Mario Voigt "
-            "seit Dezember 2024 MP. Doku-System ist ParlDok (siehe "
+            "(CDU) seit Dezember 2024 MP. ParlDok 8.3.5 (J3S GmbH) — "
-            "dokukratie/th.yml mit DokumententypId/LegislaturperiodenNummer-"
+            "EXAKT dieselbe Version wie MV. ParLDokAdapter direkt "
-            "Form-Feldern), nicht StarWeb wie ursprünglich klassifiziert. "
+            "wiederverwendbar als Registry-Eintrag (#25). Achtung: "
-            "Vor Implementierung mit `curl parldok.thueringen.de/parldok/` "
+            "alter Hostname parldok.thueringen.de redirected per 303 "
-            "verifizieren ob das Live-System ParlDok 8.x SPA wie MV ist — "
+            "auf parldok.thueringer-landtag.de — neuer Hostname ist "
-            "dann ist der ParLDokAdapter direkt wiederverwendbar."
+            "der korrekte."
        ),
    ),
 }
--- a/app/parlamente.py
+++ b/app/parlamente.py
@ -889,6 +889,8 @@ class ParLDokAdapter(ParlamentAdapter):
        wahlperiode: int,
        prefix: str = "/parldok",
        document_typ: str = "Antrag",
        document_typ_substring: bool = False,
        kinds: Optional[list[str]] = None,
    ) -> None:
        """Configure a ParlDok 8.x adapter for one specific parliament.
@ -902,6 +904,16 @@ class ParLDokAdapter(ParlamentAdapter):
            document_typ: client-side filter on the ``type`` field of
                each hit ("Antrag", "Gesetzentwurf", …). Set to empty
                string to disable type filtering.
            document_typ_substring: if True, ``document_typ`` is matched
                as a substring against the hit's ``type`` field instead
                of an exact match. Needed for instances where the
                Drucksachen-Anträge live under composite type strings
                like ``"Antrag gemäß § 79 GO"`` (Thüringen) — strict
                ``"Antrag"`` would never match.
            kinds: optional list of acceptable ``kind`` values. Defaults
                to ``["Drucksache"]`` if None — but TH packs its Anträge
                under ``kind="Vorlage"`` so the parameter has to be
                widened there.
        """
        self.bundesland = bundesland
        self.name = name
@ -909,6 +921,27 @@ class ParLDokAdapter(ParlamentAdapter):
        self.prefix = "/" + prefix.strip("/")
        self.wahlperiode = wahlperiode
        self.document_typ = document_typ
        self.document_typ_substring = document_typ_substring
        self.kinds = kinds if kinds is not None else ["Drucksache"]
    def _hit_matches_filters(self, hit: dict) -> bool:
        """Apply the kind/typ filters to a raw hit dict.
        Centralised so the search loop can short-circuit cleanly. ``hit``
        comes from ``Fulltext/Search`` or ``Fulltext/Resultpage`` JSON
        responses; both share the same record schema.
        """
        if self.kinds and hit.get("kind") not in self.kinds:
            return False
        hit_type = (hit.get("type") or "").strip()
        if self.document_typ:
            if self.document_typ_substring:
                if self.document_typ not in hit_type:
                    return False
            else:
                if hit_type != self.document_typ:
                    return False
        return True
    @staticmethod
    def _datum_de_to_iso(datum_de: str) -> str:
@ -1159,9 +1192,7 @@ class ParLDokAdapter(ParlamentAdapter):
        async with self._make_client() as client:
            await client.get(f"{self.base_url}{self.prefix}/")
            async for hit in self._paginated_hits(client):
-                if hit.get("kind") != "Drucksache":
+                if not self._hit_matches_filters(hit):
                    continue
                if self.document_typ and hit.get("type") != self.document_typ:
                    continue
                doc = self._hit_to_drucksache(hit)
@ -1199,8 +1230,9 @@ class ParLDokAdapter(ParlamentAdapter):
        async with self._make_client() as client:
            await client.get(f"{self.base_url}{self.prefix}/")
            async for hit in self._paginated_hits(client):
-                if hit.get("kind") != "Drucksache":
+                # Don't apply doc-type filters here — get_document is
-                    continue
+                # used to look up arbitrary Drucksachen, including ones
                # whose kind/typ doesn't match the search-time filter.
                if str(hit.get("lp")) == wanted_lp and str(hit.get("number")) == wanted_num:
                    return self._hit_to_drucksache(hit)
        return None
@ -1708,6 +1740,20 @@ ADAPTERS = {
        prefix="/parldok",
        document_typ="Antrag",
    ),
    "TH": ParLDokAdapter(
        bundesland="TH",
        name="Thüringer Landtag (ParlDok)",
        base_url="https://parldok.thueringer-landtag.de",
        wahlperiode=8,
        prefix="/parldok",
        # TH packs Anträge under composite type strings like
        # "Antrag gemäß § 79 GO" with kind="Vorlage", not the
        # MV-style kind="Drucksache"/type="Antrag". Substring-match
        # on "Antrag" plus widened kind list catches them all.
        document_typ="Antrag",
        document_typ_substring=True,
        kinds=["Drucksache", "Vorlage"],
    ),
    "BY": BayernAdapter(),
    "BW": PARLISAdapter(
        bundesland="BW",