From 6184bf8a889b619a179de195998043179e42061f Mon Sep 17 00:00:00 2001
From: Dotty Dotter <dotty@Mac.wideopen.space>
Date: Wed, 8 Apr 2026 12:57:34 +0200
Subject: [PATCH] ParLDokAdapter: server-side fulltext search via
 facet_fulltext (#12)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the client-side title/Urheber substring filter with a
real server-side full-text search through ParlDok's facet_fulltext
tag (type=0). The tag schema is reverse-engineered from
pd.addInput in the live bundle.js:

  {"type": 0,
   "id": <getFulltextId(term)>,    # non-alphanum → "-"
   "fulltext": <raw term>,
   "label": <raw term>,
   "field": "Alle"}                # search all indexed fields

The Resultpage queryid inherits the fulltext filter, so
pagination works without re-sending the tag.

Smoke test (local):
  Schule → 10 hits (was 3)
  Klima  → 10 hits across multiple parties + dates
  Wohnen → 10 hits including older 2025 Anträge

The 10-page (1000-doc) safety bound still applies on top of the
fulltext-filtered result set, but since the server now narrows
to ~2k Schule-related docs WP-wide instead of the 8k+ raw WP
total, the bound is no longer the limiting factor for typical
queries.

Closes #12. BE/LSA equivalent (#13) is independent — eUI
sf-index names still need DevTools tracing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 app/parlamente.py | 93 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 65 insertions(+), 28 deletions(-)
diff --git a/app/parlamente.py b/app/parlamente.py
index db56f0d..1675dbf 100644
--- a/app/parlamente.py
+++ b/app/parlamente.py
@@ -864,6 +864,7 @@ class ParLDokAdapter(ParlamentAdapter):
     """
 
     # Reverse-engineered facet type constants from bundle.js (pd.facet_*).
+    FACET_FULLTEXT = 0
     FACET_FRACTION = 2
     FACET_KIND = 7
     FACET_TYPE = 8
@@ -941,16 +942,48 @@ class ParLDokAdapter(ParlamentAdapter):
             out.append("Landesregierung")
         return out
 
-    def _build_search_body(self, *, length: int = 100) -> dict:
+    @staticmethod
+    def _fulltext_id(term: str) -> str:
+        """Sanitize a search term to ParlDok's facet ID format.
+
+        Mirrors ``pd.getFulltextId`` from ``bundle.js``: replace every
+        non-alphanumeric character with ``-``. The server uses this to
+        deduplicate identical search facets.
+        """
+        return re.sub(r"[^a-zA-Z0-9]", "-", term)
+
+    def _build_search_body(self, *, length: int = 100, query: str = "") -> dict:
         """Build the JSON payload for the initial ``Fulltext/Search`` call.
 
-        Filters by Wahlperiode only. Type/kind filtering happens
-        client-side because the facet_type/facet_kind value IDs are
-        instance-specific and would require an extra ``Fulltext/Filter``
-        round trip to discover. Pagination beyond the first page goes
-        through ``Fulltext/Resultpage`` (see ``_post_resultpage``); the
-        ``Search`` endpoint itself ignores any non-zero ``Start``.
+        Filters by Wahlperiode + optional server-side full-text search.
+        Type/kind filtering still happens client-side because the
+        facet_type/facet_kind value IDs are instance-specific and would
+        require an extra ``Fulltext/Filter`` round trip to discover.
+
+        Pagination beyond the first page goes through
+        ``Fulltext/Resultpage`` — the ``Search`` endpoint itself
+        ignores any non-zero ``Start``.
+
+        The full-text tag schema is reverse-engineered from
+        ``pd.addInput`` in ``bundle.js`` and matches the SPA payload
+        verbatim::
+
+            {"type": 0, "id": "<sanitized>", "fulltext": "<raw>",
+             "label": "<raw>", "field": "Alle"}
+
+        ``field="Alle"`` means "search all indexed fields"
+        (``pd.currentFTSearchMode`` default). The server tokenizes
+        the term and applies AND-semantics across whitespace.
         """
+        tags: list[dict] = [{"type": self.FACET_LP, "id": self.wahlperiode}]
+        if query:
+            tags.append({
+                "type": self.FACET_FULLTEXT,
+                "id": self._fulltext_id(query),
+                "fulltext": query,
+                "label": query,
+                "field": "Alle",
+            })
         return {
             "devicekey": "",
             "max": length,
@@ -961,7 +994,7 @@ class ParLDokAdapter(ParlamentAdapter):
             "llm": 0,
             "newdocsearch": False,
             "limit": {"Start": 0, "Length": length},
-            "tags": [{"type": self.FACET_LP, "id": self.wahlperiode}],
+            "tags": tags,
             "updateFilters": [],
         }
 
@@ -1027,15 +1060,18 @@ class ParLDokAdapter(ParlamentAdapter):
             return None
 
     async def _initial_search(
-        self, client: httpx.AsyncClient, *, length: int,
+        self, client: httpx.AsyncClient, *, length: int, query: str = "",
     ) -> tuple[Optional[int], list[dict]]:
         """Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
 
         The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
         calls. ParlDok ignores any non-zero ``Start`` on this endpoint —
         the first 100 hits are the only ones reachable via ``Search``.
+
+        ``query`` is sent server-side as a ``facet_fulltext`` tag — see
+        ``_build_search_body``.
         """
-        body = self._build_search_body(length=length)
+        body = self._build_search_body(length=length, query=query)
         inner = await self._post_json(client, "Fulltext/Search", body)
         if not inner:
             return None, []
@@ -1063,7 +1099,7 @@ class ParLDokAdapter(ParlamentAdapter):
         )
 
     async def _paginated_hits(
-        self, client: httpx.AsyncClient,
+        self, client: httpx.AsyncClient, *, query: str = "",
     ):
         """Async iterator over Drucksachen-style hits across all pages.
 
@@ -1072,8 +1108,15 @@ class ParLDokAdapter(ParlamentAdapter):
         ``Fulltext/Resultpage`` using the queryid the server returned for
         the initial call. Stops when a page comes back empty, undersized,
         or after ``MAX_PAGES`` iterations.
+
+        ``query`` is forwarded as a server-side full-text filter to
+        ``_initial_search``; the resulting ``queryid`` is bound to that
+        filter, so subsequent ``Resultpage`` calls automatically inherit
+        it without needing to repeat the tag.
         """
-        queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE)
+        queryid, hits = await self._initial_search(
+            client, length=self.PAGE_SIZE, query=query,
+        )
         for hit in hits:
             yield hit
         if not queryid or len(hits) < self.PAGE_SIZE:
@@ -1105,24 +1148,23 @@ class ParLDokAdapter(ParlamentAdapter):
     async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
         """Search recent documents of the configured Wahlperiode.
 
-        ``query`` is a client-side filter on title + Urheber. The server
-        returns the configured WP sorted newest first; the client keeps
-        only ``Antrag``-typed Drucksachen and applies the title filter.
+        Server-side full-text search via the ``facet_fulltext`` tag (#12)
+        when ``query`` is non-empty; otherwise pure browse mode. The
+        server returns the WP sorted newest-first across all document
+        kinds, the client keeps only ``Antrag``-typed Drucksachen and
+        dedupes by lp/number (ParlDok returns the same Drucksache
+        multiple times when it appears in several Vorgänge/Beratungen).
 
-        Pagination: ParlDok caps each ``Fulltext/Search`` response at 100
-        rows. Only ~3% of MV hits are real Anträge (most are Kleine
-        Anfragen + Protokolle), so we may need several pages to fill
-        ``limit``.
+        Pagination: ParlDok caps each response at 100 rows; further
+        pages come from ``Fulltext/Resultpage`` bound to the
+        server-assigned ``queryid``.
         """
         results: list[Drucksache] = []
-        query_terms = [t for t in query.lower().split() if t] if query else []
-        # ParlDok returns the same Drucksache multiple times when it
-        # appears in several Vorgänge/Beratungen — dedupe by lp/number.
         seen: set[str] = set()
 
         async with self._make_client() as client:
             await client.get(f"{self.base_url}{self.prefix}/")
-            async for hit in self._paginated_hits(client):
+            async for hit in self._paginated_hits(client, query=query):
                 if hit.get("kind") != "Drucksache":
                     continue
                 if self.document_typ and hit.get("type") != self.document_typ:
@@ -1135,11 +1177,6 @@ class ParLDokAdapter(ParlamentAdapter):
                     continue
                 seen.add(doc.drucksache)
 
-                if query_terms:
-                    hay = f"{doc.title} {hit.get('authorhtml', '')}".lower()
-                    if not all(t in hay for t in query_terms):
-                        continue
-
                 results.append(doc)
                 if len(results) >= limit:
                     return results