From 6184bf8a889b619a179de195998043179e42061f Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Wed, 8 Apr 2026 12:57:34 +0200 Subject: [PATCH] ParLDokAdapter: server-side fulltext search via facet_fulltext (#12) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the client-side title/Urheber substring filter with a real server-side full-text search through ParlDok's facet_fulltext tag (type=0). The tag schema is reverse-engineered from pd.addInput in the live bundle.js: {"type": 0, "id": , # non-alphanum → "-" "fulltext": , "label": , "field": "Alle"} # search all indexed fields The Resultpage queryid inherits the fulltext filter, so pagination works without re-sending the tag. Smoke test (local): Schule → 10 hits (was 3) Klima → 10 hits across multiple parties + dates Wohnen → 10 hits including older 2025 Anträge The 10-page (1000-doc) safety bound still applies on top of the fulltext-filtered result set, but since the server now narrows to ~2k Schule-related docs WP-wide instead of the 8k+ raw WP total, the bound is no longer the limiting factor for typical queries. Closes #12. BE/LSA equivalent (#13) is independent — eUI sf-index names still need DevTools tracing. Co-Authored-By: Claude Opus 4.6 (1M context) --- app/parlamente.py | 93 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 28 deletions(-) diff --git a/app/parlamente.py b/app/parlamente.py index db56f0d..1675dbf 100644 --- a/app/parlamente.py +++ b/app/parlamente.py @@ -864,6 +864,7 @@ class ParLDokAdapter(ParlamentAdapter): """ # Reverse-engineered facet type constants from bundle.js (pd.facet_*). + FACET_FULLTEXT = 0 FACET_FRACTION = 2 FACET_KIND = 7 FACET_TYPE = 8 @@ -941,16 +942,48 @@ class ParLDokAdapter(ParlamentAdapter): out.append("Landesregierung") return out - def _build_search_body(self, *, length: int = 100) -> dict: + @staticmethod + def _fulltext_id(term: str) -> str: + """Sanitize a search term to ParlDok's facet ID format. + + Mirrors ``pd.getFulltextId`` from ``bundle.js``: replace every + non-alphanumeric character with ``-``. The server uses this to + deduplicate identical search facets. + """ + return re.sub(r"[^a-zA-Z0-9]", "-", term) + + def _build_search_body(self, *, length: int = 100, query: str = "") -> dict: """Build the JSON payload for the initial ``Fulltext/Search`` call. - Filters by Wahlperiode only. Type/kind filtering happens - client-side because the facet_type/facet_kind value IDs are - instance-specific and would require an extra ``Fulltext/Filter`` - round trip to discover. Pagination beyond the first page goes - through ``Fulltext/Resultpage`` (see ``_post_resultpage``); the - ``Search`` endpoint itself ignores any non-zero ``Start``. + Filters by Wahlperiode + optional server-side full-text search. + Type/kind filtering still happens client-side because the + facet_type/facet_kind value IDs are instance-specific and would + require an extra ``Fulltext/Filter`` round trip to discover. + + Pagination beyond the first page goes through + ``Fulltext/Resultpage`` — the ``Search`` endpoint itself + ignores any non-zero ``Start``. + + The full-text tag schema is reverse-engineered from + ``pd.addInput`` in ``bundle.js`` and matches the SPA payload + verbatim:: + + {"type": 0, "id": "", "fulltext": "", + "label": "", "field": "Alle"} + + ``field="Alle"`` means "search all indexed fields" + (``pd.currentFTSearchMode`` default). The server tokenizes + the term and applies AND-semantics across whitespace. """ + tags: list[dict] = [{"type": self.FACET_LP, "id": self.wahlperiode}] + if query: + tags.append({ + "type": self.FACET_FULLTEXT, + "id": self._fulltext_id(query), + "fulltext": query, + "label": query, + "field": "Alle", + }) return { "devicekey": "", "max": length, @@ -961,7 +994,7 @@ class ParLDokAdapter(ParlamentAdapter): "llm": 0, "newdocsearch": False, "limit": {"Start": 0, "Length": length}, - "tags": [{"type": self.FACET_LP, "id": self.wahlperiode}], + "tags": tags, "updateFilters": [], } @@ -1027,15 +1060,18 @@ class ParLDokAdapter(ParlamentAdapter): return None async def _initial_search( - self, client: httpx.AsyncClient, *, length: int, + self, client: httpx.AsyncClient, *, length: int, query: str = "", ) -> tuple[Optional[int], list[dict]]: """Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``. The ``queryid`` is needed for subsequent ``Fulltext/Resultpage`` calls. ParlDok ignores any non-zero ``Start`` on this endpoint — the first 100 hits are the only ones reachable via ``Search``. + + ``query`` is sent server-side as a ``facet_fulltext`` tag — see + ``_build_search_body``. """ - body = self._build_search_body(length=length) + body = self._build_search_body(length=length, query=query) inner = await self._post_json(client, "Fulltext/Search", body) if not inner: return None, [] @@ -1063,7 +1099,7 @@ class ParLDokAdapter(ParlamentAdapter): ) async def _paginated_hits( - self, client: httpx.AsyncClient, + self, client: httpx.AsyncClient, *, query: str = "", ): """Async iterator over Drucksachen-style hits across all pages. @@ -1072,8 +1108,15 @@ class ParLDokAdapter(ParlamentAdapter): ``Fulltext/Resultpage`` using the queryid the server returned for the initial call. Stops when a page comes back empty, undersized, or after ``MAX_PAGES`` iterations. + + ``query`` is forwarded as a server-side full-text filter to + ``_initial_search``; the resulting ``queryid`` is bound to that + filter, so subsequent ``Resultpage`` calls automatically inherit + it without needing to repeat the tag. """ - queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE) + queryid, hits = await self._initial_search( + client, length=self.PAGE_SIZE, query=query, + ) for hit in hits: yield hit if not queryid or len(hits) < self.PAGE_SIZE: @@ -1105,24 +1148,23 @@ class ParLDokAdapter(ParlamentAdapter): async def search(self, query: str, limit: int = 20) -> list[Drucksache]: """Search recent documents of the configured Wahlperiode. - ``query`` is a client-side filter on title + Urheber. The server - returns the configured WP sorted newest first; the client keeps - only ``Antrag``-typed Drucksachen and applies the title filter. + Server-side full-text search via the ``facet_fulltext`` tag (#12) + when ``query`` is non-empty; otherwise pure browse mode. The + server returns the WP sorted newest-first across all document + kinds, the client keeps only ``Antrag``-typed Drucksachen and + dedupes by lp/number (ParlDok returns the same Drucksache + multiple times when it appears in several Vorgänge/Beratungen). - Pagination: ParlDok caps each ``Fulltext/Search`` response at 100 - rows. Only ~3% of MV hits are real Anträge (most are Kleine - Anfragen + Protokolle), so we may need several pages to fill - ``limit``. + Pagination: ParlDok caps each response at 100 rows; further + pages come from ``Fulltext/Resultpage`` bound to the + server-assigned ``queryid``. """ results: list[Drucksache] = [] - query_terms = [t for t in query.lower().split() if t] if query else [] - # ParlDok returns the same Drucksache multiple times when it - # appears in several Vorgänge/Beratungen — dedupe by lp/number. seen: set[str] = set() async with self._make_client() as client: await client.get(f"{self.base_url}{self.prefix}/") - async for hit in self._paginated_hits(client): + async for hit in self._paginated_hits(client, query=query): if hit.get("kind") != "Drucksache": continue if self.document_typ and hit.get("type") != self.document_typ: @@ -1135,11 +1177,6 @@ class ParLDokAdapter(ParlamentAdapter): continue seen.add(doc.drucksache) - if query_terms: - hay = f"{doc.title} {hit.get('authorhtml', '')}".lower() - if not all(t in hay for t in query_terms): - continue - results.append(doc) if len(results) >= limit: return results