From b5ae8894d46d459d829a71de747a56f62134d503 Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Wed, 8 Apr 2026 19:01:00 +0200 Subject: [PATCH] =?UTF-8?q?ParLDokAdapter:=20Volltext=20(#12)=20deaktivier?= =?UTF-8?q?en=20=E2=80=94=20einheitlich=20Title-Filter=20(#18)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA (beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung 2026-04-08: einheitliches Verhalten ist wichtiger als das beste Verhalten in 2 von 4 Adaptern. Konkrete Änderungen: - _build_search_body() schickt query nicht mehr server-side. Der query-Parameter bleibt in der Signatur als unused-mit-del, weil die Wieder-Aktivierung später ein Drop-in sein soll wenn die PortalaAdapter-Variante reverse-engineered wurde. - _initial_search() und _paginated_hits() ohne query-Parameter. - search() macht clientseitigen Title+Urheber-Filter wie der PortalaAdapter — same Codepfad, einheitliches Verhalten. - get_document() nutzt die unveränderte Pagination. - FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code als Dokumentation für die spätere Re-Aktivierung. Im Docstring ist die Tag-Form festgehalten. Folgen: - MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA pre-#13. - Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10. - Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred) bleiben als Folge-Optionen. Co-Authored-By: Claude Opus 4.6 (1M context) --- app/parlamente.py | 92 ++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 53 deletions(-) diff --git a/app/parlamente.py b/app/parlamente.py index 82c68ae..db35487 100644 --- a/app/parlamente.py +++ b/app/parlamente.py @@ -961,35 +961,27 @@ class ParLDokAdapter(ParlamentAdapter): def _build_search_body(self, *, length: int = 100, query: str = "") -> dict: """Build the JSON payload for the initial ``Fulltext/Search`` call. - Filters by Wahlperiode + optional server-side full-text search. - Type/kind filtering still happens client-side because the - facet_type/facet_kind value IDs are instance-specific and would - require an extra ``Fulltext/Filter`` round trip to discover. + Filters by Wahlperiode only — type/kind/fulltext filtering all + happen client-side after the hit list is paginated. The + ``query`` parameter is accepted for API compatibility but is + currently NOT forwarded to the server (#18: einheitliche + client-side Title-Suche, kein Server-Volltext, weil das + Verhalten zwischen Adaptern sonst asymmetrisch wird). The + ``FACET_FULLTEXT`` constant and :meth:`_fulltext_id` helper + are kept around as documentation for the previous #12 + server-side variant — when fulltext gets uniformly + re-introduced later, the dormant tag is just:: + + {"type": self.FACET_FULLTEXT, + "id": self._fulltext_id(query), + "fulltext": query, "label": query, "field": "Alle"} Pagination beyond the first page goes through ``Fulltext/Resultpage`` — the ``Search`` endpoint itself ignores any non-zero ``Start``. - - The full-text tag schema is reverse-engineered from - ``pd.addInput`` in ``bundle.js`` and matches the SPA payload - verbatim:: - - {"type": 0, "id": "", "fulltext": "", - "label": "", "field": "Alle"} - - ``field="Alle"`` means "search all indexed fields" - (``pd.currentFTSearchMode`` default). The server tokenizes - the term and applies AND-semantics across whitespace. """ + del query # explicitly unused — see docstring tags: list[dict] = [{"type": self.FACET_LP, "id": self.wahlperiode}] - if query: - tags.append({ - "type": self.FACET_FULLTEXT, - "id": self._fulltext_id(query), - "fulltext": query, - "label": query, - "field": "Alle", - }) return { "devicekey": "", "max": length, @@ -1066,18 +1058,15 @@ class ParLDokAdapter(ParlamentAdapter): return None async def _initial_search( - self, client: httpx.AsyncClient, *, length: int, query: str = "", + self, client: httpx.AsyncClient, *, length: int, ) -> tuple[Optional[int], list[dict]]: """Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``. The ``queryid`` is needed for subsequent ``Fulltext/Resultpage`` calls. ParlDok ignores any non-zero ``Start`` on this endpoint — the first 100 hits are the only ones reachable via ``Search``. - - ``query`` is sent server-side as a ``facet_fulltext`` tag — see - ``_build_search_body``. """ - body = self._build_search_body(length=length, query=query) + body = self._build_search_body(length=length) inner = await self._post_json(client, "Fulltext/Search", body) if not inner: return None, [] @@ -1104,25 +1093,16 @@ class ParLDokAdapter(ParlamentAdapter): headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, ) - async def _paginated_hits( - self, client: httpx.AsyncClient, *, query: str = "", - ): - """Async iterator over Drucksachen-style hits across all pages. + async def _paginated_hits(self, client: httpx.AsyncClient): + """Async iterator over Drucksachen-style hits across pages. Yields raw hit dicts in newest-first order. The first batch comes from ``Fulltext/Search``, subsequent batches from ``Fulltext/Resultpage`` using the queryid the server returned for the initial call. Stops when a page comes back empty, undersized, - or after ``MAX_PAGES`` iterations. - - ``query`` is forwarded as a server-side full-text filter to - ``_initial_search``; the resulting ``queryid`` is bound to that - filter, so subsequent ``Resultpage`` calls automatically inherit - it without needing to repeat the tag. + or after :attr:`MAX_PAGES` iterations. """ - queryid, hits = await self._initial_search( - client, length=self.PAGE_SIZE, query=query, - ) + queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE) for hit in hits: yield hit if not queryid or len(hits) < self.PAGE_SIZE: @@ -1148,29 +1128,30 @@ class ParLDokAdapter(ParlamentAdapter): # Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more # than enough for the typical UI request (limit 5..20). Filtered # queries that find nothing in the last 1000 docs return empty - # rather than scan the entire WP. + # rather than scan the entire WP — same trade-off as the BE/LSA + # PortalaAdapter quick-win window. MAX_PAGES = 10 async def search(self, query: str, limit: int = 20) -> list[Drucksache]: - """Search recent documents of the configured Wahlperiode. + """Search the configured Wahlperiode, sorted newest-first. - Server-side full-text search via the ``facet_fulltext`` tag (#12) - when ``query`` is non-empty; otherwise pure browse mode. The - server returns the WP sorted newest-first across all document - kinds, the client keeps only ``Antrag``-typed Drucksachen and - dedupes by lp/number (ParlDok returns the same Drucksache - multiple times when it appears in several Vorgänge/Beratungen). + #18: einheitliches Verhalten — Server filtert nur nach WP, der + Client paginiert über die ganze WP und filtert lokal nach + Treffern in Titel oder Urheber. Volltext-Filter (#12) ist + zurückgebaut, weil das Verhalten zwischen Adaptern sonst + asymmetrisch wird. Sortierung kommt vom Server (newest-first + durch ``sort=2`` in :meth:`_build_search_body`). - Pagination: ParlDok caps each response at 100 rows; further - pages come from ``Fulltext/Resultpage`` bound to the - server-assigned ``queryid``. + Dedupe per ``lp/number`` weil ParlDok dieselbe Drucksache + mehrfach in verschiedenen Vorgängen/Beratungen liefert. """ results: list[Drucksache] = [] seen: set[str] = set() + query_terms = [t.lower() for t in query.split() if t] if query else [] async with self._make_client() as client: await client.get(f"{self.base_url}{self.prefix}/") - async for hit in self._paginated_hits(client, query=query): + async for hit in self._paginated_hits(client): if hit.get("kind") != "Drucksache": continue if self.document_typ and hit.get("type") != self.document_typ: @@ -1183,6 +1164,11 @@ class ParLDokAdapter(ParlamentAdapter): continue seen.add(doc.drucksache) + if query_terms: + hay = f"{doc.title} {hit.get('authorhtml', '')}".lower() + if not all(t in hay for t in query_terms): + continue + results.append(doc) if len(results) >= limit: return results