ParLDokAdapter: server-side fulltext search via facet_fulltext (#12)
Replaces the client-side title/Urheber substring filter with a
real server-side full-text search through ParlDok's facet_fulltext
tag (type=0). The tag schema is reverse-engineered from
pd.addInput in the live bundle.js:
{"type": 0,
"id": <getFulltextId(term)>, # non-alphanum → "-"
"fulltext": <raw term>,
"label": <raw term>,
"field": "Alle"} # search all indexed fields
The Resultpage queryid inherits the fulltext filter, so
pagination works without re-sending the tag.
Smoke test (local):
Schule → 10 hits (was 3)
Klima → 10 hits across multiple parties + dates
Wohnen → 10 hits including older 2025 Anträge
The 10-page (1000-doc) safety bound still applies on top of the
fulltext-filtered result set, but since the server now narrows
to ~2k Schule-related docs WP-wide instead of the 8k+ raw WP
total, the bound is no longer the limiting factor for typical
queries.
Closes #12. BE/LSA equivalent (#13) is independent — eUI
sf-index names still need DevTools tracing.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
bc7f4a67cb
commit
6184bf8a88
@ -864,6 +864,7 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Reverse-engineered facet type constants from bundle.js (pd.facet_*).
|
# Reverse-engineered facet type constants from bundle.js (pd.facet_*).
|
||||||
|
FACET_FULLTEXT = 0
|
||||||
FACET_FRACTION = 2
|
FACET_FRACTION = 2
|
||||||
FACET_KIND = 7
|
FACET_KIND = 7
|
||||||
FACET_TYPE = 8
|
FACET_TYPE = 8
|
||||||
@ -941,16 +942,48 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
out.append("Landesregierung")
|
out.append("Landesregierung")
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def _build_search_body(self, *, length: int = 100) -> dict:
|
@staticmethod
|
||||||
|
def _fulltext_id(term: str) -> str:
|
||||||
|
"""Sanitize a search term to ParlDok's facet ID format.
|
||||||
|
|
||||||
|
Mirrors ``pd.getFulltextId`` from ``bundle.js``: replace every
|
||||||
|
non-alphanumeric character with ``-``. The server uses this to
|
||||||
|
deduplicate identical search facets.
|
||||||
|
"""
|
||||||
|
return re.sub(r"[^a-zA-Z0-9]", "-", term)
|
||||||
|
|
||||||
|
def _build_search_body(self, *, length: int = 100, query: str = "") -> dict:
|
||||||
"""Build the JSON payload for the initial ``Fulltext/Search`` call.
|
"""Build the JSON payload for the initial ``Fulltext/Search`` call.
|
||||||
|
|
||||||
Filters by Wahlperiode only. Type/kind filtering happens
|
Filters by Wahlperiode + optional server-side full-text search.
|
||||||
client-side because the facet_type/facet_kind value IDs are
|
Type/kind filtering still happens client-side because the
|
||||||
instance-specific and would require an extra ``Fulltext/Filter``
|
facet_type/facet_kind value IDs are instance-specific and would
|
||||||
round trip to discover. Pagination beyond the first page goes
|
require an extra ``Fulltext/Filter`` round trip to discover.
|
||||||
through ``Fulltext/Resultpage`` (see ``_post_resultpage``); the
|
|
||||||
``Search`` endpoint itself ignores any non-zero ``Start``.
|
Pagination beyond the first page goes through
|
||||||
|
``Fulltext/Resultpage`` — the ``Search`` endpoint itself
|
||||||
|
ignores any non-zero ``Start``.
|
||||||
|
|
||||||
|
The full-text tag schema is reverse-engineered from
|
||||||
|
``pd.addInput`` in ``bundle.js`` and matches the SPA payload
|
||||||
|
verbatim::
|
||||||
|
|
||||||
|
{"type": 0, "id": "<sanitized>", "fulltext": "<raw>",
|
||||||
|
"label": "<raw>", "field": "Alle"}
|
||||||
|
|
||||||
|
``field="Alle"`` means "search all indexed fields"
|
||||||
|
(``pd.currentFTSearchMode`` default). The server tokenizes
|
||||||
|
the term and applies AND-semantics across whitespace.
|
||||||
"""
|
"""
|
||||||
|
tags: list[dict] = [{"type": self.FACET_LP, "id": self.wahlperiode}]
|
||||||
|
if query:
|
||||||
|
tags.append({
|
||||||
|
"type": self.FACET_FULLTEXT,
|
||||||
|
"id": self._fulltext_id(query),
|
||||||
|
"fulltext": query,
|
||||||
|
"label": query,
|
||||||
|
"field": "Alle",
|
||||||
|
})
|
||||||
return {
|
return {
|
||||||
"devicekey": "",
|
"devicekey": "",
|
||||||
"max": length,
|
"max": length,
|
||||||
@ -961,7 +994,7 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
"llm": 0,
|
"llm": 0,
|
||||||
"newdocsearch": False,
|
"newdocsearch": False,
|
||||||
"limit": {"Start": 0, "Length": length},
|
"limit": {"Start": 0, "Length": length},
|
||||||
"tags": [{"type": self.FACET_LP, "id": self.wahlperiode}],
|
"tags": tags,
|
||||||
"updateFilters": [],
|
"updateFilters": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1027,15 +1060,18 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
async def _initial_search(
|
async def _initial_search(
|
||||||
self, client: httpx.AsyncClient, *, length: int,
|
self, client: httpx.AsyncClient, *, length: int, query: str = "",
|
||||||
) -> tuple[Optional[int], list[dict]]:
|
) -> tuple[Optional[int], list[dict]]:
|
||||||
"""Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
|
"""Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
|
||||||
|
|
||||||
The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
|
The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
|
||||||
calls. ParlDok ignores any non-zero ``Start`` on this endpoint —
|
calls. ParlDok ignores any non-zero ``Start`` on this endpoint —
|
||||||
the first 100 hits are the only ones reachable via ``Search``.
|
the first 100 hits are the only ones reachable via ``Search``.
|
||||||
|
|
||||||
|
``query`` is sent server-side as a ``facet_fulltext`` tag — see
|
||||||
|
``_build_search_body``.
|
||||||
"""
|
"""
|
||||||
body = self._build_search_body(length=length)
|
body = self._build_search_body(length=length, query=query)
|
||||||
inner = await self._post_json(client, "Fulltext/Search", body)
|
inner = await self._post_json(client, "Fulltext/Search", body)
|
||||||
if not inner:
|
if not inner:
|
||||||
return None, []
|
return None, []
|
||||||
@ -1063,7 +1099,7 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
)
|
)
|
||||||
|
|
||||||
async def _paginated_hits(
|
async def _paginated_hits(
|
||||||
self, client: httpx.AsyncClient,
|
self, client: httpx.AsyncClient, *, query: str = "",
|
||||||
):
|
):
|
||||||
"""Async iterator over Drucksachen-style hits across all pages.
|
"""Async iterator over Drucksachen-style hits across all pages.
|
||||||
|
|
||||||
@ -1072,8 +1108,15 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
``Fulltext/Resultpage`` using the queryid the server returned for
|
``Fulltext/Resultpage`` using the queryid the server returned for
|
||||||
the initial call. Stops when a page comes back empty, undersized,
|
the initial call. Stops when a page comes back empty, undersized,
|
||||||
or after ``MAX_PAGES`` iterations.
|
or after ``MAX_PAGES`` iterations.
|
||||||
|
|
||||||
|
``query`` is forwarded as a server-side full-text filter to
|
||||||
|
``_initial_search``; the resulting ``queryid`` is bound to that
|
||||||
|
filter, so subsequent ``Resultpage`` calls automatically inherit
|
||||||
|
it without needing to repeat the tag.
|
||||||
"""
|
"""
|
||||||
queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE)
|
queryid, hits = await self._initial_search(
|
||||||
|
client, length=self.PAGE_SIZE, query=query,
|
||||||
|
)
|
||||||
for hit in hits:
|
for hit in hits:
|
||||||
yield hit
|
yield hit
|
||||||
if not queryid or len(hits) < self.PAGE_SIZE:
|
if not queryid or len(hits) < self.PAGE_SIZE:
|
||||||
@ -1105,24 +1148,23 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
||||||
"""Search recent documents of the configured Wahlperiode.
|
"""Search recent documents of the configured Wahlperiode.
|
||||||
|
|
||||||
``query`` is a client-side filter on title + Urheber. The server
|
Server-side full-text search via the ``facet_fulltext`` tag (#12)
|
||||||
returns the configured WP sorted newest first; the client keeps
|
when ``query`` is non-empty; otherwise pure browse mode. The
|
||||||
only ``Antrag``-typed Drucksachen and applies the title filter.
|
server returns the WP sorted newest-first across all document
|
||||||
|
kinds, the client keeps only ``Antrag``-typed Drucksachen and
|
||||||
|
dedupes by lp/number (ParlDok returns the same Drucksache
|
||||||
|
multiple times when it appears in several Vorgänge/Beratungen).
|
||||||
|
|
||||||
Pagination: ParlDok caps each ``Fulltext/Search`` response at 100
|
Pagination: ParlDok caps each response at 100 rows; further
|
||||||
rows. Only ~3% of MV hits are real Anträge (most are Kleine
|
pages come from ``Fulltext/Resultpage`` bound to the
|
||||||
Anfragen + Protokolle), so we may need several pages to fill
|
server-assigned ``queryid``.
|
||||||
``limit``.
|
|
||||||
"""
|
"""
|
||||||
results: list[Drucksache] = []
|
results: list[Drucksache] = []
|
||||||
query_terms = [t for t in query.lower().split() if t] if query else []
|
|
||||||
# ParlDok returns the same Drucksache multiple times when it
|
|
||||||
# appears in several Vorgänge/Beratungen — dedupe by lp/number.
|
|
||||||
seen: set[str] = set()
|
seen: set[str] = set()
|
||||||
|
|
||||||
async with self._make_client() as client:
|
async with self._make_client() as client:
|
||||||
await client.get(f"{self.base_url}{self.prefix}/")
|
await client.get(f"{self.base_url}{self.prefix}/")
|
||||||
async for hit in self._paginated_hits(client):
|
async for hit in self._paginated_hits(client, query=query):
|
||||||
if hit.get("kind") != "Drucksache":
|
if hit.get("kind") != "Drucksache":
|
||||||
continue
|
continue
|
||||||
if self.document_typ and hit.get("type") != self.document_typ:
|
if self.document_typ and hit.get("type") != self.document_typ:
|
||||||
@ -1135,11 +1177,6 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
continue
|
continue
|
||||||
seen.add(doc.drucksache)
|
seen.add(doc.drucksache)
|
||||||
|
|
||||||
if query_terms:
|
|
||||||
hay = f"{doc.title} {hit.get('authorhtml', '')}".lower()
|
|
||||||
if not all(t in hay for t in query_terms):
|
|
||||||
continue
|
|
||||||
|
|
||||||
results.append(doc)
|
results.append(doc)
|
||||||
if len(results) >= limit:
|
if len(results) >= limit:
|
||||||
return results
|
return results
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user