Server-side facet_fulltext-Suche aus #12 war asymmetrisch zu BE/LSA (beide nur Title-Filter über 730d-Window aus #13). User-Entscheidung 2026-04-08: einheitliches Verhalten ist wichtiger als das beste Verhalten in 2 von 4 Adaptern. Konkrete Änderungen: - _build_search_body() schickt query nicht mehr server-side. Der query-Parameter bleibt in der Signatur als unused-mit-del, weil die Wieder-Aktivierung später ein Drop-in sein soll wenn die PortalaAdapter-Variante reverse-engineered wurde. - _initial_search() und _paginated_hits() ohne query-Parameter. - search() macht clientseitigen Title+Urheber-Filter wie der PortalaAdapter — same Codepfad, einheitliches Verhalten. - get_document() nutzt die unveränderte Pagination. - FACET_FULLTEXT-Konstante und _fulltext_id-Helper bleiben im Code als Dokumentation für die spätere Re-Aktivierung. Im Docstring ist die Tag-Form festgehalten. Folgen: - MV "Schule" ist von 20 (mit Volltext) auf 3 zurück (Title-Filter über die letzten 1000 Drucksachen). Gleiches Niveau wie BE/LSA pre-#13. - Browse-Mode (no query) ist unverändert: ~10 hits in ~25s, MAX_PAGES=10. - Wenn das später nicht reicht: #16 (UI-Split DB vs. Landtag) und ein optionaler "echter Volltext"-Toggle (#17 closed-as-deferred) bleiben als Folge-Optionen. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
9eda6f9f36
commit
b5ae8894d4
@ -961,35 +961,27 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
def _build_search_body(self, *, length: int = 100, query: str = "") -> dict:
|
def _build_search_body(self, *, length: int = 100, query: str = "") -> dict:
|
||||||
"""Build the JSON payload for the initial ``Fulltext/Search`` call.
|
"""Build the JSON payload for the initial ``Fulltext/Search`` call.
|
||||||
|
|
||||||
Filters by Wahlperiode + optional server-side full-text search.
|
Filters by Wahlperiode only — type/kind/fulltext filtering all
|
||||||
Type/kind filtering still happens client-side because the
|
happen client-side after the hit list is paginated. The
|
||||||
facet_type/facet_kind value IDs are instance-specific and would
|
``query`` parameter is accepted for API compatibility but is
|
||||||
require an extra ``Fulltext/Filter`` round trip to discover.
|
currently NOT forwarded to the server (#18: einheitliche
|
||||||
|
client-side Title-Suche, kein Server-Volltext, weil das
|
||||||
|
Verhalten zwischen Adaptern sonst asymmetrisch wird). The
|
||||||
|
``FACET_FULLTEXT`` constant and :meth:`_fulltext_id` helper
|
||||||
|
are kept around as documentation for the previous #12
|
||||||
|
server-side variant — when fulltext gets uniformly
|
||||||
|
re-introduced later, the dormant tag is just::
|
||||||
|
|
||||||
|
{"type": self.FACET_FULLTEXT,
|
||||||
|
"id": self._fulltext_id(query),
|
||||||
|
"fulltext": query, "label": query, "field": "Alle"}
|
||||||
|
|
||||||
Pagination beyond the first page goes through
|
Pagination beyond the first page goes through
|
||||||
``Fulltext/Resultpage`` — the ``Search`` endpoint itself
|
``Fulltext/Resultpage`` — the ``Search`` endpoint itself
|
||||||
ignores any non-zero ``Start``.
|
ignores any non-zero ``Start``.
|
||||||
|
|
||||||
The full-text tag schema is reverse-engineered from
|
|
||||||
``pd.addInput`` in ``bundle.js`` and matches the SPA payload
|
|
||||||
verbatim::
|
|
||||||
|
|
||||||
{"type": 0, "id": "<sanitized>", "fulltext": "<raw>",
|
|
||||||
"label": "<raw>", "field": "Alle"}
|
|
||||||
|
|
||||||
``field="Alle"`` means "search all indexed fields"
|
|
||||||
(``pd.currentFTSearchMode`` default). The server tokenizes
|
|
||||||
the term and applies AND-semantics across whitespace.
|
|
||||||
"""
|
"""
|
||||||
|
del query # explicitly unused — see docstring
|
||||||
tags: list[dict] = [{"type": self.FACET_LP, "id": self.wahlperiode}]
|
tags: list[dict] = [{"type": self.FACET_LP, "id": self.wahlperiode}]
|
||||||
if query:
|
|
||||||
tags.append({
|
|
||||||
"type": self.FACET_FULLTEXT,
|
|
||||||
"id": self._fulltext_id(query),
|
|
||||||
"fulltext": query,
|
|
||||||
"label": query,
|
|
||||||
"field": "Alle",
|
|
||||||
})
|
|
||||||
return {
|
return {
|
||||||
"devicekey": "",
|
"devicekey": "",
|
||||||
"max": length,
|
"max": length,
|
||||||
@ -1066,18 +1058,15 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
async def _initial_search(
|
async def _initial_search(
|
||||||
self, client: httpx.AsyncClient, *, length: int, query: str = "",
|
self, client: httpx.AsyncClient, *, length: int,
|
||||||
) -> tuple[Optional[int], list[dict]]:
|
) -> tuple[Optional[int], list[dict]]:
|
||||||
"""Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
|
"""Run the initial ``Fulltext/Search`` and return ``(queryid, docs)``.
|
||||||
|
|
||||||
The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
|
The ``queryid`` is needed for subsequent ``Fulltext/Resultpage``
|
||||||
calls. ParlDok ignores any non-zero ``Start`` on this endpoint —
|
calls. ParlDok ignores any non-zero ``Start`` on this endpoint —
|
||||||
the first 100 hits are the only ones reachable via ``Search``.
|
the first 100 hits are the only ones reachable via ``Search``.
|
||||||
|
|
||||||
``query`` is sent server-side as a ``facet_fulltext`` tag — see
|
|
||||||
``_build_search_body``.
|
|
||||||
"""
|
"""
|
||||||
body = self._build_search_body(length=length, query=query)
|
body = self._build_search_body(length=length)
|
||||||
inner = await self._post_json(client, "Fulltext/Search", body)
|
inner = await self._post_json(client, "Fulltext/Search", body)
|
||||||
if not inner:
|
if not inner:
|
||||||
return None, []
|
return None, []
|
||||||
@ -1104,25 +1093,16 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
||||||
)
|
)
|
||||||
|
|
||||||
async def _paginated_hits(
|
async def _paginated_hits(self, client: httpx.AsyncClient):
|
||||||
self, client: httpx.AsyncClient, *, query: str = "",
|
"""Async iterator over Drucksachen-style hits across pages.
|
||||||
):
|
|
||||||
"""Async iterator over Drucksachen-style hits across all pages.
|
|
||||||
|
|
||||||
Yields raw hit dicts in newest-first order. The first batch comes
|
Yields raw hit dicts in newest-first order. The first batch comes
|
||||||
from ``Fulltext/Search``, subsequent batches from
|
from ``Fulltext/Search``, subsequent batches from
|
||||||
``Fulltext/Resultpage`` using the queryid the server returned for
|
``Fulltext/Resultpage`` using the queryid the server returned for
|
||||||
the initial call. Stops when a page comes back empty, undersized,
|
the initial call. Stops when a page comes back empty, undersized,
|
||||||
or after ``MAX_PAGES`` iterations.
|
or after :attr:`MAX_PAGES` iterations.
|
||||||
|
|
||||||
``query`` is forwarded as a server-side full-text filter to
|
|
||||||
``_initial_search``; the resulting ``queryid`` is bound to that
|
|
||||||
filter, so subsequent ``Resultpage`` calls automatically inherit
|
|
||||||
it without needing to repeat the tag.
|
|
||||||
"""
|
"""
|
||||||
queryid, hits = await self._initial_search(
|
queryid, hits = await self._initial_search(client, length=self.PAGE_SIZE)
|
||||||
client, length=self.PAGE_SIZE, query=query,
|
|
||||||
)
|
|
||||||
for hit in hits:
|
for hit in hits:
|
||||||
yield hit
|
yield hit
|
||||||
if not queryid or len(hits) < self.PAGE_SIZE:
|
if not queryid or len(hits) < self.PAGE_SIZE:
|
||||||
@ -1148,29 +1128,30 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
# Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more
|
# Anträge are ~3% of all hits in MV, so 1000 raw → ~30 Anträge, more
|
||||||
# than enough for the typical UI request (limit 5..20). Filtered
|
# than enough for the typical UI request (limit 5..20). Filtered
|
||||||
# queries that find nothing in the last 1000 docs return empty
|
# queries that find nothing in the last 1000 docs return empty
|
||||||
# rather than scan the entire WP.
|
# rather than scan the entire WP — same trade-off as the BE/LSA
|
||||||
|
# PortalaAdapter quick-win window.
|
||||||
MAX_PAGES = 10
|
MAX_PAGES = 10
|
||||||
|
|
||||||
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
||||||
"""Search recent documents of the configured Wahlperiode.
|
"""Search the configured Wahlperiode, sorted newest-first.
|
||||||
|
|
||||||
Server-side full-text search via the ``facet_fulltext`` tag (#12)
|
#18: einheitliches Verhalten — Server filtert nur nach WP, der
|
||||||
when ``query`` is non-empty; otherwise pure browse mode. The
|
Client paginiert über die ganze WP und filtert lokal nach
|
||||||
server returns the WP sorted newest-first across all document
|
Treffern in Titel oder Urheber. Volltext-Filter (#12) ist
|
||||||
kinds, the client keeps only ``Antrag``-typed Drucksachen and
|
zurückgebaut, weil das Verhalten zwischen Adaptern sonst
|
||||||
dedupes by lp/number (ParlDok returns the same Drucksache
|
asymmetrisch wird. Sortierung kommt vom Server (newest-first
|
||||||
multiple times when it appears in several Vorgänge/Beratungen).
|
durch ``sort=2`` in :meth:`_build_search_body`).
|
||||||
|
|
||||||
Pagination: ParlDok caps each response at 100 rows; further
|
Dedupe per ``lp/number`` weil ParlDok dieselbe Drucksache
|
||||||
pages come from ``Fulltext/Resultpage`` bound to the
|
mehrfach in verschiedenen Vorgängen/Beratungen liefert.
|
||||||
server-assigned ``queryid``.
|
|
||||||
"""
|
"""
|
||||||
results: list[Drucksache] = []
|
results: list[Drucksache] = []
|
||||||
seen: set[str] = set()
|
seen: set[str] = set()
|
||||||
|
query_terms = [t.lower() for t in query.split() if t] if query else []
|
||||||
|
|
||||||
async with self._make_client() as client:
|
async with self._make_client() as client:
|
||||||
await client.get(f"{self.base_url}{self.prefix}/")
|
await client.get(f"{self.base_url}{self.prefix}/")
|
||||||
async for hit in self._paginated_hits(client, query=query):
|
async for hit in self._paginated_hits(client):
|
||||||
if hit.get("kind") != "Drucksache":
|
if hit.get("kind") != "Drucksache":
|
||||||
continue
|
continue
|
||||||
if self.document_typ and hit.get("type") != self.document_typ:
|
if self.document_typ and hit.get("type") != self.document_typ:
|
||||||
@ -1183,6 +1164,11 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
continue
|
continue
|
||||||
seen.add(doc.drucksache)
|
seen.add(doc.drucksache)
|
||||||
|
|
||||||
|
if query_terms:
|
||||||
|
hay = f"{doc.title} {hit.get('authorhtml', '')}".lower()
|
||||||
|
if not all(t in hay for t in query_terms):
|
||||||
|
continue
|
||||||
|
|
||||||
results.append(doc)
|
results.append(doc)
|
||||||
if len(results) >= limit:
|
if len(results) >= limit:
|
||||||
return results
|
return results
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user