From 9eda6f9f36045c7a1753a42ab4d36e9d4019d984 Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Wed, 8 Apr 2026 13:58:34 +0200 Subject: [PATCH] PortalaAdapter: quick-win bigger window + chunksize for BE/LSA (#13) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real server-side fulltext search through the eUI sf-Index requires reverse-engineering the LSA/BE-specific search field (the obvious candidates VOLL, VOLL.main, WEV62 and bare-term-without-sf all return zero hits when probed). Without browser DevTools to capture a real fulltext request that's a multi-hour project — split out to remain in #13 as a follow-up. This commit ships the pragmatic interim fix from #11: - BE date_window_days: 180 → 730 Berlin had a tight default window because PARDOK has ~10x more documents than PADOKA. With the bigger window the client-side title/Urheber filter reaches back across most of WP19 instead of just the last six months. - chunksize logic in PortalaAdapter.search() inverted from "small when query, big when no query" to the opposite. The query-filtered path now pulls up to max(limit*10, 500) records per page so the title-filter has enough material; the unfiltered browse path stays at max(limit*2, 100). - httpx timeout 30s → 60s. LSA's report.tt.html occasionally takes 30+s on cold start; warm requests are <10s. Smoke test (local): BE Schule: 15 hits (was 0) LSA Schule: 14 hits (was N/A; same path) Live verification follows after deploy. Co-Authored-By: Claude Opus 4.6 (1M context) --- app/parlamente.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/app/parlamente.py b/app/parlamente.py index 1675dbf..82c68ae 100644 --- a/app/parlamente.py +++ b/app/parlamente.py @@ -732,7 +732,9 @@ class PortalaAdapter(ParlamentAdapter): report_html = f"{self.base_url}{self.portala_path}/report.tt.html" async with httpx.AsyncClient( - timeout=30, + # Bumped from 30s for #13 quick-win: chunksize=500 against the + # LSA report.tt.html endpoint occasionally takes 30+ seconds. + timeout=60, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, ) as client: @@ -757,8 +759,12 @@ class PortalaAdapter(ParlamentAdapter): return [] # Step 3: fetch the HTML hit list - # Take a generous chunk so client-side filter still has enough - chunksize = 100 if query else limit + # Take a generous chunk so the client-side title filter + # still has enough material to work with. Quick-win for #13 + # until the eUI sf-Index for real server-side fulltext is + # reverse-engineered: bump the unfiltered chunk floor and + # the query-filtered chunk ceiling. + chunksize = max(limit * 10, 500) if query else max(limit * 2, 100) report_resp = await client.post( report_html, json={"report_id": report_id, "start": 0, "chunksize": chunksize}, @@ -1298,9 +1304,13 @@ ADAPTERS = { # Berlin's ETYPF index uses different value strings — drop the # document_type subtree, fall back to client-side title filter. document_type=None, - # Tighter date window: BE has ~10x more documents than LSA, so a - # narrower window keeps the per-request payload bounded. - date_window_days=180, + # Quick-win for #13: pulled the date window from the original + # 180-day MVP up to 730 days so client-side title-filter searches + # ("Schule" etc.) reach back across more of the WP19 corpus until + # the eUI fulltext-sf is reverse-engineered. The chunksize bump + # in PortalaAdapter.search() means the per-request payload stays + # bounded. + date_window_days=730, pdf_url_prefix="/files/", ), "MV": ParLDokAdapter(