diff --git a/app/parlamente.py b/app/parlamente.py index 1675dbf..82c68ae 100644 --- a/app/parlamente.py +++ b/app/parlamente.py @@ -732,7 +732,9 @@ class PortalaAdapter(ParlamentAdapter): report_html = f"{self.base_url}{self.portala_path}/report.tt.html" async with httpx.AsyncClient( - timeout=30, + # Bumped from 30s for #13 quick-win: chunksize=500 against the + # LSA report.tt.html endpoint occasionally takes 30+ seconds. + timeout=60, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, ) as client: @@ -757,8 +759,12 @@ class PortalaAdapter(ParlamentAdapter): return [] # Step 3: fetch the HTML hit list - # Take a generous chunk so client-side filter still has enough - chunksize = 100 if query else limit + # Take a generous chunk so the client-side title filter + # still has enough material to work with. Quick-win for #13 + # until the eUI sf-Index for real server-side fulltext is + # reverse-engineered: bump the unfiltered chunk floor and + # the query-filtered chunk ceiling. + chunksize = max(limit * 10, 500) if query else max(limit * 2, 100) report_resp = await client.post( report_html, json={"report_id": report_id, "start": 0, "chunksize": chunksize}, @@ -1298,9 +1304,13 @@ ADAPTERS = { # Berlin's ETYPF index uses different value strings — drop the # document_type subtree, fall back to client-side title filter. document_type=None, - # Tighter date window: BE has ~10x more documents than LSA, so a - # narrower window keeps the per-request payload bounded. - date_window_days=180, + # Quick-win for #13: pulled the date window from the original + # 180-day MVP up to 730 days so client-side title-filter searches + # ("Schule" etc.) reach back across more of the WP19 corpus until + # the eUI fulltext-sf is reverse-engineered. The chunksize bump + # in PortalaAdapter.search() means the per-request payload stays + # bounded. + date_window_days=730, pdf_url_prefix="/files/", ), "MV": ParLDokAdapter(