From 06918c71eb8920eba1dd6cbf2bf0363cf066e5d3 Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Wed, 29 Apr 2026 01:19:58 +0200 Subject: [PATCH] feat(#154): HE-Index-Scrape im Auto-Ingest-Cron URL enthaelt Datum (DD-MM-YYYY), keine Vorhersage moeglich. Daher analog HH: starweb-Index scrapen, neue PDFs einzeln ingesten. Index-URL: https://starweb.hessen.de/starweb/LIS/Pd_Eingang.htm PDF-Pattern: cache/hessen/landtag/Plenum/{wp}/Beschlussprotokoll_PL_{n}_{datum}.pdf Protokoll-ID: PlPr{wp}-{n} (z.B. PlPr21-62) Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/auto-ingest-protocols.sh | 61 ++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/scripts/auto-ingest-protocols.sh b/scripts/auto-ingest-protocols.sh index 7d56e3e..c56bd40 100755 --- a/scripts/auto-ingest-protocols.sh +++ b/scripts/auto-ingest-protocols.sh @@ -98,6 +98,67 @@ for m in matches: print(f" HH: {new_count} neue Protokolle ingestet") EOF +# ─── HE: Index-Page-Scrape (URL enthaelt Datum, nicht vorhersagbar) ─── +# Hessen-Beschlussprotokoll-URL-Pattern hat Datum-Anteil DD-MM-YYYY, +# daher Pattern via starweb-Index extrahieren statt vorzuhersagen. +echo "--- HE WP21 (Index-Scrape) ---" +docker exec "$CONTAINER" python <<'EOF' +import re, sys +import urllib.request +import sqlite3 +import asyncio + +req = urllib.request.Request( + "https://starweb.hessen.de/starweb/LIS/Pd_Eingang.htm", + headers={"User-Agent": "Mozilla/5.0 GWOeAntragspruefer"}, +) +try: + # 302 → portal/browse.tt.html, urllib folgt automatisch + html = urllib.request.urlopen(req, timeout=20).read().decode("utf-8", errors="replace") +except Exception as e: + print(f" Index-Scrape fehlgeschlagen: {e}") + sys.exit(0) + +# href="http://starweb.hessen.de/cache/hessen/landtag/Plenum/{wp}/Beschlussprotokoll_PL_{n}_{datum}.pdf" +pdf_re = re.compile( + r'href="(https?://starweb\.hessen\.de/cache/hessen/landtag/Plenum/(\d+)/' + r'Beschlussprotokoll_PL_(\d+)_(\d{2}-\d{2}-\d{4})\.pdf)"' +) +matches = list(pdf_re.finditer(html)) +print(f" {len(matches)} HE-Beschlussprotokolle in Index gefunden") + +db = sqlite3.connect("/app/data/gwoe-antraege.db") +existing = {row[0] for row in db.execute( + "SELECT quelle_protokoll FROM plenum_vote_results WHERE bundesland='HE'" +)} + +from app.ingest_votes import ingest_pdf +from pathlib import Path +import tempfile + +new_count = 0 +for m in matches: + url, wp, sitzung, datum = m.groups() + pid = f"PlPr{wp}-{sitzung}" + if pid in existing: + continue + print(f" → neu: {pid} ({datum})") + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: + tmp_path = Path(tmp.name) + try: + urllib.request.urlretrieve(url, tmp_path) + stats = asyncio.run(ingest_pdf( + tmp_path, bundesland="HE", protokoll_id=pid, quelle_url=url, + )) + print(f" parsed: {stats['parsed']}, written: {stats['written']}") + new_count += 1 + except Exception as e: + print(f" Fehler: {e}") + finally: + tmp_path.unlink(missing_ok=True) +print(f" HE: {new_count} neue Protokolle ingestet") +EOF + for entry in "${PROTO_TARGETS[@]}"; do IFS='|' read -r bl wp prefix pattern <<< "$entry" echo "--- ${bl} WP${wp} (prefix=${prefix}) ---"