feat(#154): HE-Index-Scrape im Auto-Ingest-Cron

URL enthaelt Datum (DD-MM-YYYY), keine Vorhersage moeglich. Daher analog HH: starweb-Index scrapen, neue PDFs einzeln ingesten. Index-URL: https://starweb.hessen.de/starweb/LIS/Pd_Eingang.htm PDF-Pattern: cache/hessen/landtag/Plenum/{wp}/Beschlussprotokoll_PL_{n}_{datum}.pdf Protokoll-ID: PlPr{wp}-{n} (z.B. PlPr21-62) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 01:19:58 +02:00 · 2026-04-29 01:19:58 +02:00 · 06918c71eb
commit 06918c71eb
parent 8125dbb731
1 changed files with 61 additions and 0 deletions
--- a/scripts/auto-ingest-protocols.sh
+++ b/scripts/auto-ingest-protocols.sh
@ -98,6 +98,67 @@ for m in matches:
 print(f"  HH: {new_count} neue Protokolle ingestet")
 EOF

+# ─── HE: Index-Page-Scrape (URL enthaelt Datum, nicht vorhersagbar) ───
+# Hessen-Beschlussprotokoll-URL-Pattern hat Datum-Anteil DD-MM-YYYY,
+# daher Pattern via starweb-Index extrahieren statt vorzuhersagen.
+echo "--- HE WP21 (Index-Scrape) ---"
+docker exec "$CONTAINER" python <<'EOF'
+import re, sys
+import urllib.request
+import sqlite3
+import asyncio
+
+req = urllib.request.Request(
+    "https://starweb.hessen.de/starweb/LIS/Pd_Eingang.htm",
+    headers={"User-Agent": "Mozilla/5.0 GWOeAntragspruefer"},
+)
+try:
+    # 302 → portal/browse.tt.html, urllib folgt automatisch
+    html = urllib.request.urlopen(req, timeout=20).read().decode("utf-8", errors="replace")
+except Exception as e:
+    print(f"  Index-Scrape fehlgeschlagen: {e}")
+    sys.exit(0)
+
+# href="http://starweb.hessen.de/cache/hessen/landtag/Plenum/{wp}/Beschlussprotokoll_PL_{n}_{datum}.pdf"
+pdf_re = re.compile(
+    r'href="(https?://starweb\.hessen\.de/cache/hessen/landtag/Plenum/(\d+)/'
+    r'Beschlussprotokoll_PL_(\d+)_(\d{2}-\d{2}-\d{4})\.pdf)"'
+)
+matches = list(pdf_re.finditer(html))
+print(f"  {len(matches)} HE-Beschlussprotokolle in Index gefunden")
+
+db = sqlite3.connect("/app/data/gwoe-antraege.db")
+existing = {row[0] for row in db.execute(
+    "SELECT quelle_protokoll FROM plenum_vote_results WHERE bundesland='HE'"
+)}
+
+from app.ingest_votes import ingest_pdf
+from pathlib import Path
+import tempfile
+
+new_count = 0
+for m in matches:
+    url, wp, sitzung, datum = m.groups()
+    pid = f"PlPr{wp}-{sitzung}"
+    if pid in existing:
+        continue
+    print(f"  → neu: {pid} ({datum})")
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+        tmp_path = Path(tmp.name)
+    try:
+        urllib.request.urlretrieve(url, tmp_path)
+        stats = asyncio.run(ingest_pdf(
+            tmp_path, bundesland="HE", protokoll_id=pid, quelle_url=url,
+        ))
+        print(f"    parsed: {stats['parsed']}, written: {stats['written']}")
+        new_count += 1
+    except Exception as e:
+        print(f"    Fehler: {e}")
+    finally:
+        tmp_path.unlink(missing_ok=True)
+print(f"  HE: {new_count} neue Protokolle ingestet")
+EOF
+
 for entry in "${PROTO_TARGETS[@]}"; do
  IFS='|' read -r bl wp prefix pattern <<< "$entry"
  echo "--- ${bl} WP${wp} (prefix=${prefix}) ---"