From 67092d05b56fd32cc9d35acd8be61a7f5a793111 Mon Sep 17 00:00:00 2001
From: Dotty Dotter <dotty@Mac.wideopen.space>
Date: Wed, 29 Apr 2026 01:01:52 +0200
Subject: [PATCH] feat(#155): HH-Index-Scrape im Auto-Ingest-Cron
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hamburg hat keine vorhersagbare URL-Pattern (Blob-IDs + Hashes pro PDF).
Stattdessen: HH-Branch im Cron scraped die Protokoll-Liste auf
hamburgische-buergerschaft.de und ingestet jedes gefundene PDF, das
noch nicht in plenum_vote_results steht (idempotent).

Cron-Lauf morgens 06:30 zieht damit auch HH-Sitzungen automatisch nach,
sobald die Buergerschaft sie veroeffentlicht (typisch Tag nach der
Sitzung).

URL-Discovery-Pattern fuer Phase-2-BL mit aehnlich nicht-vorhersagbaren
URLs (z.B. SN, ggf. NI) — kann diese Index-Scrape-Logik wiederverwenden.
---
 scripts/auto-ingest-protocols.sh | 62 ++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/scripts/auto-ingest-protocols.sh b/scripts/auto-ingest-protocols.sh
index d672156..745d33f 100755
--- a/scripts/auto-ingest-protocols.sh
+++ b/scripts/auto-ingest-protocols.sh
@@ -33,6 +33,68 @@ PROTO_TARGETS=(
 
 echo "=== auto-ingest-protocols $(date -Iseconds) ==="
 
+# ─── HH: Index-Page-Scrape statt URL-Pattern ──────────────────────────
+# Hamburg hat keine vorhersagbare URL-Pattern (Blob-IDs + Hashes).
+# Stattdessen: Index-Seite scrapen, jedes gefundene PDF einzeln ingesten.
+echo "--- HH WP23 (Index-Scrape) ---"
+docker exec "$CONTAINER" python <<EOF
+import re, sys
+import urllib.request
+import sqlite3
+import asyncio
+
+# Index-Seite scrapen
+req = urllib.request.Request(
+    "https://www.hamburgische-buergerschaft.de/recherche-info/protokolle",
+    headers={"User-Agent": "Mozilla/5.0 GWOeAntragspruefer"},
+)
+try:
+    html = urllib.request.urlopen(req, timeout=20).read().decode("utf-8", errors="replace")
+except Exception as e:
+    print(f"  Index-Scrape fehlgeschlagen: {e}")
+    sys.exit(0)
+
+# PDFs extrahieren: /resource/blob/{ID}/{HASH}/{N}-vorl-beschlussprotokoll-DD-MM-YYYY-data.pdf
+pdf_re = re.compile(
+    r'href="(/resource/blob/(\d+)/([a-f0-9]+)/(\d+)-vorl-beschlussprotokoll-(\d{2}-\d{2}-\d{4})[^"]*\.pdf)"'
+)
+matches = list(pdf_re.finditer(html))
+print(f"  {len(matches)} HH-Protokolle in Index gefunden")
+
+# Bereits ingestete Protokolle holen
+db = sqlite3.connect("/app/data/gwoe-antraege.db")
+existing = {row[0] for row in db.execute(
+    "SELECT quelle_protokoll FROM plenum_vote_results WHERE bundesland='HH'"
+)}
+
+from app.ingest_votes import ingest_pdf
+from pathlib import Path
+import tempfile
+
+new_count = 0
+for m in matches:
+    href, blob_id, h, sitzung, datum = m.groups()
+    pid = f"PlPr23-{sitzung}"
+    if pid in existing:
+        continue
+    url = "https://www.hamburgische-buergerschaft.de" + href
+    print(f"  → neu: {pid} ({datum})")
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+        tmp_path = Path(tmp.name)
+    try:
+        urllib.request.urlretrieve(url, tmp_path)
+        stats = asyncio.run(ingest_pdf(
+            tmp_path, bundesland="HH", protokoll_id=pid, quelle_url=url,
+        ))
+        print(f"    parsed: {stats['parsed']}, written: {stats['written']}")
+        new_count += 1
+    except Exception as e:
+        print(f"    Fehler: {e}")
+    finally:
+        tmp_path.unlink(missing_ok=True)
+print(f"  HH: {new_count} neue Protokolle ingestet")
+EOF
+
 for entry in "${PROTO_TARGETS[@]}"; do
   IFS='|' read -r bl wp prefix pattern <<< "$entry"
   echo "--- ${bl} WP${wp} (prefix=${prefix}) ---"