feat(#155): HH-Index-Scrape im Auto-Ingest-Cron
Hamburg hat keine vorhersagbare URL-Pattern (Blob-IDs + Hashes pro PDF). Stattdessen: HH-Branch im Cron scraped die Protokoll-Liste auf hamburgische-buergerschaft.de und ingestet jedes gefundene PDF, das noch nicht in plenum_vote_results steht (idempotent). Cron-Lauf morgens 06:30 zieht damit auch HH-Sitzungen automatisch nach, sobald die Buergerschaft sie veroeffentlicht (typisch Tag nach der Sitzung). URL-Discovery-Pattern fuer Phase-2-BL mit aehnlich nicht-vorhersagbaren URLs (z.B. SN, ggf. NI) — kann diese Index-Scrape-Logik wiederverwenden.
This commit is contained in:
parent
5f97ae9fc3
commit
67092d05b5
@ -33,6 +33,68 @@ PROTO_TARGETS=(
|
|||||||
|
|
||||||
echo "=== auto-ingest-protocols $(date -Iseconds) ==="
|
echo "=== auto-ingest-protocols $(date -Iseconds) ==="
|
||||||
|
|
||||||
|
# ─── HH: Index-Page-Scrape statt URL-Pattern ──────────────────────────
|
||||||
|
# Hamburg hat keine vorhersagbare URL-Pattern (Blob-IDs + Hashes).
|
||||||
|
# Stattdessen: Index-Seite scrapen, jedes gefundene PDF einzeln ingesten.
|
||||||
|
echo "--- HH WP23 (Index-Scrape) ---"
|
||||||
|
docker exec "$CONTAINER" python <<EOF
|
||||||
|
import re, sys
|
||||||
|
import urllib.request
|
||||||
|
import sqlite3
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
# Index-Seite scrapen
|
||||||
|
req = urllib.request.Request(
|
||||||
|
"https://www.hamburgische-buergerschaft.de/recherche-info/protokolle",
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 GWOeAntragspruefer"},
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
html = urllib.request.urlopen(req, timeout=20).read().decode("utf-8", errors="replace")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Index-Scrape fehlgeschlagen: {e}")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# PDFs extrahieren: /resource/blob/{ID}/{HASH}/{N}-vorl-beschlussprotokoll-DD-MM-YYYY-data.pdf
|
||||||
|
pdf_re = re.compile(
|
||||||
|
r'href="(/resource/blob/(\d+)/([a-f0-9]+)/(\d+)-vorl-beschlussprotokoll-(\d{2}-\d{2}-\d{4})[^"]*\.pdf)"'
|
||||||
|
)
|
||||||
|
matches = list(pdf_re.finditer(html))
|
||||||
|
print(f" {len(matches)} HH-Protokolle in Index gefunden")
|
||||||
|
|
||||||
|
# Bereits ingestete Protokolle holen
|
||||||
|
db = sqlite3.connect("/app/data/gwoe-antraege.db")
|
||||||
|
existing = {row[0] for row in db.execute(
|
||||||
|
"SELECT quelle_protokoll FROM plenum_vote_results WHERE bundesland='HH'"
|
||||||
|
)}
|
||||||
|
|
||||||
|
from app.ingest_votes import ingest_pdf
|
||||||
|
from pathlib import Path
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
new_count = 0
|
||||||
|
for m in matches:
|
||||||
|
href, blob_id, h, sitzung, datum = m.groups()
|
||||||
|
pid = f"PlPr23-{sitzung}"
|
||||||
|
if pid in existing:
|
||||||
|
continue
|
||||||
|
url = "https://www.hamburgische-buergerschaft.de" + href
|
||||||
|
print(f" → neu: {pid} ({datum})")
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
||||||
|
tmp_path = Path(tmp.name)
|
||||||
|
try:
|
||||||
|
urllib.request.urlretrieve(url, tmp_path)
|
||||||
|
stats = asyncio.run(ingest_pdf(
|
||||||
|
tmp_path, bundesland="HH", protokoll_id=pid, quelle_url=url,
|
||||||
|
))
|
||||||
|
print(f" parsed: {stats['parsed']}, written: {stats['written']}")
|
||||||
|
new_count += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Fehler: {e}")
|
||||||
|
finally:
|
||||||
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
print(f" HH: {new_count} neue Protokolle ingestet")
|
||||||
|
EOF
|
||||||
|
|
||||||
for entry in "${PROTO_TARGETS[@]}"; do
|
for entry in "${PROTO_TARGETS[@]}"; do
|
||||||
IFS='|' read -r bl wp prefix pattern <<< "$entry"
|
IFS='|' read -r bl wp prefix pattern <<< "$entry"
|
||||||
echo "--- ${bl} WP${wp} (prefix=${prefix}) ---"
|
echo "--- ${bl} WP${wp} (prefix=${prefix}) ---"
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user