feat(#154): HE-Index-Scrape im Auto-Ingest-Cron
URL enthaelt Datum (DD-MM-YYYY), keine Vorhersage moeglich. Daher analog HH: starweb-Index scrapen, neue PDFs einzeln ingesten. Index-URL: https://starweb.hessen.de/starweb/LIS/Pd_Eingang.htm PDF-Pattern: cache/hessen/landtag/Plenum/{wp}/Beschlussprotokoll_PL_{n}_{datum}.pdf Protokoll-ID: PlPr{wp}-{n} (z.B. PlPr21-62) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8125dbb731
commit
06918c71eb
@ -98,6 +98,67 @@ for m in matches:
|
||||
print(f" HH: {new_count} neue Protokolle ingestet")
|
||||
EOF
|
||||
|
||||
# ─── HE: Index-Page-Scrape (URL enthaelt Datum, nicht vorhersagbar) ───
|
||||
# Hessen-Beschlussprotokoll-URL-Pattern hat Datum-Anteil DD-MM-YYYY,
|
||||
# daher Pattern via starweb-Index extrahieren statt vorzuhersagen.
|
||||
echo "--- HE WP21 (Index-Scrape) ---"
|
||||
docker exec "$CONTAINER" python <<'EOF'
|
||||
import re, sys
|
||||
import urllib.request
|
||||
import sqlite3
|
||||
import asyncio
|
||||
|
||||
req = urllib.request.Request(
|
||||
"https://starweb.hessen.de/starweb/LIS/Pd_Eingang.htm",
|
||||
headers={"User-Agent": "Mozilla/5.0 GWOeAntragspruefer"},
|
||||
)
|
||||
try:
|
||||
# 302 → portal/browse.tt.html, urllib folgt automatisch
|
||||
html = urllib.request.urlopen(req, timeout=20).read().decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
print(f" Index-Scrape fehlgeschlagen: {e}")
|
||||
sys.exit(0)
|
||||
|
||||
# href="http://starweb.hessen.de/cache/hessen/landtag/Plenum/{wp}/Beschlussprotokoll_PL_{n}_{datum}.pdf"
|
||||
pdf_re = re.compile(
|
||||
r'href="(https?://starweb\.hessen\.de/cache/hessen/landtag/Plenum/(\d+)/'
|
||||
r'Beschlussprotokoll_PL_(\d+)_(\d{2}-\d{2}-\d{4})\.pdf)"'
|
||||
)
|
||||
matches = list(pdf_re.finditer(html))
|
||||
print(f" {len(matches)} HE-Beschlussprotokolle in Index gefunden")
|
||||
|
||||
db = sqlite3.connect("/app/data/gwoe-antraege.db")
|
||||
existing = {row[0] for row in db.execute(
|
||||
"SELECT quelle_protokoll FROM plenum_vote_results WHERE bundesland='HE'"
|
||||
)}
|
||||
|
||||
from app.ingest_votes import ingest_pdf
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
|
||||
new_count = 0
|
||||
for m in matches:
|
||||
url, wp, sitzung, datum = m.groups()
|
||||
pid = f"PlPr{wp}-{sitzung}"
|
||||
if pid in existing:
|
||||
continue
|
||||
print(f" → neu: {pid} ({datum})")
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
||||
tmp_path = Path(tmp.name)
|
||||
try:
|
||||
urllib.request.urlretrieve(url, tmp_path)
|
||||
stats = asyncio.run(ingest_pdf(
|
||||
tmp_path, bundesland="HE", protokoll_id=pid, quelle_url=url,
|
||||
))
|
||||
print(f" parsed: {stats['parsed']}, written: {stats['written']}")
|
||||
new_count += 1
|
||||
except Exception as e:
|
||||
print(f" Fehler: {e}")
|
||||
finally:
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
print(f" HE: {new_count} neue Protokolle ingestet")
|
||||
EOF
|
||||
|
||||
for entry in "${PROTO_TARGETS[@]}"; do
|
||||
IFS='|' read -r bl wp prefix pattern <<< "$entry"
|
||||
echo "--- ${bl} WP${wp} (prefix=${prefix}) ---"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user