ops(#106): Backfill- + Auto-Ingest-Skripte fuer Plenum-Votes
scripts/backfill-nrw-protocols.sh:
Probiert MMP{wp}-1.pdf bis MMP{wp}-200.pdf durch, ingestet alle 200er.
Bei 3 aufeinanderfolgenden 404 Abbruch.
Usage: backfill-nrw-protocols.sh [WP=18] [CONTAINER=gwoe-antragspruefer-dev]
Idempotent ueber plenum_vote_results-Compound-PK.
scripts/auto-ingest-protocols.sh:
BL-uebergreifend, Cron-tauglich. Liest fuer jeden konfigurierten
BL/WP das letzte ingestete Protokoll aus der DB, probiert die
naechste Sitzungsnummer, ingestet bis zur naechsten Luecke.
Aktuell konfiguriert: NRW WP18, NRW WP17 (Pattern leicht erweiterbar).
Beide rein deterministisch — keine LLM-Calls, keine Embedding-Calls,
keine Kosten. Reines PDF-Download + Regex-Parsing + SQLite-Insert.
This commit is contained in:
parent
1769c9f349
commit
05b6b45e1b
65
scripts/auto-ingest-protocols.sh
Executable file
65
scripts/auto-ingest-protocols.sh
Executable file
@ -0,0 +1,65 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# BL-uebergreifender Auto-Ingest fuer Plenarprotokolle (#106 / #126 Phase 3).
|
||||||
|
#
|
||||||
|
# Pro registriertem BL: liest letztes ingestetes Protokoll, probiert das
|
||||||
|
# naechste, ingestet bei 200, wiederholt bis 404 (mit GAP_TOLERANCE).
|
||||||
|
# Idempotent (Compound-PK in plenum_vote_results), kein State ausser DB.
|
||||||
|
#
|
||||||
|
# Wird via Cron taeglich morgens aufgerufen. Ausgabe nach
|
||||||
|
# /var/log/gwoe-ingest-protocols.log.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# auto-ingest-protocols.sh [CONTAINER]
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
CONTAINER="${1:-gwoe-antragspruefer}"
|
||||||
|
GAP_TOLERANCE=3 # 3 aufeinanderfolgende 404 → fertig fuer dieses BL
|
||||||
|
|
||||||
|
# Pro BL: URL-Pattern + Wahlperiode-Auflistung.
|
||||||
|
# WP-Liste ergibt sich aus aktiven Wahlperioden in BUNDESLAENDER. Hier
|
||||||
|
# aktuell + Vorgaenger-WP, weil Plenum noch in der laufenden WP arbeitet
|
||||||
|
# und alte Sitzungen gelegentlich nachtraeglich digitalisiert werden.
|
||||||
|
#
|
||||||
|
# Format: BL_CODE|WAHLPERIODE|URL_PATTERN_MIT_{n}_PLACEHOLDER
|
||||||
|
PROTO_TARGETS=(
|
||||||
|
"NRW|18|https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument/MMP18-{n}.pdf"
|
||||||
|
"NRW|17|https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument/MMP17-{n}.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
echo "=== auto-ingest-protocols $(date -Iseconds) ==="
|
||||||
|
|
||||||
|
for entry in "${PROTO_TARGETS[@]}"; do
|
||||||
|
IFS='|' read -r bl wp pattern <<< "$entry"
|
||||||
|
echo "--- ${bl} WP${wp} ---"
|
||||||
|
|
||||||
|
# Hoechste bisher ingesete Sitzungs-Nr fuer diesen BL/WP-Praefix
|
||||||
|
prefix="MMP${wp}-" # NRW-Konvention; andere BL liefern ihren eigenen Prefix
|
||||||
|
last_n=$(docker exec "$CONTAINER" sqlite3 /app/data/gwoe-antraege.db \
|
||||||
|
"SELECT COALESCE(MAX(CAST(SUBSTR(quelle_protokoll, ${#prefix} + 1) AS INTEGER)), 0) \
|
||||||
|
FROM plenum_vote_results \
|
||||||
|
WHERE bundesland='${bl}' AND quelle_protokoll LIKE '${prefix}%'" 2>/dev/null || echo "0")
|
||||||
|
|
||||||
|
start_n=$((last_n + 1))
|
||||||
|
echo "Letztes ingestes ${prefix}: ${last_n}, probiere ab ${start_n}"
|
||||||
|
|
||||||
|
consecutive_404=0
|
||||||
|
for n in $(seq $start_n $((last_n + 50))); do
|
||||||
|
url="${pattern//\{n\}/$n}"
|
||||||
|
http=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 15 "$url" || echo "000")
|
||||||
|
if [ "$http" = "200" ]; then
|
||||||
|
consecutive_404=0
|
||||||
|
pid="${prefix}${n}"
|
||||||
|
echo " → ingest ${pid}"
|
||||||
|
docker exec "$CONTAINER" python -m app.ingest_votes \
|
||||||
|
--url "$url" --bundesland "$bl" --protokoll-id "$pid" 2>&1 \
|
||||||
|
| tail -3 | sed 's/^/ /' || echo " !! ingest fehlgeschlagen"
|
||||||
|
elif [ "$http" = "404" ]; then
|
||||||
|
consecutive_404=$((consecutive_404 + 1))
|
||||||
|
if [ $consecutive_404 -ge $GAP_TOLERANCE ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "=== auto-ingest done $(date -Iseconds) ==="
|
||||||
54
scripts/backfill-nrw-protocols.sh
Executable file
54
scripts/backfill-nrw-protocols.sh
Executable file
@ -0,0 +1,54 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# NRW-Plenarprotokoll Backfill (#106 / #126).
|
||||||
|
#
|
||||||
|
# Probiert MMP{wp}-1.pdf bis MMP{wp}-MAX.pdf durch und ingestet alle, die
|
||||||
|
# eine 200er-Antwort liefern. Idempotent (Compound-PK in plenum_vote_results).
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# backfill-nrw-protocols.sh [WP] [CONTAINER]
|
||||||
|
# WP = Wahlperiode (default: 18)
|
||||||
|
# CONTAINER = Container-Name (default: gwoe-antragspruefer-dev)
|
||||||
|
#
|
||||||
|
# Beispiel: WP18 auf Prod
|
||||||
|
# ./scripts/backfill-nrw-protocols.sh 18 gwoe-antragspruefer
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
WP="${1:-18}"
|
||||||
|
CONTAINER="${2:-gwoe-antragspruefer-dev}"
|
||||||
|
BASE_URL="https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument"
|
||||||
|
MAX_SITZUNGEN=200 # WP18 hat ~150 Sitzungen, Polsterung
|
||||||
|
GAP_TOLERANCE=3 # Bei N aufeinanderfolgenden 404 abbrechen
|
||||||
|
|
||||||
|
echo "=== NRW-WP${WP} Backfill in ${CONTAINER} ==="
|
||||||
|
consecutive_404=0
|
||||||
|
ok_count=0
|
||||||
|
fail_count=0
|
||||||
|
|
||||||
|
for n in $(seq 1 $MAX_SITZUNGEN); do
|
||||||
|
url="${BASE_URL}/MMP${WP}-${n}.pdf"
|
||||||
|
http=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 15 "$url" || echo "000")
|
||||||
|
if [ "$http" = "200" ]; then
|
||||||
|
consecutive_404=0
|
||||||
|
pid="MMP${WP}-${n}"
|
||||||
|
echo "[$n] ${pid} → ingest …"
|
||||||
|
if docker exec "$CONTAINER" python -m app.ingest_votes \
|
||||||
|
--url "$url" --bundesland NRW --protokoll-id "$pid" 2>&1 \
|
||||||
|
| tail -3 | sed 's/^/ /'; then
|
||||||
|
ok_count=$((ok_count + 1))
|
||||||
|
else
|
||||||
|
fail_count=$((fail_count + 1))
|
||||||
|
echo " !! ingest fehlgeschlagen"
|
||||||
|
fi
|
||||||
|
elif [ "$http" = "404" ]; then
|
||||||
|
consecutive_404=$((consecutive_404 + 1))
|
||||||
|
if [ $consecutive_404 -ge $GAP_TOLERANCE ]; then
|
||||||
|
echo "[$n] ${GAP_TOLERANCE} aufeinanderfolgende 404 — Backfill beendet."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "[$n] HTTP $http (skip)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "=== Summary: ${ok_count} ingested, ${fail_count} failed ==="
|
||||||
Loading…
Reference in New Issue
Block a user