55 lines
1.7 KiB
Bash
55 lines
1.7 KiB
Bash
|
|
#!/bin/bash
|
||
|
|
# NRW-Plenarprotokoll Backfill (#106 / #126).
|
||
|
|
#
|
||
|
|
# Probiert MMP{wp}-1.pdf bis MMP{wp}-MAX.pdf durch und ingestet alle, die
|
||
|
|
# eine 200er-Antwort liefern. Idempotent (Compound-PK in plenum_vote_results).
|
||
|
|
#
|
||
|
|
# Usage:
|
||
|
|
# backfill-nrw-protocols.sh [WP] [CONTAINER]
|
||
|
|
# WP = Wahlperiode (default: 18)
|
||
|
|
# CONTAINER = Container-Name (default: gwoe-antragspruefer-dev)
|
||
|
|
#
|
||
|
|
# Beispiel: WP18 auf Prod
|
||
|
|
# ./scripts/backfill-nrw-protocols.sh 18 gwoe-antragspruefer
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
WP="${1:-18}"
|
||
|
|
CONTAINER="${2:-gwoe-antragspruefer-dev}"
|
||
|
|
BASE_URL="https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument"
|
||
|
|
MAX_SITZUNGEN=200 # WP18 hat ~150 Sitzungen, Polsterung
|
||
|
|
GAP_TOLERANCE=3 # Bei N aufeinanderfolgenden 404 abbrechen
|
||
|
|
|
||
|
|
echo "=== NRW-WP${WP} Backfill in ${CONTAINER} ==="
|
||
|
|
consecutive_404=0
|
||
|
|
ok_count=0
|
||
|
|
fail_count=0
|
||
|
|
|
||
|
|
for n in $(seq 1 $MAX_SITZUNGEN); do
|
||
|
|
url="${BASE_URL}/MMP${WP}-${n}.pdf"
|
||
|
|
http=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 15 "$url" || echo "000")
|
||
|
|
if [ "$http" = "200" ]; then
|
||
|
|
consecutive_404=0
|
||
|
|
pid="MMP${WP}-${n}"
|
||
|
|
echo "[$n] ${pid} → ingest …"
|
||
|
|
if docker exec "$CONTAINER" python -m app.ingest_votes \
|
||
|
|
--url "$url" --bundesland NRW --protokoll-id "$pid" 2>&1 \
|
||
|
|
| tail -3 | sed 's/^/ /'; then
|
||
|
|
ok_count=$((ok_count + 1))
|
||
|
|
else
|
||
|
|
fail_count=$((fail_count + 1))
|
||
|
|
echo " !! ingest fehlgeschlagen"
|
||
|
|
fi
|
||
|
|
elif [ "$http" = "404" ]; then
|
||
|
|
consecutive_404=$((consecutive_404 + 1))
|
||
|
|
if [ $consecutive_404 -ge $GAP_TOLERANCE ]; then
|
||
|
|
echo "[$n] ${GAP_TOLERANCE} aufeinanderfolgende 404 — Backfill beendet."
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
else
|
||
|
|
echo "[$n] HTTP $http (skip)"
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
|
||
|
|
echo
|
||
|
|
echo "=== Summary: ${ok_count} ingested, ${fail_count} failed ==="
|