154 lines
5.0 KiB
Python
154 lines
5.0 KiB
Python
|
|
"""Ingest-CLI fuer NRW-Plenarprotokolle (#106).
|
||
|
|
|
||
|
|
Pipeline:
|
||
|
|
1. PDF laden (Pfad oder URL)
|
||
|
|
2. protokoll_parser_nrw.parse_protocol() liefert Liste von Abstimmungen
|
||
|
|
3. upsert_plenum_vote() schreibt jede Abstimmung in die DB
|
||
|
|
|
||
|
|
CLI:
|
||
|
|
python -m app.ingest_votes_nrw --pdf /pfad/zu/MMP18-119.pdf
|
||
|
|
python -m app.ingest_votes_nrw --url https://landtag.nrw.de/.../MMP18-119.pdf
|
||
|
|
python -m app.ingest_votes_nrw --pdf MMP18-119.pdf --protokoll-id MMP18-119
|
||
|
|
|
||
|
|
Die Protokoll-ID wird, wenn nicht uebergeben, aus dem Datei-Stem abgeleitet.
|
||
|
|
"""
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import asyncio
|
||
|
|
import logging
|
||
|
|
import sys
|
||
|
|
import tempfile
|
||
|
|
import urllib.request
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
from .protokoll_parser_nrw import parse_protocol
|
||
|
|
from .database import upsert_plenum_vote
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
|
||
|
|
def _derive_protokoll_id(pdf_path: Path) -> str:
|
||
|
|
"""Ermittle Protokoll-ID aus dem Datei-Stem (z.B. 'MMP18-119.pdf' → 'MMP18-119')."""
|
||
|
|
return pdf_path.stem
|
||
|
|
|
||
|
|
|
||
|
|
def _download_pdf(url: str, dest: Path) -> Path:
|
||
|
|
"""Lade ein PDF von einer URL in einen Pfad. Wirft bei HTTP-Fehlern."""
|
||
|
|
req = urllib.request.Request(
|
||
|
|
url,
|
||
|
|
headers={"User-Agent": "GWOeAntragspruefer/1.0 (+https://gwoe.toppyr.de)"},
|
||
|
|
)
|
||
|
|
with urllib.request.urlopen(req, timeout=60) as resp:
|
||
|
|
dest.write_bytes(resp.read())
|
||
|
|
return dest
|
||
|
|
|
||
|
|
|
||
|
|
async def ingest_pdf(
|
||
|
|
pdf_path: Path,
|
||
|
|
*,
|
||
|
|
bundesland: str = "NRW",
|
||
|
|
protokoll_id: Optional[str] = None,
|
||
|
|
quelle_url: Optional[str] = None,
|
||
|
|
) -> dict:
|
||
|
|
"""Parse das PDF und schreibe alle gefundenen Abstimmungen in die DB.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Statistik-Dict ``{parsed, written, skipped_no_drucksache, errors}``.
|
||
|
|
"""
|
||
|
|
pid = protokoll_id or _derive_protokoll_id(pdf_path)
|
||
|
|
parsed = parse_protocol(str(pdf_path))
|
||
|
|
|
||
|
|
written = 0
|
||
|
|
skipped_no_ds = 0
|
||
|
|
errors: list[str] = []
|
||
|
|
|
||
|
|
for entry in parsed:
|
||
|
|
ds = entry.get("drucksache")
|
||
|
|
if not ds:
|
||
|
|
skipped_no_ds += 1
|
||
|
|
continue
|
||
|
|
try:
|
||
|
|
await upsert_plenum_vote(
|
||
|
|
bundesland=bundesland,
|
||
|
|
drucksache=ds,
|
||
|
|
ergebnis=entry["ergebnis"],
|
||
|
|
einstimmig=bool(entry.get("einstimmig", False)),
|
||
|
|
fraktionen_ja=entry.get("votes", {}).get("ja", []),
|
||
|
|
fraktionen_nein=entry.get("votes", {}).get("nein", []),
|
||
|
|
fraktionen_enthaltung=entry.get("votes", {}).get("enthaltung", []),
|
||
|
|
quelle_protokoll=pid,
|
||
|
|
quelle_url=quelle_url,
|
||
|
|
)
|
||
|
|
written += 1
|
||
|
|
except Exception as exc:
|
||
|
|
logger.exception("Upsert fehlgeschlagen fuer %s", ds)
|
||
|
|
errors.append(f"{ds}: {exc}")
|
||
|
|
|
||
|
|
return {
|
||
|
|
"parsed": len(parsed),
|
||
|
|
"written": written,
|
||
|
|
"skipped_no_drucksache": skipped_no_ds,
|
||
|
|
"errors": errors,
|
||
|
|
"protokoll_id": pid,
|
||
|
|
"bundesland": bundesland,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def _cli() -> None:
|
||
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||
|
|
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
description="Plenarprotokoll → plenum_vote_results-Tabelle (#106)",
|
||
|
|
)
|
||
|
|
src = parser.add_mutually_exclusive_group(required=True)
|
||
|
|
src.add_argument("--pdf", help="Pfad zu lokalem PDF")
|
||
|
|
src.add_argument("--url", help="HTTP(S)-URL zum PDF")
|
||
|
|
parser.add_argument("--bundesland", default="NRW",
|
||
|
|
help="Bundesland-Code (default: NRW)")
|
||
|
|
parser.add_argument("--protokoll-id",
|
||
|
|
help="Protokoll-ID (default: aus Datei-Stem)")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
if args.url:
|
||
|
|
# Download in tmp und nach dem Run wieder loeschen
|
||
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
||
|
|
tmp_path = Path(tmp.name)
|
||
|
|
try:
|
||
|
|
print(f"Lade {args.url} → {tmp_path} …")
|
||
|
|
_download_pdf(args.url, tmp_path)
|
||
|
|
pid = args.protokoll_id or args.url.rsplit("/", 1)[-1].rsplit(".", 1)[0]
|
||
|
|
stats = asyncio.run(ingest_pdf(
|
||
|
|
tmp_path, bundesland=args.bundesland,
|
||
|
|
protokoll_id=pid, quelle_url=args.url,
|
||
|
|
))
|
||
|
|
finally:
|
||
|
|
tmp_path.unlink(missing_ok=True)
|
||
|
|
else:
|
||
|
|
pdf_path = Path(args.pdf)
|
||
|
|
if not pdf_path.exists():
|
||
|
|
print(f"FEHLER: PDF nicht gefunden: {pdf_path}", file=sys.stderr)
|
||
|
|
sys.exit(1)
|
||
|
|
stats = asyncio.run(ingest_pdf(
|
||
|
|
pdf_path, bundesland=args.bundesland,
|
||
|
|
protokoll_id=args.protokoll_id,
|
||
|
|
))
|
||
|
|
|
||
|
|
print()
|
||
|
|
print(f"Protokoll {stats['protokoll_id']} ({stats['bundesland']})")
|
||
|
|
print(f" parsed: {stats['parsed']}")
|
||
|
|
print(f" written: {stats['written']}")
|
||
|
|
if stats["skipped_no_drucksache"]:
|
||
|
|
print(f" ohne DS: {stats['skipped_no_drucksache']}")
|
||
|
|
if stats["errors"]:
|
||
|
|
print(f" errors: {len(stats['errors'])}")
|
||
|
|
for e in stats["errors"][:5]:
|
||
|
|
print(f" {e}")
|
||
|
|
if stats["written"] == 0 and not stats["errors"]:
|
||
|
|
sys.exit(2)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
_cli()
|