feat(#106): Ingest-CLI fuer NRW-Plenarprotokolle
app/ingest_votes_nrw.py: Pipeline PDF → protokoll_parser_nrw → DB. CLI: python -m app.ingest_votes_nrw --pdf /pfad/MMP18-119.pdf python -m app.ingest_votes_nrw --url https://landtag.nrw.de/.../MMP18-119.pdf python -m app.ingest_votes_nrw --pdf x.pdf --protokoll-id MMP18-119 --bundesland NRW Protokoll-ID wird default aus Datei-Stem abgeleitet (MMP18-119.pdf → MMP18-119), URL-Mode parst sie aus dem letzten Pfadsegment. ingest_pdf() ist die programmatische API (auch fuer Folge-Cron, falls spaeter automatisch Plenarprotokoll-Sammelinges nachgeruestet wird). Statistik-Dict: parsed/written/skipped_no_drucksache/errors. 6 Tests: Roundtrip, skip-bei-fehlender-Drucksache, default + override fuer Protokoll-ID, BL-Override (fuer #126-Folge), idempotenter Re-Ingest.
This commit is contained in:
parent
ae3f48be41
commit
e26607854f
153
app/ingest_votes_nrw.py
Normal file
153
app/ingest_votes_nrw.py
Normal file
@ -0,0 +1,153 @@
|
|||||||
|
"""Ingest-CLI fuer NRW-Plenarprotokolle (#106).
|
||||||
|
|
||||||
|
Pipeline:
|
||||||
|
1. PDF laden (Pfad oder URL)
|
||||||
|
2. protokoll_parser_nrw.parse_protocol() liefert Liste von Abstimmungen
|
||||||
|
3. upsert_plenum_vote() schreibt jede Abstimmung in die DB
|
||||||
|
|
||||||
|
CLI:
|
||||||
|
python -m app.ingest_votes_nrw --pdf /pfad/zu/MMP18-119.pdf
|
||||||
|
python -m app.ingest_votes_nrw --url https://landtag.nrw.de/.../MMP18-119.pdf
|
||||||
|
python -m app.ingest_votes_nrw --pdf MMP18-119.pdf --protokoll-id MMP18-119
|
||||||
|
|
||||||
|
Die Protokoll-ID wird, wenn nicht uebergeben, aus dem Datei-Stem abgeleitet.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import urllib.request
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from .protokoll_parser_nrw import parse_protocol
|
||||||
|
from .database import upsert_plenum_vote
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _derive_protokoll_id(pdf_path: Path) -> str:
|
||||||
|
"""Ermittle Protokoll-ID aus dem Datei-Stem (z.B. 'MMP18-119.pdf' → 'MMP18-119')."""
|
||||||
|
return pdf_path.stem
|
||||||
|
|
||||||
|
|
||||||
|
def _download_pdf(url: str, dest: Path) -> Path:
|
||||||
|
"""Lade ein PDF von einer URL in einen Pfad. Wirft bei HTTP-Fehlern."""
|
||||||
|
req = urllib.request.Request(
|
||||||
|
url,
|
||||||
|
headers={"User-Agent": "GWOeAntragspruefer/1.0 (+https://gwoe.toppyr.de)"},
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||||
|
dest.write_bytes(resp.read())
|
||||||
|
return dest
|
||||||
|
|
||||||
|
|
||||||
|
async def ingest_pdf(
|
||||||
|
pdf_path: Path,
|
||||||
|
*,
|
||||||
|
bundesland: str = "NRW",
|
||||||
|
protokoll_id: Optional[str] = None,
|
||||||
|
quelle_url: Optional[str] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Parse das PDF und schreibe alle gefundenen Abstimmungen in die DB.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Statistik-Dict ``{parsed, written, skipped_no_drucksache, errors}``.
|
||||||
|
"""
|
||||||
|
pid = protokoll_id or _derive_protokoll_id(pdf_path)
|
||||||
|
parsed = parse_protocol(str(pdf_path))
|
||||||
|
|
||||||
|
written = 0
|
||||||
|
skipped_no_ds = 0
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
for entry in parsed:
|
||||||
|
ds = entry.get("drucksache")
|
||||||
|
if not ds:
|
||||||
|
skipped_no_ds += 1
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
await upsert_plenum_vote(
|
||||||
|
bundesland=bundesland,
|
||||||
|
drucksache=ds,
|
||||||
|
ergebnis=entry["ergebnis"],
|
||||||
|
einstimmig=bool(entry.get("einstimmig", False)),
|
||||||
|
fraktionen_ja=entry.get("votes", {}).get("ja", []),
|
||||||
|
fraktionen_nein=entry.get("votes", {}).get("nein", []),
|
||||||
|
fraktionen_enthaltung=entry.get("votes", {}).get("enthaltung", []),
|
||||||
|
quelle_protokoll=pid,
|
||||||
|
quelle_url=quelle_url,
|
||||||
|
)
|
||||||
|
written += 1
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Upsert fehlgeschlagen fuer %s", ds)
|
||||||
|
errors.append(f"{ds}: {exc}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"parsed": len(parsed),
|
||||||
|
"written": written,
|
||||||
|
"skipped_no_drucksache": skipped_no_ds,
|
||||||
|
"errors": errors,
|
||||||
|
"protokoll_id": pid,
|
||||||
|
"bundesland": bundesland,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _cli() -> None:
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Plenarprotokoll → plenum_vote_results-Tabelle (#106)",
|
||||||
|
)
|
||||||
|
src = parser.add_mutually_exclusive_group(required=True)
|
||||||
|
src.add_argument("--pdf", help="Pfad zu lokalem PDF")
|
||||||
|
src.add_argument("--url", help="HTTP(S)-URL zum PDF")
|
||||||
|
parser.add_argument("--bundesland", default="NRW",
|
||||||
|
help="Bundesland-Code (default: NRW)")
|
||||||
|
parser.add_argument("--protokoll-id",
|
||||||
|
help="Protokoll-ID (default: aus Datei-Stem)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.url:
|
||||||
|
# Download in tmp und nach dem Run wieder loeschen
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
||||||
|
tmp_path = Path(tmp.name)
|
||||||
|
try:
|
||||||
|
print(f"Lade {args.url} → {tmp_path} …")
|
||||||
|
_download_pdf(args.url, tmp_path)
|
||||||
|
pid = args.protokoll_id or args.url.rsplit("/", 1)[-1].rsplit(".", 1)[0]
|
||||||
|
stats = asyncio.run(ingest_pdf(
|
||||||
|
tmp_path, bundesland=args.bundesland,
|
||||||
|
protokoll_id=pid, quelle_url=args.url,
|
||||||
|
))
|
||||||
|
finally:
|
||||||
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
else:
|
||||||
|
pdf_path = Path(args.pdf)
|
||||||
|
if not pdf_path.exists():
|
||||||
|
print(f"FEHLER: PDF nicht gefunden: {pdf_path}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
stats = asyncio.run(ingest_pdf(
|
||||||
|
pdf_path, bundesland=args.bundesland,
|
||||||
|
protokoll_id=args.protokoll_id,
|
||||||
|
))
|
||||||
|
|
||||||
|
print()
|
||||||
|
print(f"Protokoll {stats['protokoll_id']} ({stats['bundesland']})")
|
||||||
|
print(f" parsed: {stats['parsed']}")
|
||||||
|
print(f" written: {stats['written']}")
|
||||||
|
if stats["skipped_no_drucksache"]:
|
||||||
|
print(f" ohne DS: {stats['skipped_no_drucksache']}")
|
||||||
|
if stats["errors"]:
|
||||||
|
print(f" errors: {len(stats['errors'])}")
|
||||||
|
for e in stats["errors"][:5]:
|
||||||
|
print(f" {e}")
|
||||||
|
if stats["written"] == 0 and not stats["errors"]:
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
_cli()
|
||||||
160
tests/test_ingest_votes_nrw.py
Normal file
160
tests/test_ingest_votes_nrw.py
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
"""Tests fuer app/ingest_votes_nrw.py — PDF → plenum_vote_results Pipeline (#106)."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# Gleiches aiosqlite-Setup-Problem wie in test_database.py — dort fix
|
||||||
|
# importieren, damit hier nichts gestubbed ist.
|
||||||
|
_aio = sys.modules.get("aiosqlite")
|
||||||
|
if _aio is not None and not hasattr(_aio, "connect"):
|
||||||
|
del sys.modules["aiosqlite"]
|
||||||
|
|
||||||
|
import aiosqlite # noqa: E402
|
||||||
|
import importlib # noqa: E402
|
||||||
|
|
||||||
|
if "app.database" in sys.modules:
|
||||||
|
if not hasattr(getattr(sys.modules["app.database"], "aiosqlite", None), "connect"):
|
||||||
|
del sys.modules["app.database"]
|
||||||
|
importlib.import_module("app.database")
|
||||||
|
else:
|
||||||
|
importlib.import_module("app.database")
|
||||||
|
|
||||||
|
|
||||||
|
def run(coro):
|
||||||
|
return asyncio.get_event_loop().run_until_complete(coro)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def db_path(tmp_path, monkeypatch):
|
||||||
|
path = tmp_path / "test.db"
|
||||||
|
from app.config import settings
|
||||||
|
monkeypatch.setattr(settings, "db_path", str(path))
|
||||||
|
return str(path)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def initialized_db(db_path):
|
||||||
|
from app import database
|
||||||
|
run(database.init_db())
|
||||||
|
return db_path
|
||||||
|
|
||||||
|
|
||||||
|
def _fake_parse_result(drucksache: str, ergebnis: str = "angenommen",
|
||||||
|
einstimmig: bool = False,
|
||||||
|
ja: list[str] = None, nein: list[str] = None,
|
||||||
|
enth: list[str] = None) -> dict:
|
||||||
|
return {
|
||||||
|
"drucksache": drucksache,
|
||||||
|
"ergebnis": ergebnis,
|
||||||
|
"einstimmig": einstimmig,
|
||||||
|
"votes": {
|
||||||
|
"ja": ja or [],
|
||||||
|
"nein": nein or [],
|
||||||
|
"enthaltung": enth or [],
|
||||||
|
},
|
||||||
|
"kind": "direct",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestIngestPdf:
|
||||||
|
def test_writes_each_parsed_vote(self, initialized_db, tmp_path):
|
||||||
|
from app import ingest_votes_nrw, database
|
||||||
|
fake_pdf = tmp_path / "MMP18-119.pdf"
|
||||||
|
fake_pdf.write_bytes(b"%PDF-1.4 fake")
|
||||||
|
|
||||||
|
parser_results = [
|
||||||
|
_fake_parse_result("18/100", "angenommen", ja=["CDU", "SPD"], nein=["AfD"]),
|
||||||
|
_fake_parse_result("18/200", "abgelehnt", ja=["AfD"], nein=["CDU", "SPD"]),
|
||||||
|
]
|
||||||
|
|
||||||
|
with patch("app.ingest_votes_nrw.parse_protocol", return_value=parser_results):
|
||||||
|
stats = run(ingest_votes_nrw.ingest_pdf(fake_pdf))
|
||||||
|
|
||||||
|
assert stats["parsed"] == 2
|
||||||
|
assert stats["written"] == 2
|
||||||
|
|
||||||
|
votes_100 = run(database.get_plenum_votes("NRW", "18/100"))
|
||||||
|
assert len(votes_100) == 1
|
||||||
|
assert votes_100[0]["fraktionen_ja"] == ["CDU", "SPD"]
|
||||||
|
assert votes_100[0]["quelle_protokoll"] == "MMP18-119"
|
||||||
|
|
||||||
|
def test_skips_entries_without_drucksache(self, initialized_db, tmp_path):
|
||||||
|
"""Anchors ohne aufloesbare Drucksache werden gezaehlt aber nicht
|
||||||
|
geschrieben (sonst muellt der Import die DB voll)."""
|
||||||
|
from app import ingest_votes_nrw
|
||||||
|
fake_pdf = tmp_path / "MMP18-50.pdf"
|
||||||
|
fake_pdf.write_bytes(b"%PDF")
|
||||||
|
|
||||||
|
parser_results = [
|
||||||
|
_fake_parse_result("18/300", "angenommen"),
|
||||||
|
{"drucksache": None, "ergebnis": "angenommen", "votes": {"ja": [], "nein": [], "enthaltung": []}},
|
||||||
|
]
|
||||||
|
with patch("app.ingest_votes_nrw.parse_protocol", return_value=parser_results):
|
||||||
|
stats = run(ingest_votes_nrw.ingest_pdf(fake_pdf))
|
||||||
|
|
||||||
|
assert stats["parsed"] == 2
|
||||||
|
assert stats["written"] == 1
|
||||||
|
assert stats["skipped_no_drucksache"] == 1
|
||||||
|
|
||||||
|
def test_protokoll_id_default_from_stem(self, initialized_db, tmp_path):
|
||||||
|
from app import ingest_votes_nrw, database
|
||||||
|
fake_pdf = tmp_path / "MMP18-77.pdf"
|
||||||
|
fake_pdf.write_bytes(b"%PDF")
|
||||||
|
with patch("app.ingest_votes_nrw.parse_protocol",
|
||||||
|
return_value=[_fake_parse_result("18/500")]):
|
||||||
|
stats = run(ingest_votes_nrw.ingest_pdf(fake_pdf))
|
||||||
|
assert stats["protokoll_id"] == "MMP18-77"
|
||||||
|
votes = run(database.get_plenum_votes("NRW", "18/500"))
|
||||||
|
assert votes[0]["quelle_protokoll"] == "MMP18-77"
|
||||||
|
|
||||||
|
def test_protokoll_id_override(self, initialized_db, tmp_path):
|
||||||
|
from app import ingest_votes_nrw, database
|
||||||
|
fake_pdf = tmp_path / "scan.pdf"
|
||||||
|
fake_pdf.write_bytes(b"%PDF")
|
||||||
|
with patch("app.ingest_votes_nrw.parse_protocol",
|
||||||
|
return_value=[_fake_parse_result("18/600")]):
|
||||||
|
run(ingest_votes_nrw.ingest_pdf(
|
||||||
|
fake_pdf, protokoll_id="MMP18-99", quelle_url="https://example.com/x.pdf",
|
||||||
|
))
|
||||||
|
votes = run(database.get_plenum_votes("NRW", "18/600"))
|
||||||
|
assert votes[0]["quelle_protokoll"] == "MMP18-99"
|
||||||
|
assert votes[0]["quelle_url"] == "https://example.com/x.pdf"
|
||||||
|
|
||||||
|
def test_bundesland_override(self, initialized_db, tmp_path):
|
||||||
|
"""Adapter fuer andere BL koennten denselben Ingest-Helper nutzen."""
|
||||||
|
from app import ingest_votes_nrw, database
|
||||||
|
fake_pdf = tmp_path / "MV-MP1.pdf"
|
||||||
|
fake_pdf.write_bytes(b"%PDF")
|
||||||
|
with patch("app.ingest_votes_nrw.parse_protocol",
|
||||||
|
return_value=[_fake_parse_result("8/100")]):
|
||||||
|
run(ingest_votes_nrw.ingest_pdf(fake_pdf, bundesland="MV"))
|
||||||
|
# Lookup unter dem richtigen BL
|
||||||
|
votes_mv = run(database.get_plenum_votes("MV", "8/100"))
|
||||||
|
assert len(votes_mv) == 1
|
||||||
|
votes_nrw = run(database.get_plenum_votes("NRW", "8/100"))
|
||||||
|
assert votes_nrw == []
|
||||||
|
|
||||||
|
def test_re_ingest_overwrites_same_protokoll(self, initialized_db, tmp_path):
|
||||||
|
"""Erneuter Ingest desselben Protokolls aktualisiert die Eintraege
|
||||||
|
(idempotent), kein Duplikat."""
|
||||||
|
from app import ingest_votes_nrw, database
|
||||||
|
fake_pdf = tmp_path / "MMP18-1.pdf"
|
||||||
|
fake_pdf.write_bytes(b"%PDF")
|
||||||
|
|
||||||
|
with patch("app.ingest_votes_nrw.parse_protocol",
|
||||||
|
return_value=[_fake_parse_result("18/700", "angenommen", ja=["CDU"])]):
|
||||||
|
run(ingest_votes_nrw.ingest_pdf(fake_pdf))
|
||||||
|
# Re-Ingest mit korrigiertem Ergebnis (z.B. Parser-Fix)
|
||||||
|
with patch("app.ingest_votes_nrw.parse_protocol",
|
||||||
|
return_value=[_fake_parse_result("18/700", "abgelehnt", ja=[], nein=["CDU"])]):
|
||||||
|
run(ingest_votes_nrw.ingest_pdf(fake_pdf))
|
||||||
|
|
||||||
|
votes = run(database.get_plenum_votes("NRW", "18/700"))
|
||||||
|
assert len(votes) == 1
|
||||||
|
assert votes[0]["ergebnis"] == "abgelehnt"
|
||||||
|
assert votes[0]["fraktionen_nein"] == ["CDU"]
|
||||||
Loading…
Reference in New Issue
Block a user