From e26607854f289ac0e709e34060d18bdab1cb9c08 Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Tue, 28 Apr 2026 08:03:18 +0200 Subject: [PATCH] feat(#106): Ingest-CLI fuer NRW-Plenarprotokolle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit app/ingest_votes_nrw.py: Pipeline PDF → protokoll_parser_nrw → DB. CLI: python -m app.ingest_votes_nrw --pdf /pfad/MMP18-119.pdf python -m app.ingest_votes_nrw --url https://landtag.nrw.de/.../MMP18-119.pdf python -m app.ingest_votes_nrw --pdf x.pdf --protokoll-id MMP18-119 --bundesland NRW Protokoll-ID wird default aus Datei-Stem abgeleitet (MMP18-119.pdf → MMP18-119), URL-Mode parst sie aus dem letzten Pfadsegment. ingest_pdf() ist die programmatische API (auch fuer Folge-Cron, falls spaeter automatisch Plenarprotokoll-Sammelinges nachgeruestet wird). Statistik-Dict: parsed/written/skipped_no_drucksache/errors. 6 Tests: Roundtrip, skip-bei-fehlender-Drucksache, default + override fuer Protokoll-ID, BL-Override (fuer #126-Folge), idempotenter Re-Ingest. --- app/ingest_votes_nrw.py | 153 +++++++++++++++++++++++++++++++ tests/test_ingest_votes_nrw.py | 160 +++++++++++++++++++++++++++++++++ 2 files changed, 313 insertions(+) create mode 100644 app/ingest_votes_nrw.py create mode 100644 tests/test_ingest_votes_nrw.py diff --git a/app/ingest_votes_nrw.py b/app/ingest_votes_nrw.py new file mode 100644 index 0000000..2adbd05 --- /dev/null +++ b/app/ingest_votes_nrw.py @@ -0,0 +1,153 @@ +"""Ingest-CLI fuer NRW-Plenarprotokolle (#106). + +Pipeline: + 1. PDF laden (Pfad oder URL) + 2. protokoll_parser_nrw.parse_protocol() liefert Liste von Abstimmungen + 3. upsert_plenum_vote() schreibt jede Abstimmung in die DB + +CLI: + python -m app.ingest_votes_nrw --pdf /pfad/zu/MMP18-119.pdf + python -m app.ingest_votes_nrw --url https://landtag.nrw.de/.../MMP18-119.pdf + python -m app.ingest_votes_nrw --pdf MMP18-119.pdf --protokoll-id MMP18-119 + +Die Protokoll-ID wird, wenn nicht uebergeben, aus dem Datei-Stem abgeleitet. +""" +from __future__ import annotations + +import argparse +import asyncio +import logging +import sys +import tempfile +import urllib.request +from pathlib import Path +from typing import Optional + +from .protokoll_parser_nrw import parse_protocol +from .database import upsert_plenum_vote + +logger = logging.getLogger(__name__) + + +def _derive_protokoll_id(pdf_path: Path) -> str: + """Ermittle Protokoll-ID aus dem Datei-Stem (z.B. 'MMP18-119.pdf' → 'MMP18-119').""" + return pdf_path.stem + + +def _download_pdf(url: str, dest: Path) -> Path: + """Lade ein PDF von einer URL in einen Pfad. Wirft bei HTTP-Fehlern.""" + req = urllib.request.Request( + url, + headers={"User-Agent": "GWOeAntragspruefer/1.0 (+https://gwoe.toppyr.de)"}, + ) + with urllib.request.urlopen(req, timeout=60) as resp: + dest.write_bytes(resp.read()) + return dest + + +async def ingest_pdf( + pdf_path: Path, + *, + bundesland: str = "NRW", + protokoll_id: Optional[str] = None, + quelle_url: Optional[str] = None, +) -> dict: + """Parse das PDF und schreibe alle gefundenen Abstimmungen in die DB. + + Returns: + Statistik-Dict ``{parsed, written, skipped_no_drucksache, errors}``. + """ + pid = protokoll_id or _derive_protokoll_id(pdf_path) + parsed = parse_protocol(str(pdf_path)) + + written = 0 + skipped_no_ds = 0 + errors: list[str] = [] + + for entry in parsed: + ds = entry.get("drucksache") + if not ds: + skipped_no_ds += 1 + continue + try: + await upsert_plenum_vote( + bundesland=bundesland, + drucksache=ds, + ergebnis=entry["ergebnis"], + einstimmig=bool(entry.get("einstimmig", False)), + fraktionen_ja=entry.get("votes", {}).get("ja", []), + fraktionen_nein=entry.get("votes", {}).get("nein", []), + fraktionen_enthaltung=entry.get("votes", {}).get("enthaltung", []), + quelle_protokoll=pid, + quelle_url=quelle_url, + ) + written += 1 + except Exception as exc: + logger.exception("Upsert fehlgeschlagen fuer %s", ds) + errors.append(f"{ds}: {exc}") + + return { + "parsed": len(parsed), + "written": written, + "skipped_no_drucksache": skipped_no_ds, + "errors": errors, + "protokoll_id": pid, + "bundesland": bundesland, + } + + +def _cli() -> None: + logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") + + parser = argparse.ArgumentParser( + description="Plenarprotokoll → plenum_vote_results-Tabelle (#106)", + ) + src = parser.add_mutually_exclusive_group(required=True) + src.add_argument("--pdf", help="Pfad zu lokalem PDF") + src.add_argument("--url", help="HTTP(S)-URL zum PDF") + parser.add_argument("--bundesland", default="NRW", + help="Bundesland-Code (default: NRW)") + parser.add_argument("--protokoll-id", + help="Protokoll-ID (default: aus Datei-Stem)") + args = parser.parse_args() + + if args.url: + # Download in tmp und nach dem Run wieder loeschen + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: + tmp_path = Path(tmp.name) + try: + print(f"Lade {args.url} → {tmp_path} …") + _download_pdf(args.url, tmp_path) + pid = args.protokoll_id or args.url.rsplit("/", 1)[-1].rsplit(".", 1)[0] + stats = asyncio.run(ingest_pdf( + tmp_path, bundesland=args.bundesland, + protokoll_id=pid, quelle_url=args.url, + )) + finally: + tmp_path.unlink(missing_ok=True) + else: + pdf_path = Path(args.pdf) + if not pdf_path.exists(): + print(f"FEHLER: PDF nicht gefunden: {pdf_path}", file=sys.stderr) + sys.exit(1) + stats = asyncio.run(ingest_pdf( + pdf_path, bundesland=args.bundesland, + protokoll_id=args.protokoll_id, + )) + + print() + print(f"Protokoll {stats['protokoll_id']} ({stats['bundesland']})") + print(f" parsed: {stats['parsed']}") + print(f" written: {stats['written']}") + if stats["skipped_no_drucksache"]: + print(f" ohne DS: {stats['skipped_no_drucksache']}") + if stats["errors"]: + print(f" errors: {len(stats['errors'])}") + for e in stats["errors"][:5]: + print(f" {e}") + if stats["written"] == 0 and not stats["errors"]: + sys.exit(2) + + +if __name__ == "__main__": + _cli() diff --git a/tests/test_ingest_votes_nrw.py b/tests/test_ingest_votes_nrw.py new file mode 100644 index 0000000..6501bc2 --- /dev/null +++ b/tests/test_ingest_votes_nrw.py @@ -0,0 +1,160 @@ +"""Tests fuer app/ingest_votes_nrw.py — PDF → plenum_vote_results Pipeline (#106).""" +from __future__ import annotations + +import asyncio +import sys +from pathlib import Path +from unittest.mock import patch + +import pytest + +# Gleiches aiosqlite-Setup-Problem wie in test_database.py — dort fix +# importieren, damit hier nichts gestubbed ist. +_aio = sys.modules.get("aiosqlite") +if _aio is not None and not hasattr(_aio, "connect"): + del sys.modules["aiosqlite"] + +import aiosqlite # noqa: E402 +import importlib # noqa: E402 + +if "app.database" in sys.modules: + if not hasattr(getattr(sys.modules["app.database"], "aiosqlite", None), "connect"): + del sys.modules["app.database"] + importlib.import_module("app.database") +else: + importlib.import_module("app.database") + + +def run(coro): + return asyncio.get_event_loop().run_until_complete(coro) + + +@pytest.fixture() +def db_path(tmp_path, monkeypatch): + path = tmp_path / "test.db" + from app.config import settings + monkeypatch.setattr(settings, "db_path", str(path)) + return str(path) + + +@pytest.fixture() +def initialized_db(db_path): + from app import database + run(database.init_db()) + return db_path + + +def _fake_parse_result(drucksache: str, ergebnis: str = "angenommen", + einstimmig: bool = False, + ja: list[str] = None, nein: list[str] = None, + enth: list[str] = None) -> dict: + return { + "drucksache": drucksache, + "ergebnis": ergebnis, + "einstimmig": einstimmig, + "votes": { + "ja": ja or [], + "nein": nein or [], + "enthaltung": enth or [], + }, + "kind": "direct", + } + + +class TestIngestPdf: + def test_writes_each_parsed_vote(self, initialized_db, tmp_path): + from app import ingest_votes_nrw, database + fake_pdf = tmp_path / "MMP18-119.pdf" + fake_pdf.write_bytes(b"%PDF-1.4 fake") + + parser_results = [ + _fake_parse_result("18/100", "angenommen", ja=["CDU", "SPD"], nein=["AfD"]), + _fake_parse_result("18/200", "abgelehnt", ja=["AfD"], nein=["CDU", "SPD"]), + ] + + with patch("app.ingest_votes_nrw.parse_protocol", return_value=parser_results): + stats = run(ingest_votes_nrw.ingest_pdf(fake_pdf)) + + assert stats["parsed"] == 2 + assert stats["written"] == 2 + + votes_100 = run(database.get_plenum_votes("NRW", "18/100")) + assert len(votes_100) == 1 + assert votes_100[0]["fraktionen_ja"] == ["CDU", "SPD"] + assert votes_100[0]["quelle_protokoll"] == "MMP18-119" + + def test_skips_entries_without_drucksache(self, initialized_db, tmp_path): + """Anchors ohne aufloesbare Drucksache werden gezaehlt aber nicht + geschrieben (sonst muellt der Import die DB voll).""" + from app import ingest_votes_nrw + fake_pdf = tmp_path / "MMP18-50.pdf" + fake_pdf.write_bytes(b"%PDF") + + parser_results = [ + _fake_parse_result("18/300", "angenommen"), + {"drucksache": None, "ergebnis": "angenommen", "votes": {"ja": [], "nein": [], "enthaltung": []}}, + ] + with patch("app.ingest_votes_nrw.parse_protocol", return_value=parser_results): + stats = run(ingest_votes_nrw.ingest_pdf(fake_pdf)) + + assert stats["parsed"] == 2 + assert stats["written"] == 1 + assert stats["skipped_no_drucksache"] == 1 + + def test_protokoll_id_default_from_stem(self, initialized_db, tmp_path): + from app import ingest_votes_nrw, database + fake_pdf = tmp_path / "MMP18-77.pdf" + fake_pdf.write_bytes(b"%PDF") + with patch("app.ingest_votes_nrw.parse_protocol", + return_value=[_fake_parse_result("18/500")]): + stats = run(ingest_votes_nrw.ingest_pdf(fake_pdf)) + assert stats["protokoll_id"] == "MMP18-77" + votes = run(database.get_plenum_votes("NRW", "18/500")) + assert votes[0]["quelle_protokoll"] == "MMP18-77" + + def test_protokoll_id_override(self, initialized_db, tmp_path): + from app import ingest_votes_nrw, database + fake_pdf = tmp_path / "scan.pdf" + fake_pdf.write_bytes(b"%PDF") + with patch("app.ingest_votes_nrw.parse_protocol", + return_value=[_fake_parse_result("18/600")]): + run(ingest_votes_nrw.ingest_pdf( + fake_pdf, protokoll_id="MMP18-99", quelle_url="https://example.com/x.pdf", + )) + votes = run(database.get_plenum_votes("NRW", "18/600")) + assert votes[0]["quelle_protokoll"] == "MMP18-99" + assert votes[0]["quelle_url"] == "https://example.com/x.pdf" + + def test_bundesland_override(self, initialized_db, tmp_path): + """Adapter fuer andere BL koennten denselben Ingest-Helper nutzen.""" + from app import ingest_votes_nrw, database + fake_pdf = tmp_path / "MV-MP1.pdf" + fake_pdf.write_bytes(b"%PDF") + with patch("app.ingest_votes_nrw.parse_protocol", + return_value=[_fake_parse_result("8/100")]): + run(ingest_votes_nrw.ingest_pdf(fake_pdf, bundesland="MV")) + # Lookup unter dem richtigen BL + votes_mv = run(database.get_plenum_votes("MV", "8/100")) + assert len(votes_mv) == 1 + votes_nrw = run(database.get_plenum_votes("NRW", "8/100")) + assert votes_nrw == [] + + def test_re_ingest_overwrites_same_protokoll(self, initialized_db, tmp_path): + """Erneuter Ingest desselben Protokolls aktualisiert die Eintraege + (idempotent), kein Duplikat.""" + from app import ingest_votes_nrw, database + fake_pdf = tmp_path / "MMP18-1.pdf" + fake_pdf.write_bytes(b"%PDF") + + with patch("app.ingest_votes_nrw.parse_protocol", + return_value=[_fake_parse_result("18/700", "angenommen", ja=["CDU"])]): + run(ingest_votes_nrw.ingest_pdf(fake_pdf)) + # Re-Ingest mit korrigiertem Ergebnis (z.B. Parser-Fix) + with patch("app.ingest_votes_nrw.parse_protocol", + return_value=[_fake_parse_result("18/700", "abgelehnt", ja=[], nein=["CDU"])]): + run(ingest_votes_nrw.ingest_pdf(fake_pdf)) + + votes = run(database.get_plenum_votes("NRW", "18/700")) + assert len(votes) == 1 + assert votes[0]["ergebnis"] == "abgelehnt" + assert votes[0]["fraktionen_nein"] == ["CDU"]