"""Tests fuer app/ingest_votes.py — PDF → plenum_vote_results Pipeline (#106 / #126).""" from __future__ import annotations import asyncio import sys from pathlib import Path from unittest.mock import patch import pytest # Gleiches aiosqlite-Setup-Problem wie in test_database.py — dort fix # importieren, damit hier nichts gestubbed ist. _aio = sys.modules.get("aiosqlite") if _aio is not None and not hasattr(_aio, "connect"): del sys.modules["aiosqlite"] import aiosqlite # noqa: E402 import importlib # noqa: E402 if "app.database" in sys.modules: if not hasattr(getattr(sys.modules["app.database"], "aiosqlite", None), "connect"): del sys.modules["app.database"] importlib.import_module("app.database") else: importlib.import_module("app.database") def run(coro): return asyncio.get_event_loop().run_until_complete(coro) @pytest.fixture() def db_path(tmp_path, monkeypatch): path = tmp_path / "test.db" from app.config import settings monkeypatch.setattr(settings, "db_path", str(path)) return str(path) @pytest.fixture() def initialized_db(db_path): from app import database run(database.init_db()) return db_path def _fake_parse_result(drucksache: str, ergebnis: str = "angenommen", einstimmig: bool = False, ja: list[str] = None, nein: list[str] = None, enth: list[str] = None) -> dict: return { "drucksache": drucksache, "ergebnis": ergebnis, "einstimmig": einstimmig, "votes": { "ja": ja or [], "nein": nein or [], "enthaltung": enth or [], }, "kind": "direct", } class TestIngestPdf: def test_writes_each_parsed_vote(self, initialized_db, tmp_path): from app import ingest_votes, database fake_pdf = tmp_path / "MMP18-119.pdf" fake_pdf.write_bytes(b"%PDF-1.4 fake") parser_results = [ _fake_parse_result("18/100", "angenommen", ja=["CDU", "SPD"], nein=["AfD"]), _fake_parse_result("18/200", "abgelehnt", ja=["AfD"], nein=["CDU", "SPD"]), ] with patch("app.ingest_votes.parse_protocol", return_value=parser_results): stats = run(ingest_votes.ingest_pdf(fake_pdf)) assert stats["parsed"] == 2 assert stats["written"] == 2 votes_100 = run(database.get_plenum_votes("NRW", "18/100")) assert len(votes_100) == 1 assert votes_100[0]["fraktionen_ja"] == ["CDU", "SPD"] assert votes_100[0]["quelle_protokoll"] == "MMP18-119" def test_skips_entries_without_drucksache(self, initialized_db, tmp_path): """Anchors ohne aufloesbare Drucksache werden gezaehlt aber nicht geschrieben (sonst muellt der Import die DB voll).""" from app import ingest_votes fake_pdf = tmp_path / "MMP18-50.pdf" fake_pdf.write_bytes(b"%PDF") parser_results = [ _fake_parse_result("18/300", "angenommen"), {"drucksache": None, "ergebnis": "angenommen", "votes": {"ja": [], "nein": [], "enthaltung": []}}, ] with patch("app.ingest_votes.parse_protocol", return_value=parser_results): stats = run(ingest_votes.ingest_pdf(fake_pdf)) assert stats["parsed"] == 2 assert stats["written"] == 1 assert stats["skipped_no_drucksache"] == 1 def test_protokoll_id_default_from_stem(self, initialized_db, tmp_path): from app import ingest_votes, database fake_pdf = tmp_path / "MMP18-77.pdf" fake_pdf.write_bytes(b"%PDF") with patch("app.ingest_votes.parse_protocol", return_value=[_fake_parse_result("18/500")]): stats = run(ingest_votes.ingest_pdf(fake_pdf)) assert stats["protokoll_id"] == "MMP18-77" votes = run(database.get_plenum_votes("NRW", "18/500")) assert votes[0]["quelle_protokoll"] == "MMP18-77" def test_protokoll_id_override(self, initialized_db, tmp_path): from app import ingest_votes, database fake_pdf = tmp_path / "scan.pdf" fake_pdf.write_bytes(b"%PDF") with patch("app.ingest_votes.parse_protocol", return_value=[_fake_parse_result("18/600")]): run(ingest_votes.ingest_pdf( fake_pdf, protokoll_id="MMP18-99", quelle_url="https://example.com/x.pdf", )) votes = run(database.get_plenum_votes("NRW", "18/600")) assert votes[0]["quelle_protokoll"] == "MMP18-99" assert votes[0]["quelle_url"] == "https://example.com/x.pdf" def test_bundesland_override(self, initialized_db, tmp_path): """Adapter fuer andere BL koennten denselben Ingest-Helper nutzen.""" from app import ingest_votes, database fake_pdf = tmp_path / "MV-MP1.pdf" fake_pdf.write_bytes(b"%PDF") with patch("app.ingest_votes.parse_protocol", return_value=[_fake_parse_result("8/100")]): run(ingest_votes.ingest_pdf(fake_pdf, bundesland="MV")) # Lookup unter dem richtigen BL votes_mv = run(database.get_plenum_votes("MV", "8/100")) assert len(votes_mv) == 1 votes_nrw = run(database.get_plenum_votes("NRW", "8/100")) assert votes_nrw == [] def test_re_ingest_overwrites_same_protokoll(self, initialized_db, tmp_path): """Erneuter Ingest desselben Protokolls aktualisiert die Eintraege (idempotent), kein Duplikat.""" from app import ingest_votes, database fake_pdf = tmp_path / "MMP18-1.pdf" fake_pdf.write_bytes(b"%PDF") with patch("app.ingest_votes.parse_protocol", return_value=[_fake_parse_result("18/700", "angenommen", ja=["CDU"])]): run(ingest_votes.ingest_pdf(fake_pdf)) # Re-Ingest mit korrigiertem Ergebnis (z.B. Parser-Fix) with patch("app.ingest_votes.parse_protocol", return_value=[_fake_parse_result("18/700", "abgelehnt", ja=[], nein=["CDU"])]): run(ingest_votes.ingest_pdf(fake_pdf)) votes = run(database.get_plenum_votes("NRW", "18/700")) assert len(votes) == 1 assert votes[0]["ergebnis"] == "abgelehnt" assert votes[0]["fraktionen_nein"] == ["CDU"] def test_db_error_collected_not_raised(self, initialized_db, tmp_path): """Wenn upsert fehlschlaegt, sollte der Fehler in errors-Liste landen, nicht propagieren — der Rest des Protokolls soll trotzdem verarbeitet werden.""" from app import ingest_votes fake_pdf = tmp_path / "MMP18-2.pdf" fake_pdf.write_bytes(b"%PDF") async def _failing_upsert(**kw): raise RuntimeError("simulated DB error") parser_results = [ _fake_parse_result("18/800", "angenommen"), _fake_parse_result("18/801", "abgelehnt"), ] with patch("app.ingest_votes.parse_protocol", return_value=parser_results), \ patch("app.ingest_votes.upsert_plenum_vote", side_effect=_failing_upsert): stats = run(ingest_votes.ingest_pdf(fake_pdf)) assert stats["written"] == 0 assert len(stats["errors"]) == 2 assert "18/800" in stats["errors"][0] assert "simulated DB error" in stats["errors"][0] class TestDownloadPdf: def test_writes_response_bytes(self, tmp_path): from app.ingest_votes import _download_pdf class _FakeResp: def read(self): return b"%PDF downloaded content" def __enter__(self): return self def __exit__(self, *a): return False dest = tmp_path / "out.pdf" with patch("urllib.request.urlopen", return_value=_FakeResp()): _download_pdf("https://example.com/x.pdf", dest) assert dest.read_bytes() == b"%PDF downloaded content" def test_propagates_http_error(self, tmp_path): """HTTP-Fehler beim Download propagieren — der Caller (CLI) soll mit Stack-Trace abbrechen, nicht still weitergehen.""" from app.ingest_votes import _download_pdf def _raise(*a, **kw): raise OSError("Connection refused") with patch("urllib.request.urlopen", side_effect=_raise): with pytest.raises(OSError): _download_pdf("https://example.com/x.pdf", tmp_path / "out.pdf") class TestCli: """Tests fuer die CLI-Wrapper-Funktion _cli — argv-basiert.""" def test_supported_lists_bl(self, capsys): """--supported gibt registrierte BL aus und exitet mit 0.""" from app import ingest_votes with patch.object(ingest_votes.sys, "argv", ["ingest_votes", "--supported"]): with pytest.raises(SystemExit) as exc: ingest_votes._cli() assert exc.value.code == 0 out = capsys.readouterr().out assert "NRW" in out def test_no_args_errors(self, capsys): """Ohne --pdf und --url muss CLI mit klarer Fehlermeldung exiten.""" from app import ingest_votes with patch.object(ingest_votes.sys, "argv", ["ingest_votes"]): with pytest.raises(SystemExit): ingest_votes._cli() def test_pdf_path_missing_errors(self, capsys, tmp_path): """--pdf mit nicht-existentem Pfad exitet 1.""" from app import ingest_votes nonexistent = tmp_path / "missing.pdf" with patch.object(ingest_votes.sys, "argv", ["ingest_votes", "--pdf", str(nonexistent)]): with pytest.raises(SystemExit) as exc: ingest_votes._cli() assert exc.value.code == 1 err = capsys.readouterr().err assert "nicht gefunden" in err def test_pdf_path_calls_ingest(self, tmp_path, capsys): """--pdf mit existentem Pfad ruft ingest_pdf und gibt Statistik aus.""" from app import ingest_votes pdf = tmp_path / "MMP18-X.pdf" pdf.write_bytes(b"%PDF") fake_stats = { "parsed": 3, "written": 2, "skipped_no_drucksache": 1, "errors": [], "protokoll_id": "MMP18-X", "bundesland": "NRW", } with patch("app.ingest_votes.asyncio.run", return_value=fake_stats), \ patch.object(ingest_votes.sys, "argv", ["ingest_votes", "--pdf", str(pdf)]): ingest_votes._cli() out = capsys.readouterr().out assert "MMP18-X" in out assert "parsed: 3" in out assert "written: 2" in out assert "ohne DS: 1" in out def test_url_downloads_then_ingests(self, capsys): """--url path: Download in tmp, dann ingest_pdf.""" from app import ingest_votes fake_stats = { "parsed": 1, "written": 1, "skipped_no_drucksache": 0, "errors": [], "protokoll_id": "MMP18-Y", "bundesland": "NRW", } class _FakeResp: def read(self): return b"%PDF downloaded" def __enter__(self): return self def __exit__(self, *a): return False with patch("app.ingest_votes.asyncio.run", return_value=fake_stats), \ patch("urllib.request.urlopen", return_value=_FakeResp()), \ patch.object(ingest_votes.sys, "argv", ["ingest_votes", "--url", "https://example.com/MMP18-Y.pdf"]): ingest_votes._cli() out = capsys.readouterr().out assert "MMP18-Y" in out def test_zero_results_exits_2(self, tmp_path, capsys): """Wenn weder geschrieben noch Fehler: exit code 2 (= 'no signal').""" from app import ingest_votes pdf = tmp_path / "leer.pdf" pdf.write_bytes(b"%PDF") fake_stats = { "parsed": 0, "written": 0, "skipped_no_drucksache": 0, "errors": [], "protokoll_id": "leer", "bundesland": "NRW", } with patch("app.ingest_votes.asyncio.run", return_value=fake_stats), \ patch.object(ingest_votes.sys, "argv", ["ingest_votes", "--pdf", str(pdf)]): with pytest.raises(SystemExit) as exc: ingest_votes._cli() assert exc.value.code == 2 def test_errors_listed_in_output(self, tmp_path, capsys): """Wenn errors gefuellt sind, erscheint die Errors-Zeile + erste 5.""" from app import ingest_votes pdf = tmp_path / "x.pdf" pdf.write_bytes(b"%PDF") fake_stats = { "parsed": 2, "written": 0, "skipped_no_drucksache": 0, "errors": ["18/1: oops", "18/2: nope"], "protokoll_id": "x", "bundesland": "NRW", } with patch("app.ingest_votes.asyncio.run", return_value=fake_stats), \ patch.object(ingest_votes.sys, "argv", ["ingest_votes", "--pdf", str(pdf)]): ingest_votes._cli() out = capsys.readouterr().out assert "errors: 2" in out assert "18/1: oops" in out assert "18/2: nope" in out