2026-04-28 08:37:31 +02:00
|
|
|
"""Tests fuer app/ingest_votes.py — PDF → plenum_vote_results Pipeline (#106 / #126)."""
|
2026-04-28 08:03:18 +02:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import asyncio
|
|
|
|
|
import sys
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from unittest.mock import patch
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
# Gleiches aiosqlite-Setup-Problem wie in test_database.py — dort fix
|
|
|
|
|
# importieren, damit hier nichts gestubbed ist.
|
|
|
|
|
_aio = sys.modules.get("aiosqlite")
|
|
|
|
|
if _aio is not None and not hasattr(_aio, "connect"):
|
|
|
|
|
del sys.modules["aiosqlite"]
|
|
|
|
|
|
|
|
|
|
import aiosqlite # noqa: E402
|
|
|
|
|
import importlib # noqa: E402
|
|
|
|
|
|
|
|
|
|
if "app.database" in sys.modules:
|
|
|
|
|
if not hasattr(getattr(sys.modules["app.database"], "aiosqlite", None), "connect"):
|
|
|
|
|
del sys.modules["app.database"]
|
|
|
|
|
importlib.import_module("app.database")
|
|
|
|
|
else:
|
|
|
|
|
importlib.import_module("app.database")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run(coro):
|
|
|
|
|
return asyncio.get_event_loop().run_until_complete(coro)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
|
def db_path(tmp_path, monkeypatch):
|
|
|
|
|
path = tmp_path / "test.db"
|
|
|
|
|
from app.config import settings
|
|
|
|
|
monkeypatch.setattr(settings, "db_path", str(path))
|
|
|
|
|
return str(path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
|
def initialized_db(db_path):
|
|
|
|
|
from app import database
|
|
|
|
|
run(database.init_db())
|
|
|
|
|
return db_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _fake_parse_result(drucksache: str, ergebnis: str = "angenommen",
|
|
|
|
|
einstimmig: bool = False,
|
|
|
|
|
ja: list[str] = None, nein: list[str] = None,
|
|
|
|
|
enth: list[str] = None) -> dict:
|
|
|
|
|
return {
|
|
|
|
|
"drucksache": drucksache,
|
|
|
|
|
"ergebnis": ergebnis,
|
|
|
|
|
"einstimmig": einstimmig,
|
|
|
|
|
"votes": {
|
|
|
|
|
"ja": ja or [],
|
|
|
|
|
"nein": nein or [],
|
|
|
|
|
"enthaltung": enth or [],
|
|
|
|
|
},
|
|
|
|
|
"kind": "direct",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestIngestPdf:
|
|
|
|
|
def test_writes_each_parsed_vote(self, initialized_db, tmp_path):
|
2026-04-28 08:37:31 +02:00
|
|
|
from app import ingest_votes, database
|
2026-04-28 08:03:18 +02:00
|
|
|
fake_pdf = tmp_path / "MMP18-119.pdf"
|
|
|
|
|
fake_pdf.write_bytes(b"%PDF-1.4 fake")
|
|
|
|
|
|
|
|
|
|
parser_results = [
|
|
|
|
|
_fake_parse_result("18/100", "angenommen", ja=["CDU", "SPD"], nein=["AfD"]),
|
|
|
|
|
_fake_parse_result("18/200", "abgelehnt", ja=["AfD"], nein=["CDU", "SPD"]),
|
|
|
|
|
]
|
|
|
|
|
|
2026-04-28 08:37:31 +02:00
|
|
|
with patch("app.ingest_votes.parse_protocol", return_value=parser_results):
|
|
|
|
|
stats = run(ingest_votes.ingest_pdf(fake_pdf))
|
2026-04-28 08:03:18 +02:00
|
|
|
|
|
|
|
|
assert stats["parsed"] == 2
|
|
|
|
|
assert stats["written"] == 2
|
|
|
|
|
|
|
|
|
|
votes_100 = run(database.get_plenum_votes("NRW", "18/100"))
|
|
|
|
|
assert len(votes_100) == 1
|
|
|
|
|
assert votes_100[0]["fraktionen_ja"] == ["CDU", "SPD"]
|
|
|
|
|
assert votes_100[0]["quelle_protokoll"] == "MMP18-119"
|
|
|
|
|
|
|
|
|
|
def test_skips_entries_without_drucksache(self, initialized_db, tmp_path):
|
|
|
|
|
"""Anchors ohne aufloesbare Drucksache werden gezaehlt aber nicht
|
|
|
|
|
geschrieben (sonst muellt der Import die DB voll)."""
|
2026-04-28 08:37:31 +02:00
|
|
|
from app import ingest_votes
|
2026-04-28 08:03:18 +02:00
|
|
|
fake_pdf = tmp_path / "MMP18-50.pdf"
|
|
|
|
|
fake_pdf.write_bytes(b"%PDF")
|
|
|
|
|
|
|
|
|
|
parser_results = [
|
|
|
|
|
_fake_parse_result("18/300", "angenommen"),
|
|
|
|
|
{"drucksache": None, "ergebnis": "angenommen", "votes": {"ja": [], "nein": [], "enthaltung": []}},
|
|
|
|
|
]
|
2026-04-28 08:37:31 +02:00
|
|
|
with patch("app.ingest_votes.parse_protocol", return_value=parser_results):
|
|
|
|
|
stats = run(ingest_votes.ingest_pdf(fake_pdf))
|
2026-04-28 08:03:18 +02:00
|
|
|
|
|
|
|
|
assert stats["parsed"] == 2
|
|
|
|
|
assert stats["written"] == 1
|
|
|
|
|
assert stats["skipped_no_drucksache"] == 1
|
|
|
|
|
|
|
|
|
|
def test_protokoll_id_default_from_stem(self, initialized_db, tmp_path):
|
2026-04-28 08:37:31 +02:00
|
|
|
from app import ingest_votes, database
|
2026-04-28 08:03:18 +02:00
|
|
|
fake_pdf = tmp_path / "MMP18-77.pdf"
|
|
|
|
|
fake_pdf.write_bytes(b"%PDF")
|
2026-04-28 08:37:31 +02:00
|
|
|
with patch("app.ingest_votes.parse_protocol",
|
2026-04-28 08:03:18 +02:00
|
|
|
return_value=[_fake_parse_result("18/500")]):
|
2026-04-28 08:37:31 +02:00
|
|
|
stats = run(ingest_votes.ingest_pdf(fake_pdf))
|
2026-04-28 08:03:18 +02:00
|
|
|
assert stats["protokoll_id"] == "MMP18-77"
|
|
|
|
|
votes = run(database.get_plenum_votes("NRW", "18/500"))
|
|
|
|
|
assert votes[0]["quelle_protokoll"] == "MMP18-77"
|
|
|
|
|
|
|
|
|
|
def test_protokoll_id_override(self, initialized_db, tmp_path):
|
2026-04-28 08:37:31 +02:00
|
|
|
from app import ingest_votes, database
|
2026-04-28 08:03:18 +02:00
|
|
|
fake_pdf = tmp_path / "scan.pdf"
|
|
|
|
|
fake_pdf.write_bytes(b"%PDF")
|
2026-04-28 08:37:31 +02:00
|
|
|
with patch("app.ingest_votes.parse_protocol",
|
2026-04-28 08:03:18 +02:00
|
|
|
return_value=[_fake_parse_result("18/600")]):
|
2026-04-28 08:37:31 +02:00
|
|
|
run(ingest_votes.ingest_pdf(
|
2026-04-28 08:03:18 +02:00
|
|
|
fake_pdf, protokoll_id="MMP18-99", quelle_url="https://example.com/x.pdf",
|
|
|
|
|
))
|
|
|
|
|
votes = run(database.get_plenum_votes("NRW", "18/600"))
|
|
|
|
|
assert votes[0]["quelle_protokoll"] == "MMP18-99"
|
|
|
|
|
assert votes[0]["quelle_url"] == "https://example.com/x.pdf"
|
|
|
|
|
|
|
|
|
|
def test_bundesland_override(self, initialized_db, tmp_path):
|
|
|
|
|
"""Adapter fuer andere BL koennten denselben Ingest-Helper nutzen."""
|
2026-04-28 08:37:31 +02:00
|
|
|
from app import ingest_votes, database
|
2026-04-28 08:03:18 +02:00
|
|
|
fake_pdf = tmp_path / "MV-MP1.pdf"
|
|
|
|
|
fake_pdf.write_bytes(b"%PDF")
|
2026-04-28 08:37:31 +02:00
|
|
|
with patch("app.ingest_votes.parse_protocol",
|
2026-04-28 08:03:18 +02:00
|
|
|
return_value=[_fake_parse_result("8/100")]):
|
2026-04-28 08:37:31 +02:00
|
|
|
run(ingest_votes.ingest_pdf(fake_pdf, bundesland="MV"))
|
2026-04-28 08:03:18 +02:00
|
|
|
# Lookup unter dem richtigen BL
|
|
|
|
|
votes_mv = run(database.get_plenum_votes("MV", "8/100"))
|
|
|
|
|
assert len(votes_mv) == 1
|
|
|
|
|
votes_nrw = run(database.get_plenum_votes("NRW", "8/100"))
|
|
|
|
|
assert votes_nrw == []
|
|
|
|
|
|
|
|
|
|
def test_re_ingest_overwrites_same_protokoll(self, initialized_db, tmp_path):
|
|
|
|
|
"""Erneuter Ingest desselben Protokolls aktualisiert die Eintraege
|
|
|
|
|
(idempotent), kein Duplikat."""
|
2026-04-28 08:37:31 +02:00
|
|
|
from app import ingest_votes, database
|
2026-04-28 08:03:18 +02:00
|
|
|
fake_pdf = tmp_path / "MMP18-1.pdf"
|
|
|
|
|
fake_pdf.write_bytes(b"%PDF")
|
|
|
|
|
|
2026-04-28 08:37:31 +02:00
|
|
|
with patch("app.ingest_votes.parse_protocol",
|
2026-04-28 08:03:18 +02:00
|
|
|
return_value=[_fake_parse_result("18/700", "angenommen", ja=["CDU"])]):
|
2026-04-28 08:37:31 +02:00
|
|
|
run(ingest_votes.ingest_pdf(fake_pdf))
|
2026-04-28 08:03:18 +02:00
|
|
|
# Re-Ingest mit korrigiertem Ergebnis (z.B. Parser-Fix)
|
2026-04-28 08:37:31 +02:00
|
|
|
with patch("app.ingest_votes.parse_protocol",
|
2026-04-28 08:03:18 +02:00
|
|
|
return_value=[_fake_parse_result("18/700", "abgelehnt", ja=[], nein=["CDU"])]):
|
2026-04-28 08:37:31 +02:00
|
|
|
run(ingest_votes.ingest_pdf(fake_pdf))
|
2026-04-28 08:03:18 +02:00
|
|
|
|
|
|
|
|
votes = run(database.get_plenum_votes("NRW", "18/700"))
|
|
|
|
|
assert len(votes) == 1
|
|
|
|
|
assert votes[0]["ergebnis"] == "abgelehnt"
|
|
|
|
|
assert votes[0]["fraktionen_nein"] == ["CDU"]
|
test(#134): Coverage-Backfill drei Module
- app/ingest_votes.py 39.2% → 100%
- TestDownloadPdf: schreibt Bytes, propagiert HTTP-Fehler
- TestCli: --supported, kein-arg-error, fehlender PDF-Pfad,
pdf-Pfad-Run, --url-Download-Pfad, exit-Code 2 bei null Resultaten,
Errors-Liste im Output
- DB-Error-Collection in ingest_pdf
- app/wahlprogramme.py 90.7% → 100%
- TestLoadWahlprogrammText: paged-Datei, Normal-Datei-Fallback,
fehlende Datei
- TestSearchWahlprogramm: leere Returns
- TestFindRelevantQuotes: ValueError bei unbekanntem BL
- TestFormatQuoteForPrompt: leeres Dict
- app/abgeordnetenwatch.py 95.2% → 97.6%
- test_rp_pattern_nr_wp_swap: '/538-18.pdf' → '18/538'
- test_sn_pattern_dok_nr_leg_per_swap: 'dok_nr=2150&leg_per=8' → '8/2150'
Total: 47.59% → 48.69%, 666 → 686 Tests, 0 Failures.
2026-04-28 10:50:26 +02:00
|
|
|
|
|
|
|
|
def test_db_error_collected_not_raised(self, initialized_db, tmp_path):
|
|
|
|
|
"""Wenn upsert fehlschlaegt, sollte der Fehler in errors-Liste
|
|
|
|
|
landen, nicht propagieren — der Rest des Protokolls soll trotzdem
|
|
|
|
|
verarbeitet werden."""
|
|
|
|
|
from app import ingest_votes
|
|
|
|
|
fake_pdf = tmp_path / "MMP18-2.pdf"
|
|
|
|
|
fake_pdf.write_bytes(b"%PDF")
|
|
|
|
|
|
|
|
|
|
async def _failing_upsert(**kw):
|
|
|
|
|
raise RuntimeError("simulated DB error")
|
|
|
|
|
|
|
|
|
|
parser_results = [
|
|
|
|
|
_fake_parse_result("18/800", "angenommen"),
|
|
|
|
|
_fake_parse_result("18/801", "abgelehnt"),
|
|
|
|
|
]
|
|
|
|
|
with patch("app.ingest_votes.parse_protocol", return_value=parser_results), \
|
|
|
|
|
patch("app.ingest_votes.upsert_plenum_vote", side_effect=_failing_upsert):
|
|
|
|
|
stats = run(ingest_votes.ingest_pdf(fake_pdf))
|
|
|
|
|
|
|
|
|
|
assert stats["written"] == 0
|
|
|
|
|
assert len(stats["errors"]) == 2
|
|
|
|
|
assert "18/800" in stats["errors"][0]
|
|
|
|
|
assert "simulated DB error" in stats["errors"][0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestDownloadPdf:
|
|
|
|
|
def test_writes_response_bytes(self, tmp_path):
|
|
|
|
|
from app.ingest_votes import _download_pdf
|
|
|
|
|
|
|
|
|
|
class _FakeResp:
|
|
|
|
|
def read(self):
|
|
|
|
|
return b"%PDF downloaded content"
|
|
|
|
|
def __enter__(self):
|
|
|
|
|
return self
|
|
|
|
|
def __exit__(self, *a):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
dest = tmp_path / "out.pdf"
|
|
|
|
|
with patch("urllib.request.urlopen", return_value=_FakeResp()):
|
|
|
|
|
_download_pdf("https://example.com/x.pdf", dest)
|
|
|
|
|
assert dest.read_bytes() == b"%PDF downloaded content"
|
|
|
|
|
|
|
|
|
|
def test_propagates_http_error(self, tmp_path):
|
|
|
|
|
"""HTTP-Fehler beim Download propagieren — der Caller (CLI)
|
|
|
|
|
soll mit Stack-Trace abbrechen, nicht still weitergehen."""
|
|
|
|
|
from app.ingest_votes import _download_pdf
|
|
|
|
|
|
|
|
|
|
def _raise(*a, **kw):
|
|
|
|
|
raise OSError("Connection refused")
|
|
|
|
|
|
|
|
|
|
with patch("urllib.request.urlopen", side_effect=_raise):
|
|
|
|
|
with pytest.raises(OSError):
|
|
|
|
|
_download_pdf("https://example.com/x.pdf", tmp_path / "out.pdf")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestCli:
|
|
|
|
|
"""Tests fuer die CLI-Wrapper-Funktion _cli — argv-basiert."""
|
|
|
|
|
|
|
|
|
|
def test_supported_lists_bl(self, capsys):
|
|
|
|
|
"""--supported gibt registrierte BL aus und exitet mit 0."""
|
|
|
|
|
from app import ingest_votes
|
|
|
|
|
with patch.object(ingest_votes.sys, "argv", ["ingest_votes", "--supported"]):
|
|
|
|
|
with pytest.raises(SystemExit) as exc:
|
|
|
|
|
ingest_votes._cli()
|
|
|
|
|
assert exc.value.code == 0
|
|
|
|
|
out = capsys.readouterr().out
|
|
|
|
|
assert "NRW" in out
|
|
|
|
|
|
|
|
|
|
def test_no_args_errors(self, capsys):
|
|
|
|
|
"""Ohne --pdf und --url muss CLI mit klarer Fehlermeldung exiten."""
|
|
|
|
|
from app import ingest_votes
|
|
|
|
|
with patch.object(ingest_votes.sys, "argv", ["ingest_votes"]):
|
|
|
|
|
with pytest.raises(SystemExit):
|
|
|
|
|
ingest_votes._cli()
|
|
|
|
|
|
|
|
|
|
def test_pdf_path_missing_errors(self, capsys, tmp_path):
|
|
|
|
|
"""--pdf mit nicht-existentem Pfad exitet 1."""
|
|
|
|
|
from app import ingest_votes
|
|
|
|
|
nonexistent = tmp_path / "missing.pdf"
|
|
|
|
|
with patch.object(ingest_votes.sys, "argv",
|
|
|
|
|
["ingest_votes", "--pdf", str(nonexistent)]):
|
|
|
|
|
with pytest.raises(SystemExit) as exc:
|
|
|
|
|
ingest_votes._cli()
|
|
|
|
|
assert exc.value.code == 1
|
|
|
|
|
err = capsys.readouterr().err
|
|
|
|
|
assert "nicht gefunden" in err
|
|
|
|
|
|
|
|
|
|
def test_pdf_path_calls_ingest(self, tmp_path, capsys):
|
|
|
|
|
"""--pdf mit existentem Pfad ruft ingest_pdf und gibt Statistik aus."""
|
|
|
|
|
from app import ingest_votes
|
|
|
|
|
pdf = tmp_path / "MMP18-X.pdf"
|
|
|
|
|
pdf.write_bytes(b"%PDF")
|
|
|
|
|
|
|
|
|
|
fake_stats = {
|
|
|
|
|
"parsed": 3, "written": 2,
|
|
|
|
|
"skipped_no_drucksache": 1, "errors": [],
|
|
|
|
|
"protokoll_id": "MMP18-X", "bundesland": "NRW",
|
|
|
|
|
}
|
|
|
|
|
with patch("app.ingest_votes.asyncio.run", return_value=fake_stats), \
|
|
|
|
|
patch.object(ingest_votes.sys, "argv",
|
|
|
|
|
["ingest_votes", "--pdf", str(pdf)]):
|
|
|
|
|
ingest_votes._cli()
|
|
|
|
|
out = capsys.readouterr().out
|
|
|
|
|
assert "MMP18-X" in out
|
|
|
|
|
assert "parsed: 3" in out
|
|
|
|
|
assert "written: 2" in out
|
|
|
|
|
assert "ohne DS: 1" in out
|
|
|
|
|
|
|
|
|
|
def test_url_downloads_then_ingests(self, capsys):
|
|
|
|
|
"""--url path: Download in tmp, dann ingest_pdf."""
|
|
|
|
|
from app import ingest_votes
|
|
|
|
|
|
|
|
|
|
fake_stats = {
|
|
|
|
|
"parsed": 1, "written": 1, "skipped_no_drucksache": 0,
|
|
|
|
|
"errors": [], "protokoll_id": "MMP18-Y",
|
|
|
|
|
"bundesland": "NRW",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
class _FakeResp:
|
|
|
|
|
def read(self):
|
|
|
|
|
return b"%PDF downloaded"
|
|
|
|
|
def __enter__(self):
|
|
|
|
|
return self
|
|
|
|
|
def __exit__(self, *a):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
with patch("app.ingest_votes.asyncio.run", return_value=fake_stats), \
|
|
|
|
|
patch("urllib.request.urlopen", return_value=_FakeResp()), \
|
|
|
|
|
patch.object(ingest_votes.sys, "argv",
|
|
|
|
|
["ingest_votes", "--url",
|
|
|
|
|
"https://example.com/MMP18-Y.pdf"]):
|
|
|
|
|
ingest_votes._cli()
|
|
|
|
|
out = capsys.readouterr().out
|
|
|
|
|
assert "MMP18-Y" in out
|
|
|
|
|
|
|
|
|
|
def test_zero_results_exits_2(self, tmp_path, capsys):
|
|
|
|
|
"""Wenn weder geschrieben noch Fehler: exit code 2 (= 'no signal')."""
|
|
|
|
|
from app import ingest_votes
|
|
|
|
|
pdf = tmp_path / "leer.pdf"
|
|
|
|
|
pdf.write_bytes(b"%PDF")
|
|
|
|
|
|
|
|
|
|
fake_stats = {
|
|
|
|
|
"parsed": 0, "written": 0, "skipped_no_drucksache": 0,
|
|
|
|
|
"errors": [], "protokoll_id": "leer", "bundesland": "NRW",
|
|
|
|
|
}
|
|
|
|
|
with patch("app.ingest_votes.asyncio.run", return_value=fake_stats), \
|
|
|
|
|
patch.object(ingest_votes.sys, "argv",
|
|
|
|
|
["ingest_votes", "--pdf", str(pdf)]):
|
|
|
|
|
with pytest.raises(SystemExit) as exc:
|
|
|
|
|
ingest_votes._cli()
|
|
|
|
|
assert exc.value.code == 2
|
|
|
|
|
|
|
|
|
|
def test_errors_listed_in_output(self, tmp_path, capsys):
|
|
|
|
|
"""Wenn errors gefuellt sind, erscheint die Errors-Zeile + erste 5."""
|
|
|
|
|
from app import ingest_votes
|
|
|
|
|
pdf = tmp_path / "x.pdf"
|
|
|
|
|
pdf.write_bytes(b"%PDF")
|
|
|
|
|
fake_stats = {
|
|
|
|
|
"parsed": 2, "written": 0, "skipped_no_drucksache": 0,
|
|
|
|
|
"errors": ["18/1: oops", "18/2: nope"],
|
|
|
|
|
"protokoll_id": "x", "bundesland": "NRW",
|
|
|
|
|
}
|
|
|
|
|
with patch("app.ingest_votes.asyncio.run", return_value=fake_stats), \
|
|
|
|
|
patch.object(ingest_votes.sys, "argv",
|
|
|
|
|
["ingest_votes", "--pdf", str(pdf)]):
|
|
|
|
|
ingest_votes._cli()
|
|
|
|
|
out = capsys.readouterr().out
|
|
|
|
|
assert "errors: 2" in out
|
|
|
|
|
assert "18/1: oops" in out
|
|
|
|
|
assert "18/2: nope" in out
|