gwoe-antragspruefer/tests/test_ingest_votes.py

"""Tests fuer app/ingest_votes.py — PDF → plenum_vote_results Pipeline (#106 / #126)."""
from __future__ import annotations

import asyncio
import sys
from pathlib import Path
from unittest.mock import patch

import pytest

# Gleiches aiosqlite-Setup-Problem wie in test_database.py — dort fix
# importieren, damit hier nichts gestubbed ist.
_aio = sys.modules.get("aiosqlite")
if _aio is not None and not hasattr(_aio, "connect"):
    del sys.modules["aiosqlite"]

import aiosqlite  # noqa: E402
import importlib  # noqa: E402

if "app.database" in sys.modules:
    if not hasattr(getattr(sys.modules["app.database"], "aiosqlite", None), "connect"):
        del sys.modules["app.database"]
        importlib.import_module("app.database")
else:
    importlib.import_module("app.database")


def run(coro):
    return asyncio.get_event_loop().run_until_complete(coro)


@pytest.fixture()
def db_path(tmp_path, monkeypatch):
    path = tmp_path / "test.db"
    from app.config import settings
    monkeypatch.setattr(settings, "db_path", str(path))
    return str(path)


@pytest.fixture()
def initialized_db(db_path):
    from app import database
    run(database.init_db())
    return db_path


def _fake_parse_result(drucksache: str, ergebnis: str = "angenommen",
                       einstimmig: bool = False,
                       ja: list[str] = None, nein: list[str] = None,
                       enth: list[str] = None) -> dict:
    return {
        "drucksache": drucksache,
        "ergebnis": ergebnis,
        "einstimmig": einstimmig,
        "votes": {
            "ja": ja or [],
            "nein": nein or [],
            "enthaltung": enth or [],
        },
        "kind": "direct",
    }


class TestIngestPdf:
    def test_writes_each_parsed_vote(self, initialized_db, tmp_path):
        from app import ingest_votes, database
        fake_pdf = tmp_path / "MMP18-119.pdf"
        fake_pdf.write_bytes(b"%PDF-1.4 fake")

        parser_results = [
            _fake_parse_result("18/100", "angenommen", ja=["CDU", "SPD"], nein=["AfD"]),
            _fake_parse_result("18/200", "abgelehnt", ja=["AfD"], nein=["CDU", "SPD"]),
        ]

        with patch("app.ingest_votes.parse_protocol", return_value=parser_results):
            stats = run(ingest_votes.ingest_pdf(fake_pdf))

        assert stats["parsed"] == 2
        assert stats["written"] == 2

        votes_100 = run(database.get_plenum_votes("NRW", "18/100"))
        assert len(votes_100) == 1
        assert votes_100[0]["fraktionen_ja"] == ["CDU", "SPD"]
        assert votes_100[0]["quelle_protokoll"] == "MMP18-119"

    def test_skips_entries_without_drucksache(self, initialized_db, tmp_path):
        """Anchors ohne aufloesbare Drucksache werden gezaehlt aber nicht
        geschrieben (sonst muellt der Import die DB voll)."""
        from app import ingest_votes
        fake_pdf = tmp_path / "MMP18-50.pdf"
        fake_pdf.write_bytes(b"%PDF")

        parser_results = [
            _fake_parse_result("18/300", "angenommen"),
            {"drucksache": None, "ergebnis": "angenommen", "votes": {"ja": [], "nein": [], "enthaltung": []}},
        ]
        with patch("app.ingest_votes.parse_protocol", return_value=parser_results):
            stats = run(ingest_votes.ingest_pdf(fake_pdf))

        assert stats["parsed"] == 2
        assert stats["written"] == 1
        assert stats["skipped_no_drucksache"] == 1

    def test_protokoll_id_default_from_stem(self, initialized_db, tmp_path):
        from app import ingest_votes, database
        fake_pdf = tmp_path / "MMP18-77.pdf"
        fake_pdf.write_bytes(b"%PDF")
        with patch("app.ingest_votes.parse_protocol",
                   return_value=[_fake_parse_result("18/500")]):
            stats = run(ingest_votes.ingest_pdf(fake_pdf))
        assert stats["protokoll_id"] == "MMP18-77"
        votes = run(database.get_plenum_votes("NRW", "18/500"))
        assert votes[0]["quelle_protokoll"] == "MMP18-77"

    def test_protokoll_id_override(self, initialized_db, tmp_path):
        from app import ingest_votes, database
        fake_pdf = tmp_path / "scan.pdf"
        fake_pdf.write_bytes(b"%PDF")
        with patch("app.ingest_votes.parse_protocol",
                   return_value=[_fake_parse_result("18/600")]):
            run(ingest_votes.ingest_pdf(
                fake_pdf, protokoll_id="MMP18-99", quelle_url="https://example.com/x.pdf",
            ))
        votes = run(database.get_plenum_votes("NRW", "18/600"))
        assert votes[0]["quelle_protokoll"] == "MMP18-99"
        assert votes[0]["quelle_url"] == "https://example.com/x.pdf"

    def test_bundesland_override(self, initialized_db, tmp_path):
        """Adapter fuer andere BL koennten denselben Ingest-Helper nutzen."""
        from app import ingest_votes, database
        fake_pdf = tmp_path / "MV-MP1.pdf"
        fake_pdf.write_bytes(b"%PDF")
        with patch("app.ingest_votes.parse_protocol",
                   return_value=[_fake_parse_result("8/100")]):
            run(ingest_votes.ingest_pdf(fake_pdf, bundesland="MV"))
        # Lookup unter dem richtigen BL
        votes_mv = run(database.get_plenum_votes("MV", "8/100"))
        assert len(votes_mv) == 1
        votes_nrw = run(database.get_plenum_votes("NRW", "8/100"))
        assert votes_nrw == []

    def test_re_ingest_overwrites_same_protokoll(self, initialized_db, tmp_path):
        """Erneuter Ingest desselben Protokolls aktualisiert die Eintraege
        (idempotent), kein Duplikat."""
        from app import ingest_votes, database
        fake_pdf = tmp_path / "MMP18-1.pdf"
        fake_pdf.write_bytes(b"%PDF")

        with patch("app.ingest_votes.parse_protocol",
                   return_value=[_fake_parse_result("18/700", "angenommen", ja=["CDU"])]):
            run(ingest_votes.ingest_pdf(fake_pdf))
        # Re-Ingest mit korrigiertem Ergebnis (z.B. Parser-Fix)
        with patch("app.ingest_votes.parse_protocol",
                   return_value=[_fake_parse_result("18/700", "abgelehnt", ja=[], nein=["CDU"])]):
            run(ingest_votes.ingest_pdf(fake_pdf))

        votes = run(database.get_plenum_votes("NRW", "18/700"))
        assert len(votes) == 1
        assert votes[0]["ergebnis"] == "abgelehnt"
        assert votes[0]["fraktionen_nein"] == ["CDU"]

    def test_db_error_collected_not_raised(self, initialized_db, tmp_path):
        """Wenn upsert fehlschlaegt, sollte der Fehler in errors-Liste
        landen, nicht propagieren — der Rest des Protokolls soll trotzdem
        verarbeitet werden."""
        from app import ingest_votes
        fake_pdf = tmp_path / "MMP18-2.pdf"
        fake_pdf.write_bytes(b"%PDF")

        async def _failing_upsert(**kw):
            raise RuntimeError("simulated DB error")

        parser_results = [
            _fake_parse_result("18/800", "angenommen"),
            _fake_parse_result("18/801", "abgelehnt"),
        ]
        with patch("app.ingest_votes.parse_protocol", return_value=parser_results), \
             patch("app.ingest_votes.upsert_plenum_vote", side_effect=_failing_upsert):
            stats = run(ingest_votes.ingest_pdf(fake_pdf))

        assert stats["written"] == 0
        assert len(stats["errors"]) == 2
        assert "18/800" in stats["errors"][0]
        assert "simulated DB error" in stats["errors"][0]


class TestDownloadPdf:
    def test_writes_response_bytes(self, tmp_path):
        from app.ingest_votes import _download_pdf

        class _FakeResp:
            def read(self):
                return b"%PDF downloaded content"
            def __enter__(self):
                return self
            def __exit__(self, *a):
                return False

        dest = tmp_path / "out.pdf"
        with patch("urllib.request.urlopen", return_value=_FakeResp()):
            _download_pdf("https://example.com/x.pdf", dest)
        assert dest.read_bytes() == b"%PDF downloaded content"

    def test_propagates_http_error(self, tmp_path):
        """HTTP-Fehler beim Download propagieren — der Caller (CLI)
        soll mit Stack-Trace abbrechen, nicht still weitergehen."""
        from app.ingest_votes import _download_pdf

        def _raise(*a, **kw):
            raise OSError("Connection refused")

        with patch("urllib.request.urlopen", side_effect=_raise):
            with pytest.raises(OSError):
                _download_pdf("https://example.com/x.pdf", tmp_path / "out.pdf")


class TestCli:
    """Tests fuer die CLI-Wrapper-Funktion _cli — argv-basiert."""

    def test_supported_lists_bl(self, capsys):
        """--supported gibt registrierte BL aus und exitet mit 0."""
        from app import ingest_votes
        with patch.object(ingest_votes.sys, "argv", ["ingest_votes", "--supported"]):
            with pytest.raises(SystemExit) as exc:
                ingest_votes._cli()
        assert exc.value.code == 0
        out = capsys.readouterr().out
        assert "NRW" in out

    def test_no_args_errors(self, capsys):
        """Ohne --pdf und --url muss CLI mit klarer Fehlermeldung exiten."""
        from app import ingest_votes
        with patch.object(ingest_votes.sys, "argv", ["ingest_votes"]):
            with pytest.raises(SystemExit):
                ingest_votes._cli()

    def test_pdf_path_missing_errors(self, capsys, tmp_path):
        """--pdf mit nicht-existentem Pfad exitet 1."""
        from app import ingest_votes
        nonexistent = tmp_path / "missing.pdf"
        with patch.object(ingest_votes.sys, "argv",
                          ["ingest_votes", "--pdf", str(nonexistent)]):
            with pytest.raises(SystemExit) as exc:
                ingest_votes._cli()
        assert exc.value.code == 1
        err = capsys.readouterr().err
        assert "nicht gefunden" in err

    def test_pdf_path_calls_ingest(self, tmp_path, capsys):
        """--pdf mit existentem Pfad ruft ingest_pdf und gibt Statistik aus."""
        from app import ingest_votes
        pdf = tmp_path / "MMP18-X.pdf"
        pdf.write_bytes(b"%PDF")

        fake_stats = {
            "parsed": 3, "written": 2,
            "skipped_no_drucksache": 1, "errors": [],
            "protokoll_id": "MMP18-X", "bundesland": "NRW",
        }
        with patch("app.ingest_votes.asyncio.run", return_value=fake_stats), \
             patch.object(ingest_votes.sys, "argv",
                          ["ingest_votes", "--pdf", str(pdf)]):
            ingest_votes._cli()
        out = capsys.readouterr().out
        assert "MMP18-X" in out
        assert "parsed:  3" in out
        assert "written: 2" in out
        assert "ohne DS: 1" in out

    def test_url_downloads_then_ingests(self, capsys):
        """--url path: Download in tmp, dann ingest_pdf."""
        from app import ingest_votes

        fake_stats = {
            "parsed": 1, "written": 1, "skipped_no_drucksache": 0,
            "errors": [], "protokoll_id": "MMP18-Y",
            "bundesland": "NRW",
        }

        class _FakeResp:
            def read(self):
                return b"%PDF downloaded"
            def __enter__(self):
                return self
            def __exit__(self, *a):
                return False

        with patch("app.ingest_votes.asyncio.run", return_value=fake_stats), \
             patch("urllib.request.urlopen", return_value=_FakeResp()), \
             patch.object(ingest_votes.sys, "argv",
                          ["ingest_votes", "--url",
                           "https://example.com/MMP18-Y.pdf"]):
            ingest_votes._cli()
        out = capsys.readouterr().out
        assert "MMP18-Y" in out

    def test_zero_results_exits_2(self, tmp_path, capsys):
        """Wenn weder geschrieben noch Fehler: exit code 2 (= 'no signal')."""
        from app import ingest_votes
        pdf = tmp_path / "leer.pdf"
        pdf.write_bytes(b"%PDF")

        fake_stats = {
            "parsed": 0, "written": 0, "skipped_no_drucksache": 0,
            "errors": [], "protokoll_id": "leer", "bundesland": "NRW",
        }
        with patch("app.ingest_votes.asyncio.run", return_value=fake_stats), \
             patch.object(ingest_votes.sys, "argv",
                          ["ingest_votes", "--pdf", str(pdf)]):
            with pytest.raises(SystemExit) as exc:
                ingest_votes._cli()
        assert exc.value.code == 2

    def test_errors_listed_in_output(self, tmp_path, capsys):
        """Wenn errors gefuellt sind, erscheint die Errors-Zeile + erste 5."""
        from app import ingest_votes
        pdf = tmp_path / "x.pdf"
        pdf.write_bytes(b"%PDF")
        fake_stats = {
            "parsed": 2, "written": 0, "skipped_no_drucksache": 0,
            "errors": ["18/1: oops", "18/2: nope"],
            "protokoll_id": "x", "bundesland": "NRW",
        }
        with patch("app.ingest_votes.asyncio.run", return_value=fake_stats), \
             patch.object(ingest_votes.sys, "argv",
                          ["ingest_votes", "--pdf", str(pdf)]):
            ingest_votes._cli()
        out = capsys.readouterr().out
        assert "errors:  2" in out
        assert "18/1: oops" in out
        assert "18/2: nope" in out