gwoe-antragspruefer/app/protokoll_parsers/__init__.py

"""BL-uebergreifende Plenarprotokoll-Abstimmungsparser (#126).

Architektur (vgl. ADR 0009): pro Bundesland eine Modul-Datei
``app/protokoll_parsers/<bl-code>.py``, die mindestens eine Funktion
``parse_protocol(pdf_path: str) -> list[dict]`` exportiert. Die Registry
``PROTOKOLL_PARSERS`` mappt BL-Code → Parser-Funktion.

Erwartetes Result-Schema pro Eintrag in der Liste::

    {
        "drucksache": str | None,    # z.B. "18/1234"; None bei nicht aufloesbar
        "ergebnis": str,             # angenommen | abgelehnt | ueberwiesen | ...
        "einstimmig": bool,          # explizit als einstimmig markiert
        "kind": str,                 # parser-intern, fuer Debug
        "votes": {                   # fraktions-Listen pro Vote-Kategorie
            "ja": list[str],
            "nein": list[str],
            "enthaltung": list[str],
        },
    }

NRW ist die Referenz-Implementierung. Folge-BL (HE/BB/MV/BE/...) bekommen
eigene Module mit demselben Funktions-Vertrag — neue Eintraege in der
Registry sind reine Tippelarbeit, das Reverse-Engineering pro Landtag
ist die eigentliche Arbeit.
"""
from __future__ import annotations

from typing import Callable

from .nrw import parse_protocol as _parse_nrw
from .bund import parse_protocol as _parse_bund
from .be import parse_protocol as _parse_be
from .hh import parse_protocol as _parse_hh
from .th import parse_protocol as _parse_th
from .he import parse_protocol as _parse_he

# Typ-Alias fuer Lesbarkeit; Parser-Signatur ist bewusst minimal.
ProtokollParser = Callable[[str], list[dict]]

PROTOKOLL_PARSERS: dict[str, ProtokollParser] = {
    "NRW": _parse_nrw,
    "BUND": _parse_bund,
    "BE": _parse_be,
    "HH": _parse_hh,
    "TH": _parse_th,
    "HE": _parse_he,
}


def parse_protocol(bundesland: str, pdf_path: str) -> list[dict]:
    """BL-uebergreifender Einstieg. Sucht den Parser in der Registry.

    Raises:
        NotImplementedError: wenn fuer das Bundesland (noch) kein Parser
            registriert ist. Folge-Issue: BL-Adapter ergaenzen mit einem
            eigenen Modul plus Eintrag hier.
    """
    parser = PROTOKOLL_PARSERS.get(bundesland)
    if parser is None:
        supported = ", ".join(sorted(PROTOKOLL_PARSERS)) or "(keine)"
        raise NotImplementedError(
            f"Kein Plenarprotokoll-Parser fuer {bundesland!r}. "
            f"Unterstuetzt: {supported}. Siehe #126."
        )
    return parser(pdf_path)


def supported_bundeslaender() -> list[str]:
    """Liste der BL-Codes mit registrierten Parsern."""
    return sorted(PROTOKOLL_PARSERS)


__all__ = [
    "ProtokollParser",
    "PROTOKOLL_PARSERS",
    "parse_protocol",
    "supported_bundeslaender",
]