gwoe-antragspruefer/app/protokoll_parsers/sl.py

"""Saarland (SL) — Abstimmungsergebnisse-Parser (#106 / #161, ADR 0009).

**Spezialfall:** Saarland publiziert keine Wortprotokolle, sondern eigene
Abstimmungsergebnisse-HTML-Seiten pro Sitzung mit strukturiertem Vote-Block:

```
<p>...Drucksache 17/2076...
in Erster Lesung mit Stimmenmehrheit angenommen und an den Ausschuss [...]
[SPD: dafür; CDU und AfD: dagegen]</p>
```

Daher Input ist HTML, nicht PDF. ``parse_protocol(html_path)`` liest die
HTML-Seite und extrahiert pro <li> einen Vote.

URL-Pattern (nicht direkt vorhersagbar, daher Index-Scrape):
``https://www.landtag-saar.de/aktuelles/mitteilungen/abstimmungsergebnisse-der-{n}-landtagssitzung-vom-{datum}/``

Index-Seite: https://www.landtag-saar.de (Front-Listing der Mitteilungen).

## Vote-Block-Format

Strukturierte Klammer-Notation pro Drucksache:
- ``[SPD: dafür; CDU und AfD: dagegen]`` → JA=[SPD], NEIN=[CDU,AfD]
- ``[SPD: dafür; CDU: dagegen; AfD: Enthaltung]`` → JA=[SPD], NEIN=[CDU], ENTH=[AfD]
- ``[SPD und CDU: dafür; AfD: Enthaltung]`` → JA=[SPD,CDU], NEIN=[], ENTH=[AfD]

## Ergebnis-Mapping

- ``angenommen`` (mit oder ohne ``mit Stimmenmehrheit|einstimmig``) → angenommen
- ``abgelehnt`` → abgelehnt
- ``zur Kenntnis genommen`` → uebersprungen (kein Vote)

## Fraktions-Mapping WP17 (ab 2022)

WP17 Konstellation: SPD-Alleinregierung (43 Sitze), CDU + AfD Opposition.
- ``SPD``, ``CDU``, ``AfD``
"""
from __future__ import annotations

import re
from typing import Optional


ALLE_FRAKTIONEN_SL = ["SPD", "CDU", "AfD"]


# <li>...</li>-Block per Sitzung; jeder Block enthaelt typischerweise
# 1x Drucksache + 1x Status + 1x Vote-Klammer.
LI_BLOCK_RE = re.compile(
    r"<li[^>]*>(.*?)</li>",
    re.DOTALL,
)

DS_RE_SL = re.compile(r"Drucksache\s+(\d{1,2}/\d{2,5})")

STATUS_RE = re.compile(
    r"(?:in\s+\w+\s+Lesung\s+)?"
    r"(?:mit\s+Stimmenmehrheit|einstimmig|mit\s+Mehrheit)?\s*"
    r"(?P<ergebnis>angenommen|abgelehnt|abgesetzt|zur\s+Kenntnis\s+genommen)",
    re.IGNORECASE,
)

# Vote-Klammer: [SPD: dafür; CDU und AfD: dagegen]
VOTE_BRACKET_RE = re.compile(r"\[(?P<inner>[^\[\]]+)\]")


def _normalize_fraktionen_sl(phrase: str) -> list[str]:
    """SPD und CDU → ['CDU', 'SPD']; CDU → ['CDU']."""
    found = set()
    for fr in ALLE_FRAKTIONEN_SL:
        if re.search(rf"\b{re.escape(fr)}\b", phrase, re.IGNORECASE):
            found.add(fr)
    return sorted(found)


def _parse_vote_bracket(bracket_inner: str) -> dict:
    """Parst '[SPD: dafür; CDU und AfD: dagegen]' (innen ohne Klammern)."""
    votes = {"ja": [], "nein": [], "enthaltung": []}
    for segment in bracket_inner.split(";"):
        if ":" not in segment:
            continue
        fraktionen_phrase, _, status = segment.rpartition(":")
        status = status.strip().lower()
        fraktionen = _normalize_fraktionen_sl(fraktionen_phrase)
        if "dafür" in status or "ja" in status or "zustimm" in status:
            votes["ja"].extend(fraktionen)
        elif "dagegen" in status or "nein" in status or "ablehn" in status:
            votes["nein"].extend(fraktionen)
        elif "enthalt" in status:
            votes["enthaltung"].extend(fraktionen)
    for key in votes:
        votes[key] = sorted(set(votes[key]))
    return votes


def _strip_html(text: str) -> str:
    text = re.sub(r"<[^>]+>", " ", text)
    text = text.replace("&amp;", "&").replace("&nbsp;", " ")
    return re.sub(r"\s+", " ", text).strip()


def parse_protocol(html_path: str) -> list[dict]:
    """Parst SL-Abstimmungsergebnisse-HTML, liefert Status + Votes."""
    with open(html_path, "r", encoding="utf-8", errors="replace") as f:
        html = f.read()

    results = []
    for m in LI_BLOCK_RE.finditer(html):
        block_html = m.group(1)
        block_text = _strip_html(block_html)

        ds_m = DS_RE_SL.search(block_text)
        if not ds_m:
            continue
        ds = ds_m.group(1)

        status_m = STATUS_RE.search(block_text)
        if not status_m:
            continue
        ergebnis = status_m.group("ergebnis").lower()
        if "kenntnis" in ergebnis:
            continue

        modus_match = re.search(r"einstimmig", block_text, re.IGNORECASE)
        einstimmig = bool(modus_match)

        vote_m = VOTE_BRACKET_RE.search(block_text)
        votes = {"ja": [], "nein": [], "enthaltung": []}
        if vote_m:
            votes = _parse_vote_bracket(vote_m.group("inner"))

        if einstimmig and not votes["ja"]:
            votes["ja"] = list(ALLE_FRAKTIONEN_SL)

        results.append({
            "drucksache": ds,
            "ergebnis": ergebnis,
            "einstimmig": einstimmig,
            "kind": "direct",
            "votes": votes,
            "anchor_pos": m.start(),
        })

    seen = set()
    deduped = []
    for r in results:
        key = (r["drucksache"], r["anchor_pos"])
        if key in seen:
            continue
        seen.add(key)
        deduped.append(r)
    return deduped