antragstracker/scripts/scrape_beratungsfolge.py

#!/usr/bin/env python3
"""
Scraped Beratungsfolge und Beschlüsse von ALLRIS Vorlagen-Seiten.
Extrahiert: Sitzungen, Beschlussart, Beschlusstext aus verlinkten TOs.
"""

import argparse
import re
import sqlite3
import time
from pathlib import Path

import httpx
from bs4 import BeautifulSoup

PROJECT_ROOT = Path(__file__).resolve().parent.parent
DB_PATH = PROJECT_ROOT / "data" / "tracker_remote.db"
LOG_FILE = PROJECT_ROOT / "data" / "beratungsfolge.log"

# Rate Limiting
DELAY_SECONDS = 1.0


def log(msg: str):
    timestamp = time.strftime("%H:%M:%S")
    line = f"[{timestamp}] {msg}"
    print(line)
    with open(LOG_FILE, "a") as f:
        f.write(line + "\n")


def get_db():
    conn = sqlite3.connect(str(DB_PATH))
    conn.row_factory = sqlite3.Row
    return conn


def init_tables(conn):
    """Erweitert beratungen-Tabelle um neue Spalten falls nötig."""
    existing = [c[1] for c in conn.execute('PRAGMA table_info(beratungen)').fetchall()]

    needed = ['to_url', 'tolfdnr', 'beschlussart', 'beschlusstext', 'wortprotokoll', 'scraped_at']
    for col in needed:
        if col not in existing:
            conn.execute(f'ALTER TABLE beratungen ADD COLUMN {col} TEXT')
            log(f"  Schema: +{col}")

    conn.commit()


def scrape_vorlage_page(url: str) -> list[dict]:
    """Scraped Beratungsfolge von einer Vorlagen-Seite."""
    try:
        resp = httpx.get(url, timeout=30, follow_redirects=True)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')

        beratungen = []

        # Finde Beratungsfolge-Tabelle
        # Links zu to020 (Tagesordnungspunkt) oder to010 (Sitzung)
        for link in soup.find_all('a', href=True):
            href = link['href']

            # TO-Links finden (Beschluss)
            if 'to020' in href and 'TOLFDNR=' in href:
                tolfdnr_match = re.search(r'TOLFDNR=(\d+)', href)
                if tolfdnr_match:
                    tolfdnr = tolfdnr_match.group(1)
                    beschlussart = link.get_text(strip=True)

                    # Sitzungsinfo aus vorherigem Link holen
                    sitzung_name = None
                    sitzung_url = None

                    # Suche vorherigen to010 Link (Sitzung)
                    prev = link.find_previous('a', href=re.compile(r'to010.*SILFDNR='))
                    if prev:
                        sitzung_name = prev.get_text(strip=True)
                        sitzung_url = prev['href']
                        if not sitzung_url.startswith('http'):
                            sitzung_url = 'https://allris.hagen.de' + sitzung_url

                    to_url = href if href.startswith('http') else 'https://allris.hagen.de' + href

                    beratungen.append({
                        'tolfdnr': tolfdnr,
                        'beschlussart': beschlussart,
                        'sitzung_name': sitzung_name,
                        'sitzung_url': sitzung_url,
                        'to_url': to_url,
                    })

        return beratungen

    except Exception as e:
        log(f"  Fehler beim Scrapen: {e}")
        return []


def scrape_to_page(url: str) -> dict:
    """Scraped Beschlusstext und Wortprotokoll von TO-Seite."""
    try:
        resp = httpx.get(url, timeout=30, follow_redirects=True)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')

        result = {
            'beschlusstext': None,
            'wortprotokoll': None,
            'sitzung_datum': None,
        }

        # Datum aus Titel extrahieren (z.B. "30.01.2025 - 6.4 Mündlicher...")
        title = soup.find('h1', class_='title')
        if title:
            date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', title.get_text())
            if date_match:
                result['sitzung_datum'] = date_match.group(1)

        # Beschlusstext und Wortprotokoll finden
        # Die sind in <span style="font-family:Arial"> Tags
        text_spans = soup.find_all('span', style=re.compile(r'font-family.*Arial'))

        texts = [s.get_text(strip=True) for s in text_spans if s.get_text(strip=True)]

        if texts:
            # Letzter Text ist oft der Beschluss
            result['beschlusstext'] = texts[-1] if len(texts) > 0 else None
            # Vorherige Texte sind Wortprotokoll
            if len(texts) > 1:
                result['wortprotokoll'] = '\n\n'.join(texts[:-1])

        return result

    except Exception as e:
        log(f"  TO-Fehler: {e}")
        return {}


def process_vorlage(conn, vorlage: dict) -> int:
    """Verarbeitet eine Vorlage und speichert Beratungsfolge."""
    vorlage_id = vorlage['id']
    web_url = vorlage['web_url']

    if not web_url:
        return 0

    # Beratungsfolge von Vorlagen-Seite scrapen
    beratungen = scrape_vorlage_page(web_url)

    if not beratungen:
        return 0

    saved = 0
    for b in beratungen:
        time.sleep(DELAY_SECONDS)

        # TO-Seite für Details scrapen
        to_details = scrape_to_page(b['to_url'])

        try:
            # Update existierende Zeile oder insert neue
            conn.execute("""
                UPDATE beratungen
                SET to_url = ?, tolfdnr = ?, beschlussart = ?,
                    beschlusstext = ?, wortprotokoll = ?, scraped_at = CURRENT_TIMESTAMP
                WHERE vorlage_id = ? AND (tolfdnr = ? OR tolfdnr IS NULL)
            """, (
                b['to_url'],
                b['tolfdnr'],
                b['beschlussart'],
                to_details.get('beschlusstext'),
                to_details.get('wortprotokoll'),
                vorlage_id,
                b['tolfdnr'],
            ))

            if conn.total_changes == 0:
                # Neue Zeile
                conn.execute("""
                    INSERT INTO beratungen
                    (vorlage_id, to_url, tolfdnr, beschlussart, beschlusstext, wortprotokoll, scraped_at)
                    VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
                """, (
                    vorlage_id,
                    b['to_url'],
                    b['tolfdnr'],
                    b['beschlussart'],
                    to_details.get('beschlusstext'),
                    to_details.get('wortprotokoll'),
                ))

            conn.commit()
            saved += 1
        except Exception as e:
            log(f"  DB-Fehler: {e}")

    return saved


def main():
    parser = argparse.ArgumentParser(description="Beratungsfolge scrapen")
    parser.add_argument("--limit", type=int, default=50, help="Max. Vorlagen")
    args = parser.parse_args()

    log(f"=== Beratungsfolge-Scraper ===")
    log(f"Limit: {args.limit}")

    conn = get_db()
    init_tables(conn)

    # Vorlagen mit web_url die noch nicht gescraped sind (nutze beratung_status)
    vorlagen = conn.execute("""
        SELECT id, aktenzeichen, web_url
        FROM vorlagen
        WHERE web_url IS NOT NULL
          AND beratung_status IS NULL
        ORDER BY datum_eingang DESC
        LIMIT ?
    """, (args.limit,)).fetchall()

    log(f"Zu verarbeiten: {len(vorlagen)}")

    total_beratungen = 0
    for i, v in enumerate(vorlagen):
        log(f"[{i+1}/{len(vorlagen)}] {v['aktenzeichen']}...")
        time.sleep(DELAY_SECONDS)

        count = process_vorlage(conn, dict(v))
        total_beratungen += count

        # Status setzen (auch wenn keine Beratung gefunden)
        conn.execute("UPDATE vorlagen SET beratung_status = 'done' WHERE id = ?", (v['id'],))
        conn.commit()

        if count > 0:
            log(f"  ✓ {count} Beratungen")
        else:
            log(f"  - Keine Beratungsfolge")

    conn.close()

    log(f"\n=== Fertig ===")
    log(f"Beratungen gespeichert: {total_beratungen}")

    # Exit-Code für Batch-Runner
    remaining = conn.execute("""
        SELECT COUNT(*) FROM vorlagen
        WHERE web_url IS NOT NULL
          AND id NOT IN (SELECT DISTINCT vorlage_id FROM beratungen)
    """).fetchone()[0] if False else 0  # Vereinfacht

    return 0


if __name__ == "__main__":
    import sys
    sys.exit(main())