260 lines
8.2 KiB
Python
260 lines
8.2 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Scraped Beratungsfolge und Beschlüsse von ALLRIS Vorlagen-Seiten.
|
||
|
|
Extrahiert: Sitzungen, Beschlussart, Beschlusstext aus verlinkten TOs.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import re
|
||
|
|
import sqlite3
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
from bs4 import BeautifulSoup
|
||
|
|
|
||
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||
|
|
DB_PATH = PROJECT_ROOT / "data" / "tracker_remote.db"
|
||
|
|
LOG_FILE = PROJECT_ROOT / "data" / "beratungsfolge.log"
|
||
|
|
|
||
|
|
# Rate Limiting
|
||
|
|
DELAY_SECONDS = 1.0
|
||
|
|
|
||
|
|
|
||
|
|
def log(msg: str):
|
||
|
|
timestamp = time.strftime("%H:%M:%S")
|
||
|
|
line = f"[{timestamp}] {msg}"
|
||
|
|
print(line)
|
||
|
|
with open(LOG_FILE, "a") as f:
|
||
|
|
f.write(line + "\n")
|
||
|
|
|
||
|
|
|
||
|
|
def get_db():
|
||
|
|
conn = sqlite3.connect(str(DB_PATH))
|
||
|
|
conn.row_factory = sqlite3.Row
|
||
|
|
return conn
|
||
|
|
|
||
|
|
|
||
|
|
def init_tables(conn):
|
||
|
|
"""Erweitert beratungen-Tabelle um neue Spalten falls nötig."""
|
||
|
|
existing = [c[1] for c in conn.execute('PRAGMA table_info(beratungen)').fetchall()]
|
||
|
|
|
||
|
|
needed = ['to_url', 'tolfdnr', 'beschlussart', 'beschlusstext', 'wortprotokoll', 'scraped_at']
|
||
|
|
for col in needed:
|
||
|
|
if col not in existing:
|
||
|
|
conn.execute(f'ALTER TABLE beratungen ADD COLUMN {col} TEXT')
|
||
|
|
log(f" Schema: +{col}")
|
||
|
|
|
||
|
|
conn.commit()
|
||
|
|
|
||
|
|
|
||
|
|
def scrape_vorlage_page(url: str) -> list[dict]:
|
||
|
|
"""Scraped Beratungsfolge von einer Vorlagen-Seite."""
|
||
|
|
try:
|
||
|
|
resp = httpx.get(url, timeout=30, follow_redirects=True)
|
||
|
|
resp.raise_for_status()
|
||
|
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
||
|
|
|
||
|
|
beratungen = []
|
||
|
|
|
||
|
|
# Finde Beratungsfolge-Tabelle
|
||
|
|
# Links zu to020 (Tagesordnungspunkt) oder to010 (Sitzung)
|
||
|
|
for link in soup.find_all('a', href=True):
|
||
|
|
href = link['href']
|
||
|
|
|
||
|
|
# TO-Links finden (Beschluss)
|
||
|
|
if 'to020' in href and 'TOLFDNR=' in href:
|
||
|
|
tolfdnr_match = re.search(r'TOLFDNR=(\d+)', href)
|
||
|
|
if tolfdnr_match:
|
||
|
|
tolfdnr = tolfdnr_match.group(1)
|
||
|
|
beschlussart = link.get_text(strip=True)
|
||
|
|
|
||
|
|
# Sitzungsinfo aus vorherigem Link holen
|
||
|
|
sitzung_name = None
|
||
|
|
sitzung_url = None
|
||
|
|
|
||
|
|
# Suche vorherigen to010 Link (Sitzung)
|
||
|
|
prev = link.find_previous('a', href=re.compile(r'to010.*SILFDNR='))
|
||
|
|
if prev:
|
||
|
|
sitzung_name = prev.get_text(strip=True)
|
||
|
|
sitzung_url = prev['href']
|
||
|
|
if not sitzung_url.startswith('http'):
|
||
|
|
sitzung_url = 'https://allris.hagen.de' + sitzung_url
|
||
|
|
|
||
|
|
to_url = href if href.startswith('http') else 'https://allris.hagen.de' + href
|
||
|
|
|
||
|
|
beratungen.append({
|
||
|
|
'tolfdnr': tolfdnr,
|
||
|
|
'beschlussart': beschlussart,
|
||
|
|
'sitzung_name': sitzung_name,
|
||
|
|
'sitzung_url': sitzung_url,
|
||
|
|
'to_url': to_url,
|
||
|
|
})
|
||
|
|
|
||
|
|
return beratungen
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
log(f" Fehler beim Scrapen: {e}")
|
||
|
|
return []
|
||
|
|
|
||
|
|
|
||
|
|
def scrape_to_page(url: str) -> dict:
|
||
|
|
"""Scraped Beschlusstext und Wortprotokoll von TO-Seite."""
|
||
|
|
try:
|
||
|
|
resp = httpx.get(url, timeout=30, follow_redirects=True)
|
||
|
|
resp.raise_for_status()
|
||
|
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
||
|
|
|
||
|
|
result = {
|
||
|
|
'beschlusstext': None,
|
||
|
|
'wortprotokoll': None,
|
||
|
|
'sitzung_datum': None,
|
||
|
|
}
|
||
|
|
|
||
|
|
# Datum aus Titel extrahieren (z.B. "30.01.2025 - 6.4 Mündlicher...")
|
||
|
|
title = soup.find('h1', class_='title')
|
||
|
|
if title:
|
||
|
|
date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', title.get_text())
|
||
|
|
if date_match:
|
||
|
|
result['sitzung_datum'] = date_match.group(1)
|
||
|
|
|
||
|
|
# Beschlusstext und Wortprotokoll finden
|
||
|
|
# Die sind in <span style="font-family:Arial"> Tags
|
||
|
|
text_spans = soup.find_all('span', style=re.compile(r'font-family.*Arial'))
|
||
|
|
|
||
|
|
texts = [s.get_text(strip=True) for s in text_spans if s.get_text(strip=True)]
|
||
|
|
|
||
|
|
if texts:
|
||
|
|
# Letzter Text ist oft der Beschluss
|
||
|
|
result['beschlusstext'] = texts[-1] if len(texts) > 0 else None
|
||
|
|
# Vorherige Texte sind Wortprotokoll
|
||
|
|
if len(texts) > 1:
|
||
|
|
result['wortprotokoll'] = '\n\n'.join(texts[:-1])
|
||
|
|
|
||
|
|
return result
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
log(f" TO-Fehler: {e}")
|
||
|
|
return {}
|
||
|
|
|
||
|
|
|
||
|
|
def process_vorlage(conn, vorlage: dict) -> int:
|
||
|
|
"""Verarbeitet eine Vorlage und speichert Beratungsfolge."""
|
||
|
|
vorlage_id = vorlage['id']
|
||
|
|
web_url = vorlage['web_url']
|
||
|
|
|
||
|
|
if not web_url:
|
||
|
|
return 0
|
||
|
|
|
||
|
|
# Beratungsfolge von Vorlagen-Seite scrapen
|
||
|
|
beratungen = scrape_vorlage_page(web_url)
|
||
|
|
|
||
|
|
if not beratungen:
|
||
|
|
return 0
|
||
|
|
|
||
|
|
saved = 0
|
||
|
|
for b in beratungen:
|
||
|
|
time.sleep(DELAY_SECONDS)
|
||
|
|
|
||
|
|
# TO-Seite für Details scrapen
|
||
|
|
to_details = scrape_to_page(b['to_url'])
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Update existierende Zeile oder insert neue
|
||
|
|
conn.execute("""
|
||
|
|
UPDATE beratungen
|
||
|
|
SET to_url = ?, tolfdnr = ?, beschlussart = ?,
|
||
|
|
beschlusstext = ?, wortprotokoll = ?, scraped_at = CURRENT_TIMESTAMP
|
||
|
|
WHERE vorlage_id = ? AND (tolfdnr = ? OR tolfdnr IS NULL)
|
||
|
|
""", (
|
||
|
|
b['to_url'],
|
||
|
|
b['tolfdnr'],
|
||
|
|
b['beschlussart'],
|
||
|
|
to_details.get('beschlusstext'),
|
||
|
|
to_details.get('wortprotokoll'),
|
||
|
|
vorlage_id,
|
||
|
|
b['tolfdnr'],
|
||
|
|
))
|
||
|
|
|
||
|
|
if conn.total_changes == 0:
|
||
|
|
# Neue Zeile
|
||
|
|
conn.execute("""
|
||
|
|
INSERT INTO beratungen
|
||
|
|
(vorlage_id, to_url, tolfdnr, beschlussart, beschlusstext, wortprotokoll, scraped_at)
|
||
|
|
VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
||
|
|
""", (
|
||
|
|
vorlage_id,
|
||
|
|
b['to_url'],
|
||
|
|
b['tolfdnr'],
|
||
|
|
b['beschlussart'],
|
||
|
|
to_details.get('beschlusstext'),
|
||
|
|
to_details.get('wortprotokoll'),
|
||
|
|
))
|
||
|
|
|
||
|
|
conn.commit()
|
||
|
|
saved += 1
|
||
|
|
except Exception as e:
|
||
|
|
log(f" DB-Fehler: {e}")
|
||
|
|
|
||
|
|
return saved
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description="Beratungsfolge scrapen")
|
||
|
|
parser.add_argument("--limit", type=int, default=50, help="Max. Vorlagen")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
log(f"=== Beratungsfolge-Scraper ===")
|
||
|
|
log(f"Limit: {args.limit}")
|
||
|
|
|
||
|
|
conn = get_db()
|
||
|
|
init_tables(conn)
|
||
|
|
|
||
|
|
# Vorlagen mit web_url die noch nicht gescraped sind (nutze beratung_status)
|
||
|
|
vorlagen = conn.execute("""
|
||
|
|
SELECT id, aktenzeichen, web_url
|
||
|
|
FROM vorlagen
|
||
|
|
WHERE web_url IS NOT NULL
|
||
|
|
AND beratung_status IS NULL
|
||
|
|
ORDER BY datum_eingang DESC
|
||
|
|
LIMIT ?
|
||
|
|
""", (args.limit,)).fetchall()
|
||
|
|
|
||
|
|
log(f"Zu verarbeiten: {len(vorlagen)}")
|
||
|
|
|
||
|
|
total_beratungen = 0
|
||
|
|
for i, v in enumerate(vorlagen):
|
||
|
|
log(f"[{i+1}/{len(vorlagen)}] {v['aktenzeichen']}...")
|
||
|
|
time.sleep(DELAY_SECONDS)
|
||
|
|
|
||
|
|
count = process_vorlage(conn, dict(v))
|
||
|
|
total_beratungen += count
|
||
|
|
|
||
|
|
# Status setzen (auch wenn keine Beratung gefunden)
|
||
|
|
conn.execute("UPDATE vorlagen SET beratung_status = 'done' WHERE id = ?", (v['id'],))
|
||
|
|
conn.commit()
|
||
|
|
|
||
|
|
if count > 0:
|
||
|
|
log(f" ✓ {count} Beratungen")
|
||
|
|
else:
|
||
|
|
log(f" - Keine Beratungsfolge")
|
||
|
|
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
log(f"\n=== Fertig ===")
|
||
|
|
log(f"Beratungen gespeichert: {total_beratungen}")
|
||
|
|
|
||
|
|
# Exit-Code für Batch-Runner
|
||
|
|
remaining = conn.execute("""
|
||
|
|
SELECT COUNT(*) FROM vorlagen
|
||
|
|
WHERE web_url IS NOT NULL
|
||
|
|
AND id NOT IN (SELECT DISTINCT vorlage_id FROM beratungen)
|
||
|
|
""").fetchone()[0] if False else 0 # Vereinfacht
|
||
|
|
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
import sys
|
||
|
|
sys.exit(main())
|