Vollständige Pipeline zur Analyse kommunaler Vorlagen aus ALLRIS: - OParl-Import: 20.149 Vorlagen - PDF-Extraktion: 10.045 Volltexte (adaptives Throttling) - KI-Zusammenfassungen: 10.026 via Qwen Plus (parallelisiert) - Beratungsfolge-Scraper: Beschlusstexte + Wortprotokolle - Abstimmungs-Analyse mit Koalitionsmatrix - Georeferenzierung (Nominatim) Stack: FastAPI + SvelteKit + SQLite Deployment: Docker + Traefik auf VServer Daten (DB, Logs) nicht im Repo — siehe Restic-Backup. Repo-Setup: scripts/setup.sh für Neuaufbau aus OParl-API.
122 lines
3.3 KiB
Python
122 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Importiert PDF-URLs aus OParl in die anlagen-Tabelle.
|
|
"""
|
|
|
|
import sqlite3
|
|
from pathlib import Path
|
|
import httpx
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
DB_PATH = PROJECT_ROOT / "data" / "tracker_remote.db"
|
|
OPARL_BASE = "https://allris.hagen.de/public/oparl/papers?body=1"
|
|
|
|
|
|
def get_db():
|
|
conn = sqlite3.connect(str(DB_PATH))
|
|
conn.row_factory = sqlite3.Row
|
|
return conn
|
|
|
|
|
|
def ensure_anlagen_table(conn):
|
|
"""Erstellt anlagen-Tabelle falls nicht vorhanden."""
|
|
conn.execute("""
|
|
CREATE TABLE IF NOT EXISTS anlagen (
|
|
id INTEGER PRIMARY KEY,
|
|
vorlage_id INTEGER NOT NULL,
|
|
name TEXT,
|
|
url TEXT,
|
|
mime_type TEXT,
|
|
size INTEGER,
|
|
downloaded INTEGER DEFAULT 0,
|
|
FOREIGN KEY (vorlage_id) REFERENCES vorlagen(id)
|
|
)
|
|
""")
|
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_anlagen_vorlage ON anlagen(vorlage_id)")
|
|
conn.commit()
|
|
|
|
|
|
def import_pdf_urls():
|
|
conn = get_db()
|
|
ensure_anlagen_table(conn)
|
|
|
|
# Mapping oparl_id -> vorlage.id
|
|
vorlage_map = {}
|
|
for row in conn.execute("SELECT id, oparl_id FROM vorlagen WHERE oparl_id IS NOT NULL"):
|
|
vorlage_map[row['oparl_id']] = row['id']
|
|
|
|
print(f"Vorlagen mit OParl-ID: {len(vorlage_map)}")
|
|
|
|
page = 1
|
|
imported = 0
|
|
skipped = 0
|
|
|
|
while True:
|
|
print(f"Seite {page}...", end=" ", flush=True)
|
|
|
|
try:
|
|
resp = httpx.get(f"{OPARL_BASE}&page={page}", timeout=30)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
except Exception as e:
|
|
print(f"Fehler: {e}")
|
|
break
|
|
|
|
if not data.get('data'):
|
|
print("keine Daten")
|
|
break
|
|
|
|
page_imported = 0
|
|
for paper in data['data']:
|
|
oparl_id = paper.get('id')
|
|
main_file = paper.get('mainFile')
|
|
|
|
if not main_file or not oparl_id:
|
|
continue
|
|
|
|
vorlage_id = vorlage_map.get(oparl_id)
|
|
if not vorlage_id:
|
|
skipped += 1
|
|
continue
|
|
|
|
# Prüfen ob schon existiert
|
|
existing = conn.execute(
|
|
"SELECT id FROM anlagen WHERE vorlage_id = ?", (vorlage_id,)
|
|
).fetchone()
|
|
|
|
if existing:
|
|
continue
|
|
|
|
url = main_file.get('accessUrl') or main_file.get('downloadUrl')
|
|
if not url:
|
|
continue
|
|
|
|
conn.execute("""
|
|
INSERT INTO anlagen (vorlage_id, name, url, mime_type, size)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
""", (
|
|
vorlage_id,
|
|
main_file.get('name') or main_file.get('fileName'),
|
|
url,
|
|
main_file.get('mimeType'),
|
|
main_file.get('size')
|
|
))
|
|
imported += 1
|
|
page_imported += 1
|
|
|
|
conn.commit()
|
|
print(f"{page_imported} importiert")
|
|
|
|
if not data.get('links', {}).get('next'):
|
|
break
|
|
page += 1
|
|
|
|
conn.close()
|
|
print(f"\n=== Fertig ===")
|
|
print(f"Importiert: {imported}")
|
|
print(f"Übersprungen (keine Vorlage): {skipped}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import_pdf_urls()
|