#!/usr/bin/env python3 """ Importiert PDF-URLs aus OParl in die anlagen-Tabelle. """ import sqlite3 from pathlib import Path import httpx PROJECT_ROOT = Path(__file__).resolve().parent.parent DB_PATH = PROJECT_ROOT / "data" / "tracker_remote.db" OPARL_BASE = "https://allris.hagen.de/public/oparl/papers?body=1" def get_db(): conn = sqlite3.connect(str(DB_PATH)) conn.row_factory = sqlite3.Row return conn def ensure_anlagen_table(conn): """Erstellt anlagen-Tabelle falls nicht vorhanden.""" conn.execute(""" CREATE TABLE IF NOT EXISTS anlagen ( id INTEGER PRIMARY KEY, vorlage_id INTEGER NOT NULL, name TEXT, url TEXT, mime_type TEXT, size INTEGER, downloaded INTEGER DEFAULT 0, FOREIGN KEY (vorlage_id) REFERENCES vorlagen(id) ) """) conn.execute("CREATE INDEX IF NOT EXISTS idx_anlagen_vorlage ON anlagen(vorlage_id)") conn.commit() def import_pdf_urls(): conn = get_db() ensure_anlagen_table(conn) # Mapping oparl_id -> vorlage.id vorlage_map = {} for row in conn.execute("SELECT id, oparl_id FROM vorlagen WHERE oparl_id IS NOT NULL"): vorlage_map[row['oparl_id']] = row['id'] print(f"Vorlagen mit OParl-ID: {len(vorlage_map)}") page = 1 imported = 0 skipped = 0 while True: print(f"Seite {page}...", end=" ", flush=True) try: resp = httpx.get(f"{OPARL_BASE}&page={page}", timeout=30) resp.raise_for_status() data = resp.json() except Exception as e: print(f"Fehler: {e}") break if not data.get('data'): print("keine Daten") break page_imported = 0 for paper in data['data']: oparl_id = paper.get('id') main_file = paper.get('mainFile') if not main_file or not oparl_id: continue vorlage_id = vorlage_map.get(oparl_id) if not vorlage_id: skipped += 1 continue # Prüfen ob schon existiert existing = conn.execute( "SELECT id FROM anlagen WHERE vorlage_id = ?", (vorlage_id,) ).fetchone() if existing: continue url = main_file.get('accessUrl') or main_file.get('downloadUrl') if not url: continue conn.execute(""" INSERT INTO anlagen (vorlage_id, name, url, mime_type, size) VALUES (?, ?, ?, ?, ?) """, ( vorlage_id, main_file.get('name') or main_file.get('fileName'), url, main_file.get('mimeType'), main_file.get('size') )) imported += 1 page_imported += 1 conn.commit() print(f"{page_imported} importiert") if not data.get('links', {}).get('next'): break page += 1 conn.close() print(f"\n=== Fertig ===") print(f"Importiert: {imported}") print(f"Übersprungen (keine Vorlage): {skipped}") if __name__ == "__main__": import_pdf_urls()