antragstracker/scripts/import_pdf_urls.py

#!/usr/bin/env python3
"""
Importiert PDF-URLs aus OParl in die anlagen-Tabelle.
"""

import sqlite3
from pathlib import Path
import httpx

PROJECT_ROOT = Path(__file__).resolve().parent.parent
DB_PATH = PROJECT_ROOT / "data" / "tracker_remote.db"
OPARL_BASE = "https://allris.hagen.de/public/oparl/papers?body=1"


def get_db():
    conn = sqlite3.connect(str(DB_PATH))
    conn.row_factory = sqlite3.Row
    return conn


def ensure_anlagen_table(conn):
    """Erstellt anlagen-Tabelle falls nicht vorhanden."""
    conn.execute("""
        CREATE TABLE IF NOT EXISTS anlagen (
            id INTEGER PRIMARY KEY,
            vorlage_id INTEGER NOT NULL,
            name TEXT,
            url TEXT,
            mime_type TEXT,
            size INTEGER,
            downloaded INTEGER DEFAULT 0,
            FOREIGN KEY (vorlage_id) REFERENCES vorlagen(id)
        )
    """)
    conn.execute("CREATE INDEX IF NOT EXISTS idx_anlagen_vorlage ON anlagen(vorlage_id)")
    conn.commit()


def import_pdf_urls():
    conn = get_db()
    ensure_anlagen_table(conn)

    # Mapping oparl_id -> vorlage.id
    vorlage_map = {}
    for row in conn.execute("SELECT id, oparl_id FROM vorlagen WHERE oparl_id IS NOT NULL"):
        vorlage_map[row['oparl_id']] = row['id']

    print(f"Vorlagen mit OParl-ID: {len(vorlage_map)}")

    page = 1
    imported = 0
    skipped = 0

    while True:
        print(f"Seite {page}...", end=" ", flush=True)

        try:
            resp = httpx.get(f"{OPARL_BASE}&page={page}", timeout=30)
            resp.raise_for_status()
            data = resp.json()
        except Exception as e:
            print(f"Fehler: {e}")
            break

        if not data.get('data'):
            print("keine Daten")
            break

        page_imported = 0
        for paper in data['data']:
            oparl_id = paper.get('id')
            main_file = paper.get('mainFile')

            if not main_file or not oparl_id:
                continue

            vorlage_id = vorlage_map.get(oparl_id)
            if not vorlage_id:
                skipped += 1
                continue

            # Prüfen ob schon existiert
            existing = conn.execute(
                "SELECT id FROM anlagen WHERE vorlage_id = ?", (vorlage_id,)
            ).fetchone()

            if existing:
                continue

            url = main_file.get('accessUrl') or main_file.get('downloadUrl')
            if not url:
                continue

            conn.execute("""
                INSERT INTO anlagen (vorlage_id, name, url, mime_type, size)
                VALUES (?, ?, ?, ?, ?)
            """, (
                vorlage_id,
                main_file.get('name') or main_file.get('fileName'),
                url,
                main_file.get('mimeType'),
                main_file.get('size')
            ))
            imported += 1
            page_imported += 1

        conn.commit()
        print(f"{page_imported} importiert")

        if not data.get('links', {}).get('next'):
            break
        page += 1

    conn.close()
    print(f"\n=== Fertig ===")
    print(f"Importiert: {imported}")
    print(f"Übersprungen (keine Vorlage): {skipped}")


if __name__ == "__main__":
    import_pdf_urls()