antragstracker/scripts/import_pdf_urls.py

122 lines
3.3 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""
Importiert PDF-URLs aus OParl in die anlagen-Tabelle.
"""
import sqlite3
from pathlib import Path
import httpx
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DB_PATH = PROJECT_ROOT / "data" / "tracker_remote.db"
OPARL_BASE = "https://allris.hagen.de/public/oparl/papers?body=1"
def get_db():
conn = sqlite3.connect(str(DB_PATH))
conn.row_factory = sqlite3.Row
return conn
def ensure_anlagen_table(conn):
"""Erstellt anlagen-Tabelle falls nicht vorhanden."""
conn.execute("""
CREATE TABLE IF NOT EXISTS anlagen (
id INTEGER PRIMARY KEY,
vorlage_id INTEGER NOT NULL,
name TEXT,
url TEXT,
mime_type TEXT,
size INTEGER,
downloaded INTEGER DEFAULT 0,
FOREIGN KEY (vorlage_id) REFERENCES vorlagen(id)
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_anlagen_vorlage ON anlagen(vorlage_id)")
conn.commit()
def import_pdf_urls():
conn = get_db()
ensure_anlagen_table(conn)
# Mapping oparl_id -> vorlage.id
vorlage_map = {}
for row in conn.execute("SELECT id, oparl_id FROM vorlagen WHERE oparl_id IS NOT NULL"):
vorlage_map[row['oparl_id']] = row['id']
print(f"Vorlagen mit OParl-ID: {len(vorlage_map)}")
page = 1
imported = 0
skipped = 0
while True:
print(f"Seite {page}...", end=" ", flush=True)
try:
resp = httpx.get(f"{OPARL_BASE}&page={page}", timeout=30)
resp.raise_for_status()
data = resp.json()
except Exception as e:
print(f"Fehler: {e}")
break
if not data.get('data'):
print("keine Daten")
break
page_imported = 0
for paper in data['data']:
oparl_id = paper.get('id')
main_file = paper.get('mainFile')
if not main_file or not oparl_id:
continue
vorlage_id = vorlage_map.get(oparl_id)
if not vorlage_id:
skipped += 1
continue
# Prüfen ob schon existiert
existing = conn.execute(
"SELECT id FROM anlagen WHERE vorlage_id = ?", (vorlage_id,)
).fetchone()
if existing:
continue
url = main_file.get('accessUrl') or main_file.get('downloadUrl')
if not url:
continue
conn.execute("""
INSERT INTO anlagen (vorlage_id, name, url, mime_type, size)
VALUES (?, ?, ?, ?, ?)
""", (
vorlage_id,
main_file.get('name') or main_file.get('fileName'),
url,
main_file.get('mimeType'),
main_file.get('size')
))
imported += 1
page_imported += 1
conn.commit()
print(f"{page_imported} importiert")
if not data.get('links', {}).get('next'):
break
page += 1
conn.close()
print(f"\n=== Fertig ===")
print(f"Importiert: {imported}")
print(f"Übersprungen (keine Vorlage): {skipped}")
if __name__ == "__main__":
import_pdf_urls()