133 lines
3.7 KiB
Python
133 lines
3.7 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
PDF-Volltext-Extraktion für Antragstracker Hagen.
|
||
|
|
Lädt PDFs von ALLRIS und extrahiert den Text.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import sqlite3
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import fitz # PyMuPDF
|
||
|
|
import httpx
|
||
|
|
|
||
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||
|
|
DB_PATH = PROJECT_ROOT / "data" / "tracker_remote.db"
|
||
|
|
|
||
|
|
|
||
|
|
def get_db():
|
||
|
|
conn = sqlite3.connect(str(DB_PATH))
|
||
|
|
conn.row_factory = sqlite3.Row
|
||
|
|
return conn
|
||
|
|
|
||
|
|
|
||
|
|
def extract_text_from_pdf(pdf_bytes: bytes) -> str:
|
||
|
|
"""Extrahiert Text aus PDF-Bytes."""
|
||
|
|
try:
|
||
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||
|
|
text_parts = []
|
||
|
|
for page in doc:
|
||
|
|
text_parts.append(page.get_text())
|
||
|
|
doc.close()
|
||
|
|
return "\n".join(text_parts).strip()
|
||
|
|
except Exception as e:
|
||
|
|
print(f" PDF-Fehler: {e}")
|
||
|
|
return ""
|
||
|
|
|
||
|
|
|
||
|
|
def clean_text(text: str) -> str:
|
||
|
|
"""Bereinigt extrahierten Text."""
|
||
|
|
# Mehrfache Leerzeilen reduzieren
|
||
|
|
import re
|
||
|
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
||
|
|
# Führende/trailing Whitespace pro Zeile
|
||
|
|
lines = [line.strip() for line in text.split('\n')]
|
||
|
|
return '\n'.join(lines).strip()
|
||
|
|
|
||
|
|
|
||
|
|
def process_vorlage(conn: sqlite3.Connection, client: httpx.Client, vorlage: dict) -> bool:
|
||
|
|
"""Lädt PDF und extrahiert Volltext für eine Vorlage."""
|
||
|
|
vid = vorlage['id']
|
||
|
|
akz = vorlage['aktenzeichen'] or f"#{vid}"
|
||
|
|
pdf_url = vorlage['pdf_url']
|
||
|
|
|
||
|
|
if not pdf_url:
|
||
|
|
print(f" {akz}: Keine PDF-URL")
|
||
|
|
return False
|
||
|
|
|
||
|
|
try:
|
||
|
|
resp = client.get(pdf_url, timeout=30, follow_redirects=True)
|
||
|
|
resp.raise_for_status()
|
||
|
|
|
||
|
|
if 'application/pdf' not in resp.headers.get('content-type', ''):
|
||
|
|
print(f" {akz}: Kein PDF ({resp.headers.get('content-type')})")
|
||
|
|
return False
|
||
|
|
|
||
|
|
text = extract_text_from_pdf(resp.content)
|
||
|
|
if not text:
|
||
|
|
print(f" {akz}: Kein Text extrahiert")
|
||
|
|
return False
|
||
|
|
|
||
|
|
text_clean = clean_text(text)
|
||
|
|
|
||
|
|
conn.execute(
|
||
|
|
"UPDATE vorlagen SET volltext = ?, volltext_clean = ? WHERE id = ?",
|
||
|
|
(text, text_clean, vid)
|
||
|
|
)
|
||
|
|
conn.commit()
|
||
|
|
|
||
|
|
print(f" {akz}: {len(text_clean)} Zeichen")
|
||
|
|
return True
|
||
|
|
|
||
|
|
except httpx.TimeoutException:
|
||
|
|
print(f" {akz}: Timeout")
|
||
|
|
return False
|
||
|
|
except Exception as e:
|
||
|
|
print(f" {akz}: Fehler {e}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description="PDF-Volltext-Extraktion")
|
||
|
|
parser.add_argument("--limit", type=int, default=10, help="Max. Anzahl (default: 10)")
|
||
|
|
parser.add_argument("--typ", type=str, default="antrag", help="Vorlagen-Typ (default: antrag)")
|
||
|
|
parser.add_argument("--all", action="store_true", help="Alle ohne Volltext")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
print(f"=== PDF-Volltext-Extraktion ===\n")
|
||
|
|
|
||
|
|
conn = get_db()
|
||
|
|
client = httpx.Client()
|
||
|
|
|
||
|
|
# Vorlagen ohne Volltext finden
|
||
|
|
query = """
|
||
|
|
SELECT id, aktenzeichen, pdf_url
|
||
|
|
FROM vorlagen
|
||
|
|
WHERE volltext IS NULL
|
||
|
|
AND pdf_url IS NOT NULL
|
||
|
|
"""
|
||
|
|
if args.typ:
|
||
|
|
query += f" AND typ = '{args.typ}'"
|
||
|
|
query += " ORDER BY datum_eingang DESC"
|
||
|
|
if not args.all:
|
||
|
|
query += f" LIMIT {args.limit}"
|
||
|
|
|
||
|
|
vorlagen = conn.execute(query).fetchall()
|
||
|
|
print(f"Verarbeite {len(vorlagen)} Vorlagen (Typ: {args.typ or 'alle'})\n")
|
||
|
|
|
||
|
|
success = 0
|
||
|
|
for v in vorlagen:
|
||
|
|
if process_vorlage(conn, client, dict(v)):
|
||
|
|
success += 1
|
||
|
|
time.sleep(0.5) # Rate limiting
|
||
|
|
|
||
|
|
client.close()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
print(f"\n=== Fertig: {success}/{len(vorlagen)} erfolgreich ===")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|