#!/usr/bin/env python3 """ OCR für PDFs ohne extrahierbaren Text. Nutzt Apple Vision Framework via ocrmac. """ import argparse import sqlite3 import tempfile import time from pathlib import Path import httpx import pymupdf from ocrmac import ocrmac PROJECT_ROOT = Path(__file__).resolve().parent.parent DB_PATH = PROJECT_ROOT / "data" / "tracker_remote.db" LOG_FILE = PROJECT_ROOT / "data" / "ocr.log" def log(msg: str): timestamp = time.strftime("%H:%M:%S") line = f"[{timestamp}] {msg}" print(line) with open(LOG_FILE, "a") as f: f.write(line + "\n") def get_db(): conn = sqlite3.connect(str(DB_PATH)) conn.row_factory = sqlite3.Row return conn def get_pdfs_without_text(limit: int) -> list[dict]: """Findet PDFs die keinen Text haben aber eine URL.""" conn = get_db() # Vorlagen mit PDF-URL aber ohne Volltext rows = conn.execute(""" SELECT v.id, a.url FROM vorlagen v JOIN anlagen a ON v.id = a.vorlage_id WHERE a.url IS NOT NULL AND a.downloaded = 1 AND (v.volltext_clean IS NULL OR v.volltext_clean = '' OR LENGTH(v.volltext_clean) < 50) ORDER BY v.datum_eingang DESC LIMIT ? """, (limit,)).fetchall() conn.close() return [dict(r) for r in rows] def ocr_pdf(url: str) -> tuple[str | None, str | None]: """Lädt PDF, extrahiert Bilder, macht OCR.""" try: # Download resp = httpx.get(url, timeout=60, follow_redirects=True) resp.raise_for_status() if len(resp.content) < 100: return None, "PDF zu klein" with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf: tmp_pdf.write(resp.content) tmp_pdf.flush() doc = pymupdf.open(tmp_pdf.name) all_text = [] for page_num, page in enumerate(doc): # Seite als Bild rendern (höhere DPI für bessere OCR) pix = page.get_pixmap(dpi=200) with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_img: pix.save(tmp_img.name) # Apple Vision OCR try: results = ocrmac.OCR(tmp_img.name).recognize() page_text = " ".join([r[0] for r in results]) all_text.append(page_text) except Exception as e: log(f" OCR-Fehler Seite {page_num}: {e}") Path(tmp_img.name).unlink(missing_ok=True) doc.close() Path(tmp_pdf.name).unlink(missing_ok=True) text = "\n\n".join(all_text).strip() if len(text) < 50: return None, "Kein Text erkannt" return text, None except Exception as e: return None, str(e)[:100] def main(): parser = argparse.ArgumentParser(description="OCR für Scan-PDFs") parser.add_argument("--limit", type=int, default=100, help="Max. Anzahl") args = parser.parse_args() log(f"=== OCR für Scans gestartet ===") log(f"Limit: {args.limit}") pdfs = get_pdfs_without_text(args.limit) log(f"Gefunden: {len(pdfs)} PDFs ohne Text") if not pdfs: log("Nichts zu tun!") return conn = get_db() success = 0 failed = 0 for i, pdf in enumerate(pdfs): log(f"[{i+1}/{len(pdfs)}] Vorlage #{pdf['id']}...") text, error = ocr_pdf(pdf['url']) if text: # In DB speichern conn.execute(""" UPDATE vorlagen SET volltext = ?, volltext_clean = ? WHERE id = ? """, (text, text, pdf['id'])) conn.commit() success += 1 log(f" ✓ {len(text)} Zeichen via OCR") else: failed += 1 log(f" ✗ {error}") conn.close() log(f"\n=== Fertig ===") log(f"Erfolgreich: {success}") log(f"Fehlgeschlagen: {failed}") if __name__ == "__main__": main()