150 lines
4.2 KiB
Python
150 lines
4.2 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
OCR für PDFs ohne extrahierbaren Text.
|
||
|
|
Nutzt Apple Vision Framework via ocrmac.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import sqlite3
|
||
|
|
import tempfile
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
import pymupdf
|
||
|
|
from ocrmac import ocrmac
|
||
|
|
|
||
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||
|
|
DB_PATH = PROJECT_ROOT / "data" / "tracker_remote.db"
|
||
|
|
LOG_FILE = PROJECT_ROOT / "data" / "ocr.log"
|
||
|
|
|
||
|
|
|
||
|
|
def log(msg: str):
|
||
|
|
timestamp = time.strftime("%H:%M:%S")
|
||
|
|
line = f"[{timestamp}] {msg}"
|
||
|
|
print(line)
|
||
|
|
with open(LOG_FILE, "a") as f:
|
||
|
|
f.write(line + "\n")
|
||
|
|
|
||
|
|
|
||
|
|
def get_db():
|
||
|
|
conn = sqlite3.connect(str(DB_PATH))
|
||
|
|
conn.row_factory = sqlite3.Row
|
||
|
|
return conn
|
||
|
|
|
||
|
|
|
||
|
|
def get_pdfs_without_text(limit: int) -> list[dict]:
|
||
|
|
"""Findet PDFs die keinen Text haben aber eine URL."""
|
||
|
|
conn = get_db()
|
||
|
|
|
||
|
|
# Vorlagen mit PDF-URL aber ohne Volltext
|
||
|
|
rows = conn.execute("""
|
||
|
|
SELECT v.id, a.url
|
||
|
|
FROM vorlagen v
|
||
|
|
JOIN anlagen a ON v.id = a.vorlage_id
|
||
|
|
WHERE a.url IS NOT NULL
|
||
|
|
AND a.downloaded = 1
|
||
|
|
AND (v.volltext_clean IS NULL OR v.volltext_clean = '' OR LENGTH(v.volltext_clean) < 50)
|
||
|
|
ORDER BY v.datum_eingang DESC
|
||
|
|
LIMIT ?
|
||
|
|
""", (limit,)).fetchall()
|
||
|
|
|
||
|
|
conn.close()
|
||
|
|
return [dict(r) for r in rows]
|
||
|
|
|
||
|
|
|
||
|
|
def ocr_pdf(url: str) -> tuple[str | None, str | None]:
|
||
|
|
"""Lädt PDF, extrahiert Bilder, macht OCR."""
|
||
|
|
try:
|
||
|
|
# Download
|
||
|
|
resp = httpx.get(url, timeout=60, follow_redirects=True)
|
||
|
|
resp.raise_for_status()
|
||
|
|
|
||
|
|
if len(resp.content) < 100:
|
||
|
|
return None, "PDF zu klein"
|
||
|
|
|
||
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
|
||
|
|
tmp_pdf.write(resp.content)
|
||
|
|
tmp_pdf.flush()
|
||
|
|
|
||
|
|
doc = pymupdf.open(tmp_pdf.name)
|
||
|
|
all_text = []
|
||
|
|
|
||
|
|
for page_num, page in enumerate(doc):
|
||
|
|
# Seite als Bild rendern (höhere DPI für bessere OCR)
|
||
|
|
pix = page.get_pixmap(dpi=200)
|
||
|
|
|
||
|
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_img:
|
||
|
|
pix.save(tmp_img.name)
|
||
|
|
|
||
|
|
# Apple Vision OCR
|
||
|
|
try:
|
||
|
|
results = ocrmac.OCR(tmp_img.name).recognize()
|
||
|
|
page_text = " ".join([r[0] for r in results])
|
||
|
|
all_text.append(page_text)
|
||
|
|
except Exception as e:
|
||
|
|
log(f" OCR-Fehler Seite {page_num}: {e}")
|
||
|
|
|
||
|
|
Path(tmp_img.name).unlink(missing_ok=True)
|
||
|
|
|
||
|
|
doc.close()
|
||
|
|
Path(tmp_pdf.name).unlink(missing_ok=True)
|
||
|
|
|
||
|
|
text = "\n\n".join(all_text).strip()
|
||
|
|
|
||
|
|
if len(text) < 50:
|
||
|
|
return None, "Kein Text erkannt"
|
||
|
|
|
||
|
|
return text, None
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
return None, str(e)[:100]
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description="OCR für Scan-PDFs")
|
||
|
|
parser.add_argument("--limit", type=int, default=100, help="Max. Anzahl")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
log(f"=== OCR für Scans gestartet ===")
|
||
|
|
log(f"Limit: {args.limit}")
|
||
|
|
|
||
|
|
pdfs = get_pdfs_without_text(args.limit)
|
||
|
|
log(f"Gefunden: {len(pdfs)} PDFs ohne Text")
|
||
|
|
|
||
|
|
if not pdfs:
|
||
|
|
log("Nichts zu tun!")
|
||
|
|
return
|
||
|
|
|
||
|
|
conn = get_db()
|
||
|
|
success = 0
|
||
|
|
failed = 0
|
||
|
|
|
||
|
|
for i, pdf in enumerate(pdfs):
|
||
|
|
log(f"[{i+1}/{len(pdfs)}] Vorlage #{pdf['id']}...")
|
||
|
|
|
||
|
|
text, error = ocr_pdf(pdf['url'])
|
||
|
|
|
||
|
|
if text:
|
||
|
|
# In DB speichern
|
||
|
|
conn.execute("""
|
||
|
|
UPDATE vorlagen SET volltext = ?, volltext_clean = ?
|
||
|
|
WHERE id = ?
|
||
|
|
""", (text, text, pdf['id']))
|
||
|
|
conn.commit()
|
||
|
|
success += 1
|
||
|
|
log(f" ✓ {len(text)} Zeichen via OCR")
|
||
|
|
else:
|
||
|
|
failed += 1
|
||
|
|
log(f" ✗ {error}")
|
||
|
|
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
log(f"\n=== Fertig ===")
|
||
|
|
log(f"Erfolgreich: {success}")
|
||
|
|
log(f"Fehlgeschlagen: {failed}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|