antragstracker/scripts/extract_adaptive.py

#!/usr/bin/env python3
"""
Adaptive PDF-Extraktion mit Throttle-Detection.

Startet konservativ und erhöht Geschwindigkeit bis zum Limit.
Robustes Logging für Wiederaufnahme nach Abbruch.
"""

import argparse
import json
import os
import sqlite3
import tempfile
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from threading import Lock

import httpx
import pymupdf

# Netdata Metrics HTTP Endpoint (VServer)
METRICS_URL = os.environ.get("METRICS_URL", "http://152.53.119.77:8127")

PROJECT_ROOT = Path(__file__).resolve().parent.parent
DB_PATH = PROJECT_ROOT / "data" / "tracker_remote.db"
STATE_FILE = PROJECT_ROOT / "data" / "extract_state.json"
LOG_FILE = PROJECT_ROOT / "data" / "extract.log"
METRICS_FILE = PROJECT_ROOT / "data" / "extract_metrics.jsonl"


@dataclass
class AdaptiveConfig:
    """Adaptive Throttling-Konfiguration."""
    delay: float = 0.2          # Start nahe Optimum
    workers: int = 4            # Start bei ~optimal-35% (basierend auf Daten: optimal ~6)
    min_delay: float = 0.1      # Minimaler Delay
    max_workers: int = 15       # Hartes Maximum
    success_streak: int = 0     # Erfolge in Folge
    streak_threshold: int = 30  # Erfolge bis Speedup
    cooldown_until: float = 0   # Timestamp bis Cooldown endet
    best_delay_per_worker: dict = field(default_factory=dict)  # worker_count -> min stable delay
    delay_fully_explored: bool = False  # True wenn delay bei aktuellem worker-level am min
    throughput_per_worker: dict = field(default_factory=dict)  # worker_count -> best throughput
    saturation_detected: bool = False  # True wenn mehr Workers keinen Gewinn bringen
    saturation_threshold: float = 0.1  # 10% Verbesserung nötig für neuen Worker
    

@dataclass
class State:
    """Persistenter Zustand für Wiederaufnahme."""
    processed: set = field(default_factory=set)
    failed: dict = field(default_factory=dict)  # vorlage_id -> retry_count
    failed_permanent: set = field(default_factory=set)
    started_at: str = ""
    last_update: str = ""
    stats: dict = field(default_factory=lambda: {
        "success": 0, "failed": 0, "retried": 0, "total": 0
    })


class AdaptiveExtractor:
    def __init__(self, state_file: Path = STATE_FILE, notify: bool = True):
        self.state_file = state_file
        self.config = AdaptiveConfig()
        self.db_lock = Lock()
        self.log_lock = Lock()
        self.notify = notify
        self.last_notify = 0
        self.notify_interval = 300  # 5 Minuten
        self.batch_start_time = None
        self.batch_metrics = []
        self.state = self._load_state()  # Must be after log_lock init
        
    def _load_state(self) -> State:
        """Lädt Zustand aus Datei oder erstellt neuen."""
        if self.state_file.exists():
            try:
                data = json.loads(self.state_file.read_text())
                state = State(
                    processed=set(data.get("processed", [])),
                    failed=data.get("failed", {}),
                    failed_permanent=set(data.get("failed_permanent", [])),
                    started_at=data.get("started_at", ""),
                    last_update=data.get("last_update", ""),
                    stats=data.get("stats", State().stats)
                )
                self._log(f"State geladen: {len(state.processed)} verarbeitet, {len(state.failed)} pending retries")
                return state
            except Exception as e:
                self._log(f"State-Laden fehlgeschlagen: {e}")
        
        return State(started_at=datetime.now().isoformat())
    
    def _save_state(self):
        """Speichert Zustand."""
        self.state.last_update = datetime.now().isoformat()
        data = {
            "processed": list(self.state.processed),
            "failed": self.state.failed,
            "failed_permanent": list(self.state.failed_permanent),
            "started_at": self.state.started_at,
            "last_update": self.state.last_update,
            "stats": self.state.stats
        }
        self.state_file.write_text(json.dumps(data, indent=2))
    
    def _log(self, msg: str):
        """Thread-safe Logging."""
        timestamp = datetime.now().strftime("%H:%M:%S")
        line = f"[{timestamp}] {msg}"
        print(line)
        with self.log_lock:
            with open(LOG_FILE, "a") as f:
                f.write(line + "\n")
    
    def _record_metric(self, batch_num: int, batch_time: float, success: int, failed: int, bytes_downloaded: int = 0):
        """Speichert Metriken für Visualisierung."""
        mb_downloaded = bytes_downloaded / (1024 * 1024)
        mb_per_sec = mb_downloaded / max(batch_time, 0.1)
        
        metric = {
            "timestamp": datetime.now().isoformat(),
            "batch": batch_num,
            "batch_time_sec": round(batch_time, 2),
            "success": success,
            "failed": failed,
            "delay": round(self.config.delay, 3),
            "workers": self.config.workers,
            "throughput": round(success / max(batch_time, 0.1), 2),  # docs/sec
            "total_success": self.state.stats["success"],
            "total_failed": len(self.state.failed_permanent),
            "pending_retries": len(self.state.failed),
            "mb_downloaded": round(mb_downloaded, 2),
            "mb_per_sec": round(mb_per_sec, 2),
        }
        self.batch_metrics.append(metric)
        
        with open(METRICS_FILE, "a") as f:
            f.write(json.dumps(metric) + "\n")
        
        return metric
    
    def _push_metrics(self, metric: dict):
        """Pusht Metriken per HTTP an VServer → Netdata Statsd."""
        try:
            payload = {
                "throughput": metric["throughput"],
                "delay": metric["delay"],
                "workers": metric["workers"],
                "success_total": metric["total_success"],
                "failed_total": metric["total_failed"],
                "batch_time": metric["batch_time_sec"],
                "pending_retries": metric["pending_retries"],
                "items_per_sec": metric["throughput"],  # Alias
                "mb_per_sec": metric.get("mb_per_sec", 0),
                "mb_downloaded": metric.get("mb_downloaded", 0),
            }
            httpx.post(METRICS_URL, json=payload, timeout=5)
        except Exception as e:
            pass  # Silent fail, don't block extraction
    
    def _send_telegram(self, message: str):
        """Loggt Update."""
        self._log(f"[NOTIFY] {message[:100]}...")
    
    def _maybe_notify(self, force: bool = False):
        """Sendet periodische Updates nach Telegram."""
        if not self.notify:
            return
        
        now = time.time()
        if not force and (now - self.last_notify) < self.notify_interval:
            return
        
        self.last_notify = now
        
        # Letzte Metriken
        if not self.batch_metrics:
            return
        
        recent = self.batch_metrics[-1]
        elapsed = (datetime.now() - datetime.fromisoformat(self.state.started_at)).total_seconds() / 60
        
        # Throughput-Trend (letzte 5 Batches)
        recent_throughputs = [m["throughput"] for m in self.batch_metrics[-5:]]
        avg_throughput = sum(recent_throughputs) / len(recent_throughputs)
        
        # ETA
        remaining = self.state.stats["total"] - self.state.stats["success"] - len(self.state.failed_permanent)
        eta_min = remaining / max(avg_throughput * 60, 0.1)
        
        msg = f"""📊 *PDF-Extraktion Update*

✓ Erfolg: {self.state.stats['success']:,}
✗ Fehler: {len(self.state.failed_permanent)}
↻ Retries: {len(self.state.failed)}

⚡ Config: {self.config.workers} workers, {self.config.delay:.2f}s delay
📈 Throughput: {avg_throughput:.1f} docs/sec
⏱️ Laufzeit: {elapsed:.0f} min
🎯 ETA: ~{eta_min:.0f} min

Batch {recent['batch']}: {recent['success']}✓ {recent['failed']}✗ in {recent['batch_time_sec']}s"""
        
        self._send_telegram(msg)
    
    def _get_db(self):
        conn = sqlite3.connect(str(DB_PATH), check_same_thread=False)
        conn.row_factory = sqlite3.Row
        return conn
    
    def _download_and_extract(self, vorlage_id: int, url: str) -> tuple[int, str | None, str | None, int]:
        """Lädt PDF und extrahiert Text. Returns: (vorlage_id, text, error, bytes_downloaded)"""
        try:
            resp = httpx.get(url, timeout=60, follow_redirects=True)
            
            # Throttling-Detection
            if resp.status_code in (429, 503):
                return (vorlage_id, None, f"THROTTLED:{resp.status_code}", 0)
            
            resp.raise_for_status()
            
            content_size = len(resp.content)
            
            if content_size < 100:
                return (vorlage_id, None, "PDF zu klein", content_size)
            
            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmp:
                tmp.write(resp.content)
                tmp.flush()
                
                doc = pymupdf.open(tmp.name)
                text_parts = []
                for page in doc:
                    text_parts.append(page.get_text())
                doc.close()
                
                text = "\n".join(text_parts).strip()
                
                if len(text) < 50:
                    return (vorlage_id, None, "Kein Text", content_size)
                
                # Bereinigen
                import re
                text = re.sub(r'\n{3,}', '\n\n', text)
                text = re.sub(r' {2,}', ' ', text)
                
                return (vorlage_id, text, None, content_size)
                
        except httpx.HTTPStatusError as e:
            return (vorlage_id, None, f"HTTP:{e.response.status_code}", 0)
        except Exception as e:
            return (vorlage_id, None, str(e)[:80], 0)
    
    def _handle_success(self, vorlage_id: int, text: str):
        """Verarbeitet erfolgreiche Extraktion."""
        with self.db_lock:
            conn = self._get_db()
            conn.execute("""
                UPDATE vorlagen SET volltext = ?, volltext_clean = ?
                WHERE id = ?
            """, (text, text, vorlage_id))
            conn.execute("UPDATE anlagen SET downloaded = 1 WHERE vorlage_id = ?", (vorlage_id,))
            conn.commit()
            conn.close()
        
        self.state.processed.add(vorlage_id)
        self.state.stats["success"] += 1
        self.config.success_streak += 1
        
        # Adaptive Speedup
        if self.config.success_streak >= self.config.streak_threshold:
            self._speedup()
            self.config.success_streak = 0
    
    def _handle_failure(self, vorlage_id: int, error: str):
        """Verarbeitet Fehler mit Retry-Logik."""
        retry_count = self.state.failed.get(str(vorlage_id), 0) + 1
        
        if "THROTTLED" in error:
            # Cooldown aktivieren
            self._slowdown(severe=True)
            self.state.failed[str(vorlage_id)] = retry_count
            return
        
        if retry_count >= 3:
            self.state.failed_permanent.add(vorlage_id)
            if str(vorlage_id) in self.state.failed:
                del self.state.failed[str(vorlage_id)]
            self._log(f"  ✗ #{vorlage_id} permanent failed: {error}")
        else:
            self.state.failed[str(vorlage_id)] = retry_count
            self._log(f"  ↻ #{vorlage_id} retry {retry_count}/3: {error}")
        
        self.state.stats["failed"] += 1
        self.config.success_streak = 0
    
    def _speedup(self):
        """Erhöht Geschwindigkeit mit Worker-Level-Exploration und Sättigungs-Erkennung."""
        old_delay = self.config.delay
        old_workers = self.config.workers
        
        # Aktuellen Delay als stabil für diesen Worker-Level merken
        w = self.config.workers
        if w not in self.config.best_delay_per_worker:
            self.config.best_delay_per_worker[w] = self.config.delay
        else:
            self.config.best_delay_per_worker[w] = min(
                self.config.best_delay_per_worker[w], 
                self.config.delay
            )
        
        # Aktuellen Throughput für Worker-Level tracken
        if self.batch_metrics:
            recent_throughput = sum(m["throughput"] for m in self.batch_metrics[-3:]) / min(3, len(self.batch_metrics))
            if w not in self.config.throughput_per_worker:
                self.config.throughput_per_worker[w] = recent_throughput
            else:
                # Gleitender Durchschnitt
                self.config.throughput_per_worker[w] = (
                    self.config.throughput_per_worker[w] * 0.7 + recent_throughput * 0.3
                )
        
        if self.config.delay > self.config.min_delay:
            # Delay noch nicht am Minimum → weiter reduzieren
            self.config.delay = max(self.config.min_delay, self.config.delay * 0.8)
            self.config.delay_fully_explored = False
        elif self.config.saturation_detected:
            # Sättigung erkannt → nicht mehr skalieren
            self._log(f"📊 Sättigung bei {self.config.workers} Workers — mehr bringt nichts")
        elif self.config.workers < self.config.max_workers:
            # Prüfe ob letzter Worker-Sprung Verbesserung gebracht hat
            prev_throughput = self.config.throughput_per_worker.get(w - 1, 0)
            curr_throughput = self.config.throughput_per_worker.get(w, 0)
            
            if prev_throughput > 0 and curr_throughput > 0:
                improvement = (curr_throughput - prev_throughput) / prev_throughput
                if improvement < self.config.saturation_threshold:
                    # Weniger als 10% Verbesserung → Sättigung
                    self.config.saturation_detected = True
                    self._log(f"📊 Sättigung erkannt: {w-1}→{w} Workers nur +{improvement*100:.1f}% Throughput")
                    return
            
            # Worker hinzufügen
            self.config.workers += 1
            prev_best = self.config.best_delay_per_worker.get(w, 0.5)
            self.config.delay = max(prev_best, 0.3)
            self.config.delay_fully_explored = False
            self._log(f"🔄 Neuer Worker-Level: reset delay auf {self.config.delay:.2f}s für Exploration")
        else:
            self.config.delay_fully_explored = True
        
        if old_delay != self.config.delay or old_workers != self.config.workers:
            self._log(f"⚡ Speedup: delay={self.config.delay:.2f}s, workers={self.config.workers}")
    
    def _slowdown(self, severe: bool = False):
        """Verlangsamt bei Problemen."""
        if severe:
            self.config.cooldown_until = time.time() + 30  # 30s Pause
            self.config.delay = min(2.0, self.config.delay * 2)
            self.config.workers = max(1, self.config.workers - 1)
            self.config.delay_fully_explored = False  # Exploration resetten
            self._log(f"🛑 Throttled! Cooldown 30s, delay={self.config.delay:.2f}s, workers={self.config.workers}")
        else:
            self.config.delay = min(2.0, self.config.delay * 1.2)
            self._log(f"⚠️ Slowdown: delay={self.config.delay:.2f}s")
    
    def _wait_cooldown(self):
        """Wartet Cooldown ab."""
        if self.config.cooldown_until > time.time():
            wait = self.config.cooldown_until - time.time()
            self._log(f"⏳ Cooldown: {wait:.0f}s warten...")
            time.sleep(wait)
    
    def get_pending(self, limit: int) -> list[dict]:
        """Holt zu verarbeitende Vorlagen."""
        conn = self._get_db()
        
        # Alle mit URL aber ohne Volltext, die nicht schon verarbeitet sind
        processed_ids = self.state.processed | self.state.failed_permanent
        
        query = """
            SELECT a.vorlage_id, a.url
            FROM anlagen a
            JOIN vorlagen v ON a.vorlage_id = v.id
            WHERE a.url IS NOT NULL
              AND a.downloaded = 0
              AND (v.volltext_clean IS NULL OR v.volltext_clean = '')
            ORDER BY v.datum_eingang DESC
        """
        
        all_pending = conn.execute(query).fetchall()
        conn.close()
        
        # Filtern
        result = []
        for row in all_pending:
            if row['vorlage_id'] not in processed_ids:
                result.append(dict(row))
                if len(result) >= limit:
                    break
        
        # Retries hinzufügen (am Ende)
        for vid_str, count in list(self.state.failed.items()):
            if len(result) >= limit:
                break
            vid = int(vid_str)
            # URL nochmal holen
            conn = self._get_db()
            row = conn.execute("SELECT vorlage_id, url FROM anlagen WHERE vorlage_id = ?", (vid,)).fetchone()
            conn.close()
            if row:
                result.append(dict(row))
                self.state.stats["retried"] += 1
        
        return result
    
    def run(self, limit: int = 1000):
        """Hauptschleife."""
        self._log(f"=== Adaptive Extraktion gestartet ===")
        self._log(f"Limit: {limit}, Start-Config: delay={self.config.delay}s, workers={self.config.workers}")
        
        pending = self.get_pending(limit)
        self.state.stats["total"] = len(pending)
        self._log(f"Zu verarbeiten: {len(pending)}")
        
        if not pending:
            self._log("Nichts zu tun!")
            return
        
        batch_size = 50
        processed_count = 0
        batch_num = 0
        
        for i in range(0, len(pending), batch_size):
            self._wait_cooldown()
            
            batch = pending[i:i+batch_size]
            batch_num += 1
            batch_start = time.time()
            batch_success = 0
            batch_failed = 0
            batch_bytes = 0
            
            self._log(f"\n--- Batch {batch_num}: {len(batch)} Vorlagen ---")
            self._log(f"Config: delay={self.config.delay:.2f}s, workers={self.config.workers}")
            
            with ThreadPoolExecutor(max_workers=self.config.workers) as executor:
                futures = {}
                for item in batch:
                    time.sleep(self.config.delay)
                    future = executor.submit(
                        self._download_and_extract,
                        item['vorlage_id'],
                        item['url']
                    )
                    futures[future] = item
                
                for future in as_completed(futures):
                    vorlage_id, text, error, bytes_dl = future.result()
                    batch_bytes += bytes_dl
                    
                    if text:
                        self._handle_success(vorlage_id, text)
                        self._log(f"  ✓ #{vorlage_id}: {len(text)} Zeichen")
                        batch_success += 1
                    else:
                        self._handle_failure(vorlage_id, error)
                        batch_failed += 1
                    
                    processed_count += 1
            
            # Metriken aufzeichnen
            batch_time = time.time() - batch_start
            metric = self._record_metric(batch_num, batch_time, batch_success, batch_failed, batch_bytes)
            
            # Push to Netdata Statsd
            self._push_metrics(metric)
            
            # State speichern nach jedem Batch
            self._save_state()
            
            # Fortschritt
            stats = self.state.stats
            self._log(f"Progress: {processed_count}/{len(pending)} | ✓{stats['success']} ✗{len(self.state.failed_permanent)} | {batch_success/max(batch_time,0.1):.1f} docs/sec")
            
            # Telegram-Update (alle 5 min)
            self._maybe_notify()
        
        self._log(f"\n=== Fertig ===")
        self._log(f"Erfolgreich: {self.state.stats['success']}")
        self._log(f"Fehlgeschlagen: {len(self.state.failed_permanent)}")
        self._log(f"State gespeichert: {self.state_file}")
        
        # Finale Notification
        self._maybe_notify(force=True)
        
        if self.notify:
            self._send_telegram(f"✅ *PDF-Extraktion abgeschlossen*\n\n✓ {self.state.stats['success']:,} erfolgreich\n✗ {len(self.state.failed_permanent)} fehlgeschlagen")


def main():
    parser = argparse.ArgumentParser(description="Adaptive PDF-Extraktion")
    parser.add_argument("--limit", type=int, default=1000, help="Max. Anzahl")
    parser.add_argument("--reset", action="store_true", help="State zurücksetzen")
    parser.add_argument("--no-notify", action="store_true", help="Keine Telegram-Updates")
    parser.add_argument("--notify-interval", type=int, default=300, help="Sekunden zwischen Updates")
    args = parser.parse_args()
    
    if args.reset and STATE_FILE.exists():
        STATE_FILE.unlink()
        print("State zurückgesetzt")
    
    # Metriken-Datei leeren bei Reset
    if args.reset and METRICS_FILE.exists():
        METRICS_FILE.unlink()
    
    extractor = AdaptiveExtractor(notify=not args.no_notify)
    extractor.notify_interval = args.notify_interval
    extractor.run(limit=args.limit)


if __name__ == "__main__":
    main()