#!/usr/bin/env python3 """Re-Run der fehlerhaften Records in `debates` und `argument_links` mit robustem JSON-Parser. Vorgehen: - Lade alle Records mit topic='error' (debates) bzw. relation='error' (argument_links) - Hole Source-/Target-Paragraph aus DB - Sende erneut an qwen-plus, parse mit json_utils.parse_llm_json - UPDATE bestehenden Eintrag Nutzung: DASHSCOPE_API_KEY=... python3 rerun_errors.py [db-pfad] """ import json import os import sys import time import sqlite3 from openai import OpenAI sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from json_utils import parse_llm_json DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite" API_KEY = os.environ.get("DASHSCOPE_API_KEY", "") BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" MODEL = "qwen-plus" # Kosten qwen-plus (DashScope intl, Stand 2025): $0.0008 / 1k input, $0.002 / 1k output COST_IN = 0.0008 / 1000 COST_OUT = 0.002 / 1000 DEBATES_SYSTEM = """Du bist ein Diskursanalyst. Du erhältst zwei Textabschnitte aus VERSCHIEDENEN Podcasts, die dasselbe Thema behandeln. Erstelle eine kurze Gegenüberstellung. Antworte NUR mit JSON: { "topic": "Das gemeinsame Thema in 3-5 Wörtern", "agreement": "Worin stimmen beide überein? (1-2 Sätze)", "divergence": "Worin unterscheiden sie sich? (1-2 Sätze, oder 'keine wesentliche Divergenz')", "insight": "Was lernt man durch die Gegenüberstellung, das man aus keinem der beiden allein lernen würde? (1 Satz)" }""" ARGS_SYSTEM = """Du bist ein Diskursanalyst. Du erhältst zwei Textabschnitte aus Podcast-Transkripten. Klassifiziere die logische Relation zwischen ihnen. Antworte NUR mit einem JSON-Objekt: {"relation": "...", "confidence": 0.0-1.0, "explanation": "Ein Satz Begruendung"} Moegliche Relationen: - "erweitert": B baut auf A auf, ergaenzt, vertieft - "widerspricht": B widerspricht A, nennt Gegenargument - "belegt": B liefert Evidenz/Daten fuer A's These - "relativiert": B schraenkt A ein, nennt Ausnahmen/Bedingungen - "gleicher_punkt": A und B sagen im Kern dasselbe - "kein_bezug": Trotz thematischer Naehe kein logischer Bezug""" class Budget: def __init__(self, hard_limit_usd): self.hard_limit = hard_limit_usd self.tokens_in = 0 self.tokens_out = 0 def add(self, usage): if usage: self.tokens_in += getattr(usage, "prompt_tokens", 0) or 0 self.tokens_out += getattr(usage, "completion_tokens", 0) or 0 def cost(self): return self.tokens_in * COST_IN + self.tokens_out * COST_OUT def over(self): return self.cost() > self.hard_limit def call_llm(client, system, user, max_tokens, budget): last_err = None for attempt in range(2): try: resp = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content": system}, {"role": "user", "content": user}, ], temperature=0.1 if "Klassifiziere" in system else 0.2, max_tokens=max_tokens, ) budget.add(getattr(resp, "usage", None)) content = resp.choices[0].message.content try: return parse_llm_json(content, expect="object"), None except ValueError as pe: last_err = f"parse: {pe}" # Ein Retry mit anderer Temperature waere moeglich; wir akzeptieren parse-fail break except Exception as e: last_err = str(e) if attempt < 1: time.sleep(2) continue return None, last_err def rerun_debates(db, client, budget): rows = db.execute(""" SELECT d.id as did, d.source_podcast, d.source_episode, d.source_idx, d.target_podcast, d.target_episode, d.target_idx, p1.text as source_text, p2.text as target_text, pc1.name as source_pname, pc2.name as target_pname, e1.title as source_title, e1.guest as source_guest, e2.title as target_title, e2.guest as target_guest FROM debates d JOIN paragraphs p1 ON d.source_podcast = p1.podcast_id AND d.source_episode = p1.episode_id AND d.source_idx = p1.idx JOIN paragraphs p2 ON d.target_podcast = p2.podcast_id AND d.target_episode = p2.episode_id AND d.target_idx = p2.idx JOIN episodes e1 ON d.source_podcast = e1.podcast_id AND d.source_episode = e1.id JOIN episodes e2 ON d.target_podcast = e2.podcast_id AND d.target_episode = e2.id JOIN podcasts pc1 ON d.source_podcast = pc1.id JOIN podcasts pc2 ON d.target_podcast = pc2.id WHERE d.topic = 'error' """).fetchall() print(f"[debates] {len(rows)} Error-Records zu reparieren") fixed = 0 still_err = 0 for i, r in enumerate(rows): if budget.over(): print(f"[debates] Kosten-Limit erreicht bei {i}/{len(rows)} (cost=${budget.cost():.4f})") break meta_a = f"{r['source_pname']} / {r['source_episode']}: {r['source_title']} ({r['source_guest']})" meta_b = f"{r['target_pname']} / {r['target_episode']}: {r['target_title']} ({r['target_guest']})" user = (f"Podcast A — {meta_a}:\n\"{r['source_text'][:800]}\"\n\n" f"Podcast B — {meta_b}:\n\"{r['target_text'][:800]}\"\n\n" "Erstelle die Gegenueberstellung.") result, err = call_llm(client, DEBATES_SYSTEM, user, 600, budget) if result is not None and result.get("topic"): db.execute( "UPDATE debates SET topic=?, agreement=?, divergence=?, insight=? WHERE id=?", (result.get("topic", "")[:120], result.get("agreement", "")[:1000], result.get("divergence", "")[:1000], result.get("insight", "")[:1000], r["did"]), ) fixed += 1 else: still_err += 1 db.execute( "UPDATE debates SET insight=? WHERE id=?", (f"rerun-failed: {err}"[:500], r["did"]), ) if (i + 1) % 10 == 0: db.commit() print(f" [debates] {i+1}/{len(rows)} gefixt={fixed} still_err={still_err} cost=${budget.cost():.4f}") time.sleep(0.3) db.commit() print(f"[debates] fertig: gefixt={fixed} still_err={still_err}") return fixed, still_err def rerun_args(db, client, budget): rows = db.execute(""" SELECT a.id as aid, a.source_podcast, a.source_episode, a.source_idx, a.target_podcast, a.target_episode, a.target_idx, p1.text as source_text, p2.text as target_text, e1.title as source_title, e1.guest as source_guest, e2.title as target_title, e2.guest as target_guest FROM argument_links a JOIN paragraphs p1 ON a.source_podcast = p1.podcast_id AND a.source_episode = p1.episode_id AND a.source_idx = p1.idx JOIN paragraphs p2 ON a.target_podcast = p2.podcast_id AND a.target_episode = p2.episode_id AND a.target_idx = p2.idx JOIN episodes e1 ON a.source_podcast = e1.podcast_id AND a.source_episode = e1.id JOIN episodes e2 ON a.target_podcast = e2.podcast_id AND a.target_episode = e2.id WHERE a.relation = 'error' """).fetchall() print(f"[argument_links] {len(rows)} Error-Records zu reparieren") fixed = 0 still_err = 0 for i, r in enumerate(rows): if budget.over(): print(f"[args] Kosten-Limit erreicht bei {i}/{len(rows)} (cost=${budget.cost():.4f})") break meta_a = f"{r['source_episode']}: {r['source_title']} — {r['source_guest']}" meta_b = f"{r['target_episode']}: {r['target_title']} — {r['target_guest']}" user = (f"Absatz A ({meta_a}):\n\"{r['source_text'][:800]}\"\n\n" f"Absatz B ({meta_b}):\n\"{r['target_text'][:800]}\"\n\n" "Welche logische Relation besteht von A zu B?") result, err = call_llm(client, ARGS_SYSTEM, user, 350, budget) if result is not None and result.get("relation") and result.get("relation") != "error": db.execute( "UPDATE argument_links SET relation=?, confidence=?, explanation=? WHERE id=?", (str(result.get("relation", ""))[:60], float(result.get("confidence", 0) or 0), str(result.get("explanation", ""))[:1000], r["aid"]), ) fixed += 1 else: still_err += 1 db.execute( "UPDATE argument_links SET explanation=? WHERE id=?", (f"rerun-failed: {err}"[:500], r["aid"]), ) if (i + 1) % 20 == 0: db.commit() print(f" [args] {i+1}/{len(rows)} gefixt={fixed} still_err={still_err} cost=${budget.cost():.4f}") time.sleep(0.25) db.commit() print(f"[argument_links] fertig: gefixt={fixed} still_err={still_err}") return fixed, still_err def main(): if not API_KEY: print("DASHSCOPE_API_KEY nicht gesetzt.") sys.exit(1) client = OpenAI(api_key=API_KEY, base_url=BASE_URL, timeout=30.0, max_retries=1) db = sqlite3.connect(DB_PATH, timeout=30) db.row_factory = sqlite3.Row db.execute("PRAGMA busy_timeout=30000") # Aufgabe-A-Budget: 1 USD budget = Budget(hard_limit_usd=1.0) print(f"DB: {DB_PATH}, Modell: {MODEL}") d_fixed, d_err = rerun_debates(db, client, budget) a_fixed, a_err = rerun_args(db, client, budget) print() print("=== Zusammenfassung Aufgabe A ===") print(f" debates gefixt={d_fixed} still_err={d_err}") print(f" argument_links gefixt={a_fixed} still_err={a_err}") print(f" Tokens in={budget.tokens_in} out={budget.tokens_out}") print(f" Kosten ~${budget.cost():.4f}") db.close() if __name__ == "__main__": main()