#!/usr/bin/env python3 """Auto-Quote-Extraktion fuer einen Podcast (z.B. LdN). Pro Episode: Qwen erhaelt das (gekuerzte) Transkript als Paragraphen-Array. Output: 3-5 markante Zitate als JSON, mit para_idx, text, verbatim, speaker, is_top, themes. Audio-Timestamps werden aus der `paragraphs`-Tabelle ueber `para_idx` zugeordnet. Nutzung: DASHSCOPE_API_KEY=... python3 extract_quotes.py [db-pfad] [podcast_id] Bei wiederholtem Aufruf: Episoden mit bestehenden Quotes werden uebersprungen. """ import json import os import re import sys import time import sqlite3 from openai import OpenAI sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from json_utils import parse_llm_json DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite" PODCAST_ID = sys.argv[2] if len(sys.argv) > 2 else "ldn" API_KEY = os.environ.get("DASHSCOPE_API_KEY", "") BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" MODEL = "qwen-plus" # Konservatives Pricing fuer Budget-Tracking (qwen-plus intl) COST_IN = 0.0008 / 1000 COST_OUT = 0.002 / 1000 PARA_CHAR_LIMIT = 600 # pro Paragraph HARD_BUDGET_USD = 1.50 SYSTEM_PROMPT = """Du bist Diskursanalyst. Du erhaeltst ein Podcast-Transkript als Paragraphen-Liste mit Index-Markern [P0], [P1], ... Waehle 3 bis 5 markante Zitate, die fuer diese Episode/diesen Diskurs charakteristisch sind. Praeferenz fuer: - Pointierte Aussagen, Thesen, Schluesselformulierungen - Konkrete Beispiele, die ein groesseres Argument verdichten - Aussagen mit klarer Sprecher-Position KEINE Floskeln, KEINE Begruessungen, KEINE Werbeblock-Zitate. Antworte NUR mit einem JSON-Array. Jedes Element: { "para_idx": , // Index des Paragraphen (P-Marker) "text": "", "verbatim": "", "speaker": "", "is_top": , "themes": ["", ...] // ZWINGEND nur aus den erlaubten IDs (siehe unten); leeres Array wenn unklar } ERLAUBTE THEME-IDs (NUR diese verwenden, sonst leeres Array): gaza-nahost, haushalt-investitionen, klima-verkehr, krieg-ukraine, migration-asyl, parteienlandschaft, trump-usa """ class Budget: def __init__(self, hard_limit_usd): self.hard_limit = hard_limit_usd self.tokens_in = 0 self.tokens_out = 0 def add(self, usage): if usage: self.tokens_in += getattr(usage, "prompt_tokens", 0) or 0 self.tokens_out += getattr(usage, "completion_tokens", 0) or 0 def cost(self): return self.tokens_in * COST_IN + self.tokens_out * COST_OUT def over(self): return self.cost() > self.hard_limit def load_themes(db, podcast_id): return [r["id"] for r in db.execute("SELECT id FROM themes WHERE podcast_id=?", (podcast_id,)).fetchall()] def build_user_msg(episode, paragraphs): head = f"Episode {episode['id']}: {episode['title'][:200]}" if episode.get("guest"): head += f" (Gast: {episode['guest']})" blocks = [] for p in paragraphs: snippet = p["text"][:PARA_CHAR_LIMIT] blocks.append(f"[P{p['idx']}] {snippet}") return head + "\n\n" + "\n\n".join(blocks) def call_llm(client, user_msg, budget): last_err = None for attempt in range(2): try: resp = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_msg}, ], temperature=0.2, max_tokens=1500, ) budget.add(getattr(resp, "usage", None)) content = resp.choices[0].message.content try: return parse_llm_json(content, expect="array"), None except ValueError as pe: last_err = f"parse: {pe} :: head={content[:200]}" break except Exception as e: last_err = str(e) if attempt < 1: time.sleep(2) continue return None, last_err def next_quote_id(db, podcast_id): rows = db.execute( "SELECT id FROM quotes WHERE podcast_id=? AND id LIKE 'q%'", (podcast_id,) ).fetchall() max_n = 0 for r in rows: m = re.match(r"q(\d+)$", r["id"]) if m: n = int(m.group(1)) if n > max_n: max_n = n return max_n + 1 def process_episode(db, client, episode, allowed_themes, budget, next_id): paras = db.execute( "SELECT idx, start_time, end_time, text FROM paragraphs " "WHERE podcast_id=? AND episode_id=? ORDER BY idx", (episode["podcast_id"], episode["id"]), ).fetchall() if not paras: return 0, next_id, "no-paragraphs" paragraph_dicts = [dict(p) for p in paras] para_lookup = {p["idx"]: p for p in paragraph_dicts} user_msg = build_user_msg(dict(episode), paragraph_dicts) result, err = call_llm(client, user_msg, budget) if result is None: return 0, next_id, f"llm-fail: {err}" if not isinstance(result, list): return 0, next_id, "llm: no array" inserted = 0 top_count = 0 for item in result: if not isinstance(item, dict): continue try: idx = int(item.get("para_idx", -1)) except (TypeError, ValueError): continue para = para_lookup.get(idx) if not para: continue text = (item.get("text") or "").strip() verbatim = (item.get("verbatim") or "").strip() speaker = (item.get("speaker") or "").strip() if not text and not verbatim: continue themes_raw = item.get("themes") or [] if not isinstance(themes_raw, list): themes_raw = [] themes = [t for t in themes_raw if t in allowed_themes] is_top_raw = item.get("is_top") is_top = bool(is_top_raw) and top_count == 0 if is_top: top_count += 1 qid = f"q{next_id}" next_id += 1 try: db.execute( "INSERT INTO quotes (id, podcast_id, episode_id, text, verbatim, speaker, " "start_time, end_time, is_top_quote, themes_json) " "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (qid, episode["podcast_id"], episode["id"], text[:1000], verbatim[:2000], speaker[:200], para["start_time"], para["end_time"], 1 if is_top else 0, json.dumps(themes, ensure_ascii=False)), ) inserted += 1 except sqlite3.IntegrityError: # Duplikat - skip pass return inserted, next_id, None def main(): if not API_KEY: print("DASHSCOPE_API_KEY nicht gesetzt.") sys.exit(1) client = OpenAI(api_key=API_KEY, base_url=BASE_URL, timeout=60.0, max_retries=1) db = sqlite3.connect(DB_PATH, timeout=30.0) db.execute("PRAGMA busy_timeout=30000") db.row_factory = sqlite3.Row # Sicherstellen, dass quotes-Tabelle existiert (sollte sie) db.executescript(""" CREATE TABLE IF NOT EXISTS quotes ( id TEXT, podcast_id TEXT, episode_id TEXT, text TEXT, verbatim TEXT, speaker TEXT, start_time REAL, end_time REAL, is_top_quote BOOLEAN, themes_json TEXT, PRIMARY KEY (podcast_id, id) ); CREATE INDEX IF NOT EXISTS idx_quotes_episode ON quotes(podcast_id, episode_id); """) allowed_themes = load_themes(db, PODCAST_ID) print(f"Erlaubte Themes ({PODCAST_ID}): {allowed_themes}") episodes = db.execute( "SELECT id, podcast_id, title, guest FROM episodes WHERE podcast_id=? ORDER BY id", (PODCAST_ID,), ).fetchall() print(f"Verarbeite {len(episodes)} Episoden fuer {PODCAST_ID}…") # Skip episodes with existing quotes done = set() for r in db.execute( "SELECT DISTINCT episode_id FROM quotes WHERE podcast_id=?", (PODCAST_ID,) ).fetchall(): done.add(r["episode_id"]) print(f" {len(done)} Episoden haben bereits Quotes — werden uebersprungen") next_id = next_quote_id(db, PODCAST_ID) print(f" Naechste Quote-ID: q{next_id}") budget = Budget(hard_limit_usd=HARD_BUDGET_USD) total_inserted = 0 failures = [] for i, ep in enumerate(episodes): if ep["id"] in done: continue if budget.over(): print(f"!! Budget ueberschritten ({budget.cost():.4f} USD) — Abbruch") break try: n, next_id, err = process_episode(db, client, ep, allowed_themes, budget, next_id) except Exception as e: n, err = 0, str(e) total_inserted += n if err: failures.append((ep["id"], err)) # Commit nach jeder Episode (Crash-Sicherheit) db.commit() print(f" [{i+1}/{len(episodes)}] {ep['id']}: +{n} quotes " f"(total={total_inserted}, cost=${budget.cost():.4f}, err={'-' if not err else err[:80]})", flush=True) time.sleep(0.4) db.commit() print() print("=== Zusammenfassung Aufgabe B ===") print(f" Quotes inserted: {total_inserted}") print(f" Tokens in={budget.tokens_in} out={budget.tokens_out}") print(f" Kosten ~${budget.cost():.4f}") if failures: print(f" Fehler in {len(failures)} Episoden, erste 5:") for ep_id, err in failures[:5]: print(f" {ep_id}: {err[:120]}") # Sanity-Check: Quotes pro Episode counts = db.execute( "SELECT episode_id, COUNT(*) c FROM quotes WHERE podcast_id=? GROUP BY episode_id ORDER BY c", (PODCAST_ID,), ).fetchall() print(f" Episoden mit Quotes: {len(counts)}") if counts: cs = [c["c"] for c in counts] print(f" Quotes/Episode: min={min(cs)} max={max(cs)} mean={sum(cs)/len(cs):.1f}") db.close() if __name__ == "__main__": main()