From f9d8c677ae949e1c47559ea5e3923679bd42e027 Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Tue, 28 Apr 2026 00:30:54 +0200 Subject: [PATCH] extract_quotes.py: Auto-Quote-Extraktion je Episode via Qwen-plus - Pro Episode: Paragraphen mit [P0]-Markern an qwen-plus, Antwort 3-5 markante Zitate als JSON-Array (para_idx, text, verbatim, speaker, is_top, themes). - Theme-IDs werden gegen die in der DB hinterlegten themes-Liste validiert; unbekannte Themes fallen auf das leere Array zurueck. - Audio-Timestamps kommen aus der paragraphs-Tabelle ueber para_idx, dadurch keine SRT-Reparsing-Schritte noetig. - Hard-Budget 1,50 USD je Lauf, Skip vorhandener Episoden, Crash-Sicherheit durch Commit nach jeder Episode. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/extract_quotes.py | 294 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 294 insertions(+) create mode 100644 scripts/extract_quotes.py diff --git a/scripts/extract_quotes.py b/scripts/extract_quotes.py new file mode 100644 index 0000000..e9fced0 --- /dev/null +++ b/scripts/extract_quotes.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +"""Auto-Quote-Extraktion fuer einen Podcast (z.B. LdN). + +Pro Episode: Qwen erhaelt das (gekuerzte) Transkript als Paragraphen-Array. +Output: 3-5 markante Zitate als JSON, mit para_idx, text, verbatim, speaker, is_top, themes. + +Audio-Timestamps werden aus der `paragraphs`-Tabelle ueber `para_idx` zugeordnet. + +Nutzung: + DASHSCOPE_API_KEY=... python3 extract_quotes.py [db-pfad] [podcast_id] + +Bei wiederholtem Aufruf: Episoden mit bestehenden Quotes werden uebersprungen. +""" + +import json +import os +import re +import sys +import time +import sqlite3 + +from openai import OpenAI + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from json_utils import parse_llm_json + +DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite" +PODCAST_ID = sys.argv[2] if len(sys.argv) > 2 else "ldn" +API_KEY = os.environ.get("DASHSCOPE_API_KEY", "") +BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" +MODEL = "qwen-plus" + +# Konservatives Pricing fuer Budget-Tracking (qwen-plus intl) +COST_IN = 0.0008 / 1000 +COST_OUT = 0.002 / 1000 + +PARA_CHAR_LIMIT = 600 # pro Paragraph +HARD_BUDGET_USD = 1.50 + +SYSTEM_PROMPT = """Du bist Diskursanalyst. Du erhaeltst ein Podcast-Transkript als Paragraphen-Liste mit Index-Markern [P0], [P1], ... + +Waehle 3 bis 5 markante Zitate, die fuer diese Episode/diesen Diskurs charakteristisch sind. Praeferenz fuer: +- Pointierte Aussagen, Thesen, Schluesselformulierungen +- Konkrete Beispiele, die ein groesseres Argument verdichten +- Aussagen mit klarer Sprecher-Position + +KEINE Floskeln, KEINE Begruessungen, KEINE Werbeblock-Zitate. + +Antworte NUR mit einem JSON-Array. Jedes Element: +{ + "para_idx": , // Index des Paragraphen (P-Marker) + "text": "", + "verbatim": "", + "speaker": "", + "is_top": , + "themes": ["", ...] // ZWINGEND nur aus den erlaubten IDs (siehe unten); leeres Array wenn unklar +} + +ERLAUBTE THEME-IDs (NUR diese verwenden, sonst leeres Array): +gaza-nahost, haushalt-investitionen, klima-verkehr, krieg-ukraine, migration-asyl, parteienlandschaft, trump-usa +""" + + +class Budget: + def __init__(self, hard_limit_usd): + self.hard_limit = hard_limit_usd + self.tokens_in = 0 + self.tokens_out = 0 + + def add(self, usage): + if usage: + self.tokens_in += getattr(usage, "prompt_tokens", 0) or 0 + self.tokens_out += getattr(usage, "completion_tokens", 0) or 0 + + def cost(self): + return self.tokens_in * COST_IN + self.tokens_out * COST_OUT + + def over(self): + return self.cost() > self.hard_limit + + +def load_themes(db, podcast_id): + return [r["id"] for r in db.execute("SELECT id FROM themes WHERE podcast_id=?", (podcast_id,)).fetchall()] + + +def build_user_msg(episode, paragraphs): + head = f"Episode {episode['id']}: {episode['title'][:200]}" + if episode.get("guest"): + head += f" (Gast: {episode['guest']})" + blocks = [] + for p in paragraphs: + snippet = p["text"][:PARA_CHAR_LIMIT] + blocks.append(f"[P{p['idx']}] {snippet}") + return head + "\n\n" + "\n\n".join(blocks) + + +def call_llm(client, user_msg, budget): + last_err = None + for attempt in range(2): + try: + resp = client.chat.completions.create( + model=MODEL, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_msg}, + ], + temperature=0.2, + max_tokens=1500, + ) + budget.add(getattr(resp, "usage", None)) + content = resp.choices[0].message.content + try: + return parse_llm_json(content, expect="array"), None + except ValueError as pe: + last_err = f"parse: {pe} :: head={content[:200]}" + break + except Exception as e: + last_err = str(e) + if attempt < 1: + time.sleep(2) + continue + return None, last_err + + +def next_quote_id(db, podcast_id): + rows = db.execute( + "SELECT id FROM quotes WHERE podcast_id=? AND id LIKE 'q%'", (podcast_id,) + ).fetchall() + max_n = 0 + for r in rows: + m = re.match(r"q(\d+)$", r["id"]) + if m: + n = int(m.group(1)) + if n > max_n: + max_n = n + return max_n + 1 + + +def process_episode(db, client, episode, allowed_themes, budget, next_id): + paras = db.execute( + "SELECT idx, start_time, end_time, text FROM paragraphs " + "WHERE podcast_id=? AND episode_id=? ORDER BY idx", + (episode["podcast_id"], episode["id"]), + ).fetchall() + if not paras: + return 0, next_id, "no-paragraphs" + + paragraph_dicts = [dict(p) for p in paras] + para_lookup = {p["idx"]: p for p in paragraph_dicts} + + user_msg = build_user_msg(dict(episode), paragraph_dicts) + result, err = call_llm(client, user_msg, budget) + if result is None: + return 0, next_id, f"llm-fail: {err}" + if not isinstance(result, list): + return 0, next_id, "llm: no array" + + inserted = 0 + top_count = 0 + for item in result: + if not isinstance(item, dict): + continue + try: + idx = int(item.get("para_idx", -1)) + except (TypeError, ValueError): + continue + para = para_lookup.get(idx) + if not para: + continue + text = (item.get("text") or "").strip() + verbatim = (item.get("verbatim") or "").strip() + speaker = (item.get("speaker") or "").strip() + if not text and not verbatim: + continue + + themes_raw = item.get("themes") or [] + if not isinstance(themes_raw, list): + themes_raw = [] + themes = [t for t in themes_raw if t in allowed_themes] + + is_top_raw = item.get("is_top") + is_top = bool(is_top_raw) and top_count == 0 + if is_top: + top_count += 1 + + qid = f"q{next_id}" + next_id += 1 + try: + db.execute( + "INSERT INTO quotes (id, podcast_id, episode_id, text, verbatim, speaker, " + "start_time, end_time, is_top_quote, themes_json) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + (qid, episode["podcast_id"], episode["id"], + text[:1000], verbatim[:2000], speaker[:200], + para["start_time"], para["end_time"], 1 if is_top else 0, + json.dumps(themes, ensure_ascii=False)), + ) + inserted += 1 + except sqlite3.IntegrityError: + # Duplikat - skip + pass + return inserted, next_id, None + + +def main(): + if not API_KEY: + print("DASHSCOPE_API_KEY nicht gesetzt.") + sys.exit(1) + client = OpenAI(api_key=API_KEY, base_url=BASE_URL, timeout=60.0, max_retries=1) + db = sqlite3.connect(DB_PATH, timeout=30.0) + db.execute("PRAGMA busy_timeout=30000") + db.row_factory = sqlite3.Row + + # Sicherstellen, dass quotes-Tabelle existiert (sollte sie) + db.executescript(""" + CREATE TABLE IF NOT EXISTS quotes ( + id TEXT, podcast_id TEXT, episode_id TEXT, + text TEXT, verbatim TEXT, speaker TEXT, + start_time REAL, end_time REAL, + is_top_quote BOOLEAN, themes_json TEXT, + PRIMARY KEY (podcast_id, id) + ); + CREATE INDEX IF NOT EXISTS idx_quotes_episode ON quotes(podcast_id, episode_id); + """) + + allowed_themes = load_themes(db, PODCAST_ID) + print(f"Erlaubte Themes ({PODCAST_ID}): {allowed_themes}") + + episodes = db.execute( + "SELECT id, podcast_id, title, guest FROM episodes WHERE podcast_id=? ORDER BY id", + (PODCAST_ID,), + ).fetchall() + print(f"Verarbeite {len(episodes)} Episoden fuer {PODCAST_ID}…") + + # Skip episodes with existing quotes + done = set() + for r in db.execute( + "SELECT DISTINCT episode_id FROM quotes WHERE podcast_id=?", (PODCAST_ID,) + ).fetchall(): + done.add(r["episode_id"]) + print(f" {len(done)} Episoden haben bereits Quotes — werden uebersprungen") + + next_id = next_quote_id(db, PODCAST_ID) + print(f" Naechste Quote-ID: q{next_id}") + + budget = Budget(hard_limit_usd=HARD_BUDGET_USD) + + total_inserted = 0 + failures = [] + for i, ep in enumerate(episodes): + if ep["id"] in done: + continue + if budget.over(): + print(f"!! Budget ueberschritten ({budget.cost():.4f} USD) — Abbruch") + break + try: + n, next_id, err = process_episode(db, client, ep, allowed_themes, budget, next_id) + except Exception as e: + n, err = 0, str(e) + total_inserted += n + if err: + failures.append((ep["id"], err)) + # Commit nach jeder Episode (Crash-Sicherheit) + db.commit() + print(f" [{i+1}/{len(episodes)}] {ep['id']}: +{n} quotes " + f"(total={total_inserted}, cost=${budget.cost():.4f}, err={'-' if not err else err[:80]})", flush=True) + time.sleep(0.4) + + db.commit() + + print() + print("=== Zusammenfassung Aufgabe B ===") + print(f" Quotes inserted: {total_inserted}") + print(f" Tokens in={budget.tokens_in} out={budget.tokens_out}") + print(f" Kosten ~${budget.cost():.4f}") + if failures: + print(f" Fehler in {len(failures)} Episoden, erste 5:") + for ep_id, err in failures[:5]: + print(f" {ep_id}: {err[:120]}") + + # Sanity-Check: Quotes pro Episode + counts = db.execute( + "SELECT episode_id, COUNT(*) c FROM quotes WHERE podcast_id=? GROUP BY episode_id ORDER BY c", + (PODCAST_ID,), + ).fetchall() + print(f" Episoden mit Quotes: {len(counts)}") + if counts: + cs = [c["c"] for c in counts] + print(f" Quotes/Episode: min={min(cs)} max={max(cs)} mean={sum(cs)/len(cs):.1f}") + db.close() + + +if __name__ == "__main__": + main()