podcast-mindmap/scripts/extract_quotes.py

#!/usr/bin/env python3
"""Auto-Quote-Extraktion fuer einen Podcast (z.B. LdN).

Pro Episode: Qwen erhaelt das (gekuerzte) Transkript als Paragraphen-Array.
Output: 3-5 markante Zitate als JSON, mit para_idx, text, verbatim, speaker, is_top, themes.

Audio-Timestamps werden aus der `paragraphs`-Tabelle ueber `para_idx` zugeordnet.

Nutzung:
    DASHSCOPE_API_KEY=... python3 extract_quotes.py [db-pfad] [podcast_id]

Bei wiederholtem Aufruf: Episoden mit bestehenden Quotes werden uebersprungen.
"""

import json
import os
import re
import sys
import time
import sqlite3

from openai import OpenAI

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from json_utils import parse_llm_json

DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
PODCAST_ID = sys.argv[2] if len(sys.argv) > 2 else "ldn"
API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
MODEL = "qwen-plus"

# Konservatives Pricing fuer Budget-Tracking (qwen-plus intl)
COST_IN = 0.0008 / 1000
COST_OUT = 0.002 / 1000

PARA_CHAR_LIMIT = 600    # pro Paragraph
HARD_BUDGET_USD = 1.50

SYSTEM_PROMPT = """Du bist Diskursanalyst. Du erhaeltst ein Podcast-Transkript als Paragraphen-Liste mit Index-Markern [P0], [P1], ...

Waehle 3 bis 5 markante Zitate, die fuer diese Episode/diesen Diskurs charakteristisch sind. Praeferenz fuer:
- Pointierte Aussagen, Thesen, Schluesselformulierungen
- Konkrete Beispiele, die ein groesseres Argument verdichten
- Aussagen mit klarer Sprecher-Position

KEINE Floskeln, KEINE Begruessungen, KEINE Werbeblock-Zitate.

Antworte NUR mit einem JSON-Array. Jedes Element:
{
  "para_idx": <int>,                          // Index des Paragraphen (P-Marker)
  "text": "<geglaettete Form, ohne Fuellwoerter, ein Satz>",
  "verbatim": "<woertlicher Snippet aus dem Transkript, max. 2 Saetze>",
  "speaker": "<Name oder leerer String>",
  "is_top": <true wenn dies das prominenteste Zitat der Episode ist, sonst false; max. 1x true pro Antwort>,
  "themes": ["<theme-id>", ...]                // ZWINGEND nur aus den erlaubten IDs (siehe unten); leeres Array wenn unklar
}

ERLAUBTE THEME-IDs (NUR diese verwenden, sonst leeres Array):
gaza-nahost, haushalt-investitionen, klima-verkehr, krieg-ukraine, migration-asyl, parteienlandschaft, trump-usa
"""


class Budget:
    def __init__(self, hard_limit_usd):
        self.hard_limit = hard_limit_usd
        self.tokens_in = 0
        self.tokens_out = 0

    def add(self, usage):
        if usage:
            self.tokens_in += getattr(usage, "prompt_tokens", 0) or 0
            self.tokens_out += getattr(usage, "completion_tokens", 0) or 0

    def cost(self):
        return self.tokens_in * COST_IN + self.tokens_out * COST_OUT

    def over(self):
        return self.cost() > self.hard_limit


def load_themes(db, podcast_id):
    return [r["id"] for r in db.execute("SELECT id FROM themes WHERE podcast_id=?", (podcast_id,)).fetchall()]


def build_user_msg(episode, paragraphs):
    head = f"Episode {episode['id']}: {episode['title'][:200]}"
    if episode.get("guest"):
        head += f" (Gast: {episode['guest']})"
    blocks = []
    for p in paragraphs:
        snippet = p["text"][:PARA_CHAR_LIMIT]
        blocks.append(f"[P{p['idx']}] {snippet}")
    return head + "\n\n" + "\n\n".join(blocks)


def call_llm(client, user_msg, budget):
    last_err = None
    for attempt in range(2):
        try:
            resp = client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": user_msg},
                ],
                temperature=0.2,
                max_tokens=1500,
            )
            budget.add(getattr(resp, "usage", None))
            content = resp.choices[0].message.content
            try:
                return parse_llm_json(content, expect="array"), None
            except ValueError as pe:
                last_err = f"parse: {pe} :: head={content[:200]}"
                break
        except Exception as e:
            last_err = str(e)
            if attempt < 1:
                time.sleep(2)
                continue
    return None, last_err


def next_quote_id(db, podcast_id):
    rows = db.execute(
        "SELECT id FROM quotes WHERE podcast_id=? AND id LIKE 'q%'", (podcast_id,)
    ).fetchall()
    max_n = 0
    for r in rows:
        m = re.match(r"q(\d+)$", r["id"])
        if m:
            n = int(m.group(1))
            if n > max_n:
                max_n = n
    return max_n + 1


def process_episode(db, client, episode, allowed_themes, budget, next_id):
    paras = db.execute(
        "SELECT idx, start_time, end_time, text FROM paragraphs "
        "WHERE podcast_id=? AND episode_id=? ORDER BY idx",
        (episode["podcast_id"], episode["id"]),
    ).fetchall()
    if not paras:
        return 0, next_id, "no-paragraphs"

    paragraph_dicts = [dict(p) for p in paras]
    para_lookup = {p["idx"]: p for p in paragraph_dicts}

    user_msg = build_user_msg(dict(episode), paragraph_dicts)
    result, err = call_llm(client, user_msg, budget)
    if result is None:
        return 0, next_id, f"llm-fail: {err}"
    if not isinstance(result, list):
        return 0, next_id, "llm: no array"

    inserted = 0
    top_count = 0
    for item in result:
        if not isinstance(item, dict):
            continue
        try:
            idx = int(item.get("para_idx", -1))
        except (TypeError, ValueError):
            continue
        para = para_lookup.get(idx)
        if not para:
            continue
        text = (item.get("text") or "").strip()
        verbatim = (item.get("verbatim") or "").strip()
        speaker = (item.get("speaker") or "").strip()
        if not text and not verbatim:
            continue

        themes_raw = item.get("themes") or []
        if not isinstance(themes_raw, list):
            themes_raw = []
        themes = [t for t in themes_raw if t in allowed_themes]

        is_top_raw = item.get("is_top")
        is_top = bool(is_top_raw) and top_count == 0
        if is_top:
            top_count += 1

        qid = f"q{next_id}"
        next_id += 1
        try:
            db.execute(
                "INSERT INTO quotes (id, podcast_id, episode_id, text, verbatim, speaker, "
                "start_time, end_time, is_top_quote, themes_json) "
                "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
                (qid, episode["podcast_id"], episode["id"],
                 text[:1000], verbatim[:2000], speaker[:200],
                 para["start_time"], para["end_time"], 1 if is_top else 0,
                 json.dumps(themes, ensure_ascii=False)),
            )
            inserted += 1
        except sqlite3.IntegrityError:
            # Duplikat - skip
            pass
    return inserted, next_id, None


def main():
    if not API_KEY:
        print("DASHSCOPE_API_KEY nicht gesetzt.")
        sys.exit(1)
    client = OpenAI(api_key=API_KEY, base_url=BASE_URL, timeout=60.0, max_retries=1)
    db = sqlite3.connect(DB_PATH, timeout=30.0)
    db.execute("PRAGMA busy_timeout=30000")
    db.row_factory = sqlite3.Row

    # Sicherstellen, dass quotes-Tabelle existiert (sollte sie)
    db.executescript("""
    CREATE TABLE IF NOT EXISTS quotes (
        id TEXT, podcast_id TEXT, episode_id TEXT,
        text TEXT, verbatim TEXT, speaker TEXT,
        start_time REAL, end_time REAL,
        is_top_quote BOOLEAN, themes_json TEXT,
        PRIMARY KEY (podcast_id, id)
    );
    CREATE INDEX IF NOT EXISTS idx_quotes_episode ON quotes(podcast_id, episode_id);
    """)

    allowed_themes = load_themes(db, PODCAST_ID)
    print(f"Erlaubte Themes ({PODCAST_ID}): {allowed_themes}")

    episodes = db.execute(
        "SELECT id, podcast_id, title, guest FROM episodes WHERE podcast_id=? ORDER BY id",
        (PODCAST_ID,),
    ).fetchall()
    print(f"Verarbeite {len(episodes)} Episoden fuer {PODCAST_ID}…")

    # Skip episodes with existing quotes
    done = set()
    for r in db.execute(
        "SELECT DISTINCT episode_id FROM quotes WHERE podcast_id=?", (PODCAST_ID,)
    ).fetchall():
        done.add(r["episode_id"])
    print(f"  {len(done)} Episoden haben bereits Quotes — werden uebersprungen")

    next_id = next_quote_id(db, PODCAST_ID)
    print(f"  Naechste Quote-ID: q{next_id}")

    budget = Budget(hard_limit_usd=HARD_BUDGET_USD)

    total_inserted = 0
    failures = []
    for i, ep in enumerate(episodes):
        if ep["id"] in done:
            continue
        if budget.over():
            print(f"!! Budget ueberschritten ({budget.cost():.4f} USD) — Abbruch")
            break
        try:
            n, next_id, err = process_episode(db, client, ep, allowed_themes, budget, next_id)
        except Exception as e:
            n, err = 0, str(e)
        total_inserted += n
        if err:
            failures.append((ep["id"], err))
        # Commit nach jeder Episode (Crash-Sicherheit)
        db.commit()
        print(f"  [{i+1}/{len(episodes)}] {ep['id']}: +{n} quotes "
              f"(total={total_inserted}, cost=${budget.cost():.4f}, err={'-' if not err else err[:80]})", flush=True)
        time.sleep(0.4)

    db.commit()

    print()
    print("=== Zusammenfassung Aufgabe B ===")
    print(f"  Quotes inserted: {total_inserted}")
    print(f"  Tokens in={budget.tokens_in} out={budget.tokens_out}")
    print(f"  Kosten ~${budget.cost():.4f}")
    if failures:
        print(f"  Fehler in {len(failures)} Episoden, erste 5:")
        for ep_id, err in failures[:5]:
            print(f"    {ep_id}: {err[:120]}")

    # Sanity-Check: Quotes pro Episode
    counts = db.execute(
        "SELECT episode_id, COUNT(*) c FROM quotes WHERE podcast_id=? GROUP BY episode_id ORDER BY c",
        (PODCAST_ID,),
    ).fetchall()
    print(f"  Episoden mit Quotes: {len(counts)}")
    if counts:
        cs = [c["c"] for c in counts]
        print(f"  Quotes/Episode: min={min(cs)} max={max(cs)} mean={sum(cs)/len(cs):.1f}")
    db.close()


if __name__ == "__main__":
    main()