- Pro Episode: Paragraphen mit [P0]-Markern an qwen-plus, Antwort 3-5 markante Zitate als JSON-Array (para_idx, text, verbatim, speaker, is_top, themes). - Theme-IDs werden gegen die in der DB hinterlegten themes-Liste validiert; unbekannte Themes fallen auf das leere Array zurueck. - Audio-Timestamps kommen aus der paragraphs-Tabelle ueber para_idx, dadurch keine SRT-Reparsing-Schritte noetig. - Hard-Budget 1,50 USD je Lauf, Skip vorhandener Episoden, Crash-Sicherheit durch Commit nach jeder Episode. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
295 lines
9.9 KiB
Python
295 lines
9.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Auto-Quote-Extraktion fuer einen Podcast (z.B. LdN).
|
|
|
|
Pro Episode: Qwen erhaelt das (gekuerzte) Transkript als Paragraphen-Array.
|
|
Output: 3-5 markante Zitate als JSON, mit para_idx, text, verbatim, speaker, is_top, themes.
|
|
|
|
Audio-Timestamps werden aus der `paragraphs`-Tabelle ueber `para_idx` zugeordnet.
|
|
|
|
Nutzung:
|
|
DASHSCOPE_API_KEY=... python3 extract_quotes.py [db-pfad] [podcast_id]
|
|
|
|
Bei wiederholtem Aufruf: Episoden mit bestehenden Quotes werden uebersprungen.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
import sqlite3
|
|
|
|
from openai import OpenAI
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
from json_utils import parse_llm_json
|
|
|
|
DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
|
|
PODCAST_ID = sys.argv[2] if len(sys.argv) > 2 else "ldn"
|
|
API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
|
|
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
|
MODEL = "qwen-plus"
|
|
|
|
# Konservatives Pricing fuer Budget-Tracking (qwen-plus intl)
|
|
COST_IN = 0.0008 / 1000
|
|
COST_OUT = 0.002 / 1000
|
|
|
|
PARA_CHAR_LIMIT = 600 # pro Paragraph
|
|
HARD_BUDGET_USD = 1.50
|
|
|
|
SYSTEM_PROMPT = """Du bist Diskursanalyst. Du erhaeltst ein Podcast-Transkript als Paragraphen-Liste mit Index-Markern [P0], [P1], ...
|
|
|
|
Waehle 3 bis 5 markante Zitate, die fuer diese Episode/diesen Diskurs charakteristisch sind. Praeferenz fuer:
|
|
- Pointierte Aussagen, Thesen, Schluesselformulierungen
|
|
- Konkrete Beispiele, die ein groesseres Argument verdichten
|
|
- Aussagen mit klarer Sprecher-Position
|
|
|
|
KEINE Floskeln, KEINE Begruessungen, KEINE Werbeblock-Zitate.
|
|
|
|
Antworte NUR mit einem JSON-Array. Jedes Element:
|
|
{
|
|
"para_idx": <int>, // Index des Paragraphen (P-Marker)
|
|
"text": "<geglaettete Form, ohne Fuellwoerter, ein Satz>",
|
|
"verbatim": "<woertlicher Snippet aus dem Transkript, max. 2 Saetze>",
|
|
"speaker": "<Name oder leerer String>",
|
|
"is_top": <true wenn dies das prominenteste Zitat der Episode ist, sonst false; max. 1x true pro Antwort>,
|
|
"themes": ["<theme-id>", ...] // ZWINGEND nur aus den erlaubten IDs (siehe unten); leeres Array wenn unklar
|
|
}
|
|
|
|
ERLAUBTE THEME-IDs (NUR diese verwenden, sonst leeres Array):
|
|
gaza-nahost, haushalt-investitionen, klima-verkehr, krieg-ukraine, migration-asyl, parteienlandschaft, trump-usa
|
|
"""
|
|
|
|
|
|
class Budget:
|
|
def __init__(self, hard_limit_usd):
|
|
self.hard_limit = hard_limit_usd
|
|
self.tokens_in = 0
|
|
self.tokens_out = 0
|
|
|
|
def add(self, usage):
|
|
if usage:
|
|
self.tokens_in += getattr(usage, "prompt_tokens", 0) or 0
|
|
self.tokens_out += getattr(usage, "completion_tokens", 0) or 0
|
|
|
|
def cost(self):
|
|
return self.tokens_in * COST_IN + self.tokens_out * COST_OUT
|
|
|
|
def over(self):
|
|
return self.cost() > self.hard_limit
|
|
|
|
|
|
def load_themes(db, podcast_id):
|
|
return [r["id"] for r in db.execute("SELECT id FROM themes WHERE podcast_id=?", (podcast_id,)).fetchall()]
|
|
|
|
|
|
def build_user_msg(episode, paragraphs):
|
|
head = f"Episode {episode['id']}: {episode['title'][:200]}"
|
|
if episode.get("guest"):
|
|
head += f" (Gast: {episode['guest']})"
|
|
blocks = []
|
|
for p in paragraphs:
|
|
snippet = p["text"][:PARA_CHAR_LIMIT]
|
|
blocks.append(f"[P{p['idx']}] {snippet}")
|
|
return head + "\n\n" + "\n\n".join(blocks)
|
|
|
|
|
|
def call_llm(client, user_msg, budget):
|
|
last_err = None
|
|
for attempt in range(2):
|
|
try:
|
|
resp = client.chat.completions.create(
|
|
model=MODEL,
|
|
messages=[
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": user_msg},
|
|
],
|
|
temperature=0.2,
|
|
max_tokens=1500,
|
|
)
|
|
budget.add(getattr(resp, "usage", None))
|
|
content = resp.choices[0].message.content
|
|
try:
|
|
return parse_llm_json(content, expect="array"), None
|
|
except ValueError as pe:
|
|
last_err = f"parse: {pe} :: head={content[:200]}"
|
|
break
|
|
except Exception as e:
|
|
last_err = str(e)
|
|
if attempt < 1:
|
|
time.sleep(2)
|
|
continue
|
|
return None, last_err
|
|
|
|
|
|
def next_quote_id(db, podcast_id):
|
|
rows = db.execute(
|
|
"SELECT id FROM quotes WHERE podcast_id=? AND id LIKE 'q%'", (podcast_id,)
|
|
).fetchall()
|
|
max_n = 0
|
|
for r in rows:
|
|
m = re.match(r"q(\d+)$", r["id"])
|
|
if m:
|
|
n = int(m.group(1))
|
|
if n > max_n:
|
|
max_n = n
|
|
return max_n + 1
|
|
|
|
|
|
def process_episode(db, client, episode, allowed_themes, budget, next_id):
|
|
paras = db.execute(
|
|
"SELECT idx, start_time, end_time, text FROM paragraphs "
|
|
"WHERE podcast_id=? AND episode_id=? ORDER BY idx",
|
|
(episode["podcast_id"], episode["id"]),
|
|
).fetchall()
|
|
if not paras:
|
|
return 0, next_id, "no-paragraphs"
|
|
|
|
paragraph_dicts = [dict(p) for p in paras]
|
|
para_lookup = {p["idx"]: p for p in paragraph_dicts}
|
|
|
|
user_msg = build_user_msg(dict(episode), paragraph_dicts)
|
|
result, err = call_llm(client, user_msg, budget)
|
|
if result is None:
|
|
return 0, next_id, f"llm-fail: {err}"
|
|
if not isinstance(result, list):
|
|
return 0, next_id, "llm: no array"
|
|
|
|
inserted = 0
|
|
top_count = 0
|
|
for item in result:
|
|
if not isinstance(item, dict):
|
|
continue
|
|
try:
|
|
idx = int(item.get("para_idx", -1))
|
|
except (TypeError, ValueError):
|
|
continue
|
|
para = para_lookup.get(idx)
|
|
if not para:
|
|
continue
|
|
text = (item.get("text") or "").strip()
|
|
verbatim = (item.get("verbatim") or "").strip()
|
|
speaker = (item.get("speaker") or "").strip()
|
|
if not text and not verbatim:
|
|
continue
|
|
|
|
themes_raw = item.get("themes") or []
|
|
if not isinstance(themes_raw, list):
|
|
themes_raw = []
|
|
themes = [t for t in themes_raw if t in allowed_themes]
|
|
|
|
is_top_raw = item.get("is_top")
|
|
is_top = bool(is_top_raw) and top_count == 0
|
|
if is_top:
|
|
top_count += 1
|
|
|
|
qid = f"q{next_id}"
|
|
next_id += 1
|
|
try:
|
|
db.execute(
|
|
"INSERT INTO quotes (id, podcast_id, episode_id, text, verbatim, speaker, "
|
|
"start_time, end_time, is_top_quote, themes_json) "
|
|
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
|
(qid, episode["podcast_id"], episode["id"],
|
|
text[:1000], verbatim[:2000], speaker[:200],
|
|
para["start_time"], para["end_time"], 1 if is_top else 0,
|
|
json.dumps(themes, ensure_ascii=False)),
|
|
)
|
|
inserted += 1
|
|
except sqlite3.IntegrityError:
|
|
# Duplikat - skip
|
|
pass
|
|
return inserted, next_id, None
|
|
|
|
|
|
def main():
|
|
if not API_KEY:
|
|
print("DASHSCOPE_API_KEY nicht gesetzt.")
|
|
sys.exit(1)
|
|
client = OpenAI(api_key=API_KEY, base_url=BASE_URL, timeout=60.0, max_retries=1)
|
|
db = sqlite3.connect(DB_PATH, timeout=30.0)
|
|
db.execute("PRAGMA busy_timeout=30000")
|
|
db.row_factory = sqlite3.Row
|
|
|
|
# Sicherstellen, dass quotes-Tabelle existiert (sollte sie)
|
|
db.executescript("""
|
|
CREATE TABLE IF NOT EXISTS quotes (
|
|
id TEXT, podcast_id TEXT, episode_id TEXT,
|
|
text TEXT, verbatim TEXT, speaker TEXT,
|
|
start_time REAL, end_time REAL,
|
|
is_top_quote BOOLEAN, themes_json TEXT,
|
|
PRIMARY KEY (podcast_id, id)
|
|
);
|
|
CREATE INDEX IF NOT EXISTS idx_quotes_episode ON quotes(podcast_id, episode_id);
|
|
""")
|
|
|
|
allowed_themes = load_themes(db, PODCAST_ID)
|
|
print(f"Erlaubte Themes ({PODCAST_ID}): {allowed_themes}")
|
|
|
|
episodes = db.execute(
|
|
"SELECT id, podcast_id, title, guest FROM episodes WHERE podcast_id=? ORDER BY id",
|
|
(PODCAST_ID,),
|
|
).fetchall()
|
|
print(f"Verarbeite {len(episodes)} Episoden fuer {PODCAST_ID}…")
|
|
|
|
# Skip episodes with existing quotes
|
|
done = set()
|
|
for r in db.execute(
|
|
"SELECT DISTINCT episode_id FROM quotes WHERE podcast_id=?", (PODCAST_ID,)
|
|
).fetchall():
|
|
done.add(r["episode_id"])
|
|
print(f" {len(done)} Episoden haben bereits Quotes — werden uebersprungen")
|
|
|
|
next_id = next_quote_id(db, PODCAST_ID)
|
|
print(f" Naechste Quote-ID: q{next_id}")
|
|
|
|
budget = Budget(hard_limit_usd=HARD_BUDGET_USD)
|
|
|
|
total_inserted = 0
|
|
failures = []
|
|
for i, ep in enumerate(episodes):
|
|
if ep["id"] in done:
|
|
continue
|
|
if budget.over():
|
|
print(f"!! Budget ueberschritten ({budget.cost():.4f} USD) — Abbruch")
|
|
break
|
|
try:
|
|
n, next_id, err = process_episode(db, client, ep, allowed_themes, budget, next_id)
|
|
except Exception as e:
|
|
n, err = 0, str(e)
|
|
total_inserted += n
|
|
if err:
|
|
failures.append((ep["id"], err))
|
|
# Commit nach jeder Episode (Crash-Sicherheit)
|
|
db.commit()
|
|
print(f" [{i+1}/{len(episodes)}] {ep['id']}: +{n} quotes "
|
|
f"(total={total_inserted}, cost=${budget.cost():.4f}, err={'-' if not err else err[:80]})", flush=True)
|
|
time.sleep(0.4)
|
|
|
|
db.commit()
|
|
|
|
print()
|
|
print("=== Zusammenfassung Aufgabe B ===")
|
|
print(f" Quotes inserted: {total_inserted}")
|
|
print(f" Tokens in={budget.tokens_in} out={budget.tokens_out}")
|
|
print(f" Kosten ~${budget.cost():.4f}")
|
|
if failures:
|
|
print(f" Fehler in {len(failures)} Episoden, erste 5:")
|
|
for ep_id, err in failures[:5]:
|
|
print(f" {ep_id}: {err[:120]}")
|
|
|
|
# Sanity-Check: Quotes pro Episode
|
|
counts = db.execute(
|
|
"SELECT episode_id, COUNT(*) c FROM quotes WHERE podcast_id=? GROUP BY episode_id ORDER BY c",
|
|
(PODCAST_ID,),
|
|
).fetchall()
|
|
print(f" Episoden mit Quotes: {len(counts)}")
|
|
if counts:
|
|
cs = [c["c"] for c in counts]
|
|
print(f" Quotes/Episode: min={min(cs)} max={max(cs)} mean={sum(cs)/len(cs):.1f}")
|
|
db.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|