#!/usr/bin/env python3 """#17 Frage-Antwort-Asymmetrie: Extrahiere Fragen aus Transkripten. Nutzung: DASHSCOPE_API_KEY=... python3 extract_questions.py [db-pfad] """ import json import os import sys import time import sqlite3 from openai import OpenAI DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite" API_KEY = os.environ.get("DASHSCOPE_API_KEY", "") BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" MODEL = "qwen-turbo" BATCH_SIZE = 3 SYSTEM_PROMPT = """Du bist ein Diskursanalyst. Du erhältst Podcast-Transkript-Absätze. Extrahiere ALLE Fragen (explizite und implizite). Klassifiziere jede Frage. Antworte NUR mit einem JSON-Array: [{"paragraph_idx": 0, "questions": [{"text": "Die Frage", "type": "genuine|rhetorical|follow_up|implicit", "answered": "yes|partial|no|self_answered"}]}] - genuine: Echte Frage, die eine Antwort erwartet - rhetorical: Rhetorische Frage zur Betonung - follow_up: Rückfrage/Nachfrage - implicit: Implizite Frage (z.B. "Da fragt man sich natürlich…") - answered: Wird die Frage im selben Absatz beantwortet? Wenn keine Fragen: {"paragraph_idx": 0, "questions": []}""" def extract_batch(client, paragraphs): user_msg = "" for i, p in enumerate(paragraphs): user_msg += f"\n--- Absatz {i} ({p['episode_id']}, {p['start_time']:.0f}s) ---\n{p['text'][:600]}\n" try: resp = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_msg}, ], temperature=0.1, max_tokens=1000, ) content = resp.choices[0].message.content.strip() if content.startswith("```"): content = content.split("```")[1].strip() if content.startswith("json"): content = content[4:].strip() return json.loads(content) except Exception as e: return [{"paragraph_idx": i, "questions": []} for i in range(len(paragraphs))] def main(): if not API_KEY: print("DASHSCOPE_API_KEY nicht gesetzt.") sys.exit(1) client = OpenAI(api_key=API_KEY, base_url=BASE_URL) db = sqlite3.connect(DB_PATH) db.row_factory = sqlite3.Row db.executescript(""" CREATE TABLE IF NOT EXISTS questions ( id INTEGER PRIMARY KEY AUTOINCREMENT, podcast_id TEXT, episode_id TEXT, paragraph_idx INTEGER, question_text TEXT, question_type TEXT, answered TEXT DEFAULT 'unknown', answered_by_podcast TEXT, answered_by_episode TEXT, answered_by_idx INTEGER, start_time REAL ); CREATE INDEX IF NOT EXISTS idx_questions_episode ON questions(podcast_id, episode_id); CREATE INDEX IF NOT EXISTS idx_questions_type ON questions(question_type); CREATE INDEX IF NOT EXISTS idx_questions_answered ON questions(answered); """) processed_keys = set() try: for r in db.execute("SELECT DISTINCT podcast_id||'/'||episode_id||'/'||paragraph_idx as k FROM questions").fetchall(): processed_keys.add(r["k"]) except Exception: pass rows = db.execute( "SELECT id, podcast_id, episode_id, idx, start_time, text FROM paragraphs ORDER BY podcast_id, episode_id, idx" ).fetchall() todo = [r for r in rows if f"{r['podcast_id']}/{r['episode_id']}/{r['idx']}" not in processed_keys] print(f"Extrahiere Fragen: {len(todo)} Absätze ({len(rows) - len(todo)} bereits fertig)") total_questions = 0 for i in range(0, len(todo), BATCH_SIZE): batch = todo[i:i + BATCH_SIZE] paras = [{"episode_id": r["episode_id"], "start_time": r["start_time"] or 0, "text": r["text"]} for r in batch] results = extract_batch(client, paras) for j, result in enumerate(results): if j >= len(batch): break row = batch[j] for q in result.get("questions", []): db.execute( "INSERT INTO questions (podcast_id, episode_id, paragraph_idx, question_text, question_type, answered, start_time) " "VALUES (?, ?, ?, ?, ?, ?, ?)", (row["podcast_id"], row["episode_id"], row["idx"], q.get("text", ""), q.get("type", "unknown"), q.get("answered", "unknown"), row["start_time"]) ) total_questions += 1 if (i // BATCH_SIZE) % 20 == 0: db.commit() print(f" {min(i + BATCH_SIZE, len(todo))}/{len(todo)} Absätze, {total_questions} Fragen bisher") time.sleep(0.2) db.commit() stats = db.execute("SELECT question_type, COUNT(*) as c FROM questions GROUP BY question_type ORDER BY c DESC").fetchall() answered_stats = db.execute("SELECT answered, COUNT(*) as c FROM questions GROUP BY answered ORDER BY c DESC").fetchall() print(f"\nFertig: {total_questions} Fragen extrahiert.") print("Nach Typ:") for s in stats: print(f" {s['question_type']}: {s['c']}") print("Beantwortet:") for s in answered_stats: print(f" {s['answered']}: {s['c']}") db.close() if __name__ == "__main__": main()