podcast-mindmap/scripts/extract_questions.py

#!/usr/bin/env python3
"""#17 Frage-Antwort-Asymmetrie: Extrahiere Fragen aus Transkripten.

Nutzung:
    DASHSCOPE_API_KEY=... python3 extract_questions.py [db-pfad]
"""

import json
import os
import sys
import time
import sqlite3

from openai import OpenAI

DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
MODEL = "qwen-turbo"
BATCH_SIZE = 3

SYSTEM_PROMPT = """Du bist ein Diskursanalyst. Du erhältst Podcast-Transkript-Absätze.
Extrahiere ALLE Fragen (explizite und implizite). Klassifiziere jede Frage.

Antworte NUR mit einem JSON-Array:
[{"paragraph_idx": 0, "questions": [{"text": "Die Frage", "type": "genuine|rhetorical|follow_up|implicit", "answered": "yes|partial|no|self_answered"}]}]

- genuine: Echte Frage, die eine Antwort erwartet
- rhetorical: Rhetorische Frage zur Betonung
- follow_up: Rückfrage/Nachfrage
- implicit: Implizite Frage (z.B. "Da fragt man sich natürlich…")

- answered: Wird die Frage im selben Absatz beantwortet?

Wenn keine Fragen: {"paragraph_idx": 0, "questions": []}"""


def extract_batch(client, paragraphs):
    user_msg = ""
    for i, p in enumerate(paragraphs):
        user_msg += f"\n--- Absatz {i} ({p['episode_id']}, {p['start_time']:.0f}s) ---\n{p['text'][:600]}\n"

    try:
        resp = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_msg},
            ],
            temperature=0.1,
            max_tokens=1000,
        )
        content = resp.choices[0].message.content.strip()
        if content.startswith("```"):
            content = content.split("```")[1].strip()
            if content.startswith("json"):
                content = content[4:].strip()
        return json.loads(content)
    except Exception as e:
        return [{"paragraph_idx": i, "questions": []} for i in range(len(paragraphs))]


def main():
    if not API_KEY:
        print("DASHSCOPE_API_KEY nicht gesetzt.")
        sys.exit(1)

    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
    db = sqlite3.connect(DB_PATH)
    db.row_factory = sqlite3.Row

    db.executescript("""
    CREATE TABLE IF NOT EXISTS questions (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        podcast_id TEXT, episode_id TEXT, paragraph_idx INTEGER,
        question_text TEXT, question_type TEXT,
        answered TEXT DEFAULT 'unknown',
        answered_by_podcast TEXT, answered_by_episode TEXT, answered_by_idx INTEGER,
        start_time REAL
    );
    CREATE INDEX IF NOT EXISTS idx_questions_episode ON questions(podcast_id, episode_id);
    CREATE INDEX IF NOT EXISTS idx_questions_type ON questions(question_type);
    CREATE INDEX IF NOT EXISTS idx_questions_answered ON questions(answered);
    """)

    processed_keys = set()
    try:
        for r in db.execute("SELECT DISTINCT podcast_id||'/'||episode_id||'/'||paragraph_idx as k FROM questions").fetchall():
            processed_keys.add(r["k"])
    except Exception:
        pass

    rows = db.execute(
        "SELECT id, podcast_id, episode_id, idx, start_time, text FROM paragraphs ORDER BY podcast_id, episode_id, idx"
    ).fetchall()

    todo = [r for r in rows if f"{r['podcast_id']}/{r['episode_id']}/{r['idx']}" not in processed_keys]
    print(f"Extrahiere Fragen: {len(todo)} Absätze ({len(rows) - len(todo)} bereits fertig)")

    total_questions = 0
    for i in range(0, len(todo), BATCH_SIZE):
        batch = todo[i:i + BATCH_SIZE]
        paras = [{"episode_id": r["episode_id"], "start_time": r["start_time"] or 0, "text": r["text"]} for r in batch]

        results = extract_batch(client, paras)

        for j, result in enumerate(results):
            if j >= len(batch):
                break
            row = batch[j]
            for q in result.get("questions", []):
                db.execute(
                    "INSERT INTO questions (podcast_id, episode_id, paragraph_idx, question_text, question_type, answered, start_time) "
                    "VALUES (?, ?, ?, ?, ?, ?, ?)",
                    (row["podcast_id"], row["episode_id"], row["idx"],
                     q.get("text", ""), q.get("type", "unknown"),
                     q.get("answered", "unknown"), row["start_time"])
                )
                total_questions += 1

        if (i // BATCH_SIZE) % 20 == 0:
            db.commit()
            print(f"  {min(i + BATCH_SIZE, len(todo))}/{len(todo)} Absätze, {total_questions} Fragen bisher")

        time.sleep(0.2)

    db.commit()

    stats = db.execute("SELECT question_type, COUNT(*) as c FROM questions GROUP BY question_type ORDER BY c DESC").fetchall()
    answered_stats = db.execute("SELECT answered, COUNT(*) as c FROM questions GROUP BY answered ORDER BY c DESC").fetchall()
    print(f"\nFertig: {total_questions} Fragen extrahiert.")
    print("Nach Typ:")
    for s in stats:
        print(f"  {s['question_type']}: {s['c']}")
    print("Beantwortet:")
    for s in answered_stats:
        print(f"  {s['answered']}: {s['c']}")

    db.close()


if __name__ == "__main__":
    main()
#12 Wort-Highlighting Frontend, #14 Leerstellen-Detektor, #15 Narrative Shift, #13/#16/#17/#18 Qwen-Analyse-Scripts - Frontend: Wort-Level-Highlighting im Transkript — jedes Wort als <span> mit Timestamp, Karaoke-Style Sync bei Wiedergabe, CSS word-active/word-spoken - API: /api/.../words Endpoint liefert Wort-Timestamps - #14 detect_gaps.py: K-Means-Clustering über 3727 Embeddings, identifiziert Leerstellen (Themen die in einem Podcast fehlen). Ergebnis: gaps_analysis.json - #15 detect_narrative_shift.py: Embedding-Drift pro Thema über Episodenfolge, erkennt Framing-Wechsel. Ergebnis: narrative_shifts.json - #13 analyse_arguments.py: Qwen klassifiziert logische Relationen (erweitert, widerspricht, belegt, relativiert) zwischen semantisch ähnlichen Absätzen - #16 extract_claims.py: Qwen extrahiert prüfbare Behauptungen (Zahlen, Statistiken) - #17 extract_questions.py: Qwen extrahiert und klassifiziert Fragen - #18 curate_debates.py: Qwen kuratiert Cross-Podcast-Gegenüberstellungen - run_all_qwen.sh: Sequentielle Pipeline für alle Qwen-Tasks (vermeidet DB-Locks) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-23 22:29:41 +02:00			`#!/usr/bin/env python3`
			`"""#17 Frage-Antwort-Asymmetrie: Extrahiere Fragen aus Transkripten.`

			`Nutzung:`
			`DASHSCOPE_API_KEY=... python3 extract_questions.py [db-pfad]`
			`"""`

			`import json`
			`import os`
			`import sys`
			`import time`
			`import sqlite3`

			`from openai import OpenAI`

			`DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"`
			`API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")`
			`BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"`
			`MODEL = "qwen-turbo"`
			`BATCH_SIZE = 3`

			`SYSTEM_PROMPT = """Du bist ein Diskursanalyst. Du erhältst Podcast-Transkript-Absätze.`
			`Extrahiere ALLE Fragen (explizite und implizite). Klassifiziere jede Frage.`

			`Antworte NUR mit einem JSON-Array:`
			`[{"paragraph_idx": 0, "questions": [{"text": "Die Frage", "type": "genuine\|rhetorical\|follow_up\|implicit", "answered": "yes\|partial\|no\|self_answered"}]}]`

			`- genuine: Echte Frage, die eine Antwort erwartet`
			`- rhetorical: Rhetorische Frage zur Betonung`
			`- follow_up: Rückfrage/Nachfrage`
			`- implicit: Implizite Frage (z.B. "Da fragt man sich natürlich…")`

			`- answered: Wird die Frage im selben Absatz beantwortet?`

			`Wenn keine Fragen: {"paragraph_idx": 0, "questions": []}"""`


			`def extract_batch(client, paragraphs):`
			`user_msg = ""`
			`for i, p in enumerate(paragraphs):`
			`user_msg += f"\n--- Absatz {i} ({p['episode_id']}, {p['start_time']:.0f}s) ---\n{p['text'][:600]}\n"`

			`try:`
			`resp = client.chat.completions.create(`
			`model=MODEL,`
			`messages=[`
			`{"role": "system", "content": SYSTEM_PROMPT},`
			`{"role": "user", "content": user_msg},`
			`],`
			`temperature=0.1,`
			`max_tokens=1000,`
			`)`
			`content = resp.choices[0].message.content.strip()`
			if content.startswith("```"):
			content = content.split("```")[1].strip()
			`if content.startswith("json"):`
			`content = content[4:].strip()`
			`return json.loads(content)`
			`except Exception as e:`
			`return [{"paragraph_idx": i, "questions": []} for i in range(len(paragraphs))]`


			`def main():`
			`if not API_KEY:`
			`print("DASHSCOPE_API_KEY nicht gesetzt.")`
			`sys.exit(1)`

			`client = OpenAI(api_key=API_KEY, base_url=BASE_URL)`
			`db = sqlite3.connect(DB_PATH)`
			`db.row_factory = sqlite3.Row`

			`db.executescript("""`
			`CREATE TABLE IF NOT EXISTS questions (`
			`id INTEGER PRIMARY KEY AUTOINCREMENT,`
			`podcast_id TEXT, episode_id TEXT, paragraph_idx INTEGER,`
			`question_text TEXT, question_type TEXT,`
			`answered TEXT DEFAULT 'unknown',`
			`answered_by_podcast TEXT, answered_by_episode TEXT, answered_by_idx INTEGER,`
			`start_time REAL`
			`);`
			`CREATE INDEX IF NOT EXISTS idx_questions_episode ON questions(podcast_id, episode_id);`
			`CREATE INDEX IF NOT EXISTS idx_questions_type ON questions(question_type);`
			`CREATE INDEX IF NOT EXISTS idx_questions_answered ON questions(answered);`
			`""")`

			`processed_keys = set()`
			`try:`
			`for r in db.execute("SELECT DISTINCT podcast_id\|\|'/'\|\|episode_id\|\|'/'\|\|paragraph_idx as k FROM questions").fetchall():`
			`processed_keys.add(r["k"])`
			`except Exception:`
			`pass`

			`rows = db.execute(`
			`"SELECT id, podcast_id, episode_id, idx, start_time, text FROM paragraphs ORDER BY podcast_id, episode_id, idx"`
			`).fetchall()`

			`todo = [r for r in rows if f"{r['podcast_id']}/{r['episode_id']}/{r['idx']}" not in processed_keys]`
			`print(f"Extrahiere Fragen: {len(todo)} Absätze ({len(rows) - len(todo)} bereits fertig)")`

			`total_questions = 0`
			`for i in range(0, len(todo), BATCH_SIZE):`
			`batch = todo[i:i + BATCH_SIZE]`
			`paras = [{"episode_id": r["episode_id"], "start_time": r["start_time"] or 0, "text": r["text"]} for r in batch]`

			`results = extract_batch(client, paras)`

			`for j, result in enumerate(results):`
			`if j >= len(batch):`
			`break`
			`row = batch[j]`
			`for q in result.get("questions", []):`
			`db.execute(`
			`"INSERT INTO questions (podcast_id, episode_id, paragraph_idx, question_text, question_type, answered, start_time) "`
			`"VALUES (?, ?, ?, ?, ?, ?, ?)",`
			`(row["podcast_id"], row["episode_id"], row["idx"],`
			`q.get("text", ""), q.get("type", "unknown"),`
			`q.get("answered", "unknown"), row["start_time"])`
			`)`
			`total_questions += 1`

			`if (i // BATCH_SIZE) % 20 == 0:`
			`db.commit()`
			`print(f" {min(i + BATCH_SIZE, len(todo))}/{len(todo)} Absätze, {total_questions} Fragen bisher")`

			`time.sleep(0.2)`

			`db.commit()`

			`stats = db.execute("SELECT question_type, COUNT(*) as c FROM questions GROUP BY question_type ORDER BY c DESC").fetchall()`
			`answered_stats = db.execute("SELECT answered, COUNT(*) as c FROM questions GROUP BY answered ORDER BY c DESC").fetchall()`
			`print(f"\nFertig: {total_questions} Fragen extrahiert.")`
			`print("Nach Typ:")`
			`for s in stats:`
			`print(f" {s['question_type']}: {s['c']}")`
			`print("Beantwortet:")`
			`for s in answered_stats:`
			`print(f" {s['answered']}: {s['c']}")`

			`db.close()`


			`if __name__ == "__main__":`
			`main()`