144 lines
5.1 KiB
Python
144 lines
5.1 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""#17 Frage-Antwort-Asymmetrie: Extrahiere Fragen aus Transkripten.
|
||
|
|
|
||
|
|
Nutzung:
|
||
|
|
DASHSCOPE_API_KEY=... python3 extract_questions.py [db-pfad]
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
import sqlite3
|
||
|
|
|
||
|
|
from openai import OpenAI
|
||
|
|
|
||
|
|
DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
|
||
|
|
API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
|
||
|
|
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
||
|
|
MODEL = "qwen-turbo"
|
||
|
|
BATCH_SIZE = 3
|
||
|
|
|
||
|
|
SYSTEM_PROMPT = """Du bist ein Diskursanalyst. Du erhältst Podcast-Transkript-Absätze.
|
||
|
|
Extrahiere ALLE Fragen (explizite und implizite). Klassifiziere jede Frage.
|
||
|
|
|
||
|
|
Antworte NUR mit einem JSON-Array:
|
||
|
|
[{"paragraph_idx": 0, "questions": [{"text": "Die Frage", "type": "genuine|rhetorical|follow_up|implicit", "answered": "yes|partial|no|self_answered"}]}]
|
||
|
|
|
||
|
|
- genuine: Echte Frage, die eine Antwort erwartet
|
||
|
|
- rhetorical: Rhetorische Frage zur Betonung
|
||
|
|
- follow_up: Rückfrage/Nachfrage
|
||
|
|
- implicit: Implizite Frage (z.B. "Da fragt man sich natürlich…")
|
||
|
|
|
||
|
|
- answered: Wird die Frage im selben Absatz beantwortet?
|
||
|
|
|
||
|
|
Wenn keine Fragen: {"paragraph_idx": 0, "questions": []}"""
|
||
|
|
|
||
|
|
|
||
|
|
def extract_batch(client, paragraphs):
|
||
|
|
user_msg = ""
|
||
|
|
for i, p in enumerate(paragraphs):
|
||
|
|
user_msg += f"\n--- Absatz {i} ({p['episode_id']}, {p['start_time']:.0f}s) ---\n{p['text'][:600]}\n"
|
||
|
|
|
||
|
|
try:
|
||
|
|
resp = client.chat.completions.create(
|
||
|
|
model=MODEL,
|
||
|
|
messages=[
|
||
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||
|
|
{"role": "user", "content": user_msg},
|
||
|
|
],
|
||
|
|
temperature=0.1,
|
||
|
|
max_tokens=1000,
|
||
|
|
)
|
||
|
|
content = resp.choices[0].message.content.strip()
|
||
|
|
if content.startswith("```"):
|
||
|
|
content = content.split("```")[1].strip()
|
||
|
|
if content.startswith("json"):
|
||
|
|
content = content[4:].strip()
|
||
|
|
return json.loads(content)
|
||
|
|
except Exception as e:
|
||
|
|
return [{"paragraph_idx": i, "questions": []} for i in range(len(paragraphs))]
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
if not API_KEY:
|
||
|
|
print("DASHSCOPE_API_KEY nicht gesetzt.")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||
|
|
db = sqlite3.connect(DB_PATH)
|
||
|
|
db.row_factory = sqlite3.Row
|
||
|
|
|
||
|
|
db.executescript("""
|
||
|
|
CREATE TABLE IF NOT EXISTS questions (
|
||
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
|
|
podcast_id TEXT, episode_id TEXT, paragraph_idx INTEGER,
|
||
|
|
question_text TEXT, question_type TEXT,
|
||
|
|
answered TEXT DEFAULT 'unknown',
|
||
|
|
answered_by_podcast TEXT, answered_by_episode TEXT, answered_by_idx INTEGER,
|
||
|
|
start_time REAL
|
||
|
|
);
|
||
|
|
CREATE INDEX IF NOT EXISTS idx_questions_episode ON questions(podcast_id, episode_id);
|
||
|
|
CREATE INDEX IF NOT EXISTS idx_questions_type ON questions(question_type);
|
||
|
|
CREATE INDEX IF NOT EXISTS idx_questions_answered ON questions(answered);
|
||
|
|
""")
|
||
|
|
|
||
|
|
processed_keys = set()
|
||
|
|
try:
|
||
|
|
for r in db.execute("SELECT DISTINCT podcast_id||'/'||episode_id||'/'||paragraph_idx as k FROM questions").fetchall():
|
||
|
|
processed_keys.add(r["k"])
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
rows = db.execute(
|
||
|
|
"SELECT id, podcast_id, episode_id, idx, start_time, text FROM paragraphs ORDER BY podcast_id, episode_id, idx"
|
||
|
|
).fetchall()
|
||
|
|
|
||
|
|
todo = [r for r in rows if f"{r['podcast_id']}/{r['episode_id']}/{r['idx']}" not in processed_keys]
|
||
|
|
print(f"Extrahiere Fragen: {len(todo)} Absätze ({len(rows) - len(todo)} bereits fertig)")
|
||
|
|
|
||
|
|
total_questions = 0
|
||
|
|
for i in range(0, len(todo), BATCH_SIZE):
|
||
|
|
batch = todo[i:i + BATCH_SIZE]
|
||
|
|
paras = [{"episode_id": r["episode_id"], "start_time": r["start_time"] or 0, "text": r["text"]} for r in batch]
|
||
|
|
|
||
|
|
results = extract_batch(client, paras)
|
||
|
|
|
||
|
|
for j, result in enumerate(results):
|
||
|
|
if j >= len(batch):
|
||
|
|
break
|
||
|
|
row = batch[j]
|
||
|
|
for q in result.get("questions", []):
|
||
|
|
db.execute(
|
||
|
|
"INSERT INTO questions (podcast_id, episode_id, paragraph_idx, question_text, question_type, answered, start_time) "
|
||
|
|
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
||
|
|
(row["podcast_id"], row["episode_id"], row["idx"],
|
||
|
|
q.get("text", ""), q.get("type", "unknown"),
|
||
|
|
q.get("answered", "unknown"), row["start_time"])
|
||
|
|
)
|
||
|
|
total_questions += 1
|
||
|
|
|
||
|
|
if (i // BATCH_SIZE) % 20 == 0:
|
||
|
|
db.commit()
|
||
|
|
print(f" {min(i + BATCH_SIZE, len(todo))}/{len(todo)} Absätze, {total_questions} Fragen bisher")
|
||
|
|
|
||
|
|
time.sleep(0.2)
|
||
|
|
|
||
|
|
db.commit()
|
||
|
|
|
||
|
|
stats = db.execute("SELECT question_type, COUNT(*) as c FROM questions GROUP BY question_type ORDER BY c DESC").fetchall()
|
||
|
|
answered_stats = db.execute("SELECT answered, COUNT(*) as c FROM questions GROUP BY answered ORDER BY c DESC").fetchall()
|
||
|
|
print(f"\nFertig: {total_questions} Fragen extrahiert.")
|
||
|
|
print("Nach Typ:")
|
||
|
|
for s in stats:
|
||
|
|
print(f" {s['question_type']}: {s['c']}")
|
||
|
|
print("Beantwortet:")
|
||
|
|
for s in answered_stats:
|
||
|
|
print(f" {s['answered']}: {s['c']}")
|
||
|
|
|
||
|
|
db.close()
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|