podcast-mindmap/scripts/match_quotes.py

294 lines
9.2 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""Match quotes from a markdown file to SRT timestamps and build mindmap_data.json."""
import json
import os
import re
import sys
from difflib import SequenceMatcher
from config import load_project
def parse_quotes_md(filepath):
"""Parse quotes markdown file. Expected format:
### Section Title
> "Quote text" -- Speaker (Episode-ID)
Returns list of {text, speaker, episode, section}.
"""
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
quotes = []
current_section = ""
for line in content.split('\n'):
if line.startswith('### '):
current_section = line[4:].strip()
elif line.startswith('> '):
# Extract quote text
text_match = re.match(r'>\s*"?(.+?)(?:"|$)', line)
if not text_match:
continue
text = text_match.group(1).strip().rstrip('"')
# Extract speaker and episode
attr_match = re.search(r'--\s*(.+?)\s*\((\w+)\)', line)
if attr_match:
speaker = attr_match.group(1).strip()
episode = attr_match.group(2).strip()
else:
# Try simpler pattern
attr_match = re.search(r'--\s*(.+?)$', line)
speaker = attr_match.group(1).strip() if attr_match else "Unknown"
ep_match = re.search(r'\((S\d+E\d+)\)', line)
episode = ep_match.group(1) if ep_match else ""
quotes.append({
"text": text,
"speaker": speaker,
"episode": episode,
"section": current_section
})
return quotes
def parse_themes_md(filepath):
"""Parse themes markdown file. Expected format:
### 1. Theme Title -- Description
Text mentioning episodes like S1E1, S2E3...
Returns list of {id, label, description, episodes}.
"""
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
themes = []
# Default colors
colors = ["#e63946", "#457b9d", "#f4a261", "#2a9d8f", "#264653", "#e9c46a", "#9b5de5",
"#ef476f", "#06d6a0", "#118ab2"]
sections = re.split(r'###\s+\d+\.\s+', content)[1:]
for i, section in enumerate(sections):
lines = section.strip().split('\n')
title_line = lines[0]
# Parse title and description
if ' -- ' in title_line:
label, description = title_line.split(' -- ', 1)
elif '' in title_line:
label, description = title_line.split('', 1)
else:
label = title_line.rstrip()
description = ""
label = label.strip()
description = description.strip()
# Extract episode references
full_text = '\n'.join(lines)
episodes = sorted(set(re.findall(r'S\d+E\d+', full_text)))
theme_id = re.sub(r'[^a-z0-9]', '', label.lower())[:20]
themes.append({
"id": theme_id,
"label": label,
"description": description,
"episodes": episodes,
"color": colors[i % len(colors)]
})
return themes
def normalize(text):
"""Normalize text for comparison."""
text = text.lower()
text = re.sub(r'\[.*?\]', '', text)
text = re.sub(r'--', ' ', text)
text = re.sub(r'[^\w\s]', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def find_best_window(quote_text, entries, max_window=6):
"""Find best matching window of SRT entries for a quote."""
norm_quote = normalize(quote_text)
keywords = [w for w in norm_quote.split() if len(w) > 4][:8]
best_score = 0
best_start = None
best_end = None
for window_size in range(1, min(max_window + 1, len(entries) + 1)):
for i in range(len(entries) - window_size + 1):
window = entries[i:i + window_size]
window_text = ' '.join(e[2] for e in window)
norm_window = normalize(window_text)
# Quick keyword filter
if keywords:
hits = sum(1 for kw in keywords if kw in norm_window)
if hits < len(keywords) * 0.4:
continue
score = SequenceMatcher(None, norm_quote, norm_window).ratio()
# Prefer tighter matches
duration = window[-1][1] - window[0][0]
if duration < 30:
score *= 1.05
if score > best_score:
best_score = score
best_start = window[0][0]
best_end = window[-1][1]
return best_start, best_end, best_score
def parse_srt(filepath):
"""Parse SRT file into entries."""
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
blocks = re.split(r'\n\n+', content.strip())
entries = []
for block in blocks:
lines = block.strip().split('\n')
if len(lines) < 2:
continue
try:
int(lines[0].strip())
except ValueError:
continue
ts_match = re.match(
r'(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,.]\d{3})',
lines[1]
)
if not ts_match:
continue
def to_sec(ts):
ts = ts.replace(',', '.')
p = ts.split(':')
return float(p[0]) * 3600 + float(p[1]) * 60 + float(p[2])
start = to_sec(ts_match.group(1))
end = to_sec(ts_match.group(2))
text = ' '.join(lines[2:]).strip()
text = re.sub(r'^Speaker \d+:\s*', '', text)
if text:
entries.append((start, end, text))
return entries
def main():
project_dir = sys.argv[1] if len(sys.argv) > 1 else "."
config = load_project(project_dir)
audio_dir = config["_audio_dir"]
data_dir = config["_data_dir"]
os.makedirs(data_dir, exist_ok=True)
# Parse quotes
quotes_path = os.path.join(project_dir, config.get("quotes_file", "quotes.md"))
quotes = parse_quotes_md(quotes_path)
print(f"Parsed {len(quotes)} quotes from {quotes_path}")
# Parse themes
themes_path = os.path.join(project_dir, config.get("themes_file", "themes.md"))
themes = []
if os.path.exists(themes_path):
themes = parse_themes_md(themes_path)
print(f"Parsed {len(themes)} themes from {themes_path}")
# Load SRT data
srt_data = {}
for ep in config["episodes"]:
srt_key = config["_srt_keys"][ep["id"]]
srt_path = os.path.join(audio_dir, f"{srt_key}.srt")
if os.path.exists(srt_path):
srt_data[ep["id"]] = parse_srt(srt_path)
# Build episodes list
episodes_out = []
for ep in config["episodes"]:
audio_file = config["_audio_files"].get(ep["id"])
audio_path = os.path.join(audio_dir, audio_file) if audio_file else None
episodes_out.append({
"id": ep["id"],
"title": ep["title"],
"guest": ep["guest"],
"staffel": ep["staffel"],
"audioFile": audio_file if audio_path and os.path.exists(audio_path) else None
})
# Match quotes to timestamps
quotes_out = []
matched = 0
for i, q in enumerate(quotes):
ep_id = q["episode"]
quote_data = {
"id": f"q{i+1}",
"text": q["text"],
"speaker": q["speaker"],
"episode": ep_id,
"themes": [t["id"] for t in themes if ep_id in t["episodes"]],
"startTime": None,
"endTime": None,
"audioFile": config["_audio_files"].get(ep_id),
"isTopQuote": False,
"verbatim": None,
}
if ep_id in srt_data:
entries = srt_data[ep_id]
start, end, score = find_best_window(q["text"], entries)
if start is not None and score > 0.3:
quote_data["startTime"] = round(start - 1.5, 1)
quote_data["endTime"] = round(end + 1.0, 1)
matched += 1
# Get verbatim text
nearby = [e for e in entries if e[0] >= start - 1 and e[1] <= end + 1]
if nearby:
verbatim = ' '.join(e[2] for e in nearby).strip()
# Capitalize first letter
if verbatim and verbatim[0].islower():
verbatim = verbatim[0].upper() + verbatim[1:]
# Ensure ends with punctuation
if verbatim and verbatim[-1] not in '.!?':
verbatim += '.'
quote_data["verbatim"] = verbatim
quotes_out.append(quote_data)
# Build staffeln
staffeln_out = config["staffeln"]
# Output
output = {
"name": config.get("name", "Podcast"),
"description": config.get("description", ""),
"host": config.get("host", ""),
"themes": themes,
"episodes": episodes_out,
"quotes": quotes_out,
"staffeln": staffeln_out,
}
output_path = os.path.join(data_dir, "mindmap_data.json")
with open(output_path, "w", encoding="utf-8") as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f"\nMatched: {matched}/{len(quotes_out)} quotes with timestamps")
print(f"Output: {output_path}")
if __name__ == "__main__":
main()