Generic tool for building interactive mindmap visualizations from podcast transcripts. Includes: audio download, SRT conversion, quote-timestamp matching, D3.js mindmap webapp. Configurable via project.yaml — no podcast-specific content. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
294 lines
9.2 KiB
Python
294 lines
9.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Match quotes from a markdown file to SRT timestamps and build mindmap_data.json."""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from difflib import SequenceMatcher
|
|
from config import load_project
|
|
|
|
|
|
def parse_quotes_md(filepath):
|
|
"""Parse quotes markdown file. Expected format:
|
|
|
|
### Section Title
|
|
|
|
> "Quote text" -- Speaker (Episode-ID)
|
|
|
|
Returns list of {text, speaker, episode, section}.
|
|
"""
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
quotes = []
|
|
current_section = ""
|
|
|
|
for line in content.split('\n'):
|
|
if line.startswith('### '):
|
|
current_section = line[4:].strip()
|
|
elif line.startswith('> '):
|
|
# Extract quote text
|
|
text_match = re.match(r'>\s*"?(.+?)(?:"|$)', line)
|
|
if not text_match:
|
|
continue
|
|
text = text_match.group(1).strip().rstrip('"')
|
|
|
|
# Extract speaker and episode
|
|
attr_match = re.search(r'--\s*(.+?)\s*\((\w+)\)', line)
|
|
if attr_match:
|
|
speaker = attr_match.group(1).strip()
|
|
episode = attr_match.group(2).strip()
|
|
else:
|
|
# Try simpler pattern
|
|
attr_match = re.search(r'--\s*(.+?)$', line)
|
|
speaker = attr_match.group(1).strip() if attr_match else "Unknown"
|
|
ep_match = re.search(r'\((S\d+E\d+)\)', line)
|
|
episode = ep_match.group(1) if ep_match else ""
|
|
|
|
quotes.append({
|
|
"text": text,
|
|
"speaker": speaker,
|
|
"episode": episode,
|
|
"section": current_section
|
|
})
|
|
|
|
return quotes
|
|
|
|
|
|
def parse_themes_md(filepath):
|
|
"""Parse themes markdown file. Expected format:
|
|
|
|
### 1. Theme Title -- Description
|
|
Text mentioning episodes like S1E1, S2E3...
|
|
|
|
Returns list of {id, label, description, episodes}.
|
|
"""
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
themes = []
|
|
# Default colors
|
|
colors = ["#e63946", "#457b9d", "#f4a261", "#2a9d8f", "#264653", "#e9c46a", "#9b5de5",
|
|
"#ef476f", "#06d6a0", "#118ab2"]
|
|
|
|
sections = re.split(r'###\s+\d+\.\s+', content)[1:]
|
|
|
|
for i, section in enumerate(sections):
|
|
lines = section.strip().split('\n')
|
|
title_line = lines[0]
|
|
|
|
# Parse title and description
|
|
if ' -- ' in title_line:
|
|
label, description = title_line.split(' -- ', 1)
|
|
elif ' — ' in title_line:
|
|
label, description = title_line.split(' — ', 1)
|
|
else:
|
|
label = title_line.rstrip()
|
|
description = ""
|
|
|
|
label = label.strip()
|
|
description = description.strip()
|
|
|
|
# Extract episode references
|
|
full_text = '\n'.join(lines)
|
|
episodes = sorted(set(re.findall(r'S\d+E\d+', full_text)))
|
|
|
|
theme_id = re.sub(r'[^a-z0-9]', '', label.lower())[:20]
|
|
|
|
themes.append({
|
|
"id": theme_id,
|
|
"label": label,
|
|
"description": description,
|
|
"episodes": episodes,
|
|
"color": colors[i % len(colors)]
|
|
})
|
|
|
|
return themes
|
|
|
|
|
|
def normalize(text):
|
|
"""Normalize text for comparison."""
|
|
text = text.lower()
|
|
text = re.sub(r'\[.*?\]', '', text)
|
|
text = re.sub(r'--', ' ', text)
|
|
text = re.sub(r'[^\w\s]', ' ', text)
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
return text
|
|
|
|
|
|
def find_best_window(quote_text, entries, max_window=6):
|
|
"""Find best matching window of SRT entries for a quote."""
|
|
norm_quote = normalize(quote_text)
|
|
keywords = [w for w in norm_quote.split() if len(w) > 4][:8]
|
|
|
|
best_score = 0
|
|
best_start = None
|
|
best_end = None
|
|
|
|
for window_size in range(1, min(max_window + 1, len(entries) + 1)):
|
|
for i in range(len(entries) - window_size + 1):
|
|
window = entries[i:i + window_size]
|
|
window_text = ' '.join(e[2] for e in window)
|
|
norm_window = normalize(window_text)
|
|
|
|
# Quick keyword filter
|
|
if keywords:
|
|
hits = sum(1 for kw in keywords if kw in norm_window)
|
|
if hits < len(keywords) * 0.4:
|
|
continue
|
|
|
|
score = SequenceMatcher(None, norm_quote, norm_window).ratio()
|
|
|
|
# Prefer tighter matches
|
|
duration = window[-1][1] - window[0][0]
|
|
if duration < 30:
|
|
score *= 1.05
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_start = window[0][0]
|
|
best_end = window[-1][1]
|
|
|
|
return best_start, best_end, best_score
|
|
|
|
|
|
def parse_srt(filepath):
|
|
"""Parse SRT file into entries."""
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
blocks = re.split(r'\n\n+', content.strip())
|
|
entries = []
|
|
for block in blocks:
|
|
lines = block.strip().split('\n')
|
|
if len(lines) < 2:
|
|
continue
|
|
try:
|
|
int(lines[0].strip())
|
|
except ValueError:
|
|
continue
|
|
ts_match = re.match(
|
|
r'(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,.]\d{3})',
|
|
lines[1]
|
|
)
|
|
if not ts_match:
|
|
continue
|
|
def to_sec(ts):
|
|
ts = ts.replace(',', '.')
|
|
p = ts.split(':')
|
|
return float(p[0]) * 3600 + float(p[1]) * 60 + float(p[2])
|
|
start = to_sec(ts_match.group(1))
|
|
end = to_sec(ts_match.group(2))
|
|
text = ' '.join(lines[2:]).strip()
|
|
text = re.sub(r'^Speaker \d+:\s*', '', text)
|
|
if text:
|
|
entries.append((start, end, text))
|
|
return entries
|
|
|
|
|
|
def main():
|
|
project_dir = sys.argv[1] if len(sys.argv) > 1 else "."
|
|
config = load_project(project_dir)
|
|
audio_dir = config["_audio_dir"]
|
|
data_dir = config["_data_dir"]
|
|
os.makedirs(data_dir, exist_ok=True)
|
|
|
|
# Parse quotes
|
|
quotes_path = os.path.join(project_dir, config.get("quotes_file", "quotes.md"))
|
|
quotes = parse_quotes_md(quotes_path)
|
|
print(f"Parsed {len(quotes)} quotes from {quotes_path}")
|
|
|
|
# Parse themes
|
|
themes_path = os.path.join(project_dir, config.get("themes_file", "themes.md"))
|
|
themes = []
|
|
if os.path.exists(themes_path):
|
|
themes = parse_themes_md(themes_path)
|
|
print(f"Parsed {len(themes)} themes from {themes_path}")
|
|
|
|
# Load SRT data
|
|
srt_data = {}
|
|
for ep in config["episodes"]:
|
|
srt_key = config["_srt_keys"][ep["id"]]
|
|
srt_path = os.path.join(audio_dir, f"{srt_key}.srt")
|
|
if os.path.exists(srt_path):
|
|
srt_data[ep["id"]] = parse_srt(srt_path)
|
|
|
|
# Build episodes list
|
|
episodes_out = []
|
|
for ep in config["episodes"]:
|
|
audio_file = config["_audio_files"].get(ep["id"])
|
|
audio_path = os.path.join(audio_dir, audio_file) if audio_file else None
|
|
episodes_out.append({
|
|
"id": ep["id"],
|
|
"title": ep["title"],
|
|
"guest": ep["guest"],
|
|
"staffel": ep["staffel"],
|
|
"audioFile": audio_file if audio_path and os.path.exists(audio_path) else None
|
|
})
|
|
|
|
# Match quotes to timestamps
|
|
quotes_out = []
|
|
matched = 0
|
|
for i, q in enumerate(quotes):
|
|
ep_id = q["episode"]
|
|
quote_data = {
|
|
"id": f"q{i+1}",
|
|
"text": q["text"],
|
|
"speaker": q["speaker"],
|
|
"episode": ep_id,
|
|
"themes": [t["id"] for t in themes if ep_id in t["episodes"]],
|
|
"startTime": None,
|
|
"endTime": None,
|
|
"audioFile": config["_audio_files"].get(ep_id),
|
|
"isTopQuote": False,
|
|
"verbatim": None,
|
|
}
|
|
|
|
if ep_id in srt_data:
|
|
entries = srt_data[ep_id]
|
|
start, end, score = find_best_window(q["text"], entries)
|
|
if start is not None and score > 0.3:
|
|
quote_data["startTime"] = round(start - 1.5, 1)
|
|
quote_data["endTime"] = round(end + 1.0, 1)
|
|
matched += 1
|
|
|
|
# Get verbatim text
|
|
nearby = [e for e in entries if e[0] >= start - 1 and e[1] <= end + 1]
|
|
if nearby:
|
|
verbatim = ' '.join(e[2] for e in nearby).strip()
|
|
# Capitalize first letter
|
|
if verbatim and verbatim[0].islower():
|
|
verbatim = verbatim[0].upper() + verbatim[1:]
|
|
# Ensure ends with punctuation
|
|
if verbatim and verbatim[-1] not in '.!?':
|
|
verbatim += '.'
|
|
quote_data["verbatim"] = verbatim
|
|
|
|
quotes_out.append(quote_data)
|
|
|
|
# Build staffeln
|
|
staffeln_out = config["staffeln"]
|
|
|
|
# Output
|
|
output = {
|
|
"name": config.get("name", "Podcast"),
|
|
"description": config.get("description", ""),
|
|
"host": config.get("host", ""),
|
|
"themes": themes,
|
|
"episodes": episodes_out,
|
|
"quotes": quotes_out,
|
|
"staffeln": staffeln_out,
|
|
}
|
|
|
|
output_path = os.path.join(data_dir, "mindmap_data.json")
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
json.dump(output, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\nMatched: {matched}/{len(quotes_out)} quotes with timestamps")
|
|
print(f"Output: {output_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|