#!/usr/bin/env python3 """Match quotes from a markdown file to SRT timestamps and build mindmap_data.json.""" import json import os import re import sys from difflib import SequenceMatcher from config import load_project def parse_quotes_md(filepath): """Parse quotes markdown file. Expected format: ### Section Title > "Quote text" -- Speaker (Episode-ID) Returns list of {text, speaker, episode, section}. """ with open(filepath, "r", encoding="utf-8") as f: content = f.read() quotes = [] current_section = "" for line in content.split('\n'): if line.startswith('### '): current_section = line[4:].strip() elif line.startswith('> '): # Extract quote text text_match = re.match(r'>\s*"?(.+?)(?:"|$)', line) if not text_match: continue text = text_match.group(1).strip().rstrip('"') # Extract speaker and episode attr_match = re.search(r'--\s*(.+?)\s*\((\w+)\)', line) if attr_match: speaker = attr_match.group(1).strip() episode = attr_match.group(2).strip() else: # Try simpler pattern attr_match = re.search(r'--\s*(.+?)$', line) speaker = attr_match.group(1).strip() if attr_match else "Unknown" ep_match = re.search(r'\((S\d+E\d+)\)', line) episode = ep_match.group(1) if ep_match else "" quotes.append({ "text": text, "speaker": speaker, "episode": episode, "section": current_section }) return quotes def parse_themes_md(filepath): """Parse themes markdown file. Expected format: ### 1. Theme Title -- Description Text mentioning episodes like S1E1, S2E3... Returns list of {id, label, description, episodes}. """ with open(filepath, "r", encoding="utf-8") as f: content = f.read() themes = [] # Default colors colors = ["#e63946", "#457b9d", "#f4a261", "#2a9d8f", "#264653", "#e9c46a", "#9b5de5", "#ef476f", "#06d6a0", "#118ab2"] sections = re.split(r'###\s+\d+\.\s+', content)[1:] for i, section in enumerate(sections): lines = section.strip().split('\n') title_line = lines[0] # Parse title and description if ' -- ' in title_line: label, description = title_line.split(' -- ', 1) elif ' — ' in title_line: label, description = title_line.split(' — ', 1) else: label = title_line.rstrip() description = "" label = label.strip() description = description.strip() # Extract episode references full_text = '\n'.join(lines) episodes = sorted(set(re.findall(r'S\d+E\d+', full_text))) theme_id = re.sub(r'[^a-z0-9]', '', label.lower())[:20] themes.append({ "id": theme_id, "label": label, "description": description, "episodes": episodes, "color": colors[i % len(colors)] }) return themes def normalize(text): """Normalize text for comparison.""" text = text.lower() text = re.sub(r'\[.*?\]', '', text) text = re.sub(r'--', ' ', text) text = re.sub(r'[^\w\s]', ' ', text) text = re.sub(r'\s+', ' ', text).strip() return text def find_best_window(quote_text, entries, max_window=6): """Find best matching window of SRT entries for a quote.""" norm_quote = normalize(quote_text) keywords = [w for w in norm_quote.split() if len(w) > 4][:8] best_score = 0 best_start = None best_end = None for window_size in range(1, min(max_window + 1, len(entries) + 1)): for i in range(len(entries) - window_size + 1): window = entries[i:i + window_size] window_text = ' '.join(e[2] for e in window) norm_window = normalize(window_text) # Quick keyword filter if keywords: hits = sum(1 for kw in keywords if kw in norm_window) if hits < len(keywords) * 0.4: continue score = SequenceMatcher(None, norm_quote, norm_window).ratio() # Prefer tighter matches duration = window[-1][1] - window[0][0] if duration < 30: score *= 1.05 if score > best_score: best_score = score best_start = window[0][0] best_end = window[-1][1] return best_start, best_end, best_score def parse_srt(filepath): """Parse SRT file into entries.""" with open(filepath, "r", encoding="utf-8") as f: content = f.read() blocks = re.split(r'\n\n+', content.strip()) entries = [] for block in blocks: lines = block.strip().split('\n') if len(lines) < 2: continue try: int(lines[0].strip()) except ValueError: continue ts_match = re.match( r'(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,.]\d{3})', lines[1] ) if not ts_match: continue def to_sec(ts): ts = ts.replace(',', '.') p = ts.split(':') return float(p[0]) * 3600 + float(p[1]) * 60 + float(p[2]) start = to_sec(ts_match.group(1)) end = to_sec(ts_match.group(2)) text = ' '.join(lines[2:]).strip() text = re.sub(r'^Speaker \d+:\s*', '', text) if text: entries.append((start, end, text)) return entries def main(): project_dir = sys.argv[1] if len(sys.argv) > 1 else "." config = load_project(project_dir) audio_dir = config["_audio_dir"] data_dir = config["_data_dir"] os.makedirs(data_dir, exist_ok=True) # Parse quotes quotes_path = os.path.join(project_dir, config.get("quotes_file", "quotes.md")) quotes = parse_quotes_md(quotes_path) print(f"Parsed {len(quotes)} quotes from {quotes_path}") # Parse themes themes_path = os.path.join(project_dir, config.get("themes_file", "themes.md")) themes = [] if os.path.exists(themes_path): themes = parse_themes_md(themes_path) print(f"Parsed {len(themes)} themes from {themes_path}") # Load SRT data srt_data = {} for ep in config["episodes"]: srt_key = config["_srt_keys"][ep["id"]] srt_path = os.path.join(audio_dir, f"{srt_key}.srt") if os.path.exists(srt_path): srt_data[ep["id"]] = parse_srt(srt_path) # Build episodes list episodes_out = [] for ep in config["episodes"]: audio_file = config["_audio_files"].get(ep["id"]) audio_path = os.path.join(audio_dir, audio_file) if audio_file else None episodes_out.append({ "id": ep["id"], "title": ep["title"], "guest": ep["guest"], "staffel": ep["staffel"], "audioFile": audio_file if audio_path and os.path.exists(audio_path) else None }) # Match quotes to timestamps quotes_out = [] matched = 0 for i, q in enumerate(quotes): ep_id = q["episode"] quote_data = { "id": f"q{i+1}", "text": q["text"], "speaker": q["speaker"], "episode": ep_id, "themes": [t["id"] for t in themes if ep_id in t["episodes"]], "startTime": None, "endTime": None, "audioFile": config["_audio_files"].get(ep_id), "isTopQuote": False, "verbatim": None, } if ep_id in srt_data: entries = srt_data[ep_id] start, end, score = find_best_window(q["text"], entries) if start is not None and score > 0.3: quote_data["startTime"] = round(start - 1.5, 1) quote_data["endTime"] = round(end + 1.0, 1) matched += 1 # Get verbatim text nearby = [e for e in entries if e[0] >= start - 1 and e[1] <= end + 1] if nearby: verbatim = ' '.join(e[2] for e in nearby).strip() # Capitalize first letter if verbatim and verbatim[0].islower(): verbatim = verbatim[0].upper() + verbatim[1:] # Ensure ends with punctuation if verbatim and verbatim[-1] not in '.!?': verbatim += '.' quote_data["verbatim"] = verbatim quotes_out.append(quote_data) # Build staffeln staffeln_out = config["staffeln"] # Output output = { "name": config.get("name", "Podcast"), "description": config.get("description", ""), "host": config.get("host", ""), "themes": themes, "episodes": episodes_out, "quotes": quotes_out, "staffeln": staffeln_out, } output_path = os.path.join(data_dir, "mindmap_data.json") with open(output_path, "w", encoding="utf-8") as f: json.dump(output, f, ensure_ascii=False, indent=2) print(f"\nMatched: {matched}/{len(quotes_out)} quotes with timestamps") print(f"Output: {output_path}") if __name__ == "__main__": main()