podcast-mindmap/scripts/match_quotes.py

#!/usr/bin/env python3
"""Match quotes from a markdown file to SRT timestamps and build mindmap_data.json."""

import json
import os
import re
import sys
from difflib import SequenceMatcher
from config import load_project


def parse_quotes_md(filepath):
    """Parse quotes markdown file. Expected format:

    ### Section Title

    > "Quote text" -- Speaker (Episode-ID)

    Returns list of {text, speaker, episode, section}.
    """
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    quotes = []
    current_section = ""

    for line in content.split('\n'):
        if line.startswith('### '):
            current_section = line[4:].strip()
        elif line.startswith('> '):
            # Extract quote text
            text_match = re.match(r'>\s*"?(.+?)(?:"|$)', line)
            if not text_match:
                continue
            text = text_match.group(1).strip().rstrip('"')

            # Extract speaker and episode
            attr_match = re.search(r'--\s*(.+?)\s*\((\w+)\)', line)
            if attr_match:
                speaker = attr_match.group(1).strip()
                episode = attr_match.group(2).strip()
            else:
                # Try simpler pattern
                attr_match = re.search(r'--\s*(.+?)$', line)
                speaker = attr_match.group(1).strip() if attr_match else "Unknown"
                ep_match = re.search(r'\((S\d+E\d+)\)', line)
                episode = ep_match.group(1) if ep_match else ""

            quotes.append({
                "text": text,
                "speaker": speaker,
                "episode": episode,
                "section": current_section
            })

    return quotes


def parse_themes_md(filepath):
    """Parse themes markdown file. Expected format:

    ### 1. Theme Title -- Description
    Text mentioning episodes like S1E1, S2E3...

    Returns list of {id, label, description, episodes}.
    """
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    themes = []
    # Default colors
    colors = ["#e63946", "#457b9d", "#f4a261", "#2a9d8f", "#264653", "#e9c46a", "#9b5de5",
              "#ef476f", "#06d6a0", "#118ab2"]

    sections = re.split(r'###\s+\d+\.\s+', content)[1:]

    for i, section in enumerate(sections):
        lines = section.strip().split('\n')
        title_line = lines[0]

        # Parse title and description
        if ' -- ' in title_line:
            label, description = title_line.split(' -- ', 1)
        elif ' — ' in title_line:
            label, description = title_line.split(' — ', 1)
        else:
            label = title_line.rstrip()
            description = ""

        label = label.strip()
        description = description.strip()

        # Extract episode references
        full_text = '\n'.join(lines)
        episodes = sorted(set(re.findall(r'S\d+E\d+', full_text)))

        theme_id = re.sub(r'[^a-z0-9]', '', label.lower())[:20]

        themes.append({
            "id": theme_id,
            "label": label,
            "description": description,
            "episodes": episodes,
            "color": colors[i % len(colors)]
        })

    return themes


def normalize(text):
    """Normalize text for comparison."""
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'--', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def find_best_window(quote_text, entries, max_window=6):
    """Find best matching window of SRT entries for a quote."""
    norm_quote = normalize(quote_text)
    keywords = [w for w in norm_quote.split() if len(w) > 4][:8]

    best_score = 0
    best_start = None
    best_end = None

    for window_size in range(1, min(max_window + 1, len(entries) + 1)):
        for i in range(len(entries) - window_size + 1):
            window = entries[i:i + window_size]
            window_text = ' '.join(e[2] for e in window)
            norm_window = normalize(window_text)

            # Quick keyword filter
            if keywords:
                hits = sum(1 for kw in keywords if kw in norm_window)
                if hits < len(keywords) * 0.4:
                    continue

            score = SequenceMatcher(None, norm_quote, norm_window).ratio()

            # Prefer tighter matches
            duration = window[-1][1] - window[0][0]
            if duration < 30:
                score *= 1.05

            if score > best_score:
                best_score = score
                best_start = window[0][0]
                best_end = window[-1][1]

    return best_start, best_end, best_score


def parse_srt(filepath):
    """Parse SRT file into entries."""
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    blocks = re.split(r'\n\n+', content.strip())
    entries = []
    for block in blocks:
        lines = block.strip().split('\n')
        if len(lines) < 2:
            continue
        try:
            int(lines[0].strip())
        except ValueError:
            continue
        ts_match = re.match(
            r'(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,.]\d{3})',
            lines[1]
        )
        if not ts_match:
            continue
        def to_sec(ts):
            ts = ts.replace(',', '.')
            p = ts.split(':')
            return float(p[0]) * 3600 + float(p[1]) * 60 + float(p[2])
        start = to_sec(ts_match.group(1))
        end = to_sec(ts_match.group(2))
        text = ' '.join(lines[2:]).strip()
        text = re.sub(r'^Speaker \d+:\s*', '', text)
        if text:
            entries.append((start, end, text))
    return entries


def main():
    project_dir = sys.argv[1] if len(sys.argv) > 1 else "."
    config = load_project(project_dir)
    audio_dir = config["_audio_dir"]
    data_dir = config["_data_dir"]
    os.makedirs(data_dir, exist_ok=True)

    # Parse quotes
    quotes_path = os.path.join(project_dir, config.get("quotes_file", "quotes.md"))
    quotes = parse_quotes_md(quotes_path)
    print(f"Parsed {len(quotes)} quotes from {quotes_path}")

    # Parse themes
    themes_path = os.path.join(project_dir, config.get("themes_file", "themes.md"))
    themes = []
    if os.path.exists(themes_path):
        themes = parse_themes_md(themes_path)
        print(f"Parsed {len(themes)} themes from {themes_path}")

    # Load SRT data
    srt_data = {}
    for ep in config["episodes"]:
        srt_key = config["_srt_keys"][ep["id"]]
        srt_path = os.path.join(audio_dir, f"{srt_key}.srt")
        if os.path.exists(srt_path):
            srt_data[ep["id"]] = parse_srt(srt_path)

    # Build episodes list
    episodes_out = []
    for ep in config["episodes"]:
        audio_file = config["_audio_files"].get(ep["id"])
        audio_path = os.path.join(audio_dir, audio_file) if audio_file else None
        episodes_out.append({
            "id": ep["id"],
            "title": ep["title"],
            "guest": ep["guest"],
            "staffel": ep["staffel"],
            "audioFile": audio_file if audio_path and os.path.exists(audio_path) else None
        })

    # Match quotes to timestamps
    quotes_out = []
    matched = 0
    for i, q in enumerate(quotes):
        ep_id = q["episode"]
        quote_data = {
            "id": f"q{i+1}",
            "text": q["text"],
            "speaker": q["speaker"],
            "episode": ep_id,
            "themes": [t["id"] for t in themes if ep_id in t["episodes"]],
            "startTime": None,
            "endTime": None,
            "audioFile": config["_audio_files"].get(ep_id),
            "isTopQuote": False,
            "verbatim": None,
        }

        if ep_id in srt_data:
            entries = srt_data[ep_id]
            start, end, score = find_best_window(q["text"], entries)
            if start is not None and score > 0.3:
                quote_data["startTime"] = round(start - 1.5, 1)
                quote_data["endTime"] = round(end + 1.0, 1)
                matched += 1

                # Get verbatim text
                nearby = [e for e in entries if e[0] >= start - 1 and e[1] <= end + 1]
                if nearby:
                    verbatim = ' '.join(e[2] for e in nearby).strip()
                    # Capitalize first letter
                    if verbatim and verbatim[0].islower():
                        verbatim = verbatim[0].upper() + verbatim[1:]
                    # Ensure ends with punctuation
                    if verbatim and verbatim[-1] not in '.!?':
                        verbatim += '.'
                    quote_data["verbatim"] = verbatim

        quotes_out.append(quote_data)

    # Build staffeln
    staffeln_out = config["staffeln"]

    # Output
    output = {
        "name": config.get("name", "Podcast"),
        "description": config.get("description", ""),
        "host": config.get("host", ""),
        "themes": themes,
        "episodes": episodes_out,
        "quotes": quotes_out,
        "staffeln": staffeln_out,
    }

    output_path = os.path.join(data_dir, "mindmap_data.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print(f"\nMatched: {matched}/{len(quotes_out)} quotes with timestamps")
    print(f"Output: {output_path}")


if __name__ == "__main__":
    main()
Initial commit: podcast-mindmap tool Generic tool for building interactive mindmap visualizations from podcast transcripts. Includes: audio download, SRT conversion, quote-timestamp matching, D3.js mindmap webapp. Configurable via project.yaml — no podcast-specific content. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-20 01:25:42 +02:00			`#!/usr/bin/env python3`
			`"""Match quotes from a markdown file to SRT timestamps and build mindmap_data.json."""`

			`import json`
			`import os`
			`import re`
			`import sys`
			`from difflib import SequenceMatcher`
			`from config import load_project`


			`def parse_quotes_md(filepath):`
			`"""Parse quotes markdown file. Expected format:`

			`### Section Title`

			`> "Quote text" -- Speaker (Episode-ID)`

			`Returns list of {text, speaker, episode, section}.`
			`"""`
			`with open(filepath, "r", encoding="utf-8") as f:`
			`content = f.read()`

			`quotes = []`
			`current_section = ""`

			`for line in content.split('\n'):`
			`if line.startswith('### '):`
			`current_section = line[4:].strip()`
			`elif line.startswith('> '):`
			`# Extract quote text`
			`text_match = re.match(r'>\s*"?(.+?)(?:"\|$)', line)`
			`if not text_match:`
			`continue`
			`text = text_match.group(1).strip().rstrip('"')`

			`# Extract speaker and episode`
			`attr_match = re.search(r'--\s(.+?)\s\((\w+)\)', line)`
			`if attr_match:`
			`speaker = attr_match.group(1).strip()`
			`episode = attr_match.group(2).strip()`
			`else:`
			`# Try simpler pattern`
			`attr_match = re.search(r'--\s*(.+?)$', line)`
			`speaker = attr_match.group(1).strip() if attr_match else "Unknown"`
			`ep_match = re.search(r'\((S\d+E\d+)\)', line)`
			`episode = ep_match.group(1) if ep_match else ""`

			`quotes.append({`
			`"text": text,`
			`"speaker": speaker,`
			`"episode": episode,`
			`"section": current_section`
			`})`

			`return quotes`


			`def parse_themes_md(filepath):`
			`"""Parse themes markdown file. Expected format:`

			`### 1. Theme Title -- Description`
			`Text mentioning episodes like S1E1, S2E3...`

			`Returns list of {id, label, description, episodes}.`
			`"""`
			`with open(filepath, "r", encoding="utf-8") as f:`
			`content = f.read()`

			`themes = []`
			`# Default colors`
			`colors = ["#e63946", "#457b9d", "#f4a261", "#2a9d8f", "#264653", "#e9c46a", "#9b5de5",`
			`"#ef476f", "#06d6a0", "#118ab2"]`

			`sections = re.split(r'###\s+\d+\.\s+', content)[1:]`

			`for i, section in enumerate(sections):`
			`lines = section.strip().split('\n')`
			`title_line = lines[0]`

			`# Parse title and description`
			`if ' -- ' in title_line:`
			`label, description = title_line.split(' -- ', 1)`
			`elif ' — ' in title_line:`
			`label, description = title_line.split(' — ', 1)`
			`else:`
			`label = title_line.rstrip()`
			`description = ""`

			`label = label.strip()`
			`description = description.strip()`

			`# Extract episode references`
			`full_text = '\n'.join(lines)`
			`episodes = sorted(set(re.findall(r'S\d+E\d+', full_text)))`

			`theme_id = re.sub(r'[^a-z0-9]', '', label.lower())[:20]`

			`themes.append({`
			`"id": theme_id,`
			`"label": label,`
			`"description": description,`
			`"episodes": episodes,`
			`"color": colors[i % len(colors)]`
			`})`

			`return themes`


			`def normalize(text):`
			`"""Normalize text for comparison."""`
			`text = text.lower()`
			`text = re.sub(r'\[.*?\]', '', text)`
			`text = re.sub(r'--', ' ', text)`
			`text = re.sub(r'[^\w\s]', ' ', text)`
			`text = re.sub(r'\s+', ' ', text).strip()`
			`return text`


			`def find_best_window(quote_text, entries, max_window=6):`
			`"""Find best matching window of SRT entries for a quote."""`
			`norm_quote = normalize(quote_text)`
			`keywords = [w for w in norm_quote.split() if len(w) > 4][:8]`

			`best_score = 0`
			`best_start = None`
			`best_end = None`

			`for window_size in range(1, min(max_window + 1, len(entries) + 1)):`
			`for i in range(len(entries) - window_size + 1):`
			`window = entries[i:i + window_size]`
			`window_text = ' '.join(e[2] for e in window)`
			`norm_window = normalize(window_text)`

			`# Quick keyword filter`
			`if keywords:`
			`hits = sum(1 for kw in keywords if kw in norm_window)`
			`if hits < len(keywords) * 0.4:`
			`continue`

			`score = SequenceMatcher(None, norm_quote, norm_window).ratio()`

			`# Prefer tighter matches`
			`duration = window[-1][1] - window[0][0]`
			`if duration < 30:`
			`score *= 1.05`

			`if score > best_score:`
			`best_score = score`
			`best_start = window[0][0]`
			`best_end = window[-1][1]`

			`return best_start, best_end, best_score`


			`def parse_srt(filepath):`
			`"""Parse SRT file into entries."""`
			`with open(filepath, "r", encoding="utf-8") as f:`
			`content = f.read()`

			`blocks = re.split(r'\n\n+', content.strip())`
			`entries = []`
			`for block in blocks:`
			`lines = block.strip().split('\n')`
			`if len(lines) < 2:`
			`continue`
			`try:`
			`int(lines[0].strip())`
			`except ValueError:`
			`continue`
			`ts_match = re.match(`
			`r'(\d{2}:\d{2}:\d{2}[,.]\d{3})\s-->\s(\d{2}:\d{2}:\d{2}[,.]\d{3})',`
			`lines[1]`
			`)`
			`if not ts_match:`
			`continue`
			`def to_sec(ts):`
			`ts = ts.replace(',', '.')`
			`p = ts.split(':')`
			`return float(p[0]) * 3600 + float(p[1]) * 60 + float(p[2])`
			`start = to_sec(ts_match.group(1))`
			`end = to_sec(ts_match.group(2))`
			`text = ' '.join(lines[2:]).strip()`
			`text = re.sub(r'^Speaker \d+:\s*', '', text)`
			`if text:`
			`entries.append((start, end, text))`
			`return entries`


			`def main():`
			`project_dir = sys.argv[1] if len(sys.argv) > 1 else "."`
			`config = load_project(project_dir)`
			`audio_dir = config["_audio_dir"]`
			`data_dir = config["_data_dir"]`
			`os.makedirs(data_dir, exist_ok=True)`

			`# Parse quotes`
			`quotes_path = os.path.join(project_dir, config.get("quotes_file", "quotes.md"))`
			`quotes = parse_quotes_md(quotes_path)`
			`print(f"Parsed {len(quotes)} quotes from {quotes_path}")`

			`# Parse themes`
			`themes_path = os.path.join(project_dir, config.get("themes_file", "themes.md"))`
			`themes = []`
			`if os.path.exists(themes_path):`
			`themes = parse_themes_md(themes_path)`
			`print(f"Parsed {len(themes)} themes from {themes_path}")`

			`# Load SRT data`
			`srt_data = {}`
			`for ep in config["episodes"]:`
			`srt_key = config["_srt_keys"][ep["id"]]`
			`srt_path = os.path.join(audio_dir, f"{srt_key}.srt")`
			`if os.path.exists(srt_path):`
			`srt_data[ep["id"]] = parse_srt(srt_path)`

			`# Build episodes list`
			`episodes_out = []`
			`for ep in config["episodes"]:`
			`audio_file = config["_audio_files"].get(ep["id"])`
			`audio_path = os.path.join(audio_dir, audio_file) if audio_file else None`
			`episodes_out.append({`
			`"id": ep["id"],`
			`"title": ep["title"],`
			`"guest": ep["guest"],`
			`"staffel": ep["staffel"],`
			`"audioFile": audio_file if audio_path and os.path.exists(audio_path) else None`
			`})`

			`# Match quotes to timestamps`
			`quotes_out = []`
			`matched = 0`
			`for i, q in enumerate(quotes):`
			`ep_id = q["episode"]`
			`quote_data = {`
			`"id": f"q{i+1}",`
			`"text": q["text"],`
			`"speaker": q["speaker"],`
			`"episode": ep_id,`
			`"themes": [t["id"] for t in themes if ep_id in t["episodes"]],`
			`"startTime": None,`
			`"endTime": None,`
			`"audioFile": config["_audio_files"].get(ep_id),`
			`"isTopQuote": False,`
			`"verbatim": None,`
			`}`

			`if ep_id in srt_data:`
			`entries = srt_data[ep_id]`
			`start, end, score = find_best_window(q["text"], entries)`
			`if start is not None and score > 0.3:`
			`quote_data["startTime"] = round(start - 1.5, 1)`
			`quote_data["endTime"] = round(end + 1.0, 1)`
			`matched += 1`

			`# Get verbatim text`
			`nearby = [e for e in entries if e[0] >= start - 1 and e[1] <= end + 1]`
			`if nearby:`
			`verbatim = ' '.join(e[2] for e in nearby).strip()`
			`# Capitalize first letter`
			`if verbatim and verbatim[0].islower():`
			`verbatim = verbatim[0].upper() + verbatim[1:]`
			`# Ensure ends with punctuation`
			`if verbatim and verbatim[-1] not in '.!?':`
			`verbatim += '.'`
			`quote_data["verbatim"] = verbatim`

			`quotes_out.append(quote_data)`

			`# Build staffeln`
			`staffeln_out = config["staffeln"]`

			`# Output`
			`output = {`
			`"name": config.get("name", "Podcast"),`
			`"description": config.get("description", ""),`
			`"host": config.get("host", ""),`
			`"themes": themes,`
			`"episodes": episodes_out,`
			`"quotes": quotes_out,`
			`"staffeln": staffeln_out,`
			`}`

			`output_path = os.path.join(data_dir, "mindmap_data.json")`
			`with open(output_path, "w", encoding="utf-8") as f:`
			`json.dump(output, f, ensure_ascii=False, indent=2)`

			`print(f"\nMatched: {matched}/{len(quotes_out)} quotes with timestamps")`
			`print(f"Output: {output_path}")`


			`if __name__ == "__main__":`
			`main()`