podcast-mindmap/scripts/match_quotes.py

#!/usr/bin/env python3
"""Match quotes from a markdown file to SRT timestamps and build mindmap_data.json."""

import json
import os
import re
import sys
from difflib import SequenceMatcher
from config import load_project


def parse_quotes_md(filepath):
    """Parse quotes markdown file. Expected format:

    ### Section Title

    > "Quote text" -- Speaker (Episode-ID)

    Returns list of {text, speaker, episode, section}.
    """
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    quotes = []
    current_section = ""

    for line in content.split('\n'):
        if line.startswith('### '):
            current_section = line[4:].strip()
        elif line.startswith('> '):
            # Extract quote text
            text_match = re.match(r'>\s*"?(.+?)(?:"|$)', line)
            if not text_match:
                continue
            text = text_match.group(1).strip().rstrip('"')

            # Extract speaker and episode
            attr_match = re.search(r'--\s*(.+?)\s*\((\w+)\)', line)
            if attr_match:
                speaker = attr_match.group(1).strip()
                episode = attr_match.group(2).strip()
            else:
                # Try simpler pattern
                attr_match = re.search(r'--\s*(.+?)$', line)
                speaker = attr_match.group(1).strip() if attr_match else "Unknown"
                ep_match = re.search(r'\((S\d+E\d+)\)', line)
                episode = ep_match.group(1) if ep_match else ""

            quotes.append({
                "text": text,
                "speaker": speaker,
                "episode": episode,
                "section": current_section
            })

    return quotes


def parse_themes_md(filepath):
    """Parse themes markdown file. Expected format:

    ### 1. Theme Title -- Description
    Text mentioning episodes like S1E1, S2E3...

    Returns list of {id, label, description, episodes}.
    """
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    themes = []
    # Default colors
    colors = ["#e63946", "#457b9d", "#f4a261", "#2a9d8f", "#264653", "#e9c46a", "#9b5de5",
              "#ef476f", "#06d6a0", "#118ab2"]

    sections = re.split(r'###\s+\d+\.\s+', content)[1:]

    for i, section in enumerate(sections):
        lines = section.strip().split('\n')
        title_line = lines[0]

        # Parse title and description
        if ' -- ' in title_line:
            label, description = title_line.split(' -- ', 1)
        elif ' — ' in title_line:
            label, description = title_line.split(' — ', 1)
        else:
            label = title_line.rstrip()
            description = ""

        label = label.strip()
        description = description.strip()

        # Extract episode references
        full_text = '\n'.join(lines)
        episodes = sorted(set(re.findall(r'S\d+E\d+', full_text)))

        theme_id = re.sub(r'[^a-z0-9]', '', label.lower())[:20]

        themes.append({
            "id": theme_id,
            "label": label,
            "description": description,
            "episodes": episodes,
            "color": colors[i % len(colors)]
        })

    return themes


def normalize(text):
    """Normalize text for comparison."""
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'--', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def find_best_window(quote_text, entries, max_window=6):
    """Find best matching window of SRT entries for a quote."""
    norm_quote = normalize(quote_text)
    keywords = [w for w in norm_quote.split() if len(w) > 4][:8]

    best_score = 0
    best_start = None
    best_end = None

    for window_size in range(1, min(max_window + 1, len(entries) + 1)):
        for i in range(len(entries) - window_size + 1):
            window = entries[i:i + window_size]
            window_text = ' '.join(e[2] for e in window)
            norm_window = normalize(window_text)

            # Quick keyword filter
            if keywords:
                hits = sum(1 for kw in keywords if kw in norm_window)
                if hits < len(keywords) * 0.4:
                    continue

            score = SequenceMatcher(None, norm_quote, norm_window).ratio()

            # Prefer tighter matches
            duration = window[-1][1] - window[0][0]
            if duration < 30:
                score *= 1.05

            if score > best_score:
                best_score = score
                best_start = window[0][0]
                best_end = window[-1][1]

    return best_start, best_end, best_score


def parse_srt(filepath):
    """Parse SRT file into entries."""
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    blocks = re.split(r'\n\n+', content.strip())
    entries = []
    for block in blocks:
        lines = block.strip().split('\n')
        if len(lines) < 2:
            continue
        try:
            int(lines[0].strip())
        except ValueError:
            continue
        ts_match = re.match(
            r'(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,.]\d{3})',
            lines[1]
        )
        if not ts_match:
            continue
        def to_sec(ts):
            ts = ts.replace(',', '.')
            p = ts.split(':')
            return float(p[0]) * 3600 + float(p[1]) * 60 + float(p[2])
        start = to_sec(ts_match.group(1))
        end = to_sec(ts_match.group(2))
        text = ' '.join(lines[2:]).strip()
        text = re.sub(r'^Speaker \d+:\s*', '', text)
        if text:
            entries.append((start, end, text))
    return entries


def main():
    project_dir = sys.argv[1] if len(sys.argv) > 1 else "."
    config = load_project(project_dir)
    audio_dir = config["_audio_dir"]
    data_dir = config["_data_dir"]
    os.makedirs(data_dir, exist_ok=True)

    # Parse quotes
    quotes_path = os.path.join(project_dir, config.get("quotes_file", "quotes.md"))
    quotes = parse_quotes_md(quotes_path)
    print(f"Parsed {len(quotes)} quotes from {quotes_path}")

    # Parse themes
    themes_path = os.path.join(project_dir, config.get("themes_file", "themes.md"))
    themes = []
    if os.path.exists(themes_path):
        themes = parse_themes_md(themes_path)
        print(f"Parsed {len(themes)} themes from {themes_path}")

    # Load SRT data
    srt_data = {}
    for ep in config["episodes"]:
        srt_key = config["_srt_keys"][ep["id"]]
        srt_path = os.path.join(audio_dir, f"{srt_key}.srt")
        if os.path.exists(srt_path):
            srt_data[ep["id"]] = parse_srt(srt_path)

    # Build episodes list
    episodes_out = []
    for ep in config["episodes"]:
        audio_file = config["_audio_files"].get(ep["id"])
        audio_path = os.path.join(audio_dir, audio_file) if audio_file else None
        episodes_out.append({
            "id": ep["id"],
            "title": ep["title"],
            "guest": ep["guest"],
            "staffel": ep["staffel"],
            "audioFile": audio_file if audio_path and os.path.exists(audio_path) else None
        })

    # Match quotes to timestamps
    quotes_out = []
    matched = 0
    for i, q in enumerate(quotes):
        ep_id = q["episode"]
        quote_data = {
            "id": f"q{i+1}",
            "text": q["text"],
            "speaker": q["speaker"],
            "episode": ep_id,
            "themes": [t["id"] for t in themes if ep_id in t["episodes"]],
            "startTime": None,
            "endTime": None,
            "audioFile": config["_audio_files"].get(ep_id),
            "isTopQuote": False,
            "verbatim": None,
        }

        if ep_id in srt_data:
            entries = srt_data[ep_id]
            start, end, score = find_best_window(q["text"], entries)
            if start is not None and score > 0.3:
                quote_data["startTime"] = round(start - 1.5, 1)
                quote_data["endTime"] = round(end + 1.0, 1)
                matched += 1

                # Get verbatim text
                nearby = [e for e in entries if e[0] >= start - 1 and e[1] <= end + 1]
                if nearby:
                    verbatim = ' '.join(e[2] for e in nearby).strip()
                    # Capitalize first letter
                    if verbatim and verbatim[0].islower():
                        verbatim = verbatim[0].upper() + verbatim[1:]
                    # Ensure ends with punctuation
                    if verbatim and verbatim[-1] not in '.!?':
                        verbatim += '.'
                    quote_data["verbatim"] = verbatim

        quotes_out.append(quote_data)

    # Build staffeln
    staffeln_out = config["staffeln"]

    # Output
    output = {
        "name": config.get("name", "Podcast"),
        "description": config.get("description", ""),
        "host": config.get("host", ""),
        "themes": themes,
        "episodes": episodes_out,
        "quotes": quotes_out,
        "staffeln": staffeln_out,
    }

    output_path = os.path.join(data_dir, "mindmap_data.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print(f"\nMatched: {matched}/{len(quotes_out)} quotes with timestamps")
    print(f"Output: {output_path}")


if __name__ == "__main__":
    main()