podcast-mindmap/scripts/convert_srt.py

#!/usr/bin/env python3
"""Convert MacWhisper SRT files to clean transcripts + SRT index JSON."""

import json
import os
import re
import sys
from config import load_project


def parse_srt(filepath):
    """Parse SRT into list of (start_sec, end_sec, text)."""
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    blocks = re.split(r'\n\n+', content.strip())
    entries = []

    for block in blocks:
        lines = block.strip().split('\n')
        if len(lines) < 2:
            continue
        try:
            int(lines[0].strip())
        except ValueError:
            continue
        ts_match = re.match(
            r'(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,.]\d{3})',
            lines[1]
        )
        if not ts_match:
            continue
        start = _ts_to_sec(ts_match.group(1))
        end = _ts_to_sec(ts_match.group(2))
        text = ' '.join(lines[2:]).strip()
        text = re.sub(r'^Speaker \d+:\s*', '', text)
        if text:
            entries.append((start, end, text))

    return entries


def _ts_to_sec(ts):
    ts = ts.replace(',', '.')
    parts = ts.split(':')
    return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2])


def _fmt_ts(sec):
    m = int(sec) // 60
    s = int(sec) % 60
    return f"{m:02d}:{s:02d}"


def merge_to_paragraphs(entries, pause_threshold=2.0, max_para_duration=120):
    """Merge entries into paragraphs based on pauses."""
    if not entries:
        return []

    paragraphs = []
    para_start = entries[0][0]
    para_end = entries[0][1]
    para_texts = [entries[0][2]]

    for i in range(1, len(entries)):
        start, end, text = entries[i]
        gap = start - para_end
        duration = start - para_start

        if gap > pause_threshold or duration > max_para_duration:
            paragraphs.append((para_start, para_end, ' '.join(para_texts)))
            para_start = start
            para_end = end
            para_texts = [text]
        else:
            para_end = end
            para_texts.append(text)

    paragraphs.append((para_start, para_end, ' '.join(para_texts)))
    return paragraphs


def main():
    project_dir = sys.argv[1] if len(sys.argv) > 1 else "."
    config = load_project(project_dir)
    audio_dir = config["_audio_dir"]
    transcripts_dir = config["_transcripts_dir"]
    data_dir = config["_data_dir"]
    os.makedirs(transcripts_dir, exist_ok=True)
    os.makedirs(data_dir, exist_ok=True)

    all_indices = {}

    for ep in config["episodes"]:
        srt_key = config["_srt_keys"][ep["id"]]
        srt_path = os.path.join(audio_dir, f"{srt_key}.srt")

        if not os.path.exists(srt_path):
            print(f"SKIP: {ep['id']} — no SRT file")
            continue

        entries = parse_srt(srt_path)
        paragraphs = merge_to_paragraphs(entries)

        # Write transcript
        output_path = os.path.join(transcripts_dir, f"{srt_key}-Transcript.txt")
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(f"{ep['title']} — {config['host']} mit {ep['guest']}\n")
            f.write(f"{'=' * 60}\n\n")
            for start, end, text in paragraphs:
                f.write(f"[{_fmt_ts(start)}]\n{text}\n\n")

        # Store index
        all_indices[srt_key] = {
            "meta": {"host": config["host"], "guest": ep["guest"],
                     "theme": ep["title"], "staffel": ep["staffel"]},
            "paragraphs": [{"start": s, "end": e, "text": t} for s, e, t in paragraphs],
        }

        print(f"OK: {ep['id']} — {len(paragraphs)} Absätze → {output_path}")

    # Save index
    index_path = os.path.join(data_dir, "srt_index.json")
    with open(index_path, "w", encoding="utf-8") as f:
        json.dump(all_indices, f, ensure_ascii=False, indent=2)

    print(f"\nIndex: {index_path} ({len(all_indices)} Episoden)")


if __name__ == "__main__":
    main()
Initial commit: podcast-mindmap tool Generic tool for building interactive mindmap visualizations from podcast transcripts. Includes: audio download, SRT conversion, quote-timestamp matching, D3.js mindmap webapp. Configurable via project.yaml — no podcast-specific content. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-20 01:25:42 +02:00			`#!/usr/bin/env python3`
			`"""Convert MacWhisper SRT files to clean transcripts + SRT index JSON."""`

			`import json`
			`import os`
			`import re`
			`import sys`
			`from config import load_project`


			`def parse_srt(filepath):`
			`"""Parse SRT into list of (start_sec, end_sec, text)."""`
			`with open(filepath, "r", encoding="utf-8") as f:`
			`content = f.read()`

			`blocks = re.split(r'\n\n+', content.strip())`
			`entries = []`

			`for block in blocks:`
			`lines = block.strip().split('\n')`
			`if len(lines) < 2:`
			`continue`
			`try:`
			`int(lines[0].strip())`
			`except ValueError:`
			`continue`
			`ts_match = re.match(`
			`r'(\d{2}:\d{2}:\d{2}[,.]\d{3})\s-->\s(\d{2}:\d{2}:\d{2}[,.]\d{3})',`
			`lines[1]`
			`)`
			`if not ts_match:`
			`continue`
			`start = _ts_to_sec(ts_match.group(1))`
			`end = _ts_to_sec(ts_match.group(2))`
			`text = ' '.join(lines[2:]).strip()`
			`text = re.sub(r'^Speaker \d+:\s*', '', text)`
			`if text:`
			`entries.append((start, end, text))`

			`return entries`


			`def _ts_to_sec(ts):`
			`ts = ts.replace(',', '.')`
			`parts = ts.split(':')`
			`return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2])`


			`def _fmt_ts(sec):`
			`m = int(sec) // 60`
			`s = int(sec) % 60`
			`return f"{m:02d}:{s:02d}"`


			`def merge_to_paragraphs(entries, pause_threshold=2.0, max_para_duration=120):`
			`"""Merge entries into paragraphs based on pauses."""`
			`if not entries:`
			`return []`

			`paragraphs = []`
			`para_start = entries[0][0]`
			`para_end = entries[0][1]`
			`para_texts = [entries[0][2]]`

			`for i in range(1, len(entries)):`
			`start, end, text = entries[i]`
			`gap = start - para_end`
			`duration = start - para_start`

			`if gap > pause_threshold or duration > max_para_duration:`
			`paragraphs.append((para_start, para_end, ' '.join(para_texts)))`
			`para_start = start`
			`para_end = end`
			`para_texts = [text]`
			`else:`
			`para_end = end`
			`para_texts.append(text)`

			`paragraphs.append((para_start, para_end, ' '.join(para_texts)))`
			`return paragraphs`


			`def main():`
			`project_dir = sys.argv[1] if len(sys.argv) > 1 else "."`
			`config = load_project(project_dir)`
			`audio_dir = config["_audio_dir"]`
			`transcripts_dir = config["_transcripts_dir"]`
			`data_dir = config["_data_dir"]`
			`os.makedirs(transcripts_dir, exist_ok=True)`
			`os.makedirs(data_dir, exist_ok=True)`

			`all_indices = {}`

			`for ep in config["episodes"]:`
			`srt_key = config["_srt_keys"][ep["id"]]`
			`srt_path = os.path.join(audio_dir, f"{srt_key}.srt")`

			`if not os.path.exists(srt_path):`
			`print(f"SKIP: {ep['id']} — no SRT file")`
			`continue`

			`entries = parse_srt(srt_path)`
			`paragraphs = merge_to_paragraphs(entries)`

			`# Write transcript`
			`output_path = os.path.join(transcripts_dir, f"{srt_key}-Transcript.txt")`
			`with open(output_path, "w", encoding="utf-8") as f:`
			`f.write(f"{ep['title']} — {config['host']} mit {ep['guest']}\n")`
			`f.write(f"{'=' * 60}\n\n")`
			`for start, end, text in paragraphs:`
			`f.write(f"[{_fmt_ts(start)}]\n{text}\n\n")`

			`# Store index`
			`all_indices[srt_key] = {`
			`"meta": {"host": config["host"], "guest": ep["guest"],`
			`"theme": ep["title"], "staffel": ep["staffel"]},`
			`"paragraphs": [{"start": s, "end": e, "text": t} for s, e, t in paragraphs],`
			`}`

			`print(f"OK: {ep['id']} — {len(paragraphs)} Absätze → {output_path}")`

			`# Save index`
			`index_path = os.path.join(data_dir, "srt_index.json")`
			`with open(index_path, "w", encoding="utf-8") as f:`
			`json.dump(all_indices, f, ensure_ascii=False, indent=2)`

			`print(f"\nIndex: {index_path} ({len(all_indices)} Episoden)")`


			`if __name__ == "__main__":`
			`main()`