#!/usr/bin/env python3 """Convert MacWhisper SRT files to clean transcripts + SRT index JSON.""" import json import os import re import sys from config import load_project def parse_srt(filepath): """Parse SRT into list of (start_sec, end_sec, text).""" with open(filepath, "r", encoding="utf-8") as f: content = f.read() blocks = re.split(r'\n\n+', content.strip()) entries = [] for block in blocks: lines = block.strip().split('\n') if len(lines) < 2: continue try: int(lines[0].strip()) except ValueError: continue ts_match = re.match( r'(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,.]\d{3})', lines[1] ) if not ts_match: continue start = _ts_to_sec(ts_match.group(1)) end = _ts_to_sec(ts_match.group(2)) text = ' '.join(lines[2:]).strip() text = re.sub(r'^Speaker \d+:\s*', '', text) if text: entries.append((start, end, text)) return entries def _ts_to_sec(ts): ts = ts.replace(',', '.') parts = ts.split(':') return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2]) def _fmt_ts(sec): m = int(sec) // 60 s = int(sec) % 60 return f"{m:02d}:{s:02d}" def merge_to_paragraphs(entries, pause_threshold=2.0, max_para_duration=120): """Merge entries into paragraphs based on pauses.""" if not entries: return [] paragraphs = [] para_start = entries[0][0] para_end = entries[0][1] para_texts = [entries[0][2]] for i in range(1, len(entries)): start, end, text = entries[i] gap = start - para_end duration = start - para_start if gap > pause_threshold or duration > max_para_duration: paragraphs.append((para_start, para_end, ' '.join(para_texts))) para_start = start para_end = end para_texts = [text] else: para_end = end para_texts.append(text) paragraphs.append((para_start, para_end, ' '.join(para_texts))) return paragraphs def main(): project_dir = sys.argv[1] if len(sys.argv) > 1 else "." config = load_project(project_dir) audio_dir = config["_audio_dir"] transcripts_dir = config["_transcripts_dir"] data_dir = config["_data_dir"] os.makedirs(transcripts_dir, exist_ok=True) os.makedirs(data_dir, exist_ok=True) all_indices = {} for ep in config["episodes"]: srt_key = config["_srt_keys"][ep["id"]] srt_path = os.path.join(audio_dir, f"{srt_key}.srt") if not os.path.exists(srt_path): print(f"SKIP: {ep['id']} — no SRT file") continue entries = parse_srt(srt_path) paragraphs = merge_to_paragraphs(entries) # Write transcript output_path = os.path.join(transcripts_dir, f"{srt_key}-Transcript.txt") with open(output_path, "w", encoding="utf-8") as f: f.write(f"{ep['title']} — {config['host']} mit {ep['guest']}\n") f.write(f"{'=' * 60}\n\n") for start, end, text in paragraphs: f.write(f"[{_fmt_ts(start)}]\n{text}\n\n") # Store index all_indices[srt_key] = { "meta": {"host": config["host"], "guest": ep["guest"], "theme": ep["title"], "staffel": ep["staffel"]}, "paragraphs": [{"start": s, "end": e, "text": t} for s, e, t in paragraphs], } print(f"OK: {ep['id']} — {len(paragraphs)} Absätze → {output_path}") # Save index index_path = os.path.join(data_dir, "srt_index.json") with open(index_path, "w", encoding="utf-8") as f: json.dump(all_indices, f, ensure_ascii=False, indent=2) print(f"\nIndex: {index_path} ({len(all_indices)} Episoden)") if __name__ == "__main__": main()