podcast-mindmap/scripts/convert_srt.py

132 lines
3.8 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""Convert MacWhisper SRT files to clean transcripts + SRT index JSON."""
import json
import os
import re
import sys
from config import load_project
def parse_srt(filepath):
"""Parse SRT into list of (start_sec, end_sec, text)."""
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
blocks = re.split(r'\n\n+', content.strip())
entries = []
for block in blocks:
lines = block.strip().split('\n')
if len(lines) < 2:
continue
try:
int(lines[0].strip())
except ValueError:
continue
ts_match = re.match(
r'(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,.]\d{3})',
lines[1]
)
if not ts_match:
continue
start = _ts_to_sec(ts_match.group(1))
end = _ts_to_sec(ts_match.group(2))
text = ' '.join(lines[2:]).strip()
text = re.sub(r'^Speaker \d+:\s*', '', text)
if text:
entries.append((start, end, text))
return entries
def _ts_to_sec(ts):
ts = ts.replace(',', '.')
parts = ts.split(':')
return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2])
def _fmt_ts(sec):
m = int(sec) // 60
s = int(sec) % 60
return f"{m:02d}:{s:02d}"
def merge_to_paragraphs(entries, pause_threshold=2.0, max_para_duration=120):
"""Merge entries into paragraphs based on pauses."""
if not entries:
return []
paragraphs = []
para_start = entries[0][0]
para_end = entries[0][1]
para_texts = [entries[0][2]]
for i in range(1, len(entries)):
start, end, text = entries[i]
gap = start - para_end
duration = start - para_start
if gap > pause_threshold or duration > max_para_duration:
paragraphs.append((para_start, para_end, ' '.join(para_texts)))
para_start = start
para_end = end
para_texts = [text]
else:
para_end = end
para_texts.append(text)
paragraphs.append((para_start, para_end, ' '.join(para_texts)))
return paragraphs
def main():
project_dir = sys.argv[1] if len(sys.argv) > 1 else "."
config = load_project(project_dir)
audio_dir = config["_audio_dir"]
transcripts_dir = config["_transcripts_dir"]
data_dir = config["_data_dir"]
os.makedirs(transcripts_dir, exist_ok=True)
os.makedirs(data_dir, exist_ok=True)
all_indices = {}
for ep in config["episodes"]:
srt_key = config["_srt_keys"][ep["id"]]
srt_path = os.path.join(audio_dir, f"{srt_key}.srt")
if not os.path.exists(srt_path):
print(f"SKIP: {ep['id']} — no SRT file")
continue
entries = parse_srt(srt_path)
paragraphs = merge_to_paragraphs(entries)
# Write transcript
output_path = os.path.join(transcripts_dir, f"{srt_key}-Transcript.txt")
with open(output_path, "w", encoding="utf-8") as f:
f.write(f"{ep['title']}{config['host']} mit {ep['guest']}\n")
f.write(f"{'=' * 60}\n\n")
for start, end, text in paragraphs:
f.write(f"[{_fmt_ts(start)}]\n{text}\n\n")
# Store index
all_indices[srt_key] = {
"meta": {"host": config["host"], "guest": ep["guest"],
"theme": ep["title"], "staffel": ep["staffel"]},
"paragraphs": [{"start": s, "end": e, "text": t} for s, e, t in paragraphs],
}
print(f"OK: {ep['id']}{len(paragraphs)} Absätze → {output_path}")
# Save index
index_path = os.path.join(data_dir, "srt_index.json")
with open(index_path, "w", encoding="utf-8") as f:
json.dump(all_indices, f, ensure_ascii=False, indent=2)
print(f"\nIndex: {index_path} ({len(all_indices)} Episoden)")
if __name__ == "__main__":
main()