132 lines
3.8 KiB
Python
132 lines
3.8 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Convert MacWhisper SRT files to clean transcripts + SRT index JSON."""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import sys
|
||
|
|
from config import load_project
|
||
|
|
|
||
|
|
|
||
|
|
def parse_srt(filepath):
|
||
|
|
"""Parse SRT into list of (start_sec, end_sec, text)."""
|
||
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
blocks = re.split(r'\n\n+', content.strip())
|
||
|
|
entries = []
|
||
|
|
|
||
|
|
for block in blocks:
|
||
|
|
lines = block.strip().split('\n')
|
||
|
|
if len(lines) < 2:
|
||
|
|
continue
|
||
|
|
try:
|
||
|
|
int(lines[0].strip())
|
||
|
|
except ValueError:
|
||
|
|
continue
|
||
|
|
ts_match = re.match(
|
||
|
|
r'(\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,.]\d{3})',
|
||
|
|
lines[1]
|
||
|
|
)
|
||
|
|
if not ts_match:
|
||
|
|
continue
|
||
|
|
start = _ts_to_sec(ts_match.group(1))
|
||
|
|
end = _ts_to_sec(ts_match.group(2))
|
||
|
|
text = ' '.join(lines[2:]).strip()
|
||
|
|
text = re.sub(r'^Speaker \d+:\s*', '', text)
|
||
|
|
if text:
|
||
|
|
entries.append((start, end, text))
|
||
|
|
|
||
|
|
return entries
|
||
|
|
|
||
|
|
|
||
|
|
def _ts_to_sec(ts):
|
||
|
|
ts = ts.replace(',', '.')
|
||
|
|
parts = ts.split(':')
|
||
|
|
return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2])
|
||
|
|
|
||
|
|
|
||
|
|
def _fmt_ts(sec):
|
||
|
|
m = int(sec) // 60
|
||
|
|
s = int(sec) % 60
|
||
|
|
return f"{m:02d}:{s:02d}"
|
||
|
|
|
||
|
|
|
||
|
|
def merge_to_paragraphs(entries, pause_threshold=2.0, max_para_duration=120):
|
||
|
|
"""Merge entries into paragraphs based on pauses."""
|
||
|
|
if not entries:
|
||
|
|
return []
|
||
|
|
|
||
|
|
paragraphs = []
|
||
|
|
para_start = entries[0][0]
|
||
|
|
para_end = entries[0][1]
|
||
|
|
para_texts = [entries[0][2]]
|
||
|
|
|
||
|
|
for i in range(1, len(entries)):
|
||
|
|
start, end, text = entries[i]
|
||
|
|
gap = start - para_end
|
||
|
|
duration = start - para_start
|
||
|
|
|
||
|
|
if gap > pause_threshold or duration > max_para_duration:
|
||
|
|
paragraphs.append((para_start, para_end, ' '.join(para_texts)))
|
||
|
|
para_start = start
|
||
|
|
para_end = end
|
||
|
|
para_texts = [text]
|
||
|
|
else:
|
||
|
|
para_end = end
|
||
|
|
para_texts.append(text)
|
||
|
|
|
||
|
|
paragraphs.append((para_start, para_end, ' '.join(para_texts)))
|
||
|
|
return paragraphs
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
project_dir = sys.argv[1] if len(sys.argv) > 1 else "."
|
||
|
|
config = load_project(project_dir)
|
||
|
|
audio_dir = config["_audio_dir"]
|
||
|
|
transcripts_dir = config["_transcripts_dir"]
|
||
|
|
data_dir = config["_data_dir"]
|
||
|
|
os.makedirs(transcripts_dir, exist_ok=True)
|
||
|
|
os.makedirs(data_dir, exist_ok=True)
|
||
|
|
|
||
|
|
all_indices = {}
|
||
|
|
|
||
|
|
for ep in config["episodes"]:
|
||
|
|
srt_key = config["_srt_keys"][ep["id"]]
|
||
|
|
srt_path = os.path.join(audio_dir, f"{srt_key}.srt")
|
||
|
|
|
||
|
|
if not os.path.exists(srt_path):
|
||
|
|
print(f"SKIP: {ep['id']} — no SRT file")
|
||
|
|
continue
|
||
|
|
|
||
|
|
entries = parse_srt(srt_path)
|
||
|
|
paragraphs = merge_to_paragraphs(entries)
|
||
|
|
|
||
|
|
# Write transcript
|
||
|
|
output_path = os.path.join(transcripts_dir, f"{srt_key}-Transcript.txt")
|
||
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
||
|
|
f.write(f"{ep['title']} — {config['host']} mit {ep['guest']}\n")
|
||
|
|
f.write(f"{'=' * 60}\n\n")
|
||
|
|
for start, end, text in paragraphs:
|
||
|
|
f.write(f"[{_fmt_ts(start)}]\n{text}\n\n")
|
||
|
|
|
||
|
|
# Store index
|
||
|
|
all_indices[srt_key] = {
|
||
|
|
"meta": {"host": config["host"], "guest": ep["guest"],
|
||
|
|
"theme": ep["title"], "staffel": ep["staffel"]},
|
||
|
|
"paragraphs": [{"start": s, "end": e, "text": t} for s, e, t in paragraphs],
|
||
|
|
}
|
||
|
|
|
||
|
|
print(f"OK: {ep['id']} — {len(paragraphs)} Absätze → {output_path}")
|
||
|
|
|
||
|
|
# Save index
|
||
|
|
index_path = os.path.join(data_dir, "srt_index.json")
|
||
|
|
with open(index_path, "w", encoding="utf-8") as f:
|
||
|
|
json.dump(all_indices, f, ensure_ascii=False, indent=2)
|
||
|
|
|
||
|
|
print(f"\nIndex: {index_path} ({len(all_indices)} Episoden)")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|