This commit is contained in:
Alvin Tang 2026-03-23 17:31:54 -04:00 committed by GitHub
commit d929aa3884
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 22 additions and 17 deletions

View File

@ -729,27 +729,32 @@ def _extract_text_from_vtt(vtt_bytes: bytes) -> str:
# Merge consecutive utterances by the same speaker
merged_results = []
if raw_results:
current_speaker, current_text = raw_results[0]
current_speaker = None
current_text = ""
for i in range(1, len(raw_results)):
spk, txt = raw_results[i]
if spk is None:
merged_results.append((None, current_text))
continue
if spk == current_speaker:
# If it is the same speaker, merge the utterances (joined by space)
current_text += " " + txt
else:
# If the speaker changes, register the utterance so far and move on
for spk, txt in raw_results:
if spk is None:
# Flush the current speaker's accumulated text before the speakerless caption
if current_speaker is not None or current_text:
merged_results.append((current_speaker, current_text))
current_speaker, current_text = spk, txt
merged_results.append((None, txt))
# Reset: next caption with a speaker starts fresh
current_speaker = None
current_text = ""
elif current_speaker is None:
# Previous caption was speakerless (or start of stream); begin a new speaker run
current_speaker, current_text = spk, txt
elif spk == current_speaker:
# If it is the same speaker, merge the utterances (joined by space)
current_text += " " + txt
else:
# If the speaker changes, register the utterance so far and move on
merged_results.append((current_speaker, current_text))
current_speaker, current_text = spk, txt
# Add the last element
# Add the last element (skip if already flushed by a trailing speakerless caption)
if current_speaker is not None or current_text:
merged_results.append((current_speaker, current_text))
else:
merged_results = raw_results
# Return the result in the specified format: Speaker "text" style
formatted = [f'{spk or ""} "{txt}"' for spk, txt in merged_results]