Merge fe7af44b06 into 49a1fae555

2026-03-23 17:31:54 -04:00 · 2026-03-23 17:31:54 -04:00 · d929aa3884
parent 49a1fae555 fe7af44b06
commit d929aa3884
1 changed files with 22 additions and 17 deletions
--- a/api/dify_graph/nodes/document_extractor/node.py
+++ b/api/dify_graph/nodes/document_extractor/node.py
@ -729,27 +729,32 @@ def _extract_text_from_vtt(vtt_bytes: bytes) -> str:

    # Merge consecutive utterances by the same speaker
    merged_results = []
-    if raw_results:
-        current_speaker, current_text = raw_results[0]
+    current_speaker = None
+    current_text = ""

-        for i in range(1, len(raw_results)):
-            spk, txt = raw_results[i]
-            if spk is None:
-                merged_results.append((None, current_text))
-                continue
-
-            if spk == current_speaker:
-                # If it is the same speaker, merge the utterances (joined by space)
-                current_text += " " + txt
-            else:
-                # If the speaker changes, register the utterance so far and move on
+    for spk, txt in raw_results:
+        if spk is None:
+            # Flush the current speaker's accumulated text before the speakerless caption
+            if current_speaker is not None or current_text:
                merged_results.append((current_speaker, current_text))
-                current_speaker, current_text = spk, txt
+            merged_results.append((None, txt))
+            # Reset: next caption with a speaker starts fresh
+            current_speaker = None
+            current_text = ""
+        elif current_speaker is None:
+            # Previous caption was speakerless (or start of stream); begin a new speaker run
+            current_speaker, current_text = spk, txt
+        elif spk == current_speaker:
+            # If it is the same speaker, merge the utterances (joined by space)
+            current_text += " " + txt
+        else:
+            # If the speaker changes, register the utterance so far and move on
+            merged_results.append((current_speaker, current_text))
+            current_speaker, current_text = spk, txt

-        # Add the last element
+    # Add the last element (skip if already flushed by a trailing speakerless caption)
+    if current_speaker is not None or current_text:
        merged_results.append((current_speaker, current_text))
-    else:
-        merged_results = raw_results

    # Return the result in the specified format: Speaker "text" style
    formatted = [f'{spk or ""} "{txt}"' for spk, txt in merged_results]