diff --git a/api/dify_graph/nodes/document_extractor/node.py b/api/dify_graph/nodes/document_extractor/node.py
index 27196f1aca..f78221f678 100644
--- a/api/dify_graph/nodes/document_extractor/node.py
+++ b/api/dify_graph/nodes/document_extractor/node.py
@@ -729,27 +729,32 @@ def _extract_text_from_vtt(vtt_bytes: bytes) -> str:
 
     # Merge consecutive utterances by the same speaker
     merged_results = []
-    if raw_results:
-        current_speaker, current_text = raw_results[0]
+    current_speaker = None
+    current_text = ""
 
-        for i in range(1, len(raw_results)):
-            spk, txt = raw_results[i]
-            if spk is None:
-                merged_results.append((None, current_text))
-                continue
-
-            if spk == current_speaker:
-                # If it is the same speaker, merge the utterances (joined by space)
-                current_text += " " + txt
-            else:
-                # If the speaker changes, register the utterance so far and move on
+    for spk, txt in raw_results:
+        if spk is None:
+            # Flush the current speaker's accumulated text before the speakerless caption
+            if current_speaker is not None or current_text:
                 merged_results.append((current_speaker, current_text))
-                current_speaker, current_text = spk, txt
+            merged_results.append((None, txt))
+            # Reset: next caption with a speaker starts fresh
+            current_speaker = None
+            current_text = ""
+        elif current_speaker is None:
+            # Previous caption was speakerless (or start of stream); begin a new speaker run
+            current_speaker, current_text = spk, txt
+        elif spk == current_speaker:
+            # If it is the same speaker, merge the utterances (joined by space)
+            current_text += " " + txt
+        else:
+            # If the speaker changes, register the utterance so far and move on
+            merged_results.append((current_speaker, current_text))
+            current_speaker, current_text = spk, txt
 
-        # Add the last element
+    # Add the last element (skip if already flushed by a trailing speakerless caption)
+    if current_speaker is not None or current_text:
         merged_results.append((current_speaker, current_text))
-    else:
-        merged_results = raw_results
 
     # Return the result in the specified format: Speaker "text" style
     formatted = [f'{spk or ""} "{txt}"' for spk, txt in merged_results]