From 07e37769567e258f3b9c623b66e5085758ff6cb4 Mon Sep 17 00:00:00 2001 From: gambletan Date: Wed, 11 Mar 2026 10:19:51 +0800 Subject: [PATCH] fix: prevent temp file leak in document extractor when partition_via_api fails The _extract_text_from_doc, _extract_text_from_ppt, _extract_text_from_pptx, and _extract_text_from_epub functions create temporary files with delete=False but only call os.unlink() on the success path. If partition_via_api() or the file open raises an exception, the temporary file is never cleaned up, causing a resource leak on disk. Move os.unlink() into a finally block so the temp file is always removed regardless of whether the API call succeeds or fails. Co-Authored-By: Claude Opus 4.6 --- api/dify_graph/nodes/document_extractor/node.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/api/dify_graph/nodes/document_extractor/node.py b/api/dify_graph/nodes/document_extractor/node.py index c26b18aac9..0f93f89783 100644 --- a/api/dify_graph/nodes/document_extractor/node.py +++ b/api/dify_graph/nodes/document_extractor/node.py @@ -377,6 +377,7 @@ def _extract_text_from_doc(file_content: bytes, *, unstructured_api_config: Unst with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file: temp_file.write(file_content) temp_file.flush() + try: with open(temp_file.name, "rb") as file: elements = partition_via_api( file=file, @@ -384,6 +385,7 @@ def _extract_text_from_doc(file_content: bytes, *, unstructured_api_config: Unst api_url=unstructured_api_config.api_url, api_key=api_key, ) + finally: os.unlink(temp_file.name) return "\n".join([getattr(element, "text", "") for element in elements]) except Exception as e: @@ -624,6 +626,7 @@ def _extract_text_from_ppt(file_content: bytes, *, unstructured_api_config: Unst with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as temp_file: temp_file.write(file_content) temp_file.flush() + try: with open(temp_file.name, "rb") as file: elements = partition_via_api( file=file, @@ -631,6 +634,7 @@ def _extract_text_from_ppt(file_content: bytes, *, unstructured_api_config: Unst api_url=unstructured_api_config.api_url, api_key=api_key, ) + finally: os.unlink(temp_file.name) else: with io.BytesIO(file_content) as file: @@ -652,6 +656,7 @@ def _extract_text_from_pptx(file_content: bytes, *, unstructured_api_config: Uns with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as temp_file: temp_file.write(file_content) temp_file.flush() + try: with open(temp_file.name, "rb") as file: elements = partition_via_api( file=file, @@ -659,6 +664,7 @@ def _extract_text_from_pptx(file_content: bytes, *, unstructured_api_config: Uns api_url=unstructured_api_config.api_url, api_key=api_key, ) + finally: os.unlink(temp_file.name) else: with io.BytesIO(file_content) as file: @@ -679,6 +685,7 @@ def _extract_text_from_epub(file_content: bytes, *, unstructured_api_config: Uns with tempfile.NamedTemporaryFile(suffix=".epub", delete=False) as temp_file: temp_file.write(file_content) temp_file.flush() + try: with open(temp_file.name, "rb") as file: elements = partition_via_api( file=file, @@ -686,6 +693,7 @@ def _extract_text_from_epub(file_content: bytes, *, unstructured_api_config: Uns api_url=unstructured_api_config.api_url, api_key=api_key, ) + finally: os.unlink(temp_file.name) else: pypandoc.download_pandoc()