From 322cd37de10b01aa44a1ce4abe463788311cf512 Mon Sep 17 00:00:00 2001 From: Jiaquan Yi <2018298805@qq.com> Date: Mon, 9 Mar 2026 10:49:42 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20handle=20backslash=20path=20separators?= =?UTF-8?q?=20in=20DOCX=20ZIP=20entries=20exported=20on=E2=80=A6(#33129)?= =?UTF-8?q?=20(#33131)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- .../nodes/document_extractor/node.py | 37 +++++++++++- .../nodes/test_document_extractor_node.py | 56 +++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) diff --git a/api/dify_graph/nodes/document_extractor/node.py b/api/dify_graph/nodes/document_extractor/node.py index 5945e57926..e1ae6f0199 100644 --- a/api/dify_graph/nodes/document_extractor/node.py +++ b/api/dify_graph/nodes/document_extractor/node.py @@ -4,6 +4,7 @@ import json import logging import os import tempfile +import zipfile from collections.abc import Mapping, Sequence from typing import TYPE_CHECKING, Any @@ -385,6 +386,32 @@ def parser_docx_part(block, doc: Document, content_items, i): content_items.append((i, "table", Table(block, doc))) +def _normalize_docx_zip(file_content: bytes) -> bytes: + """ + Some DOCX files (e.g. exported by Evernote on Windows) are malformed: + ZIP entry names use backslash (\\) as path separator instead of the forward + slash (/) required by both the ZIP spec and OOXML. On Linux/Mac the entry + "word\\document.xml" is never found when python-docx looks for + "word/document.xml", which triggers a KeyError about a missing relationship. + + This function rewrites the ZIP in-memory, normalizing all entry names to + use forward slashes without touching any actual document content. + """ + try: + with zipfile.ZipFile(io.BytesIO(file_content), "r") as zin: + out_buf = io.BytesIO() + with zipfile.ZipFile(out_buf, "w", compression=zipfile.ZIP_DEFLATED) as zout: + for item in zin.infolist(): + data = zin.read(item.filename) + # Normalize backslash path separators to forward slash + item.filename = item.filename.replace("\\", "/") + zout.writestr(item, data) + return out_buf.getvalue() + except zipfile.BadZipFile: + # Not a valid zip — return as-is and let python-docx report the real error + return file_content + + def _extract_text_from_docx(file_content: bytes) -> str: """ Extract text from a DOCX file. @@ -392,7 +419,15 @@ def _extract_text_from_docx(file_content: bytes) -> str: """ try: doc_file = io.BytesIO(file_content) - doc = docx.Document(doc_file) + try: + doc = docx.Document(doc_file) + except Exception as e: + logger.warning("Failed to parse DOCX, attempting to normalize ZIP entry paths: %s", e) + # Some DOCX files exported by tools like Evernote on Windows use + # backslash path separators in ZIP entries and/or single-quoted XML + # attributes, both of which break python-docx on Linux. Normalize and retry. + file_content = _normalize_docx_zip(file_content) + doc = docx.Document(io.BytesIO(file_content)) text = [] # Keep track of paragraph and table positions diff --git a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py index 5e20b1e12f..abdf0954c4 100644 --- a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py +++ b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py @@ -16,6 +16,7 @@ from dify_graph.nodes.document_extractor.node import ( _extract_text_from_excel, _extract_text_from_pdf, _extract_text_from_plain_text, + _normalize_docx_zip, ) from dify_graph.variables import ArrayFileSegment from dify_graph.variables.segments import ArrayStringSegment @@ -385,3 +386,58 @@ def test_extract_text_from_excel_numeric_type_column(mock_excel_file): expected_manual = "| 1.0 | 1.1 |\n| --- | --- |\n| Test | Test |\n\n" assert expected_manual == result + + +def _make_docx_zip(use_backslash: bool) -> bytes: + """Helper to build a minimal in-memory DOCX zip. + + When use_backslash=True the ZIP entry names use backslash separators + (as produced by Evernote on Windows), otherwise forward slashes are used. + """ + import zipfile + + sep = "\\" if use_backslash else "/" + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", b"") + zf.writestr(f"_rels{sep}.rels", b"") + zf.writestr(f"word{sep}document.xml", b"") + zf.writestr(f"word{sep}_rels{sep}document.xml.rels", b"") + return buf.getvalue() + + +def test_normalize_docx_zip_replaces_backslashes(): + """ZIP entries with backslash separators must be rewritten to forward slashes.""" + import zipfile + + malformed = _make_docx_zip(use_backslash=True) + fixed = _normalize_docx_zip(malformed) + + with zipfile.ZipFile(io.BytesIO(fixed)) as zf: + names = zf.namelist() + + assert "word/document.xml" in names + assert "word/_rels/document.xml.rels" in names + # No entry should contain a backslash after normalization + assert all("\\" not in name for name in names) + + +def test_normalize_docx_zip_leaves_forward_slash_unchanged(): + """ZIP entries that already use forward slashes must not be modified.""" + import zipfile + + normal = _make_docx_zip(use_backslash=False) + fixed = _normalize_docx_zip(normal) + + with zipfile.ZipFile(io.BytesIO(fixed)) as zf: + names = zf.namelist() + + assert "word/document.xml" in names + assert "word/_rels/document.xml.rels" in names + + +def test_normalize_docx_zip_returns_original_on_bad_zip(): + """Non-zip bytes must be returned as-is without raising.""" + garbage = b"not a zip file at all" + result = _normalize_docx_zip(garbage) + assert result == garbage