mirror of https://github.com/langgenius/dify.git
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
2cc0de9c1b
commit
322cd37de1
|
|
@ -4,6 +4,7 @@ import json
|
|||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
import zipfile
|
||||
from collections.abc import Mapping, Sequence
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
|
|
@ -385,6 +386,32 @@ def parser_docx_part(block, doc: Document, content_items, i):
|
|||
content_items.append((i, "table", Table(block, doc)))
|
||||
|
||||
|
||||
def _normalize_docx_zip(file_content: bytes) -> bytes:
|
||||
"""
|
||||
Some DOCX files (e.g. exported by Evernote on Windows) are malformed:
|
||||
ZIP entry names use backslash (\\) as path separator instead of the forward
|
||||
slash (/) required by both the ZIP spec and OOXML. On Linux/Mac the entry
|
||||
"word\\document.xml" is never found when python-docx looks for
|
||||
"word/document.xml", which triggers a KeyError about a missing relationship.
|
||||
|
||||
This function rewrites the ZIP in-memory, normalizing all entry names to
|
||||
use forward slashes without touching any actual document content.
|
||||
"""
|
||||
try:
|
||||
with zipfile.ZipFile(io.BytesIO(file_content), "r") as zin:
|
||||
out_buf = io.BytesIO()
|
||||
with zipfile.ZipFile(out_buf, "w", compression=zipfile.ZIP_DEFLATED) as zout:
|
||||
for item in zin.infolist():
|
||||
data = zin.read(item.filename)
|
||||
# Normalize backslash path separators to forward slash
|
||||
item.filename = item.filename.replace("\\", "/")
|
||||
zout.writestr(item, data)
|
||||
return out_buf.getvalue()
|
||||
except zipfile.BadZipFile:
|
||||
# Not a valid zip — return as-is and let python-docx report the real error
|
||||
return file_content
|
||||
|
||||
|
||||
def _extract_text_from_docx(file_content: bytes) -> str:
|
||||
"""
|
||||
Extract text from a DOCX file.
|
||||
|
|
@ -392,7 +419,15 @@ def _extract_text_from_docx(file_content: bytes) -> str:
|
|||
"""
|
||||
try:
|
||||
doc_file = io.BytesIO(file_content)
|
||||
doc = docx.Document(doc_file)
|
||||
try:
|
||||
doc = docx.Document(doc_file)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to parse DOCX, attempting to normalize ZIP entry paths: %s", e)
|
||||
# Some DOCX files exported by tools like Evernote on Windows use
|
||||
# backslash path separators in ZIP entries and/or single-quoted XML
|
||||
# attributes, both of which break python-docx on Linux. Normalize and retry.
|
||||
file_content = _normalize_docx_zip(file_content)
|
||||
doc = docx.Document(io.BytesIO(file_content))
|
||||
text = []
|
||||
|
||||
# Keep track of paragraph and table positions
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ from dify_graph.nodes.document_extractor.node import (
|
|||
_extract_text_from_excel,
|
||||
_extract_text_from_pdf,
|
||||
_extract_text_from_plain_text,
|
||||
_normalize_docx_zip,
|
||||
)
|
||||
from dify_graph.variables import ArrayFileSegment
|
||||
from dify_graph.variables.segments import ArrayStringSegment
|
||||
|
|
@ -385,3 +386,58 @@ def test_extract_text_from_excel_numeric_type_column(mock_excel_file):
|
|||
expected_manual = "| 1.0 | 1.1 |\n| --- | --- |\n| Test | Test |\n\n"
|
||||
|
||||
assert expected_manual == result
|
||||
|
||||
|
||||
def _make_docx_zip(use_backslash: bool) -> bytes:
|
||||
"""Helper to build a minimal in-memory DOCX zip.
|
||||
|
||||
When use_backslash=True the ZIP entry names use backslash separators
|
||||
(as produced by Evernote on Windows), otherwise forward slashes are used.
|
||||
"""
|
||||
import zipfile
|
||||
|
||||
sep = "\\" if use_backslash else "/"
|
||||
buf = io.BytesIO()
|
||||
with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
||||
zf.writestr("[Content_Types].xml", b"<Types/>")
|
||||
zf.writestr(f"_rels{sep}.rels", b"<Relationships/>")
|
||||
zf.writestr(f"word{sep}document.xml", b"<w:document/>")
|
||||
zf.writestr(f"word{sep}_rels{sep}document.xml.rels", b"<Relationships/>")
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def test_normalize_docx_zip_replaces_backslashes():
|
||||
"""ZIP entries with backslash separators must be rewritten to forward slashes."""
|
||||
import zipfile
|
||||
|
||||
malformed = _make_docx_zip(use_backslash=True)
|
||||
fixed = _normalize_docx_zip(malformed)
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(fixed)) as zf:
|
||||
names = zf.namelist()
|
||||
|
||||
assert "word/document.xml" in names
|
||||
assert "word/_rels/document.xml.rels" in names
|
||||
# No entry should contain a backslash after normalization
|
||||
assert all("\\" not in name for name in names)
|
||||
|
||||
|
||||
def test_normalize_docx_zip_leaves_forward_slash_unchanged():
|
||||
"""ZIP entries that already use forward slashes must not be modified."""
|
||||
import zipfile
|
||||
|
||||
normal = _make_docx_zip(use_backslash=False)
|
||||
fixed = _normalize_docx_zip(normal)
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(fixed)) as zf:
|
||||
names = zf.namelist()
|
||||
|
||||
assert "word/document.xml" in names
|
||||
assert "word/_rels/document.xml.rels" in names
|
||||
|
||||
|
||||
def test_normalize_docx_zip_returns_original_on_bad_zip():
|
||||
"""Non-zip bytes must be returned as-is without raising."""
|
||||
garbage = b"not a zip file at all"
|
||||
result = _normalize_docx_zip(garbage)
|
||||
assert result == garbage
|
||||
|
|
|
|||
Loading…
Reference in New Issue