fix: trust Content-Type for known binary MIME types in HTTP Request node

PDFs generated by Copper PDF lack the optional binary marker comment on
line 2, so all bytes in the first 1024-byte sample are valid UTF-8.  The
`<` character in PDF dictionary syntax (`<< /Type /Catalog`) then matches
the text_markers heuristic, causing `is_file` to return False and the
response to be treated as text instead of a file.

For well-known binary application subtypes (pdf, zip, gzip, octet-stream,
msword, vnd.openxmlformats, vnd.ms-excel, vnd.ms-powerpoint, x-tar,
x-rar, x-7z-compressed, wasm), skip the heuristic byte-sampling entirely
and trust the Content-Type header.

Closes langgenius/dify#33897

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: gambletan <tan@echooo.com>
This commit is contained in:
gambletan 2026-03-23 16:50:14 +08:00
parent 35caa04fe7
commit 2b06b7dce1
2 changed files with 56 additions and 0 deletions

View File

@ -155,6 +155,25 @@ class Response:
):
return False
# Known binary application types — trust Content-Type, skip heuristic byte-sampling
known_binary_subtypes = (
"pdf",
"zip",
"gzip",
"x-gzip",
"octet-stream",
"msword",
"vnd.openxmlformats",
"vnd.ms-excel",
"vnd.ms-powerpoint",
"x-tar",
"x-rar",
"x-7z-compressed",
"wasm",
)
if any(subtype in content_type for subtype in known_binary_subtypes):
return True
# Try to detect if content is text-based by sampling first few bytes
try:
# Sample first 1024 bytes for text detection

View File

@ -231,3 +231,40 @@ def test_text_property_with_escaped_unicode(mock_response, json_content, descrip
# The text should be valid JSON that can be parsed back to proper Unicode
parsed = json.loads(response.text)
assert isinstance(parsed, dict), f"Invalid JSON for {description}"
@pytest.mark.parametrize(
"content_type",
[
"application/pdf",
"application/zip",
"application/gzip",
"application/x-gzip",
"application/octet-stream",
"application/msword",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-excel",
"application/vnd.ms-powerpoint",
"application/x-tar",
"application/x-rar",
"application/x-7z-compressed",
"application/wasm",
],
)
def test_known_binary_types_skip_heuristic(mock_response, content_type):
"""Test that known binary MIME types are identified as files even when content is UTF-8 decodable.
Regression test for https://github.com/langgenius/dify/issues/33897
PDFs without a binary marker comment (e.g. Copper PDF) have UTF-8-decodable content
containing '<' in dictionary syntax (<< /Type /Catalog), which previously triggered
the text_markers heuristic and caused the response to be misidentified as text.
"""
# Simulate a Copper PDF without binary marker — valid UTF-8, contains '<'
pdf_content = b"%PDF-1.5\r\n1 0 obj\r\n<< /Type /Catalog /Pages 2 0 R >>\r\nendobj\r\n"
mock_response.headers = {"content-type": content_type}
type(mock_response).content = PropertyMock(return_value=pdf_content)
response = Response(mock_response)
assert response.is_file, (
f"Known binary type {content_type} should be identified as a file "
"even when content is UTF-8 decodable and contains text markers"
)