From 2b06b7dce1ebd067aecbce17b4bc7e8c41a82f6a Mon Sep 17 00:00:00 2001 From: gambletan Date: Mon, 23 Mar 2026 16:50:14 +0800 Subject: [PATCH] fix: trust Content-Type for known binary MIME types in HTTP Request node PDFs generated by Copper PDF lack the optional binary marker comment on line 2, so all bytes in the first 1024-byte sample are valid UTF-8. The `<` character in PDF dictionary syntax (`<< /Type /Catalog`) then matches the text_markers heuristic, causing `is_file` to return False and the response to be treated as text instead of a file. For well-known binary application subtypes (pdf, zip, gzip, octet-stream, msword, vnd.openxmlformats, vnd.ms-excel, vnd.ms-powerpoint, x-tar, x-rar, x-7z-compressed, wasm), skip the heuristic byte-sampling entirely and trust the Content-Type header. Closes langgenius/dify#33897 Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: gambletan --- api/dify_graph/nodes/http_request/entities.py | 19 ++++++++++ .../nodes/http_request/test_entities.py | 37 +++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/api/dify_graph/nodes/http_request/entities.py b/api/dify_graph/nodes/http_request/entities.py index a5564689f8..986fb80dac 100644 --- a/api/dify_graph/nodes/http_request/entities.py +++ b/api/dify_graph/nodes/http_request/entities.py @@ -155,6 +155,25 @@ class Response: ): return False + # Known binary application types — trust Content-Type, skip heuristic byte-sampling + known_binary_subtypes = ( + "pdf", + "zip", + "gzip", + "x-gzip", + "octet-stream", + "msword", + "vnd.openxmlformats", + "vnd.ms-excel", + "vnd.ms-powerpoint", + "x-tar", + "x-rar", + "x-7z-compressed", + "wasm", + ) + if any(subtype in content_type for subtype in known_binary_subtypes): + return True + # Try to detect if content is text-based by sampling first few bytes try: # Sample first 1024 bytes for text detection diff --git a/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py b/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py index fec6ad90eb..a21acc1d75 100644 --- a/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py +++ b/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py @@ -231,3 +231,40 @@ def test_text_property_with_escaped_unicode(mock_response, json_content, descrip # The text should be valid JSON that can be parsed back to proper Unicode parsed = json.loads(response.text) assert isinstance(parsed, dict), f"Invalid JSON for {description}" + + +@pytest.mark.parametrize( + "content_type", + [ + "application/pdf", + "application/zip", + "application/gzip", + "application/x-gzip", + "application/octet-stream", + "application/msword", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.ms-excel", + "application/vnd.ms-powerpoint", + "application/x-tar", + "application/x-rar", + "application/x-7z-compressed", + "application/wasm", + ], +) +def test_known_binary_types_skip_heuristic(mock_response, content_type): + """Test that known binary MIME types are identified as files even when content is UTF-8 decodable. + + Regression test for https://github.com/langgenius/dify/issues/33897 + PDFs without a binary marker comment (e.g. Copper PDF) have UTF-8-decodable content + containing '<' in dictionary syntax (<< /Type /Catalog), which previously triggered + the text_markers heuristic and caused the response to be misidentified as text. + """ + # Simulate a Copper PDF without binary marker — valid UTF-8, contains '<' + pdf_content = b"%PDF-1.5\r\n1 0 obj\r\n<< /Type /Catalog /Pages 2 0 R >>\r\nendobj\r\n" + mock_response.headers = {"content-type": content_type} + type(mock_response).content = PropertyMock(return_value=pdf_content) + response = Response(mock_response) + assert response.is_file, ( + f"Known binary type {content_type} should be identified as a file " + "even when content is UTF-8 decodable and contains text markers" + )