fix: trust Content-Type for known binary MIME types in HTTP Request node

PDFs generated by Copper PDF lack the optional binary marker comment on line 2, so all bytes in the first 1024-byte sample are valid UTF-8. The `<` character in PDF dictionary syntax (`<< /Type /Catalog`) then matches the text_markers heuristic, causing `is_file` to return False and the response to be treated as text instead of a file. For well-known binary application subtypes (pdf, zip, gzip, octet-stream, msword, vnd.openxmlformats, vnd.ms-excel, vnd.ms-powerpoint, x-tar, x-rar, x-7z-compressed, wasm), skip the heuristic byte-sampling entirely and trust the Content-Type header. Closes langgenius/dify#33897 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: gambletan <tan@echooo.com>
2026-03-23 16:50:14 +08:00 · 2026-03-23 16:50:14 +08:00 · 2b06b7dce1
parent 35caa04fe7
commit 2b06b7dce1
2 changed files with 56 additions and 0 deletions
--- a/api/dify_graph/nodes/http_request/entities.py
+++ b/api/dify_graph/nodes/http_request/entities.py
@ -155,6 +155,25 @@ class Response:
            ):
                return False

+            # Known binary application types — trust Content-Type, skip heuristic byte-sampling
+            known_binary_subtypes = (
+                "pdf",
+                "zip",
+                "gzip",
+                "x-gzip",
+                "octet-stream",
+                "msword",
+                "vnd.openxmlformats",
+                "vnd.ms-excel",
+                "vnd.ms-powerpoint",
+                "x-tar",
+                "x-rar",
+                "x-7z-compressed",
+                "wasm",
+            )
+            if any(subtype in content_type for subtype in known_binary_subtypes):
+                return True
+
            # Try to detect if content is text-based by sampling first few bytes
            try:
                # Sample first 1024 bytes for text detection
--- a/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py
+++ b/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py
@ -231,3 +231,40 @@ def test_text_property_with_escaped_unicode(mock_response, json_content, descrip
    # The text should be valid JSON that can be parsed back to proper Unicode
    parsed = json.loads(response.text)
    assert isinstance(parsed, dict), f"Invalid JSON for {description}"
+
+
+@pytest.mark.parametrize(
+    "content_type",
+    [
+        "application/pdf",
+        "application/zip",
+        "application/gzip",
+        "application/x-gzip",
+        "application/octet-stream",
+        "application/msword",
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        "application/vnd.ms-excel",
+        "application/vnd.ms-powerpoint",
+        "application/x-tar",
+        "application/x-rar",
+        "application/x-7z-compressed",
+        "application/wasm",
+    ],
+)
+def test_known_binary_types_skip_heuristic(mock_response, content_type):
+    """Test that known binary MIME types are identified as files even when content is UTF-8 decodable.
+
+    Regression test for https://github.com/langgenius/dify/issues/33897
+    PDFs without a binary marker comment (e.g. Copper PDF) have UTF-8-decodable content
+    containing '<' in dictionary syntax (<< /Type /Catalog), which previously triggered
+    the text_markers heuristic and caused the response to be misidentified as text.
+    """
+    # Simulate a Copper PDF without binary marker — valid UTF-8, contains '<'
+    pdf_content = b"%PDF-1.5\r\n1 0 obj\r\n<< /Type /Catalog /Pages 2 0 R >>\r\nendobj\r\n"
+    mock_response.headers = {"content-type": content_type}
+    type(mock_response).content = PropertyMock(return_value=pdf_content)
+    response = Response(mock_response)
+    assert response.is_file, (
+        f"Known binary type {content_type} should be identified as a file "
+        "even when content is UTF-8 decodable and contains text markers"
+    )