Merge 2b06b7dce1 into defb982c3e

2026-03-24 23:02:13 +09:00 · 2026-03-24 23:02:13 +09:00 · f547238458
parent defb982c3e 2b06b7dce1
commit f547238458
2 changed files with 56 additions and 0 deletions
--- a/api/dify_graph/nodes/http_request/entities.py
+++ b/api/dify_graph/nodes/http_request/entities.py
@ -157,6 +157,25 @@ class Response:
            ):
                return False

+            # Known binary application types — trust Content-Type, skip heuristic byte-sampling
+            known_binary_subtypes = (
+                "pdf",
+                "zip",
+                "gzip",
+                "x-gzip",
+                "octet-stream",
+                "msword",
+                "vnd.openxmlformats",
+                "vnd.ms-excel",
+                "vnd.ms-powerpoint",
+                "x-tar",
+                "x-rar",
+                "x-7z-compressed",
+                "wasm",
+            )
+            if any(subtype in content_type for subtype in known_binary_subtypes):
+                return True
+
            # Try to detect if content is text-based by sampling first few bytes
            try:
                # Sample first 1024 bytes for text detection
--- a/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py
+++ b/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py
@ -231,3 +231,40 @@ def test_text_property_with_escaped_unicode(mock_response, json_content, descrip
    # The text should be valid JSON that can be parsed back to proper Unicode
    parsed = json.loads(response.text)
    assert isinstance(parsed, dict), f"Invalid JSON for {description}"
+
+
+@pytest.mark.parametrize(
+    "content_type",
+    [
+        "application/pdf",
+        "application/zip",
+        "application/gzip",
+        "application/x-gzip",
+        "application/octet-stream",
+        "application/msword",
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        "application/vnd.ms-excel",
+        "application/vnd.ms-powerpoint",
+        "application/x-tar",
+        "application/x-rar",
+        "application/x-7z-compressed",
+        "application/wasm",
+    ],
+)
+def test_known_binary_types_skip_heuristic(mock_response, content_type):
+    """Test that known binary MIME types are identified as files even when content is UTF-8 decodable.
+
+    Regression test for https://github.com/langgenius/dify/issues/33897
+    PDFs without a binary marker comment (e.g. Copper PDF) have UTF-8-decodable content
+    containing '<' in dictionary syntax (<< /Type /Catalog), which previously triggered
+    the text_markers heuristic and caused the response to be misidentified as text.
+    """
+    # Simulate a Copper PDF without binary marker — valid UTF-8, contains '<'
+    pdf_content = b"%PDF-1.5\r\n1 0 obj\r\n<< /Type /Catalog /Pages 2 0 R >>\r\nendobj\r\n"
+    mock_response.headers = {"content-type": content_type}
+    type(mock_response).content = PropertyMock(return_value=pdf_content)
+    response = Response(mock_response)
+    assert response.is_file, (
+        f"Known binary type {content_type} should be identified as a file "
+        "even when content is UTF-8 decodable and contains text markers"
+    )