diff --git a/api/dify_graph/nodes/http_request/entities.py b/api/dify_graph/nodes/http_request/entities.py index f594d58ae6..da56dd0242 100644 --- a/api/dify_graph/nodes/http_request/entities.py +++ b/api/dify_graph/nodes/http_request/entities.py @@ -157,6 +157,25 @@ class Response: ): return False + # Known binary application types — trust Content-Type, skip heuristic byte-sampling + known_binary_subtypes = ( + "pdf", + "zip", + "gzip", + "x-gzip", + "octet-stream", + "msword", + "vnd.openxmlformats", + "vnd.ms-excel", + "vnd.ms-powerpoint", + "x-tar", + "x-rar", + "x-7z-compressed", + "wasm", + ) + if any(subtype in content_type for subtype in known_binary_subtypes): + return True + # Try to detect if content is text-based by sampling first few bytes try: # Sample first 1024 bytes for text detection diff --git a/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py b/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py index fec6ad90eb..a21acc1d75 100644 --- a/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py +++ b/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py @@ -231,3 +231,40 @@ def test_text_property_with_escaped_unicode(mock_response, json_content, descrip # The text should be valid JSON that can be parsed back to proper Unicode parsed = json.loads(response.text) assert isinstance(parsed, dict), f"Invalid JSON for {description}" + + +@pytest.mark.parametrize( + "content_type", + [ + "application/pdf", + "application/zip", + "application/gzip", + "application/x-gzip", + "application/octet-stream", + "application/msword", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.ms-excel", + "application/vnd.ms-powerpoint", + "application/x-tar", + "application/x-rar", + "application/x-7z-compressed", + "application/wasm", + ], +) +def test_known_binary_types_skip_heuristic(mock_response, content_type): + """Test that known binary MIME types are identified as files even when content is UTF-8 decodable. + + Regression test for https://github.com/langgenius/dify/issues/33897 + PDFs without a binary marker comment (e.g. Copper PDF) have UTF-8-decodable content + containing '<' in dictionary syntax (<< /Type /Catalog), which previously triggered + the text_markers heuristic and caused the response to be misidentified as text. + """ + # Simulate a Copper PDF without binary marker — valid UTF-8, contains '<' + pdf_content = b"%PDF-1.5\r\n1 0 obj\r\n<< /Type /Catalog /Pages 2 0 R >>\r\nendobj\r\n" + mock_response.headers = {"content-type": content_type} + type(mock_response).content = PropertyMock(return_value=pdf_content) + response = Response(mock_response) + assert response.is_file, ( + f"Known binary type {content_type} should be identified as a file " + "even when content is UTF-8 decodable and contains text markers" + )