mirror of https://github.com/langgenius/dify.git
feat(firecrawl): follow pagination when crawl status is completed (#33864)
Co-authored-by: Crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
parent
30dd36505c
commit
30deeb6f1c
|
|
@ -95,15 +95,11 @@ class FirecrawlApp:
|
|||
if response.status_code == 200:
|
||||
crawl_status_response = response.json()
|
||||
if crawl_status_response.get("status") == "completed":
|
||||
total = crawl_status_response.get("total", 0)
|
||||
if total == 0:
|
||||
# Normalize to avoid None bypassing the zero-guard when the API returns null.
|
||||
total = crawl_status_response.get("total") or 0
|
||||
if total <= 0:
|
||||
raise Exception("Failed to check crawl status. Error: No page found")
|
||||
data = crawl_status_response.get("data", [])
|
||||
url_data_list: list[FirecrawlDocumentData] = []
|
||||
for item in data:
|
||||
if isinstance(item, dict) and "metadata" in item and "markdown" in item:
|
||||
url_data = self._extract_common_fields(item)
|
||||
url_data_list.append(url_data)
|
||||
url_data_list = self._collect_all_crawl_pages(crawl_status_response, headers)
|
||||
if url_data_list:
|
||||
file_key = "website_files/" + job_id + ".txt"
|
||||
try:
|
||||
|
|
@ -120,6 +116,36 @@ class FirecrawlApp:
|
|||
self._handle_error(response, "check crawl status")
|
||||
raise RuntimeError("unreachable: _handle_error always raises")
|
||||
|
||||
def _collect_all_crawl_pages(
|
||||
self, first_page: dict[str, Any], headers: dict[str, str]
|
||||
) -> list[FirecrawlDocumentData]:
|
||||
"""Collect all crawl result pages by following pagination links.
|
||||
|
||||
Raises an exception if any paginated request fails, to avoid returning
|
||||
partial data that is inconsistent with the reported total.
|
||||
|
||||
The number of pages processed is capped at ``total`` (the
|
||||
server-reported page count) to guard against infinite loops caused by
|
||||
a misbehaving server that keeps returning a ``next`` URL.
|
||||
"""
|
||||
total: int = first_page.get("total") or 0
|
||||
url_data_list: list[FirecrawlDocumentData] = []
|
||||
current_page = first_page
|
||||
pages_processed = 0
|
||||
while True:
|
||||
for item in current_page.get("data", []):
|
||||
if isinstance(item, dict) and "metadata" in item and "markdown" in item:
|
||||
url_data_list.append(self._extract_common_fields(item))
|
||||
next_url: str | None = current_page.get("next")
|
||||
pages_processed += 1
|
||||
if not next_url or pages_processed >= total:
|
||||
break
|
||||
response = self._get_request(next_url, headers)
|
||||
if response.status_code != 200:
|
||||
self._handle_error(response, "fetch next crawl page")
|
||||
current_page = response.json()
|
||||
return url_data_list
|
||||
|
||||
def _format_crawl_status_response(
|
||||
self,
|
||||
status: str,
|
||||
|
|
|
|||
|
|
@ -164,6 +164,13 @@ class TestFirecrawlApp:
|
|||
with pytest.raises(Exception, match="No page found"):
|
||||
app.check_crawl_status("job-1")
|
||||
|
||||
def test_check_crawl_status_completed_with_null_total_raises(self, mocker: MockerFixture):
|
||||
app = FirecrawlApp(api_key="fc-key", base_url="https://custom.firecrawl.dev")
|
||||
mocker.patch("httpx.get", return_value=_response(200, {"status": "completed", "total": None, "data": []}))
|
||||
|
||||
with pytest.raises(Exception, match="No page found"):
|
||||
app.check_crawl_status("job-1")
|
||||
|
||||
def test_check_crawl_status_non_completed(self, mocker: MockerFixture):
|
||||
app = FirecrawlApp(api_key="fc-key", base_url="https://custom.firecrawl.dev")
|
||||
payload = {"status": "processing", "total": 5, "completed": 1, "data": []}
|
||||
|
|
@ -203,6 +210,77 @@ class TestFirecrawlApp:
|
|||
with pytest.raises(Exception, match="Error saving crawl data"):
|
||||
app.check_crawl_status("job-err")
|
||||
|
||||
def test_check_crawl_status_follows_pagination(self, mocker: MockerFixture):
|
||||
"""When status is completed and next is present, follow pagination to collect all pages."""
|
||||
app = FirecrawlApp(api_key="fc-key", base_url="https://custom.firecrawl.dev")
|
||||
page1 = {
|
||||
"status": "completed",
|
||||
"total": 3,
|
||||
"completed": 3,
|
||||
"next": "https://custom.firecrawl.dev/v2/crawl/job-42?skip=1",
|
||||
"data": [{"metadata": {"title": "p1", "description": "", "sourceURL": "https://p1"}, "markdown": "m1"}],
|
||||
}
|
||||
page2 = {
|
||||
"status": "completed",
|
||||
"total": 3,
|
||||
"completed": 3,
|
||||
"next": "https://custom.firecrawl.dev/v2/crawl/job-42?skip=2",
|
||||
"data": [{"metadata": {"title": "p2", "description": "", "sourceURL": "https://p2"}, "markdown": "m2"}],
|
||||
}
|
||||
page3 = {
|
||||
"status": "completed",
|
||||
"total": 3,
|
||||
"completed": 3,
|
||||
"data": [{"metadata": {"title": "p3", "description": "", "sourceURL": "https://p3"}, "markdown": "m3"}],
|
||||
}
|
||||
mocker.patch("httpx.get", side_effect=[_response(200, page1), _response(200, page2), _response(200, page3)])
|
||||
mock_storage = MagicMock()
|
||||
mock_storage.exists.return_value = False
|
||||
mocker.patch.object(firecrawl_module, "storage", mock_storage)
|
||||
|
||||
result = app.check_crawl_status("job-42")
|
||||
|
||||
assert result["status"] == "completed"
|
||||
assert result["total"] == 3
|
||||
assert len(result["data"]) == 3
|
||||
assert [d["title"] for d in result["data"]] == ["p1", "p2", "p3"]
|
||||
|
||||
def test_check_crawl_status_pagination_error_raises(self, mocker: MockerFixture):
|
||||
"""An error while fetching a paginated page raises an exception; no partial data is returned."""
|
||||
app = FirecrawlApp(api_key="fc-key", base_url="https://custom.firecrawl.dev")
|
||||
page1 = {
|
||||
"status": "completed",
|
||||
"total": 2,
|
||||
"completed": 2,
|
||||
"next": "https://custom.firecrawl.dev/v2/crawl/job-99?skip=1",
|
||||
"data": [{"metadata": {"title": "p1", "description": "", "sourceURL": "https://p1"}, "markdown": "m1"}],
|
||||
}
|
||||
mocker.patch("httpx.get", side_effect=[_response(200, page1), _response(500, {"error": "server error"})])
|
||||
|
||||
with pytest.raises(Exception, match="fetch next crawl page"):
|
||||
app.check_crawl_status("job-99")
|
||||
|
||||
def test_check_crawl_status_pagination_capped_at_total(self, mocker: MockerFixture):
|
||||
"""Pagination stops once pages_processed reaches total, even if next is present."""
|
||||
app = FirecrawlApp(api_key="fc-key", base_url="https://custom.firecrawl.dev")
|
||||
# total=1: only the first page should be processed; next must not be followed
|
||||
page1 = {
|
||||
"status": "completed",
|
||||
"total": 1,
|
||||
"completed": 1,
|
||||
"next": "https://custom.firecrawl.dev/v2/crawl/job-cap?skip=1",
|
||||
"data": [{"metadata": {"title": "p1", "description": "", "sourceURL": "https://p1"}, "markdown": "m1"}],
|
||||
}
|
||||
mock_get = mocker.patch("httpx.get", return_value=_response(200, page1))
|
||||
mock_storage = MagicMock()
|
||||
mock_storage.exists.return_value = False
|
||||
mocker.patch.object(firecrawl_module, "storage", mock_storage)
|
||||
|
||||
result = app.check_crawl_status("job-cap")
|
||||
|
||||
assert len(result["data"]) == 1
|
||||
mock_get.assert_called_once() # initial fetch only; next URL is not followed due to cap
|
||||
|
||||
def test_extract_common_fields_and_status_formatter(self):
|
||||
app = FirecrawlApp(api_key="fc-key", base_url="https://custom.firecrawl.dev")
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue