feat(firecrawl): follow pagination when crawl status is completed (#33864)

Co-authored-by: Crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
kurokobo 2026-03-23 22:19:32 +09:00 committed by GitHub
parent 30dd36505c
commit 30deeb6f1c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 112 additions and 8 deletions

View File

@ -95,15 +95,11 @@ class FirecrawlApp:
if response.status_code == 200:
crawl_status_response = response.json()
if crawl_status_response.get("status") == "completed":
total = crawl_status_response.get("total", 0)
if total == 0:
# Normalize to avoid None bypassing the zero-guard when the API returns null.
total = crawl_status_response.get("total") or 0
if total <= 0:
raise Exception("Failed to check crawl status. Error: No page found")
data = crawl_status_response.get("data", [])
url_data_list: list[FirecrawlDocumentData] = []
for item in data:
if isinstance(item, dict) and "metadata" in item and "markdown" in item:
url_data = self._extract_common_fields(item)
url_data_list.append(url_data)
url_data_list = self._collect_all_crawl_pages(crawl_status_response, headers)
if url_data_list:
file_key = "website_files/" + job_id + ".txt"
try:
@ -120,6 +116,36 @@ class FirecrawlApp:
self._handle_error(response, "check crawl status")
raise RuntimeError("unreachable: _handle_error always raises")
def _collect_all_crawl_pages(
self, first_page: dict[str, Any], headers: dict[str, str]
) -> list[FirecrawlDocumentData]:
"""Collect all crawl result pages by following pagination links.
Raises an exception if any paginated request fails, to avoid returning
partial data that is inconsistent with the reported total.
The number of pages processed is capped at ``total`` (the
server-reported page count) to guard against infinite loops caused by
a misbehaving server that keeps returning a ``next`` URL.
"""
total: int = first_page.get("total") or 0
url_data_list: list[FirecrawlDocumentData] = []
current_page = first_page
pages_processed = 0
while True:
for item in current_page.get("data", []):
if isinstance(item, dict) and "metadata" in item and "markdown" in item:
url_data_list.append(self._extract_common_fields(item))
next_url: str | None = current_page.get("next")
pages_processed += 1
if not next_url or pages_processed >= total:
break
response = self._get_request(next_url, headers)
if response.status_code != 200:
self._handle_error(response, "fetch next crawl page")
current_page = response.json()
return url_data_list
def _format_crawl_status_response(
self,
status: str,

View File

@ -164,6 +164,13 @@ class TestFirecrawlApp:
with pytest.raises(Exception, match="No page found"):
app.check_crawl_status("job-1")
def test_check_crawl_status_completed_with_null_total_raises(self, mocker: MockerFixture):
app = FirecrawlApp(api_key="fc-key", base_url="https://custom.firecrawl.dev")
mocker.patch("httpx.get", return_value=_response(200, {"status": "completed", "total": None, "data": []}))
with pytest.raises(Exception, match="No page found"):
app.check_crawl_status("job-1")
def test_check_crawl_status_non_completed(self, mocker: MockerFixture):
app = FirecrawlApp(api_key="fc-key", base_url="https://custom.firecrawl.dev")
payload = {"status": "processing", "total": 5, "completed": 1, "data": []}
@ -203,6 +210,77 @@ class TestFirecrawlApp:
with pytest.raises(Exception, match="Error saving crawl data"):
app.check_crawl_status("job-err")
def test_check_crawl_status_follows_pagination(self, mocker: MockerFixture):
"""When status is completed and next is present, follow pagination to collect all pages."""
app = FirecrawlApp(api_key="fc-key", base_url="https://custom.firecrawl.dev")
page1 = {
"status": "completed",
"total": 3,
"completed": 3,
"next": "https://custom.firecrawl.dev/v2/crawl/job-42?skip=1",
"data": [{"metadata": {"title": "p1", "description": "", "sourceURL": "https://p1"}, "markdown": "m1"}],
}
page2 = {
"status": "completed",
"total": 3,
"completed": 3,
"next": "https://custom.firecrawl.dev/v2/crawl/job-42?skip=2",
"data": [{"metadata": {"title": "p2", "description": "", "sourceURL": "https://p2"}, "markdown": "m2"}],
}
page3 = {
"status": "completed",
"total": 3,
"completed": 3,
"data": [{"metadata": {"title": "p3", "description": "", "sourceURL": "https://p3"}, "markdown": "m3"}],
}
mocker.patch("httpx.get", side_effect=[_response(200, page1), _response(200, page2), _response(200, page3)])
mock_storage = MagicMock()
mock_storage.exists.return_value = False
mocker.patch.object(firecrawl_module, "storage", mock_storage)
result = app.check_crawl_status("job-42")
assert result["status"] == "completed"
assert result["total"] == 3
assert len(result["data"]) == 3
assert [d["title"] for d in result["data"]] == ["p1", "p2", "p3"]
def test_check_crawl_status_pagination_error_raises(self, mocker: MockerFixture):
"""An error while fetching a paginated page raises an exception; no partial data is returned."""
app = FirecrawlApp(api_key="fc-key", base_url="https://custom.firecrawl.dev")
page1 = {
"status": "completed",
"total": 2,
"completed": 2,
"next": "https://custom.firecrawl.dev/v2/crawl/job-99?skip=1",
"data": [{"metadata": {"title": "p1", "description": "", "sourceURL": "https://p1"}, "markdown": "m1"}],
}
mocker.patch("httpx.get", side_effect=[_response(200, page1), _response(500, {"error": "server error"})])
with pytest.raises(Exception, match="fetch next crawl page"):
app.check_crawl_status("job-99")
def test_check_crawl_status_pagination_capped_at_total(self, mocker: MockerFixture):
"""Pagination stops once pages_processed reaches total, even if next is present."""
app = FirecrawlApp(api_key="fc-key", base_url="https://custom.firecrawl.dev")
# total=1: only the first page should be processed; next must not be followed
page1 = {
"status": "completed",
"total": 1,
"completed": 1,
"next": "https://custom.firecrawl.dev/v2/crawl/job-cap?skip=1",
"data": [{"metadata": {"title": "p1", "description": "", "sourceURL": "https://p1"}, "markdown": "m1"}],
}
mock_get = mocker.patch("httpx.get", return_value=_response(200, page1))
mock_storage = MagicMock()
mock_storage.exists.return_value = False
mocker.patch.object(firecrawl_module, "storage", mock_storage)
result = app.check_crawl_status("job-cap")
assert len(result["data"]) == 1
mock_get.assert_called_once() # initial fetch only; next URL is not followed due to cap
def test_extract_common_fields_and_status_formatter(self):
app = FirecrawlApp(api_key="fc-key", base_url="https://custom.firecrawl.dev")