From 9970f4449a93acd670a27cda1301c41ee17aec67 Mon Sep 17 00:00:00 2001 From: wangxiaolei Date: Mon, 9 Mar 2026 15:53:21 +0800 Subject: [PATCH] refactor: reuse redis connection instead of create new one (#32678) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- api/schedule/queue_monitor_task.py | 4 + api/schedule/trigger_provider_refresh_task.py | 20 +- api/schedule/workflow_schedule_task.py | 34 +- api/tasks/document_indexing_task.py | 39 +- .../rag_pipeline/rag_pipeline_run_task.py | 38 +- .../tasks/test_dataset_indexing_task.py | 24 +- .../tasks/test_document_indexing_task.py | 27 +- .../tasks/test_rag_pipeline_run_tasks.py | 91 +- .../trigger/conftest.py | 12 +- .../tasks/test_dataset_indexing_task.py | 1183 ++++++++++++++++- 10 files changed, 1360 insertions(+), 112 deletions(-) diff --git a/api/schedule/queue_monitor_task.py b/api/schedule/queue_monitor_task.py index 77d6b5a138..01642e397e 100644 --- a/api/schedule/queue_monitor_task.py +++ b/api/schedule/queue_monitor_task.py @@ -21,6 +21,10 @@ celery_redis = Redis( ssl_cert_reqs=getattr(dify_config, "REDIS_SSL_CERT_REQS", None) if dify_config.BROKER_USE_SSL else None, ssl_certfile=getattr(dify_config, "REDIS_SSL_CERTFILE", None) if dify_config.BROKER_USE_SSL else None, ssl_keyfile=getattr(dify_config, "REDIS_SSL_KEYFILE", None) if dify_config.BROKER_USE_SSL else None, + # Add conservative socket timeouts and health checks to avoid long-lived half-open sockets + socket_timeout=5, + socket_connect_timeout=5, + health_check_interval=30, ) logger = logging.getLogger(__name__) diff --git a/api/schedule/trigger_provider_refresh_task.py b/api/schedule/trigger_provider_refresh_task.py index 3b3e478793..df5058d70a 100644 --- a/api/schedule/trigger_provider_refresh_task.py +++ b/api/schedule/trigger_provider_refresh_task.py @@ -3,6 +3,7 @@ import math import time from collections.abc import Iterable, Sequence +from celery import group from sqlalchemy import ColumnElement, and_, func, or_, select from sqlalchemy.engine.row import Row from sqlalchemy.orm import Session @@ -85,20 +86,25 @@ def trigger_provider_refresh() -> None: lock_keys: list[str] = build_trigger_refresh_lock_keys(subscriptions) acquired: list[bool] = _acquire_locks(keys=lock_keys, ttl_seconds=lock_ttl) - enqueued: int = 0 - for (tenant_id, subscription_id), is_locked in zip(subscriptions, acquired): - if not is_locked: - continue - trigger_subscription_refresh.delay(tenant_id=tenant_id, subscription_id=subscription_id) - enqueued += 1 + if not any(acquired): + continue + + jobs = [ + trigger_subscription_refresh.s(tenant_id=tenant_id, subscription_id=subscription_id) + for (tenant_id, subscription_id), is_locked in zip(subscriptions, acquired) + if is_locked + ] + result = group(jobs).apply_async() + enqueued = len(jobs) logger.info( - "Trigger refresh page %d/%d: scanned=%d locks_acquired=%d enqueued=%d", + "Trigger refresh page %d/%d: scanned=%d locks_acquired=%d enqueued=%d result=%s", page + 1, pages, len(subscriptions), sum(1 for x in acquired if x), enqueued, + result, ) logger.info("Trigger refresh scan done: due=%d", total_due) diff --git a/api/schedule/workflow_schedule_task.py b/api/schedule/workflow_schedule_task.py index d68b9565ec..2fee9e467d 100644 --- a/api/schedule/workflow_schedule_task.py +++ b/api/schedule/workflow_schedule_task.py @@ -1,6 +1,6 @@ import logging -from celery import group, shared_task +from celery import current_app, group, shared_task from sqlalchemy import and_, select from sqlalchemy.orm import Session, sessionmaker @@ -29,31 +29,27 @@ def poll_workflow_schedules() -> None: with session_factory() as session: total_dispatched = 0 - # Process in batches until we've handled all due schedules or hit the limit while True: due_schedules = _fetch_due_schedules(session) if not due_schedules: break - dispatched_count = _process_schedules(session, due_schedules) - total_dispatched += dispatched_count + with current_app.producer_or_acquire() as producer: # type: ignore + dispatched_count = _process_schedules(session, due_schedules, producer) + total_dispatched += dispatched_count - logger.debug("Batch processed: %d dispatched", dispatched_count) - - # Circuit breaker: check if we've hit the per-tick limit (if enabled) - if ( - dify_config.WORKFLOW_SCHEDULE_MAX_DISPATCH_PER_TICK > 0 - and total_dispatched >= dify_config.WORKFLOW_SCHEDULE_MAX_DISPATCH_PER_TICK - ): - logger.warning( - "Circuit breaker activated: reached dispatch limit (%d), will continue next tick", - dify_config.WORKFLOW_SCHEDULE_MAX_DISPATCH_PER_TICK, - ) - break + logger.debug("Batch processed: %d dispatched", dispatched_count) + # Circuit breaker: check if we've hit the per-tick limit (if enabled) + if 0 < dify_config.WORKFLOW_SCHEDULE_MAX_DISPATCH_PER_TICK <= total_dispatched: + logger.warning( + "Circuit breaker activated: reached dispatch limit (%d), will continue next tick", + dify_config.WORKFLOW_SCHEDULE_MAX_DISPATCH_PER_TICK, + ) + break if total_dispatched > 0: - logger.info("Total processed: %d dispatched", total_dispatched) + logger.info("Total processed: %d workflow schedule(s) dispatched", total_dispatched) def _fetch_due_schedules(session: Session) -> list[WorkflowSchedulePlan]: @@ -90,7 +86,7 @@ def _fetch_due_schedules(session: Session) -> list[WorkflowSchedulePlan]: return list(due_schedules) -def _process_schedules(session: Session, schedules: list[WorkflowSchedulePlan]) -> int: +def _process_schedules(session: Session, schedules: list[WorkflowSchedulePlan], producer=None) -> int: """Process schedules: check quota, update next run time and dispatch to Celery in parallel.""" if not schedules: return 0 @@ -107,7 +103,7 @@ def _process_schedules(session: Session, schedules: list[WorkflowSchedulePlan]) if tasks_to_dispatch: job = group(run_schedule_trigger.s(schedule_id) for schedule_id in tasks_to_dispatch) - job.apply_async() + job.apply_async(producer=producer) logger.debug("Dispatched %d tasks in parallel", len(tasks_to_dispatch)) diff --git a/api/tasks/document_indexing_task.py b/api/tasks/document_indexing_task.py index 11edcf151f..b3f36d8f44 100644 --- a/api/tasks/document_indexing_task.py +++ b/api/tasks/document_indexing_task.py @@ -1,9 +1,10 @@ import logging import time -from collections.abc import Callable, Sequence +from collections.abc import Sequence +from typing import Any, Protocol import click -from celery import shared_task +from celery import current_app, shared_task from configs import dify_config from core.db.session_factory import session_factory @@ -19,6 +20,12 @@ from tasks.generate_summary_index_task import generate_summary_index_task logger = logging.getLogger(__name__) +class CeleryTaskLike(Protocol): + def delay(self, *args: Any, **kwargs: Any) -> Any: ... + + def apply_async(self, *args: Any, **kwargs: Any) -> Any: ... + + @shared_task(queue="dataset") def document_indexing_task(dataset_id: str, document_ids: list): """ @@ -179,8 +186,8 @@ def _document_indexing(dataset_id: str, document_ids: Sequence[str]): def _document_indexing_with_tenant_queue( - tenant_id: str, dataset_id: str, document_ids: Sequence[str], task_func: Callable[[str, str, Sequence[str]], None] -): + tenant_id: str, dataset_id: str, document_ids: Sequence[str], task_func: CeleryTaskLike +) -> None: try: _document_indexing(dataset_id, document_ids) except Exception: @@ -201,16 +208,20 @@ def _document_indexing_with_tenant_queue( logger.info("document indexing tenant isolation queue %s next tasks: %s", tenant_id, next_tasks) if next_tasks: - for next_task in next_tasks: - document_task = DocumentTask(**next_task) - # Process the next waiting task - # Keep the flag set to indicate a task is running - tenant_isolated_task_queue.set_task_waiting_time() - task_func.delay( # type: ignore - tenant_id=document_task.tenant_id, - dataset_id=document_task.dataset_id, - document_ids=document_task.document_ids, - ) + with current_app.producer_or_acquire() as producer: # type: ignore + for next_task in next_tasks: + document_task = DocumentTask(**next_task) + # Keep the flag set to indicate a task is running + tenant_isolated_task_queue.set_task_waiting_time() + task_func.apply_async( + kwargs={ + "tenant_id": document_task.tenant_id, + "dataset_id": document_task.dataset_id, + "document_ids": document_task.document_ids, + }, + producer=producer, + ) + else: # No more waiting tasks, clear the flag tenant_isolated_task_queue.delete_task_key() diff --git a/api/tasks/rag_pipeline/rag_pipeline_run_task.py b/api/tasks/rag_pipeline/rag_pipeline_run_task.py index 093342d1a3..52f66dddb8 100644 --- a/api/tasks/rag_pipeline/rag_pipeline_run_task.py +++ b/api/tasks/rag_pipeline/rag_pipeline_run_task.py @@ -3,12 +3,13 @@ import json import logging import time import uuid -from collections.abc import Mapping +from collections.abc import Mapping, Sequence from concurrent.futures import ThreadPoolExecutor +from itertools import islice from typing import Any import click -from celery import shared_task # type: ignore +from celery import group, shared_task from flask import current_app, g from sqlalchemy.orm import Session, sessionmaker @@ -27,6 +28,11 @@ from services.file_service import FileService logger = logging.getLogger(__name__) +def chunked(iterable: Sequence, size: int): + it = iter(iterable) + return iter(lambda: list(islice(it, size)), []) + + @shared_task(queue="pipeline") def rag_pipeline_run_task( rag_pipeline_invoke_entities_file_id: str, @@ -83,16 +89,24 @@ def rag_pipeline_run_task( logger.info("rag pipeline tenant isolation queue %s next files: %s", tenant_id, next_file_ids) if next_file_ids: - for next_file_id in next_file_ids: - # Process the next waiting task - # Keep the flag set to indicate a task is running - tenant_isolated_task_queue.set_task_waiting_time() - rag_pipeline_run_task.delay( # type: ignore - rag_pipeline_invoke_entities_file_id=next_file_id.decode("utf-8") - if isinstance(next_file_id, bytes) - else next_file_id, - tenant_id=tenant_id, - ) + for batch in chunked(next_file_ids, 100): + jobs = [] + for next_file_id in batch: + tenant_isolated_task_queue.set_task_waiting_time() + + file_id = ( + next_file_id.decode("utf-8") if isinstance(next_file_id, (bytes, bytearray)) else next_file_id + ) + + jobs.append( + rag_pipeline_run_task.s( + rag_pipeline_invoke_entities_file_id=file_id, + tenant_id=tenant_id, + ) + ) + + if jobs: + group(jobs).apply_async() else: # No more waiting tasks, clear the flag tenant_isolated_task_queue.delete_task_key() diff --git a/api/tests/test_containers_integration_tests/tasks/test_dataset_indexing_task.py b/api/tests/test_containers_integration_tests/tasks/test_dataset_indexing_task.py index 207bdad751..4a62383590 100644 --- a/api/tests/test_containers_integration_tests/tasks/test_dataset_indexing_task.py +++ b/api/tests/test_containers_integration_tests/tasks/test_dataset_indexing_task.py @@ -322,11 +322,14 @@ class TestDatasetIndexingTaskIntegration: _document_indexing_with_tenant_queue(dataset.tenant_id, dataset.id, document_ids, task_dispatch_spy) # Assert - task_dispatch_spy.delay.assert_called_once_with( - tenant_id=next_task["tenant_id"], - dataset_id=next_task["dataset_id"], - document_ids=next_task["document_ids"], - ) + # apply_async is used by implementation; assert it was called once with expected kwargs + assert task_dispatch_spy.apply_async.call_count == 1 + call_kwargs = task_dispatch_spy.apply_async.call_args.kwargs.get("kwargs", {}) + assert call_kwargs == { + "tenant_id": next_task["tenant_id"], + "dataset_id": next_task["dataset_id"], + "document_ids": next_task["document_ids"], + } set_waiting_spy.assert_called_once() delete_key_spy.assert_not_called() @@ -352,7 +355,7 @@ class TestDatasetIndexingTaskIntegration: _document_indexing_with_tenant_queue(dataset.tenant_id, dataset.id, document_ids, task_dispatch_spy) # Assert - task_dispatch_spy.delay.assert_not_called() + task_dispatch_spy.apply_async.assert_not_called() delete_key_spy.assert_called_once() def test_validation_failure_sets_error_status_when_vector_space_at_limit( @@ -447,7 +450,7 @@ class TestDatasetIndexingTaskIntegration: _document_indexing_with_tenant_queue(dataset.tenant_id, dataset.id, document_ids, task_dispatch_spy) # Assert - task_dispatch_spy.delay.assert_called_once() + task_dispatch_spy.apply_async.assert_called_once() def test_sessions_close_on_successful_indexing( self, @@ -534,7 +537,7 @@ class TestDatasetIndexingTaskIntegration: _document_indexing_with_tenant_queue(dataset.tenant_id, dataset.id, document_ids, task_dispatch_spy) # Assert - assert task_dispatch_spy.delay.call_count == concurrency_limit + assert task_dispatch_spy.apply_async.call_count == concurrency_limit assert set_waiting_spy.call_count == concurrency_limit def test_task_queue_fifo_ordering(self, db_session_with_containers, patched_external_dependencies): @@ -565,9 +568,10 @@ class TestDatasetIndexingTaskIntegration: _document_indexing_with_tenant_queue(dataset.tenant_id, dataset.id, document_ids, task_dispatch_spy) # Assert - assert task_dispatch_spy.delay.call_count == 3 + assert task_dispatch_spy.apply_async.call_count == 3 for index, expected_task in enumerate(ordered_tasks): - assert task_dispatch_spy.delay.call_args_list[index].kwargs["document_ids"] == expected_task["document_ids"] + call_kwargs = task_dispatch_spy.apply_async.call_args_list[index].kwargs.get("kwargs", {}) + assert call_kwargs.get("document_ids") == expected_task["document_ids"] def test_billing_disabled_skips_limit_checks(self, db_session_with_containers, patched_external_dependencies): """Skip limit checks when billing feature is disabled.""" diff --git a/api/tests/test_containers_integration_tests/tasks/test_document_indexing_task.py b/api/tests/test_containers_integration_tests/tasks/test_document_indexing_task.py index 4be1180c73..5dc1f6bee0 100644 --- a/api/tests/test_containers_integration_tests/tasks/test_document_indexing_task.py +++ b/api/tests/test_containers_integration_tests/tasks/test_document_indexing_task.py @@ -762,11 +762,12 @@ class TestDocumentIndexingTasks: mock_external_service_dependencies["indexing_runner_instance"].run.assert_called_once() # Verify task function was called for each waiting task - assert mock_task_func.delay.call_count == 1 + assert mock_task_func.apply_async.call_count == 1 # Verify correct parameters for each call - calls = mock_task_func.delay.call_args_list - assert calls[0][1] == {"tenant_id": tenant_id, "dataset_id": dataset_id, "document_ids": ["waiting-doc-1"]} + calls = mock_task_func.apply_async.call_args_list + sent_kwargs = calls[0][1]["kwargs"] + assert sent_kwargs == {"tenant_id": tenant_id, "dataset_id": dataset_id, "document_ids": ["waiting-doc-1"]} # Verify queue is empty after processing (tasks were pulled) remaining_tasks = queue.pull_tasks(count=10) # Pull more than we added @@ -830,11 +831,15 @@ class TestDocumentIndexingTasks: assert updated_document.processing_started_at is not None # Verify waiting task was still processed despite core processing error - mock_task_func.delay.assert_called_once() + mock_task_func.apply_async.assert_called_once() # Verify correct parameters for the call - call = mock_task_func.delay.call_args - assert call[1] == {"tenant_id": tenant_id, "dataset_id": dataset_id, "document_ids": ["waiting-doc-1"]} + call = mock_task_func.apply_async.call_args + assert call[1]["kwargs"] == { + "tenant_id": tenant_id, + "dataset_id": dataset_id, + "document_ids": ["waiting-doc-1"], + } # Verify queue is empty after processing (task was pulled) remaining_tasks = queue.pull_tasks(count=10) @@ -896,9 +901,13 @@ class TestDocumentIndexingTasks: mock_external_service_dependencies["indexing_runner_instance"].run.assert_called_once() # Verify only tenant1's waiting task was processed - mock_task_func.delay.assert_called_once() - call = mock_task_func.delay.call_args - assert call[1] == {"tenant_id": tenant1_id, "dataset_id": dataset1_id, "document_ids": ["tenant1-doc-1"]} + mock_task_func.apply_async.assert_called_once() + call = mock_task_func.apply_async.call_args + assert call[1]["kwargs"] == { + "tenant_id": tenant1_id, + "dataset_id": dataset1_id, + "document_ids": ["tenant1-doc-1"], + } # Verify tenant1's queue is empty remaining_tasks1 = queue1.pull_tasks(count=10) diff --git a/api/tests/test_containers_integration_tests/tasks/test_rag_pipeline_run_tasks.py b/api/tests/test_containers_integration_tests/tasks/test_rag_pipeline_run_tasks.py index ef7191299a..f01fcc1742 100644 --- a/api/tests/test_containers_integration_tests/tasks/test_rag_pipeline_run_tasks.py +++ b/api/tests/test_containers_integration_tests/tasks/test_rag_pipeline_run_tasks.py @@ -1,6 +1,6 @@ import json import uuid -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest from faker import Faker @@ -388,8 +388,10 @@ class TestRagPipelineRunTasks: # Set the task key to indicate there are waiting tasks (legacy behavior) redis_client.set(legacy_task_key, 1, ex=60 * 60) - # Mock the task function calls - with patch("tasks.rag_pipeline.rag_pipeline_run_task.rag_pipeline_run_task.delay") as mock_delay: + # Mock the Celery group scheduling used by the implementation + with patch("tasks.rag_pipeline.rag_pipeline_run_task.group") as mock_group: + mock_group.return_value.apply_async = MagicMock() + # Act: Execute the priority task with new code but legacy queue data rag_pipeline_run_task(file_id, tenant.id) @@ -398,13 +400,14 @@ class TestRagPipelineRunTasks: mock_file_service["delete_file"].assert_called_once_with(file_id) assert mock_pipeline_generator.call_count == 1 - # Verify waiting tasks were processed, pull 1 task a time by default - assert mock_delay.call_count == 1 + # Verify waiting tasks were processed via group, pull 1 task a time by default + assert mock_group.return_value.apply_async.called - # Verify correct parameters for the call - call_kwargs = mock_delay.call_args[1] if mock_delay.call_args else {} - assert call_kwargs.get("rag_pipeline_invoke_entities_file_id") == legacy_file_ids[0] - assert call_kwargs.get("tenant_id") == tenant.id + # Verify correct parameters for the first scheduled job signature + jobs = mock_group.call_args.args[0] if mock_group.call_args else [] + first_kwargs = jobs[0].kwargs if jobs else {} + assert first_kwargs.get("rag_pipeline_invoke_entities_file_id") == legacy_file_ids[0] + assert first_kwargs.get("tenant_id") == tenant.id # Verify that new code can process legacy queue entries # The new TenantIsolatedTaskQueue should be able to read from the legacy format @@ -446,8 +449,10 @@ class TestRagPipelineRunTasks: waiting_file_ids = [str(uuid.uuid4()) for _ in range(3)] queue.push_tasks(waiting_file_ids) - # Mock the task function calls - with patch("tasks.rag_pipeline.rag_pipeline_run_task.rag_pipeline_run_task.delay") as mock_delay: + # Mock the Celery group scheduling used by the implementation + with patch("tasks.rag_pipeline.rag_pipeline_run_task.group") as mock_group: + mock_group.return_value.apply_async = MagicMock() + # Act: Execute the regular task rag_pipeline_run_task(file_id, tenant.id) @@ -456,13 +461,14 @@ class TestRagPipelineRunTasks: mock_file_service["delete_file"].assert_called_once_with(file_id) assert mock_pipeline_generator.call_count == 1 - # Verify waiting tasks were processed, pull 1 task a time by default - assert mock_delay.call_count == 1 + # Verify waiting tasks were processed via group.apply_async + assert mock_group.return_value.apply_async.called - # Verify correct parameters for the call - call_kwargs = mock_delay.call_args[1] if mock_delay.call_args else {} - assert call_kwargs.get("rag_pipeline_invoke_entities_file_id") == waiting_file_ids[0] - assert call_kwargs.get("tenant_id") == tenant.id + # Verify correct parameters for the first scheduled job signature + jobs = mock_group.call_args.args[0] if mock_group.call_args else [] + first_kwargs = jobs[0].kwargs if jobs else {} + assert first_kwargs.get("rag_pipeline_invoke_entities_file_id") == waiting_file_ids[0] + assert first_kwargs.get("tenant_id") == tenant.id # Verify queue still has remaining tasks (only 1 was pulled) remaining_tasks = queue.pull_tasks(count=10) @@ -557,8 +563,10 @@ class TestRagPipelineRunTasks: waiting_file_id = str(uuid.uuid4()) queue.push_tasks([waiting_file_id]) - # Mock the task function calls - with patch("tasks.rag_pipeline.rag_pipeline_run_task.rag_pipeline_run_task.delay") as mock_delay: + # Mock the Celery group scheduling used by the implementation + with patch("tasks.rag_pipeline.rag_pipeline_run_task.group") as mock_group: + mock_group.return_value.apply_async = MagicMock() + # Act: Execute the regular task (should not raise exception) rag_pipeline_run_task(file_id, tenant.id) @@ -569,12 +577,13 @@ class TestRagPipelineRunTasks: assert mock_pipeline_generator.call_count == 1 # Verify waiting task was still processed despite core processing error - mock_delay.assert_called_once() + assert mock_group.return_value.apply_async.called - # Verify correct parameters for the call - call_kwargs = mock_delay.call_args[1] if mock_delay.call_args else {} - assert call_kwargs.get("rag_pipeline_invoke_entities_file_id") == waiting_file_id - assert call_kwargs.get("tenant_id") == tenant.id + # Verify correct parameters for the first scheduled job signature + jobs = mock_group.call_args.args[0] if mock_group.call_args else [] + first_kwargs = jobs[0].kwargs if jobs else {} + assert first_kwargs.get("rag_pipeline_invoke_entities_file_id") == waiting_file_id + assert first_kwargs.get("tenant_id") == tenant.id # Verify queue is empty after processing (task was pulled) remaining_tasks = queue.pull_tasks(count=10) @@ -684,8 +693,10 @@ class TestRagPipelineRunTasks: queue1.push_tasks([waiting_file_id1]) queue2.push_tasks([waiting_file_id2]) - # Mock the task function calls - with patch("tasks.rag_pipeline.rag_pipeline_run_task.rag_pipeline_run_task.delay") as mock_delay: + # Mock the Celery group scheduling used by the implementation + with patch("tasks.rag_pipeline.rag_pipeline_run_task.group") as mock_group: + mock_group.return_value.apply_async = MagicMock() + # Act: Execute the regular task for tenant1 only rag_pipeline_run_task(file_id1, tenant1.id) @@ -694,11 +705,12 @@ class TestRagPipelineRunTasks: assert mock_file_service["delete_file"].call_count == 1 assert mock_pipeline_generator.call_count == 1 - # Verify only tenant1's waiting task was processed - mock_delay.assert_called_once() - call_kwargs = mock_delay.call_args[1] if mock_delay.call_args else {} - assert call_kwargs.get("rag_pipeline_invoke_entities_file_id") == waiting_file_id1 - assert call_kwargs.get("tenant_id") == tenant1.id + # Verify only tenant1's waiting task was processed (via group) + assert mock_group.return_value.apply_async.called + jobs = mock_group.call_args.args[0] if mock_group.call_args else [] + first_kwargs = jobs[0].kwargs if jobs else {} + assert first_kwargs.get("rag_pipeline_invoke_entities_file_id") == waiting_file_id1 + assert first_kwargs.get("tenant_id") == tenant1.id # Verify tenant1's queue is empty remaining_tasks1 = queue1.pull_tasks(count=10) @@ -913,8 +925,10 @@ class TestRagPipelineRunTasks: waiting_file_id = str(uuid.uuid4()) queue.push_tasks([waiting_file_id]) - # Mock the task function calls - with patch("tasks.rag_pipeline.rag_pipeline_run_task.rag_pipeline_run_task.delay") as mock_delay: + # Mock the Celery group scheduling used by the implementation + with patch("tasks.rag_pipeline.rag_pipeline_run_task.group") as mock_group: + mock_group.return_value.apply_async = MagicMock() + # Act & Assert: Execute the regular task (should raise Exception) with pytest.raises(Exception, match="File not found"): rag_pipeline_run_task(file_id, tenant.id) @@ -924,12 +938,13 @@ class TestRagPipelineRunTasks: mock_pipeline_generator.assert_not_called() # Verify waiting task was still processed despite file error - mock_delay.assert_called_once() + assert mock_group.return_value.apply_async.called - # Verify correct parameters for the call - call_kwargs = mock_delay.call_args[1] if mock_delay.call_args else {} - assert call_kwargs.get("rag_pipeline_invoke_entities_file_id") == waiting_file_id - assert call_kwargs.get("tenant_id") == tenant.id + # Verify correct parameters for the first scheduled job signature + jobs = mock_group.call_args.args[0] if mock_group.call_args else [] + first_kwargs = jobs[0].kwargs if jobs else {} + assert first_kwargs.get("rag_pipeline_invoke_entities_file_id") == waiting_file_id + assert first_kwargs.get("tenant_id") == tenant.id # Verify queue is empty after processing (task was pulled) remaining_tasks = queue.pull_tasks(count=10) diff --git a/api/tests/test_containers_integration_tests/trigger/conftest.py b/api/tests/test_containers_integration_tests/trigger/conftest.py index 9c1fd5e0ec..e3832fb2ef 100644 --- a/api/tests/test_containers_integration_tests/trigger/conftest.py +++ b/api/tests/test_containers_integration_tests/trigger/conftest.py @@ -105,18 +105,26 @@ def app_model( class MockCeleryGroup: - """Mock for celery group() function that collects dispatched tasks.""" + """Mock for celery group() function that collects dispatched tasks. + + Matches the Celery group API loosely, accepting arbitrary kwargs on apply_async + (e.g. producer) so production code can pass broker-related options without + breaking tests. + """ def __init__(self) -> None: self.collected: list[dict[str, Any]] = [] self._applied = False + self.last_apply_async_kwargs: dict[str, Any] | None = None def __call__(self, items: Any) -> MockCeleryGroup: self.collected = list(items) return self - def apply_async(self) -> None: + def apply_async(self, **kwargs: Any) -> None: + # Accept arbitrary kwargs like producer to be compatible with Celery self._applied = True + self.last_apply_async_kwargs = kwargs @property def applied(self) -> bool: diff --git a/api/tests/unit_tests/tasks/test_dataset_indexing_task.py b/api/tests/unit_tests/tasks/test_dataset_indexing_task.py index 11b4663187..67e0a8efaf 100644 --- a/api/tests/unit_tests/tasks/test_dataset_indexing_task.py +++ b/api/tests/unit_tests/tasks/test_dataset_indexing_task.py @@ -10,14 +10,23 @@ This module tests the document indexing task functionality including: """ import uuid -from unittest.mock import Mock, patch +from unittest.mock import MagicMock, Mock, patch import pytest +from core.indexing_runner import DocumentIsPausedError from core.rag.pipeline.queue import TenantIsolatedTaskQueue from enums.cloud_plan import CloudPlan from extensions.ext_redis import redis_client +from models.dataset import Dataset, Document from services.document_indexing_proxy.document_indexing_task_proxy import DocumentIndexingTaskProxy +from tasks.document_indexing_task import ( + _document_indexing, + _document_indexing_with_tenant_queue, + document_indexing_task, + normal_document_indexing_task, + priority_document_indexing_task, +) # ============================================================================ # Fixtures @@ -56,6 +65,190 @@ def mock_redis(): return redis_client +# Additional fixtures required by tests in this module + + +@pytest.fixture +def mock_db_session(): + """Mock session_factory.create_session() to return a session whose queries use shared test data. + + Tests set session._shared_data = {"dataset": , "documents": [, ...]} + This fixture makes session.query(Dataset).first() return the shared dataset, + and session.query(Document).all()/first() return from the shared documents. + """ + with patch("tasks.document_indexing_task.session_factory") as mock_sf: + session = MagicMock() + session._shared_data = {"dataset": None, "documents": []} + + # Keep a pointer so repeated Document.first() calls iterate across provided docs + session._doc_first_idx = 0 + + def _query_side_effect(model): + q = MagicMock() + + # Capture filters passed via where(...) so first()/all() can honor them. + q._filters = {} + + def _extract_filters(*conds, **kw): + # Support both SQLAlchemy expressions (BinaryExpression) and kwargs + # We only need the simple fields used by production code: id, dataset_id, and id.in_(...) + for cond in conds: + left = getattr(cond, "left", None) + right = getattr(cond, "right", None) + key = None + if left is not None: + key = getattr(left, "key", None) or getattr(left, "name", None) + if not key: + continue + # Right side might be a BindParameter with .value, or a raw value/sequence + val = getattr(right, "value", right) + q._filters[key] = val + # Also accept kwargs (e.g., where(id=...)) just in case + for k, v in kw.items(): + q._filters[k] = v + + def _where_side_effect(*conds, **kw): + _extract_filters(*conds, **kw) + return q + + q.where.side_effect = _where_side_effect + + # Dataset queries + if model.__name__ == "Dataset": + + def _dataset_first(): + ds = session._shared_data.get("dataset") + if not ds: + return None + if "id" in q._filters: + val = q._filters["id"] + if isinstance(val, (list, tuple, set)): + return ds if ds.id in val else None + return ds if ds.id == val else None + return ds + + def _dataset_all(): + ds = session._shared_data.get("dataset") + if not ds: + return [] + first = _dataset_first() + return [first] if first else [] + + q.first.side_effect = _dataset_first + q.all.side_effect = _dataset_all + return q + + # Document queries + if model.__name__ == "Document": + + def _apply_doc_filters(docs): + result = list(docs) + for key in ("id", "dataset_id"): + if key in q._filters: + val = q._filters[key] + if isinstance(val, (list, tuple, set)): + result = [d for d in result if getattr(d, key, None) in val] + else: + result = [d for d in result if getattr(d, key, None) == val] + return result + + def _docs_all(): + docs = session._shared_data.get("documents", []) + return _apply_doc_filters(docs) + + def _docs_first(): + docs = _docs_all() + return docs[0] if docs else None + + q.all.side_effect = _docs_all + q.first.side_effect = _docs_first + return q + + # Default fallback + q.first.return_value = None + q.all.return_value = [] + return q + + session.query.side_effect = _query_side_effect + + # Implement session.begin() context manager that commits on exit + session.commit = MagicMock() + bm = MagicMock() + bm.__enter__.return_value = session + + def _bm_exit_side_effect(*args, **kwargs): + session.commit() + + bm.__exit__.side_effect = _bm_exit_side_effect + session.begin.return_value = bm + + # Context manager behavior for create_session(): ensure close() is called on exit + session.close = MagicMock() + cm = MagicMock() + cm.__enter__.return_value = session + + def _exit_side_effect(*args, **kwargs): + session.close() + + cm.__exit__.side_effect = _exit_side_effect + mock_sf.create_session.return_value = cm + + yield session + + +@pytest.fixture +def mock_dataset(dataset_id, tenant_id): + """Create a mock Dataset object.""" + dataset = Mock(spec=Dataset) + dataset.id = dataset_id + dataset.tenant_id = tenant_id + dataset.indexing_technique = "high_quality" + dataset.embedding_model_provider = "openai" + dataset.embedding_model = "text-embedding-ada-002" + return dataset + + +@pytest.fixture +def mock_documents(document_ids, dataset_id): + """Create mock Document objects.""" + documents = [] + for doc_id in document_ids: + doc = Mock(spec=Document) + doc.id = doc_id + doc.dataset_id = dataset_id + doc.indexing_status = "waiting" + doc.error = None + doc.stopped_at = None + doc.processing_started_at = None + # optional attribute used in some code paths + doc.doc_form = "text_model" + documents.append(doc) + return documents + + +@pytest.fixture +def mock_indexing_runner(): + """Mock IndexingRunner for document_indexing_task module.""" + with patch("tasks.document_indexing_task.IndexingRunner") as mock_runner_class: + mock_runner = MagicMock() + mock_runner_class.return_value = mock_runner + yield mock_runner + + +@pytest.fixture +def mock_feature_service(): + """Mock FeatureService for document_indexing_task module.""" + with patch("tasks.document_indexing_task.FeatureService") as mock_service: + mock_features = Mock() + mock_features.billing = Mock() + mock_features.billing.enabled = False + mock_features.vector_space = Mock() + mock_features.vector_space.size = 0 + mock_features.vector_space.limit = 1000 + mock_service.get_features.return_value = mock_features + yield mock_service + + # ============================================================================ # Test Task Enqueuing # ============================================================================ @@ -166,6 +359,492 @@ class TestTaskEnqueuing: assert mock_redis.lpush.called mock_task.delay.assert_not_called() + def test_legacy_document_indexing_task_still_works( + self, dataset_id, document_ids, mock_db_session, mock_dataset, mock_documents, mock_indexing_runner + ): + """ + Test that the legacy document_indexing_task function still works. + + This ensures backward compatibility for existing code that may still + use the deprecated function. + """ + # Arrange + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + with patch("tasks.document_indexing_task.FeatureService.get_features") as mock_features: + mock_features.return_value.billing.enabled = False + + # Act + document_indexing_task(dataset_id, document_ids) + + # Assert + mock_indexing_runner.run.assert_called_once() + + +# ============================================================================ +# Test Batch Processing +# ============================================================================ + + +class TestBatchProcessing: + """Test cases for batch processing of multiple documents.""" + + def test_batch_processing_multiple_documents( + self, dataset_id, document_ids, mock_db_session, mock_dataset, mock_indexing_runner + ): + """ + Test batch processing of multiple documents. + + All documents in the batch should be processed together and their + status should be updated to 'parsing'. + """ + # Arrange - Create actual document objects that can be modified + mock_documents = [] + for doc_id in document_ids: + doc = MagicMock(spec=Document) + doc.id = doc_id + doc.dataset_id = dataset_id + doc.indexing_status = "waiting" + doc.error = None + doc.stopped_at = None + doc.processing_started_at = None + mock_documents.append(doc) + + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + with patch("tasks.document_indexing_task.FeatureService.get_features") as mock_features: + mock_features.return_value.billing.enabled = False + + # Act + _document_indexing(dataset_id, document_ids) + + # Assert - All documents should be set to 'parsing' status + for doc in mock_documents: + assert doc.indexing_status == "parsing" + assert doc.processing_started_at is not None + + # IndexingRunner should be called with all documents + mock_indexing_runner.run.assert_called_once() + call_args = mock_indexing_runner.run.call_args[0][0] + assert len(call_args) == len(document_ids) + + def test_batch_processing_with_limit_check(self, dataset_id, mock_db_session, mock_dataset, mock_feature_service): + """ + Test batch processing respects upload limits. + + When the number of documents exceeds the batch upload limit, + an error should be raised and all documents should be marked as error. + """ + # Arrange + batch_limit = 10 + document_ids = [str(uuid.uuid4()) for _ in range(batch_limit + 1)] + + mock_documents = [] + for doc_id in document_ids: + doc = MagicMock(spec=Document) + doc.id = doc_id + doc.dataset_id = dataset_id + doc.indexing_status = "waiting" + doc.error = None + doc.stopped_at = None + mock_documents.append(doc) + + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + mock_feature_service.get_features.return_value.billing.enabled = True + mock_feature_service.get_features.return_value.billing.subscription.plan = CloudPlan.PROFESSIONAL + mock_feature_service.get_features.return_value.vector_space.limit = 1000 + mock_feature_service.get_features.return_value.vector_space.size = 0 + + with patch("tasks.document_indexing_task.dify_config.BATCH_UPLOAD_LIMIT", str(batch_limit)): + # Act + _document_indexing(dataset_id, document_ids) + + # Assert - All documents should have error status + for doc in mock_documents: + assert doc.indexing_status == "error" + assert doc.error is not None + assert "batch upload limit" in doc.error + + def test_batch_processing_sandbox_plan_single_document_only( + self, dataset_id, mock_db_session, mock_dataset, mock_feature_service + ): + """ + Test that sandbox plan only allows single document upload. + + Sandbox plan should reject batch uploads (more than 1 document). + """ + # Arrange + document_ids = [str(uuid.uuid4()) for _ in range(2)] + + mock_documents = [] + for doc_id in document_ids: + doc = MagicMock(spec=Document) + doc.id = doc_id + doc.dataset_id = dataset_id + doc.indexing_status = "waiting" + doc.error = None + doc.stopped_at = None + mock_documents.append(doc) + + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + mock_feature_service.get_features.return_value.billing.enabled = True + mock_feature_service.get_features.return_value.billing.subscription.plan = CloudPlan.SANDBOX + mock_feature_service.get_features.return_value.vector_space.limit = 1000 + mock_feature_service.get_features.return_value.vector_space.size = 0 + + # Act + _document_indexing(dataset_id, document_ids) + + # Assert - All documents should have error status + for doc in mock_documents: + assert doc.indexing_status == "error" + assert "does not support batch upload" in doc.error + + def test_batch_processing_empty_document_list( + self, dataset_id, mock_db_session, mock_dataset, mock_indexing_runner + ): + """ + Test batch processing with empty document list. + + Should handle empty list gracefully without errors. + """ + # Arrange + document_ids = [] + + # Set shared mock data with empty documents list + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = [] + + with patch("tasks.document_indexing_task.FeatureService.get_features") as mock_features: + mock_features.return_value.billing.enabled = False + + # Act + _document_indexing(dataset_id, document_ids) + + # Assert - IndexingRunner should still be called with empty list + mock_indexing_runner.run.assert_called_once_with([]) + + +# ============================================================================ +# Test Progress Tracking +# ============================================================================ + + +class TestProgressTracking: + """Test cases for progress tracking through task lifecycle.""" + + def test_document_status_progression( + self, dataset_id, document_ids, mock_db_session, mock_dataset, mock_indexing_runner + ): + """ + Test document status progresses correctly through lifecycle. + + Documents should transition from 'waiting' -> 'parsing' -> processed. + """ + # Arrange - Create actual document objects + mock_documents = [] + for doc_id in document_ids: + doc = MagicMock(spec=Document) + doc.id = doc_id + doc.dataset_id = dataset_id + doc.indexing_status = "waiting" + doc.processing_started_at = None + mock_documents.append(doc) + + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + with patch("tasks.document_indexing_task.FeatureService.get_features") as mock_features: + mock_features.return_value.billing.enabled = False + + # Act + _document_indexing(dataset_id, document_ids) + + # Assert - Status should be 'parsing' + for doc in mock_documents: + assert doc.indexing_status == "parsing" + assert doc.processing_started_at is not None + + # Verify commit was called to persist status + assert mock_db_session.commit.called + + def test_processing_started_timestamp_set( + self, dataset_id, document_ids, mock_db_session, mock_dataset, mock_indexing_runner + ): + """ + Test that processing_started_at timestamp is set correctly. + + When documents start processing, the timestamp should be recorded. + """ + # Arrange - Create actual document objects + mock_documents = [] + for doc_id in document_ids: + doc = MagicMock(spec=Document) + doc.id = doc_id + doc.dataset_id = dataset_id + doc.indexing_status = "waiting" + doc.processing_started_at = None + mock_documents.append(doc) + + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + with patch("tasks.document_indexing_task.FeatureService.get_features") as mock_features: + mock_features.return_value.billing.enabled = False + + # Act + _document_indexing(dataset_id, document_ids) + + # Assert + for doc in mock_documents: + assert doc.processing_started_at is not None + + def test_tenant_queue_processes_next_task_after_completion( + self, tenant_id, dataset_id, document_ids, mock_redis, mock_db_session, mock_dataset, mock_indexing_runner + ): + """ + Test that tenant queue processes next waiting task after completion. + + After a task completes, the system should check for waiting tasks + and process the next one. + """ + # Arrange + next_task_data = {"tenant_id": tenant_id, "dataset_id": dataset_id, "document_ids": ["next_doc_id"]} + + # Simulate next task in queue + from core.rag.pipeline.queue import TaskWrapper + + wrapper = TaskWrapper(data=next_task_data) + mock_redis.rpop.return_value = wrapper.serialize() + + mock_db_session.query.return_value.where.return_value.first.return_value = mock_dataset + + with patch("tasks.document_indexing_task.FeatureService.get_features") as mock_features: + mock_features.return_value.billing.enabled = False + + with patch("tasks.document_indexing_task.normal_document_indexing_task") as mock_task: + # Act + _document_indexing_with_tenant_queue(tenant_id, dataset_id, document_ids, mock_task) + + # Assert - Next task should be enqueued + mock_task.apply_async.assert_called() + # Task key should be set for next task + assert mock_redis.setex.called + + def test_tenant_queue_clears_flag_when_no_more_tasks( + self, tenant_id, dataset_id, document_ids, mock_redis, mock_db_session, mock_dataset, mock_indexing_runner + ): + """ + Test that tenant queue clears flag when no more tasks are waiting. + + When there are no more tasks in the queue, the task key should be deleted. + """ + # Arrange + mock_redis.rpop.return_value = None # No more tasks + mock_db_session.query.return_value.where.return_value.first.return_value = mock_dataset + + with patch("tasks.document_indexing_task.FeatureService.get_features") as mock_features: + mock_features.return_value.billing.enabled = False + + with patch("tasks.document_indexing_task.normal_document_indexing_task") as mock_task: + # Act + _document_indexing_with_tenant_queue(tenant_id, dataset_id, document_ids, mock_task) + + # Assert - Task key should be deleted + assert mock_redis.delete.called + + +# ============================================================================ +# Test Error Handling and Retries +# ============================================================================ + + +class TestErrorHandling: + """Test cases for error handling and retry mechanisms.""" + + def test_error_handling_sets_document_error_status( + self, dataset_id, document_ids, mock_db_session, mock_dataset, mock_feature_service + ): + """ + Test that errors during validation set document error status. + + When validation fails (e.g., limit exceeded), documents should be + marked with error status and error message. + """ + # Arrange - Create actual document objects + mock_documents = [] + for doc_id in document_ids: + doc = MagicMock(spec=Document) + doc.id = doc_id + doc.dataset_id = dataset_id + doc.indexing_status = "waiting" + doc.error = None + doc.stopped_at = None + mock_documents.append(doc) + + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + # Set up to trigger vector space limit error + mock_feature_service.get_features.return_value.billing.enabled = True + mock_feature_service.get_features.return_value.billing.subscription.plan = CloudPlan.PROFESSIONAL + mock_feature_service.get_features.return_value.vector_space.limit = 100 + mock_feature_service.get_features.return_value.vector_space.size = 100 # At limit + + # Act + _document_indexing(dataset_id, document_ids) + + # Assert + for doc in mock_documents: + assert doc.indexing_status == "error" + assert doc.error is not None + assert "over the limit" in doc.error + assert doc.stopped_at is not None + + def test_error_handling_during_indexing_runner( + self, dataset_id, document_ids, mock_db_session, mock_dataset, mock_documents, mock_indexing_runner + ): + """ + Test error handling when IndexingRunner raises an exception. + + Errors during indexing should be caught and logged, but not crash the task. + """ + # Arrange + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + # Make IndexingRunner raise an exception + mock_indexing_runner.run.side_effect = Exception("Indexing failed") + + with patch("tasks.document_indexing_task.FeatureService.get_features") as mock_features: + mock_features.return_value.billing.enabled = False + + # Act - Should not raise exception + _document_indexing(dataset_id, document_ids) + + # Assert - Session should be closed even after error + assert mock_db_session.close.called + + def test_document_paused_error_handling( + self, dataset_id, document_ids, mock_db_session, mock_dataset, mock_documents, mock_indexing_runner + ): + """ + Test handling of DocumentIsPausedError. + + When a document is paused, the error should be caught and logged + but not treated as a failure. + """ + # Arrange + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + # Make IndexingRunner raise DocumentIsPausedError + mock_indexing_runner.run.side_effect = DocumentIsPausedError("Document is paused") + + with patch("tasks.document_indexing_task.FeatureService.get_features") as mock_features: + mock_features.return_value.billing.enabled = False + + # Act - Should not raise exception + _document_indexing(dataset_id, document_ids) + + # Assert - Session should be closed + assert mock_db_session.close.called + + def test_dataset_not_found_error_handling(self, dataset_id, document_ids, mock_db_session): + """ + Test handling when dataset is not found. + + If the dataset doesn't exist, the task should exit gracefully. + """ + # Arrange + mock_db_session.query.return_value.where.return_value.first.return_value = None + + # Act + _document_indexing(dataset_id, document_ids) + + # Assert - Session should be closed + assert mock_db_session.close.called + + def test_tenant_queue_error_handling_still_processes_next_task( + self, tenant_id, dataset_id, document_ids, mock_redis, mock_db_session, mock_dataset, mock_indexing_runner + ): + """ + Test that errors don't prevent processing next task in tenant queue. + + Even if the current task fails, the next task should still be processed. + """ + # Arrange + next_task_data = {"tenant_id": tenant_id, "dataset_id": dataset_id, "document_ids": ["next_doc_id"]} + + from core.rag.pipeline.queue import TaskWrapper + + wrapper = TaskWrapper(data=next_task_data) + # Set up rpop to return task once for concurrency check + mock_redis.rpop.side_effect = [wrapper.serialize(), None] + + mock_db_session.query.return_value.where.return_value.first.return_value = mock_dataset + + # Make _document_indexing raise an error + with patch("tasks.document_indexing_task._document_indexing") as mock_indexing: + mock_indexing.side_effect = Exception("Processing failed") + + # Patch logger to avoid format string issue in actual code + with patch("tasks.document_indexing_task.logger"): + with patch("tasks.document_indexing_task.normal_document_indexing_task") as mock_task: + # Act + _document_indexing_with_tenant_queue(tenant_id, dataset_id, document_ids, mock_task) + + # Assert - Next task should still be enqueued despite error + mock_task.apply_async.assert_called() + + def test_concurrent_task_limit_respected( + self, tenant_id, dataset_id, document_ids, mock_redis, mock_db_session, mock_dataset + ): + """ + Test that tenant isolated task concurrency limit is respected. + + Should pull only TENANT_ISOLATED_TASK_CONCURRENCY tasks at a time. + """ + # Arrange + concurrency_limit = 2 + + # Create multiple tasks in queue + tasks = [] + for i in range(5): + task_data = {"tenant_id": tenant_id, "dataset_id": dataset_id, "document_ids": [f"doc_{i}"]} + from core.rag.pipeline.queue import TaskWrapper + + wrapper = TaskWrapper(data=task_data) + tasks.append(wrapper.serialize()) + + # Mock rpop to return tasks one by one + mock_redis.rpop.side_effect = tasks[:concurrency_limit] + [None] + + mock_db_session.query.return_value.where.return_value.first.return_value = mock_dataset + + with patch("tasks.document_indexing_task.dify_config.TENANT_ISOLATED_TASK_CONCURRENCY", concurrency_limit): + with patch("tasks.document_indexing_task.normal_document_indexing_task") as mock_task: + # Act + _document_indexing_with_tenant_queue(tenant_id, dataset_id, document_ids, mock_task) + + # Assert - Should enqueue exactly concurrency_limit tasks + assert mock_task.apply_async.call_count == concurrency_limit + # ============================================================================ # Test Task Cancellation @@ -198,6 +877,407 @@ class TestTaskCancellation: assert tenant_2 in queue_2._queue +# ============================================================================ +# Integration Tests +# ============================================================================ + + +class TestAdvancedScenarios: + """Advanced test scenarios for edge cases and complex workflows.""" + + def test_multiple_documents_with_mixed_success_and_failure( + self, dataset_id, mock_db_session, mock_dataset, mock_indexing_runner + ): + """ + Test handling of mixed success and failure scenarios in batch processing. + + When processing multiple documents, some may succeed while others fail. + This tests that the system handles partial failures gracefully. + + Scenario: + - Process 3 documents in a batch + - First document succeeds + - Second document is not found (skipped) + - Third document succeeds + + Expected behavior: + - Only found documents are processed + - Missing documents are skipped without crashing + - IndexingRunner receives only valid documents + """ + # Arrange - Create document IDs with one missing + document_ids = [str(uuid.uuid4()) for _ in range(3)] + + # Create only 2 documents (simulate one missing) + # The new code uses .all() which will only return existing documents + mock_documents = [] + for i, doc_id in enumerate([document_ids[0], document_ids[2]]): # Skip middle one + doc = MagicMock(spec=Document) + doc.id = doc_id + doc.dataset_id = dataset_id + doc.indexing_status = "waiting" + doc.processing_started_at = None + mock_documents.append(doc) + + # Set shared mock data - .all() will only return existing documents + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + with patch("tasks.document_indexing_task.FeatureService.get_features") as mock_features: + mock_features.return_value.billing.enabled = False + + # Act + _document_indexing(dataset_id, document_ids) + + # Assert - Only 2 documents should be processed (missing one skipped) + mock_indexing_runner.run.assert_called_once() + call_args = mock_indexing_runner.run.call_args[0][0] + assert len(call_args) == 2 # Only found documents + + def test_tenant_queue_with_multiple_concurrent_tasks( + self, tenant_id, dataset_id, mock_redis, mock_db_session, mock_dataset + ): + """ + Test concurrent task processing with tenant isolation. + + This tests the scenario where multiple tasks are queued for the same tenant + and need to be processed respecting the concurrency limit. + + Scenario: + - 5 tasks are waiting in the queue + - Concurrency limit is 2 + - After current task completes, pull and enqueue next 2 tasks + + Expected behavior: + - Exactly 2 tasks are pulled from queue (respecting concurrency) + - Each task is enqueued with correct parameters + - Task waiting time is set for each new task + """ + # Arrange + concurrency_limit = 2 + document_ids = [str(uuid.uuid4())] + + # Create multiple waiting tasks + waiting_tasks = [] + for i in range(5): + task_data = {"tenant_id": tenant_id, "dataset_id": dataset_id, "document_ids": [f"doc_{i}"]} + from core.rag.pipeline.queue import TaskWrapper + + wrapper = TaskWrapper(data=task_data) + waiting_tasks.append(wrapper.serialize()) + + # Mock rpop to return tasks up to concurrency limit + mock_redis.rpop.side_effect = waiting_tasks[:concurrency_limit] + [None] + mock_db_session.query.return_value.where.return_value.first.return_value = mock_dataset + + with patch("tasks.document_indexing_task.dify_config.TENANT_ISOLATED_TASK_CONCURRENCY", concurrency_limit): + with patch("tasks.document_indexing_task.normal_document_indexing_task") as mock_task: + # Act + _document_indexing_with_tenant_queue(tenant_id, dataset_id, document_ids, mock_task) + + # Assert + # Should enqueue exactly concurrency_limit tasks + assert mock_task.apply_async.call_count == concurrency_limit + + # Verify task waiting time was set for each task + assert mock_redis.setex.call_count >= concurrency_limit + + def test_vector_space_limit_edge_case_at_exact_limit( + self, dataset_id, document_ids, mock_db_session, mock_dataset, mock_feature_service + ): + """ + Test vector space limit validation at exact boundary. + + Edge case: When vector space is exactly at the limit (not over), + the upload should still be rejected. + + Scenario: + - Vector space limit: 100 + - Current size: 100 (exactly at limit) + - Try to upload 3 documents + + Expected behavior: + - Upload is rejected with appropriate error message + - All documents are marked with error status + """ + # Arrange + mock_documents = [] + for doc_id in document_ids: + doc = MagicMock(spec=Document) + doc.id = doc_id + doc.dataset_id = dataset_id + doc.indexing_status = "waiting" + doc.error = None + doc.stopped_at = None + mock_documents.append(doc) + + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + # Set vector space exactly at limit + mock_feature_service.get_features.return_value.billing.enabled = True + mock_feature_service.get_features.return_value.billing.subscription.plan = CloudPlan.PROFESSIONAL + mock_feature_service.get_features.return_value.vector_space.limit = 100 + mock_feature_service.get_features.return_value.vector_space.size = 100 # Exactly at limit + + # Act + _document_indexing(dataset_id, document_ids) + + # Assert - All documents should have error status + for doc in mock_documents: + assert doc.indexing_status == "error" + assert "over the limit" in doc.error + + def test_task_queue_fifo_ordering(self, tenant_id, dataset_id, mock_redis, mock_db_session, mock_dataset): + """ + Test that tasks are processed in FIFO (First-In-First-Out) order. + + The tenant isolated queue should maintain task order, ensuring + that tasks are processed in the sequence they were added. + + Scenario: + - Task A added first + - Task B added second + - Task C added third + - When pulling tasks, should get A, then B, then C + + Expected behavior: + - Tasks are retrieved in the order they were added + - FIFO ordering is maintained throughout processing + """ + # Arrange + document_ids = [str(uuid.uuid4())] + + # Create tasks with identifiable document IDs to track order + task_order = ["task_A", "task_B", "task_C"] + tasks = [] + for task_name in task_order: + task_data = {"tenant_id": tenant_id, "dataset_id": dataset_id, "document_ids": [task_name]} + from core.rag.pipeline.queue import TaskWrapper + + wrapper = TaskWrapper(data=task_data) + tasks.append(wrapper.serialize()) + + # Mock rpop to return tasks in FIFO order + mock_redis.rpop.side_effect = tasks + [None] + mock_db_session.query.return_value.where.return_value.first.return_value = mock_dataset + + with patch("tasks.document_indexing_task.dify_config.TENANT_ISOLATED_TASK_CONCURRENCY", 3): + with patch("tasks.document_indexing_task.normal_document_indexing_task") as mock_task: + # Act + _document_indexing_with_tenant_queue(tenant_id, dataset_id, document_ids, mock_task) + + # Assert - Verify tasks were enqueued in correct order + assert mock_task.apply_async.call_count == 3 + + # Check that document_ids in calls match expected order + for i, call_obj in enumerate(mock_task.apply_async.call_args_list): + called_doc_ids = call_obj[1]["kwargs"]["document_ids"] + assert called_doc_ids == [task_order[i]] + + def test_empty_queue_after_task_completion_cleans_up( + self, tenant_id, dataset_id, document_ids, mock_redis, mock_db_session, mock_dataset + ): + """ + Test cleanup behavior when queue becomes empty after task completion. + + After processing the last task in the queue, the system should: + 1. Detect that no more tasks are waiting + 2. Delete the task key to indicate tenant is idle + 3. Allow new tasks to start fresh processing + + Scenario: + - Process a task + - Check queue for next tasks + - Queue is empty + - Task key should be deleted + + Expected behavior: + - Task key is deleted when queue is empty + - Tenant is marked as idle (no active tasks) + """ + # Arrange + mock_redis.rpop.return_value = None # Empty queue + mock_db_session.query.return_value.where.return_value.first.return_value = mock_dataset + + with patch("tasks.document_indexing_task.normal_document_indexing_task") as mock_task: + # Act + _document_indexing_with_tenant_queue(tenant_id, dataset_id, document_ids, mock_task) + + # Assert + # Verify delete was called to clean up task key + mock_redis.delete.assert_called_once() + + # Verify the correct key was deleted (contains tenant_id and "document_indexing") + delete_call_args = mock_redis.delete.call_args[0][0] + assert tenant_id in delete_call_args + assert "document_indexing" in delete_call_args + + def test_billing_disabled_skips_limit_checks( + self, dataset_id, document_ids, mock_db_session, mock_dataset, mock_indexing_runner, mock_feature_service + ): + """ + Test that billing limit checks are skipped when billing is disabled. + + For self-hosted or enterprise deployments where billing is disabled, + the system should not enforce vector space or batch upload limits. + + Scenario: + - Billing is disabled + - Upload 100 documents (would normally exceed limits) + - No limit checks should be performed + + Expected behavior: + - Documents are processed without limit validation + - No errors related to limits + - All documents proceed to indexing + """ + # Arrange - Create many documents + large_batch_ids = [str(uuid.uuid4()) for _ in range(100)] + + mock_documents = [] + for doc_id in large_batch_ids: + doc = MagicMock(spec=Document) + doc.id = doc_id + doc.dataset_id = dataset_id + doc.indexing_status = "waiting" + doc.processing_started_at = None + mock_documents.append(doc) + + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + # Billing disabled - limits should not be checked + mock_feature_service.get_features.return_value.billing.enabled = False + + # Act + _document_indexing(dataset_id, large_batch_ids) + + # Assert + # All documents should be set to parsing (no limit errors) + for doc in mock_documents: + assert doc.indexing_status == "parsing" + + # IndexingRunner should be called with all documents + mock_indexing_runner.run.assert_called_once() + call_args = mock_indexing_runner.run.call_args[0][0] + assert len(call_args) == 100 + + +class TestIntegration: + """Integration tests for complete task workflows.""" + + def test_complete_workflow_normal_task( + self, tenant_id, dataset_id, document_ids, mock_redis, mock_db_session, mock_dataset, mock_indexing_runner + ): + """ + Test complete workflow for normal document indexing task. + + This tests the full flow from task receipt to completion. + """ + # Arrange - Create actual document objects + mock_documents = [] + for doc_id in document_ids: + doc = MagicMock(spec=Document) + doc.id = doc_id + doc.dataset_id = dataset_id + doc.indexing_status = "waiting" + doc.processing_started_at = None + mock_documents.append(doc) + + # Set up rpop to return None for concurrency check (no more tasks) + mock_redis.rpop.side_effect = [None] + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + with patch("tasks.document_indexing_task.FeatureService.get_features") as mock_features: + mock_features.return_value.billing.enabled = False + + # Act + normal_document_indexing_task(tenant_id, dataset_id, document_ids) + + # Assert + # Documents should be processed + mock_indexing_runner.run.assert_called_once() + # Session should be closed + assert mock_db_session.close.called + # Task key should be deleted (no more tasks) + assert mock_redis.delete.called + + def test_complete_workflow_priority_task( + self, tenant_id, dataset_id, document_ids, mock_redis, mock_db_session, mock_dataset, mock_indexing_runner + ): + """ + Test complete workflow for priority document indexing task. + + Priority tasks should follow the same flow as normal tasks. + """ + # Arrange - Create actual document objects + mock_documents = [] + for doc_id in document_ids: + doc = MagicMock(spec=Document) + doc.id = doc_id + doc.dataset_id = dataset_id + doc.indexing_status = "waiting" + doc.processing_started_at = None + mock_documents.append(doc) + + # Set up rpop to return None for concurrency check (no more tasks) + mock_redis.rpop.side_effect = [None] + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + with patch("tasks.document_indexing_task.FeatureService.get_features") as mock_features: + mock_features.return_value.billing.enabled = False + + # Act + priority_document_indexing_task(tenant_id, dataset_id, document_ids) + + # Assert + mock_indexing_runner.run.assert_called_once() + assert mock_db_session.close.called + assert mock_redis.delete.called + + def test_queue_chain_processing( + self, tenant_id, dataset_id, mock_redis, mock_db_session, mock_dataset, mock_indexing_runner + ): + """ + Test that multiple tasks in queue are processed in sequence. + + When tasks are queued, they should be processed one after another. + """ + # Arrange + task_1_docs = [str(uuid.uuid4())] + task_2_docs = [str(uuid.uuid4())] + + task_2_data = {"tenant_id": tenant_id, "dataset_id": dataset_id, "document_ids": task_2_docs} + + from core.rag.pipeline.queue import TaskWrapper + + wrapper = TaskWrapper(data=task_2_data) + + # First call returns task 2, second call returns None + mock_redis.rpop.side_effect = [wrapper.serialize(), None] + + mock_db_session.query.return_value.where.return_value.first.return_value = mock_dataset + + with patch("tasks.document_indexing_task.FeatureService.get_features") as mock_features: + mock_features.return_value.billing.enabled = False + + with patch("tasks.document_indexing_task.normal_document_indexing_task") as mock_task: + # Act - Process first task + _document_indexing_with_tenant_queue(tenant_id, dataset_id, task_1_docs, mock_task) + + # Assert - Second task should be enqueued + assert mock_task.apply_async.called + call_args = mock_task.apply_async.call_args + assert call_args[1]["kwargs"]["document_ids"] == task_2_docs + + # ============================================================================ # Additional Edge Case Tests # ============================================================================ @@ -249,6 +1329,107 @@ class TestEdgeCases: class TestPerformanceScenarios: """Test performance-related scenarios and optimizations.""" + def test_large_document_batch_processing( + self, dataset_id, mock_db_session, mock_dataset, mock_indexing_runner, mock_feature_service + ): + """ + Test processing a large batch of documents at batch limit. + + When processing the maximum allowed batch size, the system + should handle it efficiently without errors. + + Scenario: + - Process exactly batch_upload_limit documents (e.g., 50) + - All documents are valid + - Billing is enabled + + Expected behavior: + - All documents are processed successfully + - No timeout or memory issues + - Batch limit is not exceeded + """ + # Arrange + batch_limit = 50 + document_ids = [str(uuid.uuid4()) for _ in range(batch_limit)] + + mock_documents = [] + for doc_id in document_ids: + doc = MagicMock(spec=Document) + doc.id = doc_id + doc.dataset_id = dataset_id + doc.indexing_status = "waiting" + doc.processing_started_at = None + mock_documents.append(doc) + + # Set shared mock data so all sessions can access it + mock_db_session._shared_data["dataset"] = mock_dataset + mock_db_session._shared_data["documents"] = mock_documents + + # Configure billing with sufficient limits + mock_feature_service.get_features.return_value.billing.enabled = True + mock_feature_service.get_features.return_value.billing.subscription.plan = CloudPlan.PROFESSIONAL + mock_feature_service.get_features.return_value.vector_space.limit = 10000 + mock_feature_service.get_features.return_value.vector_space.size = 0 + + with patch("tasks.document_indexing_task.dify_config.BATCH_UPLOAD_LIMIT", str(batch_limit)): + # Act + _document_indexing(dataset_id, document_ids) + + # Assert + for doc in mock_documents: + assert doc.indexing_status == "parsing" + + mock_indexing_runner.run.assert_called_once() + call_args = mock_indexing_runner.run.call_args[0][0] + assert len(call_args) == batch_limit + + def test_tenant_queue_handles_burst_traffic(self, tenant_id, dataset_id, mock_redis, mock_db_session, mock_dataset): + """ + Test tenant queue handling burst traffic scenarios. + + When many tasks arrive in a burst for the same tenant, + the queue should handle them efficiently without dropping tasks. + + Scenario: + - 20 tasks arrive rapidly + - Concurrency limit is 3 + - Tasks should be queued and processed in batches + + Expected behavior: + - First 3 tasks are processed immediately + - Remaining tasks wait in queue + - No tasks are lost + """ + # Arrange + num_tasks = 20 + concurrency_limit = 3 + document_ids = [str(uuid.uuid4())] + + # Create waiting tasks + waiting_tasks = [] + for i in range(num_tasks): + task_data = { + "tenant_id": tenant_id, + "dataset_id": dataset_id, + "document_ids": [f"doc_{i}"], + } + from core.rag.pipeline.queue import TaskWrapper + + wrapper = TaskWrapper(data=task_data) + waiting_tasks.append(wrapper.serialize()) + + # Mock rpop to return tasks up to concurrency limit + mock_redis.rpop.side_effect = waiting_tasks[:concurrency_limit] + [None] + mock_db_session.query.return_value.where.return_value.first.return_value = mock_dataset + + with patch("tasks.document_indexing_task.dify_config.TENANT_ISOLATED_TASK_CONCURRENCY", concurrency_limit): + with patch("tasks.document_indexing_task.normal_document_indexing_task") as mock_task: + # Act + _document_indexing_with_tenant_queue(tenant_id, dataset_id, document_ids, mock_task) + + # Assert - Should process exactly concurrency_limit tasks + assert mock_task.apply_async.call_count == concurrency_limit + def test_multiple_tenants_isolated_processing(self, mock_redis): """ Test that multiple tenants process tasks in isolation.