test(api): split dataset_service unit tests by service area

This commit is contained in:
Rajat Agarwal 2026-03-24 12:57:43 +05:30
parent 90b28a6d17
commit 9263442a4e
5 changed files with 5301 additions and 5129 deletions

View File

@ -0,0 +1,455 @@
"""Shared helpers for dataset_service unit tests.
These factories and lightweight builders are reused across the dataset,
document, and segment service test modules that exercise
``api/services/dataset_service.py``.
"""
import json
from types import SimpleNamespace
from unittest.mock import MagicMock, Mock, create_autospec, patch
import pytest
from werkzeug.exceptions import Forbidden, NotFound
from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError
from core.rag.index_processor.constant.built_in_field import BuiltInField
from core.rag.index_processor.constant.index_type import IndexStructureType
from core.rag.retrieval.retrieval_methods import RetrievalMethod
from dify_graph.model_runtime.entities.model_entities import ModelFeature, ModelType
from enums.cloud_plan import CloudPlan
from models import Account, TenantAccountRole
from models.dataset import (
ChildChunk,
Dataset,
DatasetPermissionEnum,
DatasetProcessRule,
Document,
DocumentSegment,
)
from models.model import UploadFile
from services.dataset_service import (
DatasetCollectionBindingService,
DatasetPermissionService,
DatasetService,
DocumentService,
SegmentService,
)
from services.entities.knowledge_entities.knowledge_entities import (
ChildChunkUpdateArgs,
DataSource,
FileInfo,
InfoList,
KnowledgeConfig,
NotionIcon,
NotionInfo,
NotionPage,
PreProcessingRule,
ProcessRule,
RerankingModel,
RetrievalModel,
Rule,
Segmentation,
SegmentUpdateArgs,
WebsiteInfo,
)
from services.entities.knowledge_entities.rag_pipeline_entities import (
IconInfo as PipelineIconInfo,
)
from services.entities.knowledge_entities.rag_pipeline_entities import (
KnowledgeConfiguration,
RagPipelineDatasetCreateEntity,
)
from services.entities.knowledge_entities.rag_pipeline_entities import (
RerankingModelConfig as RagPipelineRerankingModelConfig,
)
from services.entities.knowledge_entities.rag_pipeline_entities import (
RetrievalSetting as RagPipelineRetrievalSetting,
)
from services.errors.account import NoPermissionError
from services.errors.chunk import ChildChunkDeleteIndexError, ChildChunkIndexingError
from services.errors.dataset import DatasetNameDuplicateError
from services.errors.document import DocumentIndexingError
from services.errors.file import FileNotExistsError
__all__ = [
"Account",
"BuiltInField",
"ChildChunk",
"ChildChunkDeleteIndexError",
"ChildChunkIndexingError",
"ChildChunkUpdateArgs",
"CloudPlan",
"DataSource",
"Dataset",
"DatasetCollectionBindingService",
"DatasetNameDuplicateError",
"DatasetPermissionEnum",
"DatasetPermissionService",
"DatasetProcessRule",
"DatasetService",
"DatasetServiceUnitDataFactory",
"Document",
"DocumentIndexingError",
"DocumentSegment",
"DocumentService",
"FileInfo",
"FileNotExistsError",
"Forbidden",
"IndexStructureType",
"InfoList",
"KnowledgeConfig",
"KnowledgeConfiguration",
"LLMBadRequestError",
"MagicMock",
"Mock",
"ModelFeature",
"ModelType",
"NoPermissionError",
"NotFound",
"NotionIcon",
"NotionInfo",
"NotionPage",
"PipelineIconInfo",
"PreProcessingRule",
"ProcessRule",
"ProviderTokenNotInitError",
"RagPipelineDatasetCreateEntity",
"RagPipelineRerankingModelConfig",
"RagPipelineRetrievalSetting",
"RerankingModel",
"RetrievalMethod",
"RetrievalModel",
"Rule",
"SegmentService",
"SegmentUpdateArgs",
"Segmentation",
"SimpleNamespace",
"TenantAccountRole",
"WebsiteInfo",
"_make_child_chunk",
"_make_dataset",
"_make_document",
"_make_features",
"_make_knowledge_configuration",
"_make_lock_context",
"_make_retrieval_model",
"_make_segment",
"_make_session_context",
"_make_upload_knowledge_config",
"create_autospec",
"json",
"patch",
"pytest",
]
def _make_session_context(session: MagicMock) -> MagicMock:
"""Wrap a mocked session in a context manager."""
context_manager = MagicMock()
context_manager.__enter__.return_value = session
context_manager.__exit__.return_value = False
return context_manager
class DatasetServiceUnitDataFactory:
"""Factory for lightweight doubles used across dataset service tests."""
@staticmethod
def create_dataset_mock(
dataset_id: str = "dataset-123",
tenant_id: str = "tenant-123",
*,
permission: str = DatasetPermissionEnum.ALL_TEAM,
created_by: str = "user-123",
indexing_technique: str = "economy",
embedding_model_provider: str = "provider",
embedding_model: str = "model",
built_in_field_enabled: bool = False,
doc_form: str | None = "text_model",
enable_api: bool = False,
summary_index_setting: dict | None = None,
**kwargs,
) -> Mock:
dataset = Mock(spec=Dataset)
dataset.id = dataset_id
dataset.tenant_id = tenant_id
dataset.permission = permission
dataset.created_by = created_by
dataset.indexing_technique = indexing_technique
dataset.embedding_model_provider = embedding_model_provider
dataset.embedding_model = embedding_model
dataset.built_in_field_enabled = built_in_field_enabled
dataset.doc_form = doc_form
dataset.enable_api = enable_api
dataset.updated_by = None
dataset.updated_at = None
dataset.summary_index_setting = summary_index_setting
for key, value in kwargs.items():
setattr(dataset, key, value)
return dataset
@staticmethod
def create_user_mock(
user_id: str = "user-123",
tenant_id: str = "tenant-123",
role: str = TenantAccountRole.OWNER,
**kwargs,
) -> SimpleNamespace:
user = SimpleNamespace(
id=user_id,
current_tenant_id=tenant_id,
current_role=role,
)
for key, value in kwargs.items():
setattr(user, key, value)
return user
@staticmethod
def create_document_mock(
document_id: str = "doc-123",
dataset_id: str = "dataset-123",
tenant_id: str = "tenant-123",
*,
indexing_status: str = "completed",
is_paused: bool = False,
archived: bool = False,
enabled: bool = True,
data_source_type: str = "upload_file",
data_source_info_dict: dict | None = None,
data_source_info: str | None = None,
doc_form: str = "text_model",
need_summary: bool = True,
position: int = 0,
doc_metadata: dict | None = None,
name: str = "Document",
**kwargs,
) -> Mock:
document = Mock(spec=Document)
document.id = document_id
document.dataset_id = dataset_id
document.tenant_id = tenant_id
document.indexing_status = indexing_status
document.is_paused = is_paused
document.paused_by = None
document.paused_at = None
document.archived = archived
document.enabled = enabled
document.data_source_type = data_source_type
document.data_source_info_dict = data_source_info_dict or {}
document.data_source_info = data_source_info
document.doc_form = doc_form
document.need_summary = need_summary
document.position = position
document.doc_metadata = doc_metadata
document.name = name
for key, value in kwargs.items():
setattr(document, key, value)
return document
@staticmethod
def create_upload_file_mock(file_id: str = "file-123", name: str = "upload.txt") -> Mock:
upload_file = Mock(spec=UploadFile)
upload_file.id = file_id
upload_file.name = name
return upload_file
_UNSET = object()
def _make_lock_context() -> MagicMock:
context_manager = MagicMock()
context_manager.__enter__.return_value = None
context_manager.__exit__.return_value = False
return context_manager
def _make_features(*, enabled: bool, plan: str = CloudPlan.PROFESSIONAL) -> SimpleNamespace:
return SimpleNamespace(
billing=SimpleNamespace(
enabled=enabled,
subscription=SimpleNamespace(plan=plan),
),
documents_upload_quota=SimpleNamespace(limit=1000, size=0),
)
def _make_dataset(
*,
dataset_id: str = "dataset-1",
tenant_id: str = "tenant-1",
data_source_type: str | None = None,
indexing_technique: str | None = "economy",
latest_process_rule=None,
) -> Mock:
dataset = Mock(spec=Dataset)
dataset.id = dataset_id
dataset.tenant_id = tenant_id
dataset.data_source_type = data_source_type
dataset.indexing_technique = indexing_technique
dataset.latest_process_rule = latest_process_rule
dataset.embedding_model_provider = "provider"
dataset.embedding_model = "embedding-model"
dataset.summary_index_setting = None
dataset.retrieval_model = None
dataset.collection_binding_id = None
return dataset
def _make_document(
*,
document_id: str = "doc-1",
dataset_id: str = "dataset-1",
tenant_id: str = "tenant-1",
batch: str = "batch-1",
doc_form: str = IndexStructureType.PARAGRAPH_INDEX,
word_count: int = 0,
name: str = "Document 1",
enabled: bool = True,
archived: bool = False,
indexing_status: str = "completed",
display_status: str = "available",
) -> Mock:
document = Mock(spec=Document)
document.id = document_id
document.dataset_id = dataset_id
document.tenant_id = tenant_id
document.batch = batch
document.doc_form = doc_form
document.word_count = word_count
document.name = name
document.enabled = enabled
document.archived = archived
document.indexing_status = indexing_status
document.display_status = display_status
document.data_source_type = "upload_file"
document.data_source_info = "{}"
document.completed_at = SimpleNamespace()
document.processing_started_at = "started"
document.parsing_completed_at = "parsed"
document.cleaning_completed_at = "cleaned"
document.splitting_completed_at = "split"
document.updated_at = None
document.created_from = None
document.dataset_process_rule_id = "process-rule-1"
return document
def _make_segment(
*,
segment_id: str = "segment-1",
content: str = "segment content",
word_count: int = 15,
enabled: bool = True,
keywords: list[str] | None = None,
index_node_id: str = "node-1",
dataset_id: str = "dataset-1",
document_id: str = "doc-1",
) -> Mock:
segment = Mock(spec=DocumentSegment)
segment.id = segment_id
segment.dataset_id = dataset_id
segment.document_id = document_id
segment.content = content
segment.word_count = word_count
segment.enabled = enabled
segment.keywords = keywords or []
segment.answer = None
segment.index_node_id = index_node_id
segment.disabled_at = None
segment.disabled_by = None
segment.status = "completed"
segment.error = None
return segment
def _make_child_chunk() -> ChildChunk:
return ChildChunk(
id="child-a",
tenant_id="tenant-1",
dataset_id="dataset-1",
document_id="doc-1",
segment_id="segment-1",
position=1,
content="old content",
word_count=11,
created_by="user-1",
)
def _make_upload_knowledge_config(
*,
original_document_id: str | None = None,
file_ids: list[str] | None = None,
process_rule: ProcessRule | None = None,
data_source: DataSource | object | None = _UNSET,
) -> KnowledgeConfig:
if data_source is _UNSET:
info_list = InfoList(
data_source_type="upload_file",
file_info_list=FileInfo(file_ids=file_ids) if file_ids is not None else None,
)
data_source = DataSource(info_list=info_list)
return KnowledgeConfig(
original_document_id=original_document_id,
indexing_technique="economy",
data_source=data_source,
process_rule=process_rule,
doc_form=IndexStructureType.PARAGRAPH_INDEX,
doc_language="English",
)
def _make_retrieval_model(
*,
reranking_provider_name: str = "rerank-provider",
reranking_model_name: str = "rerank-model",
) -> RetrievalModel:
return RetrievalModel(
search_method=RetrievalMethod.SEMANTIC_SEARCH,
reranking_enable=True,
reranking_model=RerankingModel(
reranking_provider_name=reranking_provider_name,
reranking_model_name=reranking_model_name,
),
reranking_mode="reranking_model",
top_k=4,
score_threshold_enabled=False,
)
def _make_rag_pipeline_retrieval_setting() -> RagPipelineRetrievalSetting:
return RagPipelineRetrievalSetting(
search_method=RetrievalMethod.SEMANTIC_SEARCH,
top_k=4,
score_threshold=0.5,
score_threshold_enabled=True,
reranking_mode="reranking_model",
reranking_enable=True,
reranking_model=RagPipelineRerankingModelConfig(
reranking_provider_name="rerank-provider",
reranking_model_name="rerank-model",
),
)
def _make_knowledge_configuration(
*,
chunk_structure: str = "paragraph",
indexing_technique: str = "high_quality",
embedding_model_provider: str = "provider",
embedding_model: str = "embedding-model",
keyword_number: int = 8,
summary_index_setting: dict | None = None,
) -> KnowledgeConfiguration:
return KnowledgeConfiguration(
chunk_structure=chunk_structure,
indexing_technique=indexing_technique,
embedding_model_provider=embedding_model_provider,
embedding_model=embedding_model,
keyword_number=keyword_number,
retrieval_model=_make_rag_pipeline_retrieval_setting(),
summary_index_setting=summary_index_setting,
)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff