mirror of https://github.com/langgenius/dify.git
Merge 89980f01e6 into a813b9f103
This commit is contained in:
commit
a9184872a3
|
|
@ -38,7 +38,14 @@ from extensions.ext_storage import storage
|
|||
from libs import helper
|
||||
from libs.datetime_utils import naive_utc_now
|
||||
from models import Account
|
||||
from models.dataset import AutomaticRulesConfig, ChildChunk, Dataset, DatasetProcessRule, DocumentSegment
|
||||
from models.dataset import (
|
||||
AutomaticRulesConfig,
|
||||
ChildChunk,
|
||||
Dataset,
|
||||
DatasetProcessRule,
|
||||
DocumentSegment,
|
||||
ProcessRuleDict,
|
||||
)
|
||||
from models.dataset import Document as DatasetDocument
|
||||
from models.enums import DataSourceType, IndexingStatus, ProcessRuleMode, SegmentStatus
|
||||
from models.model import UploadFile
|
||||
|
|
@ -318,10 +325,11 @@ class IndexingRunner:
|
|||
index_processor = IndexProcessorFactory(index_type).init_index_processor()
|
||||
# one extract_setting is one source document
|
||||
for extract_setting in extract_settings:
|
||||
# extract
|
||||
processing_rule = DatasetProcessRule(
|
||||
mode=tmp_processing_rule["mode"], rules=json.dumps(tmp_processing_rule["rules"])
|
||||
)
|
||||
# Preview only needs the rule payload, not a persisted ORM record.
|
||||
processing_rule: ProcessRuleDict = {
|
||||
"mode": tmp_processing_rule["mode"],
|
||||
"rules": tmp_processing_rule.get("rules"),
|
||||
}
|
||||
# Extract document content
|
||||
text_docs = index_processor.extract(extract_setting, process_rule_mode=tmp_processing_rule["mode"])
|
||||
# Cleaning and segmentation
|
||||
|
|
@ -329,7 +337,7 @@ class IndexingRunner:
|
|||
text_docs,
|
||||
current_user=None,
|
||||
embedding_model_instance=embedding_model_instance,
|
||||
process_rule=processing_rule.to_dict(),
|
||||
process_rule=processing_rule,
|
||||
tenant_id=tenant_id,
|
||||
doc_language=doc_language,
|
||||
preview=True,
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ import time
|
|||
from collections.abc import Sequence
|
||||
from datetime import datetime
|
||||
from json import JSONDecodeError
|
||||
from typing import Any, TypedDict, cast
|
||||
from typing import Any, NotRequired, TypedDict, cast
|
||||
from uuid import uuid4
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
|
@ -70,8 +70,8 @@ class AutomaticRulesConfig(TypedDict):
|
|||
|
||||
|
||||
class ProcessRuleDict(TypedDict):
|
||||
id: str
|
||||
dataset_id: str
|
||||
id: NotRequired[str]
|
||||
dataset_id: NotRequired[str]
|
||||
mode: str
|
||||
rules: dict[str, Any] | None
|
||||
|
||||
|
|
|
|||
|
|
@ -1431,6 +1431,44 @@ class TestIndexingRunnerEstimate:
|
|||
doc_form=IndexStructureType.PARAGRAPH_INDEX,
|
||||
)
|
||||
|
||||
def test_indexing_estimate_uses_lightweight_preview_process_rule(self, mock_dependencies):
|
||||
"""Test preview estimation avoids constructing a persisted DatasetProcessRule ORM object."""
|
||||
# Arrange
|
||||
runner = IndexingRunner()
|
||||
tenant_id = str(uuid.uuid4())
|
||||
extract_setting = MagicMock()
|
||||
tmp_processing_rule = create_mock_process_rule(mode="automatic")
|
||||
|
||||
mock_features = MagicMock()
|
||||
mock_features.billing.enabled = False
|
||||
mock_dependencies["feature_service"].get_features.return_value = mock_features
|
||||
|
||||
mock_processor = MagicMock()
|
||||
mock_processor.extract.return_value = [Document(page_content="source", metadata={"doc_id": "source-1"})]
|
||||
mock_processor.transform.return_value = [Document(page_content="preview chunk", metadata={"doc_id": "chunk-1"})]
|
||||
mock_dependencies["factory"].return_value.init_index_processor.return_value = mock_processor
|
||||
|
||||
with patch("core.indexing_runner.DatasetProcessRule", side_effect=AssertionError("ORM should not be created")):
|
||||
# Act
|
||||
estimate = runner.indexing_estimate(
|
||||
tenant_id=tenant_id,
|
||||
extract_settings=[extract_setting],
|
||||
tmp_processing_rule=tmp_processing_rule,
|
||||
doc_form=IndexStructureType.PARAGRAPH_INDEX,
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert estimate.total_segments == 1
|
||||
assert len(estimate.preview) == 1
|
||||
assert estimate.preview[0].content == "preview chunk"
|
||||
|
||||
transform_kwargs = mock_processor.transform.call_args.kwargs
|
||||
assert transform_kwargs["preview"] is True
|
||||
assert transform_kwargs["process_rule"] == {
|
||||
"mode": tmp_processing_rule["mode"],
|
||||
"rules": tmp_processing_rule.get("rules"),
|
||||
}
|
||||
|
||||
|
||||
class TestIndexingRunnerProcessChunk:
|
||||
"""Unit tests for chunk processing in parallel.
|
||||
|
|
|
|||
Loading…
Reference in New Issue