This commit is contained in:
wangji0923 2026-03-24 11:12:12 +01:00 committed by GitHub
commit a9184872a3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 55 additions and 9 deletions

View File

@ -38,7 +38,14 @@ from extensions.ext_storage import storage
from libs import helper
from libs.datetime_utils import naive_utc_now
from models import Account
from models.dataset import AutomaticRulesConfig, ChildChunk, Dataset, DatasetProcessRule, DocumentSegment
from models.dataset import (
AutomaticRulesConfig,
ChildChunk,
Dataset,
DatasetProcessRule,
DocumentSegment,
ProcessRuleDict,
)
from models.dataset import Document as DatasetDocument
from models.enums import DataSourceType, IndexingStatus, ProcessRuleMode, SegmentStatus
from models.model import UploadFile
@ -318,10 +325,11 @@ class IndexingRunner:
index_processor = IndexProcessorFactory(index_type).init_index_processor()
# one extract_setting is one source document
for extract_setting in extract_settings:
# extract
processing_rule = DatasetProcessRule(
mode=tmp_processing_rule["mode"], rules=json.dumps(tmp_processing_rule["rules"])
)
# Preview only needs the rule payload, not a persisted ORM record.
processing_rule: ProcessRuleDict = {
"mode": tmp_processing_rule["mode"],
"rules": tmp_processing_rule.get("rules"),
}
# Extract document content
text_docs = index_processor.extract(extract_setting, process_rule_mode=tmp_processing_rule["mode"])
# Cleaning and segmentation
@ -329,7 +337,7 @@ class IndexingRunner:
text_docs,
current_user=None,
embedding_model_instance=embedding_model_instance,
process_rule=processing_rule.to_dict(),
process_rule=processing_rule,
tenant_id=tenant_id,
doc_language=doc_language,
preview=True,

View File

@ -11,7 +11,7 @@ import time
from collections.abc import Sequence
from datetime import datetime
from json import JSONDecodeError
from typing import Any, TypedDict, cast
from typing import Any, NotRequired, TypedDict, cast
from uuid import uuid4
import sqlalchemy as sa
@ -70,8 +70,8 @@ class AutomaticRulesConfig(TypedDict):
class ProcessRuleDict(TypedDict):
id: str
dataset_id: str
id: NotRequired[str]
dataset_id: NotRequired[str]
mode: str
rules: dict[str, Any] | None

View File

@ -1431,6 +1431,44 @@ class TestIndexingRunnerEstimate:
doc_form=IndexStructureType.PARAGRAPH_INDEX,
)
def test_indexing_estimate_uses_lightweight_preview_process_rule(self, mock_dependencies):
"""Test preview estimation avoids constructing a persisted DatasetProcessRule ORM object."""
# Arrange
runner = IndexingRunner()
tenant_id = str(uuid.uuid4())
extract_setting = MagicMock()
tmp_processing_rule = create_mock_process_rule(mode="automatic")
mock_features = MagicMock()
mock_features.billing.enabled = False
mock_dependencies["feature_service"].get_features.return_value = mock_features
mock_processor = MagicMock()
mock_processor.extract.return_value = [Document(page_content="source", metadata={"doc_id": "source-1"})]
mock_processor.transform.return_value = [Document(page_content="preview chunk", metadata={"doc_id": "chunk-1"})]
mock_dependencies["factory"].return_value.init_index_processor.return_value = mock_processor
with patch("core.indexing_runner.DatasetProcessRule", side_effect=AssertionError("ORM should not be created")):
# Act
estimate = runner.indexing_estimate(
tenant_id=tenant_id,
extract_settings=[extract_setting],
tmp_processing_rule=tmp_processing_rule,
doc_form=IndexStructureType.PARAGRAPH_INDEX,
)
# Assert
assert estimate.total_segments == 1
assert len(estimate.preview) == 1
assert estimate.preview[0].content == "preview chunk"
transform_kwargs = mock_processor.transform.call_args.kwargs
assert transform_kwargs["preview"] is True
assert transform_kwargs["process_rule"] == {
"mode": tmp_processing_rule["mode"],
"rules": tmp_processing_rule.get("rules"),
}
class TestIndexingRunnerProcessChunk:
"""Unit tests for chunk processing in parallel.