From 6535613419df0a024f5d5310610d776b59bf4baf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=8D=E8=AF=95=E8=B5=84=E6=96=99?= Date: Mon, 23 Mar 2026 14:54:43 +0800 Subject: [PATCH 1/3] fix: avoid persisted process rules in indexing preview --- api/core/indexing_runner.py | 13 ++++--- api/models/dataset.py | 6 +-- .../core/rag/indexing/test_indexing_runner.py | 38 +++++++++++++++++++ 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 52776ee626..86e6964b08 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -38,7 +38,7 @@ from extensions.ext_storage import storage from libs import helper from libs.datetime_utils import naive_utc_now from models import Account -from models.dataset import AutomaticRulesConfig, ChildChunk, Dataset, DatasetProcessRule, DocumentSegment +from models.dataset import AutomaticRulesConfig, ChildChunk, Dataset, DatasetProcessRule, DocumentSegment, ProcessRuleDict from models.dataset import Document as DatasetDocument from models.enums import DataSourceType, IndexingStatus, ProcessRuleMode, SegmentStatus from models.model import UploadFile @@ -318,10 +318,11 @@ class IndexingRunner: index_processor = IndexProcessorFactory(index_type).init_index_processor() # one extract_setting is one source document for extract_setting in extract_settings: - # extract - processing_rule = DatasetProcessRule( - mode=tmp_processing_rule["mode"], rules=json.dumps(tmp_processing_rule["rules"]) - ) + # Preview only needs the rule payload, not a persisted ORM record. + processing_rule: ProcessRuleDict = { + "mode": tmp_processing_rule["mode"], + "rules": tmp_processing_rule.get("rules"), + } # Extract document content text_docs = index_processor.extract(extract_setting, process_rule_mode=tmp_processing_rule["mode"]) # Cleaning and segmentation @@ -329,7 +330,7 @@ class IndexingRunner: text_docs, current_user=None, embedding_model_instance=embedding_model_instance, - process_rule=processing_rule.to_dict(), + process_rule=processing_rule, tenant_id=tenant_id, doc_language=doc_language, preview=True, diff --git a/api/models/dataset.py b/api/models/dataset.py index d0163e6984..e180b63ab0 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -11,7 +11,7 @@ import time from collections.abc import Sequence from datetime import datetime from json import JSONDecodeError -from typing import Any, TypedDict, cast +from typing import Any, NotRequired, TypedDict, cast from uuid import uuid4 import sqlalchemy as sa @@ -68,8 +68,8 @@ class AutomaticRulesConfig(TypedDict): class ProcessRuleDict(TypedDict): - id: str - dataset_id: str + id: NotRequired[str] + dataset_id: NotRequired[str] mode: str rules: dict[str, Any] | None diff --git a/api/tests/unit_tests/core/rag/indexing/test_indexing_runner.py b/api/tests/unit_tests/core/rag/indexing/test_indexing_runner.py index b011ade884..70b5e462ec 100644 --- a/api/tests/unit_tests/core/rag/indexing/test_indexing_runner.py +++ b/api/tests/unit_tests/core/rag/indexing/test_indexing_runner.py @@ -1431,6 +1431,44 @@ class TestIndexingRunnerEstimate: doc_form=IndexStructureType.PARAGRAPH_INDEX, ) + def test_indexing_estimate_uses_lightweight_preview_process_rule(self, mock_dependencies): + """Test preview estimation avoids constructing a persisted DatasetProcessRule ORM object.""" + # Arrange + runner = IndexingRunner() + tenant_id = str(uuid.uuid4()) + extract_setting = MagicMock() + tmp_processing_rule = create_mock_process_rule(mode="automatic") + + mock_features = MagicMock() + mock_features.billing.enabled = False + mock_dependencies["feature_service"].get_features.return_value = mock_features + + mock_processor = MagicMock() + mock_processor.extract.return_value = [Document(page_content="source", metadata={"doc_id": "source-1"})] + mock_processor.transform.return_value = [Document(page_content="preview chunk", metadata={"doc_id": "chunk-1"})] + mock_dependencies["factory"].return_value.init_index_processor.return_value = mock_processor + + with patch("core.indexing_runner.DatasetProcessRule", side_effect=AssertionError("ORM should not be created")): + # Act + estimate = runner.indexing_estimate( + tenant_id=tenant_id, + extract_settings=[extract_setting], + tmp_processing_rule=tmp_processing_rule, + doc_form=IndexStructureType.PARAGRAPH_INDEX, + ) + + # Assert + assert estimate.total_segments == 1 + assert len(estimate.preview) == 1 + assert estimate.preview[0].content == "preview chunk" + + transform_kwargs = mock_processor.transform.call_args.kwargs + assert transform_kwargs["preview"] is True + assert transform_kwargs["process_rule"] == { + "mode": tmp_processing_rule["mode"], + "rules": tmp_processing_rule["rules"], + } + class TestIndexingRunnerProcessChunk: """Unit tests for chunk processing in parallel. From c50c3b574e0146a98d6e0280dc5f77a93b7fc43f Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 06:58:54 +0000 Subject: [PATCH 2/3] [autofix.ci] apply automated fixes --- api/core/indexing_runner.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 86e6964b08..04d8293ccb 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -38,7 +38,14 @@ from extensions.ext_storage import storage from libs import helper from libs.datetime_utils import naive_utc_now from models import Account -from models.dataset import AutomaticRulesConfig, ChildChunk, Dataset, DatasetProcessRule, DocumentSegment, ProcessRuleDict +from models.dataset import ( + AutomaticRulesConfig, + ChildChunk, + Dataset, + DatasetProcessRule, + DocumentSegment, + ProcessRuleDict, +) from models.dataset import Document as DatasetDocument from models.enums import DataSourceType, IndexingStatus, ProcessRuleMode, SegmentStatus from models.model import UploadFile From 89980f01e64c6da6790da7f7293436ce04a4d78e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=8D=E8=AF=95=E8=B5=84=E6=96=99?= Date: Mon, 23 Mar 2026 15:13:17 +0800 Subject: [PATCH 3/3] test: align indexing preview assertion with implementation --- api/tests/unit_tests/core/rag/indexing/test_indexing_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/tests/unit_tests/core/rag/indexing/test_indexing_runner.py b/api/tests/unit_tests/core/rag/indexing/test_indexing_runner.py index 70b5e462ec..cf24dcf544 100644 --- a/api/tests/unit_tests/core/rag/indexing/test_indexing_runner.py +++ b/api/tests/unit_tests/core/rag/indexing/test_indexing_runner.py @@ -1466,7 +1466,7 @@ class TestIndexingRunnerEstimate: assert transform_kwargs["preview"] is True assert transform_kwargs["process_rule"] == { "mode": tmp_processing_rule["mode"], - "rules": tmp_processing_rule["rules"], + "rules": tmp_processing_rule.get("rules"), }