From 3454224ff9c74c48a01175a098386a77b5a3928d Mon Sep 17 00:00:00 2001 From: BitToby <218712309+bittoby@users.noreply.github.com> Date: Wed, 18 Mar 2026 06:26:49 +0200 Subject: [PATCH] refactor(api): replace dict with SummaryIndexSettingDict TypedDict in core/rag (#33633) --- api/core/rag/index_processor/index_processor.py | 10 ++++++++-- api/core/rag/index_processor/index_processor_base.py | 12 ++++++++++-- .../processor/paragraph_index_processor.py | 6 +++--- .../processor/parent_child_index_processor.py | 4 ++-- .../index_processor/processor/qa_index_processor.py | 4 ++-- api/core/rag/summary_index/summary_index.py | 7 ++++++- api/core/workflow/nodes/knowledge_index/entities.py | 3 ++- .../nodes/knowledge_index/knowledge_index_node.py | 3 ++- api/services/summary_index_service.py | 7 ++++--- 9 files changed, 39 insertions(+), 17 deletions(-) diff --git a/api/core/rag/index_processor/index_processor.py b/api/core/rag/index_processor/index_processor.py index a7c42c5a4e..d9145023ac 100644 --- a/api/core/rag/index_processor/index_processor.py +++ b/api/core/rag/index_processor/index_processor.py @@ -9,6 +9,7 @@ from flask import current_app from sqlalchemy import delete, func, select from core.db.session_factory import session_factory +from core.rag.index_processor.index_processor_base import SummaryIndexSettingDict from core.workflow.nodes.knowledge_index.exc import KnowledgeIndexNodeError from core.workflow.nodes.knowledge_index.protocols import Preview, PreviewItem, QaPreview from models.dataset import Dataset, Document, DocumentSegment @@ -51,7 +52,7 @@ class IndexProcessor: original_document_id: str, chunks: Mapping[str, Any], batch: Any, - summary_index_setting: dict | None = None, + summary_index_setting: SummaryIndexSettingDict | None = None, ): with session_factory.create_session() as session: document = session.query(Document).filter_by(id=document_id).first() @@ -131,7 +132,12 @@ class IndexProcessor: } def get_preview_output( - self, chunks: Any, dataset_id: str, document_id: str, chunk_structure: str, summary_index_setting: dict | None + self, + chunks: Any, + dataset_id: str, + document_id: str, + chunk_structure: str, + summary_index_setting: SummaryIndexSettingDict | None, ) -> Preview: doc_language = None with session_factory.create_session() as session: diff --git a/api/core/rag/index_processor/index_processor_base.py b/api/core/rag/index_processor/index_processor_base.py index 9e0557e1ff..a435dfc46a 100644 --- a/api/core/rag/index_processor/index_processor_base.py +++ b/api/core/rag/index_processor/index_processor_base.py @@ -7,10 +7,11 @@ import os import re from abc import ABC, abstractmethod from collections.abc import Mapping -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, NotRequired, Optional from urllib.parse import unquote, urlparse import httpx +from typing_extensions import TypedDict from configs import dify_config from core.entities.knowledge_entities import PreviewDetail @@ -36,6 +37,13 @@ if TYPE_CHECKING: from core.model_manager import ModelInstance +class SummaryIndexSettingDict(TypedDict): + enable: bool + model_name: NotRequired[str] + model_provider_name: NotRequired[str] + summary_prompt: NotRequired[str] + + class BaseIndexProcessor(ABC): """Interface for extract files.""" @@ -52,7 +60,7 @@ class BaseIndexProcessor(ABC): self, tenant_id: str, preview_texts: list[PreviewDetail], - summary_index_setting: dict, + summary_index_setting: SummaryIndexSettingDict, doc_language: str | None = None, ) -> list[PreviewDetail]: """ diff --git a/api/core/rag/index_processor/processor/paragraph_index_processor.py b/api/core/rag/index_processor/processor/paragraph_index_processor.py index 4b767860dc..80163b1707 100644 --- a/api/core/rag/index_processor/processor/paragraph_index_processor.py +++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py @@ -23,7 +23,7 @@ from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.extract_processor import ExtractProcessor from core.rag.index_processor.constant.doc_type import DocType from core.rag.index_processor.constant.index_type import IndexStructureType -from core.rag.index_processor.index_processor_base import BaseIndexProcessor +from core.rag.index_processor.index_processor_base import BaseIndexProcessor, SummaryIndexSettingDict from core.rag.models.document import AttachmentDocument, Document, MultimodalGeneralStructureChunk from core.rag.retrieval.retrieval_methods import RetrievalMethod from core.tools.utils.text_processing_utils import remove_leading_symbols @@ -279,7 +279,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor): self, tenant_id: str, preview_texts: list[PreviewDetail], - summary_index_setting: dict, + summary_index_setting: SummaryIndexSettingDict, doc_language: str | None = None, ) -> list[PreviewDetail]: """ @@ -363,7 +363,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor): def generate_summary( tenant_id: str, text: str, - summary_index_setting: dict | None = None, + summary_index_setting: SummaryIndexSettingDict | None = None, segment_id: str | None = None, document_language: str | None = None, ) -> tuple[str, LLMUsage]: diff --git a/api/core/rag/index_processor/processor/parent_child_index_processor.py b/api/core/rag/index_processor/processor/parent_child_index_processor.py index 702a63b561..df0761ca73 100644 --- a/api/core/rag/index_processor/processor/parent_child_index_processor.py +++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py @@ -19,7 +19,7 @@ from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.extract_processor import ExtractProcessor from core.rag.index_processor.constant.doc_type import DocType from core.rag.index_processor.constant.index_type import IndexStructureType -from core.rag.index_processor.index_processor_base import BaseIndexProcessor +from core.rag.index_processor.index_processor_base import BaseIndexProcessor, SummaryIndexSettingDict from core.rag.models.document import AttachmentDocument, ChildDocument, Document, ParentChildStructureChunk from core.rag.retrieval.retrieval_methods import RetrievalMethod from extensions.ext_database import db @@ -362,7 +362,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor): self, tenant_id: str, preview_texts: list[PreviewDetail], - summary_index_setting: dict, + summary_index_setting: SummaryIndexSettingDict, doc_language: str | None = None, ) -> list[PreviewDetail]: """ diff --git a/api/core/rag/index_processor/processor/qa_index_processor.py b/api/core/rag/index_processor/processor/qa_index_processor.py index d56f69ca75..62f88b7760 100644 --- a/api/core/rag/index_processor/processor/qa_index_processor.py +++ b/api/core/rag/index_processor/processor/qa_index_processor.py @@ -22,7 +22,7 @@ from core.rag.docstore.dataset_docstore import DatasetDocumentStore from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.extract_processor import ExtractProcessor from core.rag.index_processor.constant.index_type import IndexStructureType -from core.rag.index_processor.index_processor_base import BaseIndexProcessor +from core.rag.index_processor.index_processor_base import BaseIndexProcessor, SummaryIndexSettingDict from core.rag.models.document import AttachmentDocument, Document, QAStructureChunk from core.rag.retrieval.retrieval_methods import RetrievalMethod from core.tools.utils.text_processing_utils import remove_leading_symbols @@ -245,7 +245,7 @@ class QAIndexProcessor(BaseIndexProcessor): self, tenant_id: str, preview_texts: list[PreviewDetail], - summary_index_setting: dict, + summary_index_setting: SummaryIndexSettingDict, doc_language: str | None = None, ) -> list[PreviewDetail]: """ diff --git a/api/core/rag/summary_index/summary_index.py b/api/core/rag/summary_index/summary_index.py index 79d7821b4e..31d21dbeee 100644 --- a/api/core/rag/summary_index/summary_index.py +++ b/api/core/rag/summary_index/summary_index.py @@ -2,6 +2,7 @@ import concurrent.futures import logging from core.db.session_factory import session_factory +from core.rag.index_processor.index_processor_base import SummaryIndexSettingDict from models.dataset import Dataset, Document, DocumentSegment, DocumentSegmentSummary from services.summary_index_service import SummaryIndexService from tasks.generate_summary_index_task import generate_summary_index_task @@ -11,7 +12,11 @@ logger = logging.getLogger(__name__) class SummaryIndex: def generate_and_vectorize_summary( - self, dataset_id: str, document_id: str, is_preview: bool, summary_index_setting: dict | None = None + self, + dataset_id: str, + document_id: str, + is_preview: bool, + summary_index_setting: SummaryIndexSettingDict | None = None, ) -> None: if is_preview: with session_factory.create_session() as session: diff --git a/api/core/workflow/nodes/knowledge_index/entities.py b/api/core/workflow/nodes/knowledge_index/entities.py index 8b00746268..8d2e9bf3cb 100644 --- a/api/core/workflow/nodes/knowledge_index/entities.py +++ b/api/core/workflow/nodes/knowledge_index/entities.py @@ -2,6 +2,7 @@ from typing import Literal, Union from pydantic import BaseModel +from core.rag.index_processor.index_processor_base import SummaryIndexSettingDict from core.rag.retrieval.retrieval_methods import RetrievalMethod from core.workflow.nodes.knowledge_index import KNOWLEDGE_INDEX_NODE_TYPE from dify_graph.entities.base_node_data import BaseNodeData @@ -161,4 +162,4 @@ class KnowledgeIndexNodeData(BaseNodeData): chunk_structure: str index_chunk_variable_selector: list[str] indexing_technique: str | None = None - summary_index_setting: dict | None = None + summary_index_setting: SummaryIndexSettingDict | None = None diff --git a/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py b/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py index 0a74847bc1..4ea9091c5b 100644 --- a/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py +++ b/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py @@ -3,6 +3,7 @@ from collections.abc import Mapping from typing import TYPE_CHECKING, Any from core.rag.index_processor.index_processor import IndexProcessor +from core.rag.index_processor.index_processor_base import SummaryIndexSettingDict from core.rag.summary_index.summary_index import SummaryIndex from core.workflow.nodes.knowledge_index import KNOWLEDGE_INDEX_NODE_TYPE from dify_graph.entities.graph_config import NodeConfigDict @@ -127,7 +128,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]): is_preview: bool, batch: Any, chunks: Mapping[str, Any], - summary_index_setting: dict | None = None, + summary_index_setting: SummaryIndexSettingDict | None = None, ): if not document_id: raise KnowledgeIndexNodeError("document_id is required.") diff --git a/api/services/summary_index_service.py b/api/services/summary_index_service.py index 13a6363bc3..943dfc972b 100644 --- a/api/services/summary_index_service.py +++ b/api/services/summary_index_service.py @@ -12,6 +12,7 @@ from core.db.session_factory import session_factory from core.model_manager import ModelManager from core.rag.datasource.vdb.vector_factory import Vector from core.rag.index_processor.constant.doc_type import DocType +from core.rag.index_processor.index_processor_base import SummaryIndexSettingDict from core.rag.models.document import Document from dify_graph.model_runtime.entities.llm_entities import LLMUsage from dify_graph.model_runtime.entities.model_entities import ModelType @@ -30,7 +31,7 @@ class SummaryIndexService: def generate_summary_for_segment( segment: DocumentSegment, dataset: Dataset, - summary_index_setting: dict, + summary_index_setting: SummaryIndexSettingDict, ) -> tuple[str, LLMUsage]: """ Generate summary for a single segment. @@ -600,7 +601,7 @@ class SummaryIndexService: def generate_and_vectorize_summary( segment: DocumentSegment, dataset: Dataset, - summary_index_setting: dict, + summary_index_setting: SummaryIndexSettingDict, ) -> DocumentSegmentSummary: """ Generate summary for a segment and vectorize it. @@ -705,7 +706,7 @@ class SummaryIndexService: def generate_summaries_for_document( dataset: Dataset, document: DatasetDocument, - summary_index_setting: dict, + summary_index_setting: SummaryIndexSettingDict, segment_ids: list[str] | None = None, only_parent_chunks: bool = False, ) -> list[DocumentSegmentSummary]: