dify/api/tasks/batch_create_segment_to_ind...

import logging
import tempfile
import time
import uuid
from pathlib import Path

import click
import pandas as pd
from celery import shared_task
from sqlalchemy import func

from core.db.session_factory import session_factory
from core.model_manager import ModelManager
from core.rag.index_processor.constant.index_type import IndexStructureType
from dify_graph.model_runtime.entities.model_entities import ModelType
from extensions.ext_redis import redis_client
from extensions.ext_storage import storage
from libs import helper
from libs.datetime_utils import naive_utc_now
from models.dataset import Dataset, Document, DocumentSegment
from models.model import UploadFile
from services.vector_service import VectorService

logger = logging.getLogger(__name__)


@shared_task(queue="dataset")
def batch_create_segment_to_index_task(
    job_id: str,
    upload_file_id: str,
    dataset_id: str,
    document_id: str,
    tenant_id: str,
    user_id: str,
):
    """
    Async batch create segment to index
    :param job_id:
    :param upload_file_id:
    :param dataset_id:
    :param document_id:
    :param tenant_id:
    :param user_id:

    Usage: batch_create_segment_to_index_task.delay(job_id, upload_file_id, dataset_id, document_id, tenant_id, user_id)
    """
    logger.info(click.style(f"Start batch create segment jobId: {job_id}", fg="green"))
    start_at = time.perf_counter()

    indexing_cache_key = f"segment_batch_import_{job_id}"

    # Initialize variables with default values
    upload_file_key: str | None = None
    dataset_config: dict | None = None
    document_config: dict | None = None

    with session_factory.create_session() as session:
        try:
            dataset = session.get(Dataset, dataset_id)
            if not dataset:
                raise ValueError("Dataset not exist.")

            dataset_document = session.get(Document, document_id)
            if not dataset_document:
                raise ValueError("Document not exist.")

            if (
                not dataset_document.enabled
                or dataset_document.archived
                or dataset_document.indexing_status != "completed"
            ):
                raise ValueError("Document is not available.")

            upload_file = session.get(UploadFile, upload_file_id)
            if not upload_file:
                raise ValueError("UploadFile not found.")

            dataset_config = {
                "id": dataset.id,
                "indexing_technique": dataset.indexing_technique,
                "tenant_id": dataset.tenant_id,
                "embedding_model_provider": dataset.embedding_model_provider,
                "embedding_model": dataset.embedding_model,
            }

            document_config = {
                "id": dataset_document.id,
                "doc_form": dataset_document.doc_form,
                "word_count": dataset_document.word_count or 0,
            }

            upload_file_key = upload_file.key

        except Exception:
            logger.exception("Segments batch created index failed")
            redis_client.setex(indexing_cache_key, 600, "error")
            return

    # Ensure required variables are set before proceeding
    if upload_file_key is None or dataset_config is None or document_config is None:
        logger.error("Required configuration not set due to session error")
        redis_client.setex(indexing_cache_key, 600, "error")
        return

    with tempfile.TemporaryDirectory() as temp_dir:
        suffix = Path(upload_file_key).suffix
        file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"  # type: ignore
        storage.download(upload_file_key, file_path)

        df = pd.read_csv(file_path)
        content = []
        for _, row in df.iterrows():
            if document_config["doc_form"] == IndexStructureType.QA_INDEX:
                data = {"content": row.iloc[0], "answer": row.iloc[1]}
            else:
                data = {"content": row.iloc[0]}
            content.append(data)
        if len(content) == 0:
            raise ValueError("The CSV file is empty.")

    document_segments = []
    embedding_model = None
    if dataset_config["indexing_technique"] == "high_quality":
        model_manager = ModelManager()
        embedding_model = model_manager.get_model_instance(
            tenant_id=dataset_config["tenant_id"],
            provider=dataset_config["embedding_model_provider"],
            model_type=ModelType.TEXT_EMBEDDING,
            model=dataset_config["embedding_model"],
        )

    word_count_change = 0
    if embedding_model:
        tokens_list = embedding_model.get_text_embedding_num_tokens(texts=[segment["content"] for segment in content])
    else:
        tokens_list = [0] * len(content)

    with session_factory.create_session() as session, session.begin():
        for segment, tokens in zip(content, tokens_list):
            content = segment["content"]
            doc_id = str(uuid.uuid4())
            segment_hash = helper.generate_text_hash(content)
            max_position = (
                session.query(func.max(DocumentSegment.position))
                .where(DocumentSegment.document_id == document_config["id"])
                .scalar()
            )
            segment_document = DocumentSegment(
                tenant_id=tenant_id,
                dataset_id=dataset_id,
                document_id=document_id,
                index_node_id=doc_id,
                index_node_hash=segment_hash,
                position=max_position + 1 if max_position else 1,
                content=content,
                word_count=len(content),
                tokens=tokens,
                created_by=user_id,
                indexing_at=naive_utc_now(),
                status="completed",
                completed_at=naive_utc_now(),
            )
            if document_config["doc_form"] == IndexStructureType.QA_INDEX:
                segment_document.answer = segment["answer"]
                segment_document.word_count += len(segment["answer"])
            word_count_change += segment_document.word_count
            session.add(segment_document)
            document_segments.append(segment_document)

    with session_factory.create_session() as session, session.begin():
        dataset_document = session.get(Document, document_id)
        if dataset_document:
            assert dataset_document.word_count is not None
            dataset_document.word_count += word_count_change
            session.add(dataset_document)

    with session_factory.create_session() as session:
        dataset = session.get(Dataset, dataset_id)
        if dataset:
            VectorService.create_segments_vector(None, document_segments, dataset, document_config["doc_form"])

    redis_client.setex(indexing_cache_key, 600, "completed")
    end_at = time.perf_counter()
    logger.info(
        click.style(
            f"Segment batch created job: {job_id} latency: {end_at - start_at}",
            fg="green",
        )
    )
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 09:37:31 +00:00			`import logging`
fix(dataset): CELERY_BROKER uses amqp rabbitmq. When adding document segments in batches and uploading large files, the status will always remain stuck at "In batch processing" #22709 (#23038) 2025-07-28 06:24:13 +00:00			`import tempfile`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 09:37:31 +00:00			`import time`
			`import uuid`
fix(dataset): CELERY_BROKER uses amqp rabbitmq. When adding document segments in batches and uploading large files, the status will always remain stuck at "In batch processing" #22709 (#23038) 2025-07-28 06:24:13 +00:00			`from pathlib import Path`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 09:37:31 +00:00
			`import click`
fix(dataset): CELERY_BROKER uses amqp rabbitmq. When adding document segments in batches and uploading large files, the status will always remain stuck at "In batch processing" #22709 (#23038) 2025-07-28 06:24:13 +00:00			`import pandas as pd`
chore: apply static type checks on celery async task dispatches and imports (#24418) 2025-08-24 15:07:22 +00:00			`from celery import shared_task`
Feat/queue monitor (#20647) 2025-06-04 11:56:34 +00:00			`from sqlalchemy import func`
enhancement: introduce Ruff for Python linter for reordering and removing unused imports with automated pre-commit and sytle check (#2366) 2024-02-06 05:21:13 +00:00
refactor: use session factory instead of call db.session directly (#31198) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> 2026-01-21 05:43:06 +00:00			`from core.db.session_factory import session_factory`
Model Runtime (#1858) Co-authored-by: StyleZhang <jasonapring2015@outlook.com> Co-authored-by: Garfield Dai <dai.hai@foxmail.com> Co-authored-by: chenhe <guchenhe@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Yeuoly <admin@srmxy.cn> 2024-01-02 15:42:00 +00:00			`from core.model_manager import ModelManager`
refactor: use EnumText for ApiToolProvider.schema_type_str and Docume… (#33983) 2026-03-24 04:27:50 +00:00			`from core.rag.index_processor.constant.index_type import IndexStructureType`
refactor(api): move model_runtime into dify_graph (#32858) 2026-03-02 12:15:32 +00:00			`from dify_graph.model_runtime.entities.model_entities import ModelType`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 09:37:31 +00:00			`from extensions.ext_redis import redis_client`
fix(dataset): CELERY_BROKER uses amqp rabbitmq. When adding document segments in batches and uploading large files, the status will always remain stuck at "In batch processing" #22709 (#23038) 2025-07-28 06:24:13 +00:00			`from extensions.ext_storage import storage`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 09:37:31 +00:00			`from libs import helper`
[Chore/Refactor] Use centralized naive_utc_now for UTC datetime operations (#24352) Signed-off-by: -LAN- <laipz8200@outlook.com> 2025-08-22 15:53:05 +00:00			`from libs.datetime_utils import naive_utc_now`
improve: introduce isort for linting Python imports (#1983) 2024-01-12 04:34:01 +00:00			`from models.dataset import Dataset, Document, DocumentSegment`
fix(dataset): CELERY_BROKER uses amqp rabbitmq. When adding document segments in batches and uploading large files, the status will always remain stuck at "In batch processing" #22709 (#23038) 2025-07-28 06:24:13 +00:00			`from models.model import UploadFile`
Feat/support parent child chunk (#12092) 2024-12-25 11:49:07 +00:00			`from services.vector_service import VectorService`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 09:37:31 +00:00
Refactor: use logger = logging.getLogger(__name__) in logging (#24515) Co-authored-by: Yongtao Huang <99629139+hyongtao-db@users.noreply.github.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> 2025-08-26 10:10:31 +00:00			`logger = logging.getLogger(__name__)`

Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 09:37:31 +00:00
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 05:38:37 +00:00			`@shared_task(queue="dataset")`
			`def batch_create_segment_to_index_task(`
fix(batch_create_segment_to_index_task): count max_position in memory. (#12929) 2025-01-22 05:39:02 +00:00			`job_id: str,`
fix(dataset): CELERY_BROKER uses amqp rabbitmq. When adding document segments in batches and uploading large files, the status will always remain stuck at "In batch processing" #22709 (#23038) 2025-07-28 06:24:13 +00:00			`upload_file_id: str,`
fix(batch_create_segment_to_index_task): count max_position in memory. (#12929) 2025-01-22 05:39:02 +00:00			`dataset_id: str,`
			`document_id: str,`
			`tenant_id: str,`
			`user_id: str,`
chore(api/tasks): apply ruff reformatting (#7594) 2024-08-26 05:38:37 +00:00			`):`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 09:37:31 +00:00			`"""`
			`Async batch create segment to index`
			`:param job_id:`
fix(dataset): CELERY_BROKER uses amqp rabbitmq. When adding document segments in batches and uploading large files, the status will always remain stuck at "In batch processing" #22709 (#23038) 2025-07-28 06:24:13 +00:00			`:param upload_file_id:`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 09:37:31 +00:00			`:param dataset_id:`
			`:param document_id:`
			`:param tenant_id:`
			`:param user_id:`

fix(dataset): CELERY_BROKER uses amqp rabbitmq. When adding document segments in batches and uploading large files, the status will always remain stuck at "In batch processing" #22709 (#23038) 2025-07-28 06:24:13 +00:00			`Usage: batch_create_segment_to_index_task.delay(job_id, upload_file_id, dataset_id, document_id, tenant_id, user_id)`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 09:37:31 +00:00			`"""`
Refactor: use logger = logging.getLogger(__name__) in logging (#24515) Co-authored-by: Yongtao Huang <99629139+hyongtao-db@users.noreply.github.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> 2025-08-26 10:10:31 +00:00			`logger.info(click.style(f"Start batch create segment jobId: {job_id}", fg="green"))`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 09:37:31 +00:00			`start_at = time.perf_counter()`

make logging not use f-str, change others to f-str (#22882) 2025-07-25 02:32:48 +00:00			`indexing_cache_key = f"segment_batch_import_{job_id}"`
Feature/mutil embedding model (#908) Co-authored-by: JzoNg <jzongcode@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: StyleZhang <jasonapring2015@outlook.com> 2023-08-18 09:37:31 +00:00
refactor: partition Celery task sessions into smaller, discrete execu… (#32085) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> 2026-02-08 13:01:54 +00:00			`# Initialize variables with default values`
			`upload_file_key: str \| None = None`
			`dataset_config: dict \| None = None`
			`document_config: dict \| None = None`

refactor: use session factory instead of call db.session directly (#31198) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> 2026-01-21 05:43:06 +00:00			`with session_factory.create_session() as session:`
			`try:`
			`dataset = session.get(Dataset, dataset_id)`
			`if not dataset:`
			`raise ValueError("Dataset not exist.")`

			`dataset_document = session.get(Document, document_id)`
			`if not dataset_document:`
			`raise ValueError("Document not exist.")`

			`if (`
			`not dataset_document.enabled`
			`or dataset_document.archived`
			`or dataset_document.indexing_status != "completed"`
			`):`
			`raise ValueError("Document is not available.")`

			`upload_file = session.get(UploadFile, upload_file_id)`
			`if not upload_file:`
			`raise ValueError("UploadFile not found.")`

refactor: partition Celery task sessions into smaller, discrete execu… (#32085) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> 2026-02-08 13:01:54 +00:00			`dataset_config = {`
			`"id": dataset.id,`
			`"indexing_technique": dataset.indexing_technique,`
			`"tenant_id": dataset.tenant_id,`
			`"embedding_model_provider": dataset.embedding_model_provider,`
			`"embedding_model": dataset.embedding_model,`
			`}`
refactor: use session factory instead of call db.session directly (#31198) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> 2026-01-21 05:43:06 +00:00
refactor: partition Celery task sessions into smaller, discrete execu… (#32085) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> 2026-02-08 13:01:54 +00:00			`document_config = {`
			`"id": dataset_document.id,`
			`"doc_form": dataset_document.doc_form,`
			`"word_count": dataset_document.word_count or 0,`
			`}`

			`upload_file_key = upload_file.key`
refactor: use session factory instead of call db.session directly (#31198) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> 2026-01-21 05:43:06 +00:00
			`except Exception:`
			`logger.exception("Segments batch created index failed")`
			`redis_client.setex(indexing_cache_key, 600, "error")`
refactor: partition Celery task sessions into smaller, discrete execu… (#32085) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> 2026-02-08 13:01:54 +00:00			`return`

			`# Ensure required variables are set before proceeding`
			`if upload_file_key is None or dataset_config is None or document_config is None:`
			`logger.error("Required configuration not set due to session error")`
			`redis_client.setex(indexing_cache_key, 600, "error")`
			`return`

			`with tempfile.TemporaryDirectory() as temp_dir:`
			`suffix = Path(upload_file_key).suffix`
			`file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" # type: ignore`
			`storage.download(upload_file_key, file_path)`

			`df = pd.read_csv(file_path)`
			`content = []`
			`for _, row in df.iterrows():`
refactor: use EnumText for ApiToolProvider.schema_type_str and Docume… (#33983) 2026-03-24 04:27:50 +00:00			`if document_config["doc_form"] == IndexStructureType.QA_INDEX:`
refactor: partition Celery task sessions into smaller, discrete execu… (#32085) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> 2026-02-08 13:01:54 +00:00			`data = {"content": row.iloc[0], "answer": row.iloc[1]}`
			`else:`
			`data = {"content": row.iloc[0]}`
			`content.append(data)`
			`if len(content) == 0:`
			`raise ValueError("The CSV file is empty.")`

			`document_segments = []`
			`embedding_model = None`
			`if dataset_config["indexing_technique"] == "high_quality":`
			`model_manager = ModelManager()`
			`embedding_model = model_manager.get_model_instance(`
			`tenant_id=dataset_config["tenant_id"],`
			`provider=dataset_config["embedding_model_provider"],`
			`model_type=ModelType.TEXT_EMBEDDING,`
			`model=dataset_config["embedding_model"],`
			`)`

			`word_count_change = 0`
			`if embedding_model:`
			`tokens_list = embedding_model.get_text_embedding_num_tokens(texts=[segment["content"] for segment in content])`
			`else:`
			`tokens_list = [0] * len(content)`

			`with session_factory.create_session() as session, session.begin():`
			`for segment, tokens in zip(content, tokens_list):`
			`content = segment["content"]`
			`doc_id = str(uuid.uuid4())`
			`segment_hash = helper.generate_text_hash(content)`
			`max_position = (`
			`session.query(func.max(DocumentSegment.position))`
			`.where(DocumentSegment.document_id == document_config["id"])`
			`.scalar()`
			`)`
			`segment_document = DocumentSegment(`
			`tenant_id=tenant_id,`
			`dataset_id=dataset_id,`
			`document_id=document_id,`
			`index_node_id=doc_id,`
			`index_node_hash=segment_hash,`
			`position=max_position + 1 if max_position else 1,`
			`content=content,`
			`word_count=len(content),`
			`tokens=tokens,`
			`created_by=user_id,`
			`indexing_at=naive_utc_now(),`
			`status="completed",`
			`completed_at=naive_utc_now(),`
			`)`
refactor: use EnumText for ApiToolProvider.schema_type_str and Docume… (#33983) 2026-03-24 04:27:50 +00:00			`if document_config["doc_form"] == IndexStructureType.QA_INDEX:`
refactor: partition Celery task sessions into smaller, discrete execu… (#32085) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> 2026-02-08 13:01:54 +00:00			`segment_document.answer = segment["answer"]`
			`segment_document.word_count += len(segment["answer"])`
			`word_count_change += segment_document.word_count`
			`session.add(segment_document)`
			`document_segments.append(segment_document)`

			`with session_factory.create_session() as session, session.begin():`
			`dataset_document = session.get(Document, document_id)`
			`if dataset_document:`
			`assert dataset_document.word_count is not None`
			`dataset_document.word_count += word_count_change`
			`session.add(dataset_document)`

			`with session_factory.create_session() as session:`
			`dataset = session.get(Dataset, dataset_id)`
			`if dataset:`
			`VectorService.create_segments_vector(None, document_segments, dataset, document_config["doc_form"])`

			`redis_client.setex(indexing_cache_key, 600, "completed")`
			`end_at = time.perf_counter()`
			`logger.info(`
			`click.style(`
			`f"Segment batch created job: {job_id} latency: {end_at - start_at}",`
			`fg="green",`
			`)`
			`)`