fix: update index to optimize message clean performance (#32238)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
hj24 2026-02-11 16:07:52 +08:00 committed by GitHub
parent 2f87ecc0ce
commit 7e0bccbbf0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 151 additions and 7 deletions

View File

@ -715,6 +715,7 @@ ANNOTATION_IMPORT_MAX_CONCURRENT=5
# Sandbox expired records clean configuration # Sandbox expired records clean configuration
SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD=21 SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD=21
SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_SIZE=1000 SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_SIZE=1000
SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL=200
SANDBOX_EXPIRED_RECORDS_RETENTION_DAYS=30 SANDBOX_EXPIRED_RECORDS_RETENTION_DAYS=30
SANDBOX_EXPIRED_RECORDS_CLEAN_TASK_LOCK_TTL=90000 SANDBOX_EXPIRED_RECORDS_CLEAN_TASK_LOCK_TTL=90000

View File

@ -1344,6 +1344,10 @@ class SandboxExpiredRecordsCleanConfig(BaseSettings):
description="Maximum number of records to process in each batch", description="Maximum number of records to process in each batch",
default=1000, default=1000,
) )
SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL: PositiveInt = Field(
description="Maximum interval in milliseconds between batches",
default=200,
)
SANDBOX_EXPIRED_RECORDS_RETENTION_DAYS: PositiveInt = Field( SANDBOX_EXPIRED_RECORDS_RETENTION_DAYS: PositiveInt = Field(
description="Retention days for sandbox expired workflow_run records and message records", description="Retention days for sandbox expired workflow_run records and message records",
default=30, default=30,

View File

@ -0,0 +1,39 @@
"""fix index to optimize message clean job performance
Revision ID: fce013ca180e
Revises: f55813ffe2c8
Create Date: 2026-02-11 15:49:17.603638
"""
from alembic import op
import models as models
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = 'fce013ca180e'
down_revision = 'f55813ffe2c8'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('messages', schema=None) as batch_op:
batch_op.drop_index(batch_op.f('message_created_at_idx'))
with op.batch_alter_table('saved_messages', schema=None) as batch_op:
batch_op.create_index('saved_message_message_id_idx', ['message_id'], unique=False)
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('saved_messages', schema=None) as batch_op:
batch_op.drop_index('saved_message_message_id_idx')
with op.batch_alter_table('messages', schema=None) as batch_op:
batch_op.create_index(batch_op.f('message_created_at_idx'), ['created_at'], unique=False)
# ### end Alembic commands ###

View File

@ -1040,7 +1040,6 @@ class Message(Base):
Index("message_end_user_idx", "app_id", "from_source", "from_end_user_id"), Index("message_end_user_idx", "app_id", "from_source", "from_end_user_id"),
Index("message_account_idx", "app_id", "from_source", "from_account_id"), Index("message_account_idx", "app_id", "from_source", "from_account_id"),
Index("message_workflow_run_id_idx", "conversation_id", "workflow_run_id"), Index("message_workflow_run_id_idx", "conversation_id", "workflow_run_id"),
Index("message_created_at_idx", "created_at"),
Index("message_app_mode_idx", "app_mode"), Index("message_app_mode_idx", "app_mode"),
Index("message_created_at_id_idx", "created_at", "id"), Index("message_created_at_id_idx", "created_at", "id"),
) )

View File

@ -16,6 +16,7 @@ class SavedMessage(TypeBase):
__table_args__ = ( __table_args__ = (
sa.PrimaryKeyConstraint("id", name="saved_message_pkey"), sa.PrimaryKeyConstraint("id", name="saved_message_pkey"),
sa.Index("saved_message_message_idx", "app_id", "message_id", "created_by_role", "created_by"), sa.Index("saved_message_message_idx", "app_id", "message_id", "created_by_role", "created_by"),
sa.Index("saved_message_message_id_idx", "message_id"),
) )
id: Mapped[str] = mapped_column( id: Mapped[str] = mapped_column(

View File

@ -1,10 +1,13 @@
import datetime import datetime
import logging import logging
import os
import random import random
import time
from collections.abc import Sequence from collections.abc import Sequence
from typing import cast from typing import cast
from sqlalchemy import delete, select import sqlalchemy as sa
from sqlalchemy import delete, select, tuple_
from sqlalchemy.engine import CursorResult from sqlalchemy.engine import CursorResult
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
@ -193,11 +196,15 @@ class MessagesCleanService:
self._end_before, self._end_before,
) )
max_batch_interval_ms = int(os.environ.get("SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL", 200))
while True: while True:
stats["batches"] += 1 stats["batches"] += 1
batch_start = time.monotonic()
# Step 1: Fetch a batch of messages using cursor # Step 1: Fetch a batch of messages using cursor
with Session(db.engine, expire_on_commit=False) as session: with Session(db.engine, expire_on_commit=False) as session:
fetch_messages_start = time.monotonic()
msg_stmt = ( msg_stmt = (
select(Message.id, Message.app_id, Message.created_at) select(Message.id, Message.app_id, Message.created_at)
.where(Message.created_at < self._end_before) .where(Message.created_at < self._end_before)
@ -209,13 +216,13 @@ class MessagesCleanService:
msg_stmt = msg_stmt.where(Message.created_at >= self._start_from) msg_stmt = msg_stmt.where(Message.created_at >= self._start_from)
# Apply cursor condition: (created_at, id) > (last_created_at, last_message_id) # Apply cursor condition: (created_at, id) > (last_created_at, last_message_id)
# This translates to:
# created_at > last_created_at OR (created_at = last_created_at AND id > last_message_id)
if _cursor: if _cursor:
# Continuing from previous batch
msg_stmt = msg_stmt.where( msg_stmt = msg_stmt.where(
(Message.created_at > _cursor[0]) tuple_(Message.created_at, Message.id)
| ((Message.created_at == _cursor[0]) & (Message.id > _cursor[1])) > tuple_(
sa.literal(_cursor[0], type_=sa.DateTime()),
sa.literal(_cursor[1], type_=Message.id.type),
)
) )
raw_messages = list(session.execute(msg_stmt).all()) raw_messages = list(session.execute(msg_stmt).all())
@ -223,6 +230,12 @@ class MessagesCleanService:
SimpleMessage(id=msg_id, app_id=app_id, created_at=msg_created_at) SimpleMessage(id=msg_id, app_id=app_id, created_at=msg_created_at)
for msg_id, app_id, msg_created_at in raw_messages for msg_id, app_id, msg_created_at in raw_messages
] ]
logger.info(
"clean_messages (batch %s): fetched %s messages in %sms",
stats["batches"],
len(messages),
int((time.monotonic() - fetch_messages_start) * 1000),
)
# Track total messages fetched across all batches # Track total messages fetched across all batches
stats["total_messages"] += len(messages) stats["total_messages"] += len(messages)
@ -241,8 +254,16 @@ class MessagesCleanService:
logger.info("clean_messages (batch %s): no app_ids found, skip", stats["batches"]) logger.info("clean_messages (batch %s): no app_ids found, skip", stats["batches"])
continue continue
fetch_apps_start = time.monotonic()
app_stmt = select(App.id, App.tenant_id).where(App.id.in_(app_ids)) app_stmt = select(App.id, App.tenant_id).where(App.id.in_(app_ids))
apps = list(session.execute(app_stmt).all()) apps = list(session.execute(app_stmt).all())
logger.info(
"clean_messages (batch %s): fetched %s apps for %s app_ids in %sms",
stats["batches"],
len(apps),
len(app_ids),
int((time.monotonic() - fetch_apps_start) * 1000),
)
if not apps: if not apps:
logger.info("clean_messages (batch %s): no apps found, skip", stats["batches"]) logger.info("clean_messages (batch %s): no apps found, skip", stats["batches"])
@ -252,7 +273,15 @@ class MessagesCleanService:
app_to_tenant: dict[str, str] = {app.id: app.tenant_id for app in apps} app_to_tenant: dict[str, str] = {app.id: app.tenant_id for app in apps}
# Step 3: Delegate to policy to determine which messages to delete # Step 3: Delegate to policy to determine which messages to delete
policy_start = time.monotonic()
message_ids_to_delete = self._policy.filter_message_ids(messages, app_to_tenant) message_ids_to_delete = self._policy.filter_message_ids(messages, app_to_tenant)
logger.info(
"clean_messages (batch %s): policy selected %s/%s messages in %sms",
stats["batches"],
len(message_ids_to_delete),
len(messages),
int((time.monotonic() - policy_start) * 1000),
)
if not message_ids_to_delete: if not message_ids_to_delete:
logger.info("clean_messages (batch %s): no messages to delete, skip", stats["batches"]) logger.info("clean_messages (batch %s): no messages to delete, skip", stats["batches"])
@ -263,14 +292,20 @@ class MessagesCleanService:
# Step 4: Batch delete messages and their relations # Step 4: Batch delete messages and their relations
if not self._dry_run: if not self._dry_run:
with Session(db.engine, expire_on_commit=False) as session: with Session(db.engine, expire_on_commit=False) as session:
delete_relations_start = time.monotonic()
# Delete related records first # Delete related records first
self._batch_delete_message_relations(session, message_ids_to_delete) self._batch_delete_message_relations(session, message_ids_to_delete)
delete_relations_ms = int((time.monotonic() - delete_relations_start) * 1000)
# Delete messages # Delete messages
delete_messages_start = time.monotonic()
delete_stmt = delete(Message).where(Message.id.in_(message_ids_to_delete)) delete_stmt = delete(Message).where(Message.id.in_(message_ids_to_delete))
delete_result = cast(CursorResult, session.execute(delete_stmt)) delete_result = cast(CursorResult, session.execute(delete_stmt))
messages_deleted = delete_result.rowcount messages_deleted = delete_result.rowcount
delete_messages_ms = int((time.monotonic() - delete_messages_start) * 1000)
commit_start = time.monotonic()
session.commit() session.commit()
commit_ms = int((time.monotonic() - commit_start) * 1000)
stats["total_deleted"] += messages_deleted stats["total_deleted"] += messages_deleted
@ -280,6 +315,19 @@ class MessagesCleanService:
len(messages), len(messages),
messages_deleted, messages_deleted,
) )
logger.info(
"clean_messages (batch %s): relations %sms, messages %sms, commit %sms, batch total %sms",
stats["batches"],
delete_relations_ms,
delete_messages_ms,
commit_ms,
int((time.monotonic() - batch_start) * 1000),
)
# Random sleep between batches to avoid overwhelming the database
sleep_ms = random.uniform(0, max_batch_interval_ms) # noqa: S311
logger.info("clean_messages (batch %s): sleeping for %.2fms", stats["batches"], sleep_ms)
time.sleep(sleep_ms / 1000)
else: else:
# Log random sample of message IDs that would be deleted (up to 10) # Log random sample of message IDs that would be deleted (up to 10)
sample_size = min(10, len(message_ids_to_delete)) sample_size = min(10, len(message_ids_to_delete))

View File

@ -1,5 +1,8 @@
import datetime import datetime
import logging import logging
import os
import random
import time
from collections.abc import Iterable, Sequence from collections.abc import Iterable, Sequence
import click import click
@ -72,7 +75,12 @@ class WorkflowRunCleanup:
batch_index = 0 batch_index = 0
last_seen: tuple[datetime.datetime, str] | None = None last_seen: tuple[datetime.datetime, str] | None = None
max_batch_interval_ms = int(os.environ.get("SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL", 200))
while True: while True:
batch_start = time.monotonic()
fetch_start = time.monotonic()
run_rows = self.workflow_run_repo.get_runs_batch_by_time_range( run_rows = self.workflow_run_repo.get_runs_batch_by_time_range(
start_from=self.window_start, start_from=self.window_start,
end_before=self.window_end, end_before=self.window_end,
@ -80,12 +88,30 @@ class WorkflowRunCleanup:
batch_size=self.batch_size, batch_size=self.batch_size,
) )
if not run_rows: if not run_rows:
logger.info("workflow_run_cleanup (batch #%s): no more rows to process", batch_index + 1)
break break
batch_index += 1 batch_index += 1
last_seen = (run_rows[-1].created_at, run_rows[-1].id) last_seen = (run_rows[-1].created_at, run_rows[-1].id)
logger.info(
"workflow_run_cleanup (batch #%s): fetched %s rows in %sms",
batch_index,
len(run_rows),
int((time.monotonic() - fetch_start) * 1000),
)
tenant_ids = {row.tenant_id for row in run_rows} tenant_ids = {row.tenant_id for row in run_rows}
filter_start = time.monotonic()
free_tenants = self._filter_free_tenants(tenant_ids) free_tenants = self._filter_free_tenants(tenant_ids)
logger.info(
"workflow_run_cleanup (batch #%s): filtered %s free tenants from %s tenants in %sms",
batch_index,
len(free_tenants),
len(tenant_ids),
int((time.monotonic() - filter_start) * 1000),
)
free_runs = [row for row in run_rows if row.tenant_id in free_tenants] free_runs = [row for row in run_rows if row.tenant_id in free_tenants]
paid_or_skipped = len(run_rows) - len(free_runs) paid_or_skipped = len(run_rows) - len(free_runs)
@ -104,11 +130,17 @@ class WorkflowRunCleanup:
total_runs_targeted += len(free_runs) total_runs_targeted += len(free_runs)
if self.dry_run: if self.dry_run:
count_start = time.monotonic()
batch_counts = self.workflow_run_repo.count_runs_with_related( batch_counts = self.workflow_run_repo.count_runs_with_related(
free_runs, free_runs,
count_node_executions=self._count_node_executions, count_node_executions=self._count_node_executions,
count_trigger_logs=self._count_trigger_logs, count_trigger_logs=self._count_trigger_logs,
) )
logger.info(
"workflow_run_cleanup (batch #%s, dry_run): counted related records in %sms",
batch_index,
int((time.monotonic() - count_start) * 1000),
)
if related_totals is not None: if related_totals is not None:
for key in related_totals: for key in related_totals:
related_totals[key] += batch_counts.get(key, 0) related_totals[key] += batch_counts.get(key, 0)
@ -120,14 +152,21 @@ class WorkflowRunCleanup:
fg="yellow", fg="yellow",
) )
) )
logger.info(
"workflow_run_cleanup (batch #%s, dry_run): batch total %sms",
batch_index,
int((time.monotonic() - batch_start) * 1000),
)
continue continue
try: try:
delete_start = time.monotonic()
counts = self.workflow_run_repo.delete_runs_with_related( counts = self.workflow_run_repo.delete_runs_with_related(
free_runs, free_runs,
delete_node_executions=self._delete_node_executions, delete_node_executions=self._delete_node_executions,
delete_trigger_logs=self._delete_trigger_logs, delete_trigger_logs=self._delete_trigger_logs,
) )
delete_ms = int((time.monotonic() - delete_start) * 1000)
except Exception: except Exception:
logger.exception("Failed to delete workflow runs batch ending at %s", last_seen[0]) logger.exception("Failed to delete workflow runs batch ending at %s", last_seen[0])
raise raise
@ -143,6 +182,17 @@ class WorkflowRunCleanup:
fg="green", fg="green",
) )
) )
logger.info(
"workflow_run_cleanup (batch #%s): delete %sms, batch total %sms",
batch_index,
delete_ms,
int((time.monotonic() - batch_start) * 1000),
)
# Random sleep between batches to avoid overwhelming the database
sleep_ms = random.uniform(0, max_batch_interval_ms) # noqa: S311
logger.info("workflow_run_cleanup (batch #%s): sleeping for %.2fms", batch_index, sleep_ms)
time.sleep(sleep_ms / 1000)
if self.dry_run: if self.dry_run:
if self.window_start: if self.window_start:

View File

@ -1523,6 +1523,7 @@ AMPLITUDE_API_KEY=
# Sandbox expired records clean configuration # Sandbox expired records clean configuration
SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD=21 SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD=21
SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_SIZE=1000 SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_SIZE=1000
SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL=200
SANDBOX_EXPIRED_RECORDS_RETENTION_DAYS=30 SANDBOX_EXPIRED_RECORDS_RETENTION_DAYS=30

View File

@ -684,6 +684,7 @@ x-shared-env: &shared-api-worker-env
AMPLITUDE_API_KEY: ${AMPLITUDE_API_KEY:-} AMPLITUDE_API_KEY: ${AMPLITUDE_API_KEY:-}
SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD: ${SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD:-21} SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD: ${SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD:-21}
SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_SIZE: ${SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_SIZE:-1000} SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_SIZE: ${SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_SIZE:-1000}
SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL: ${SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL:-200}
SANDBOX_EXPIRED_RECORDS_RETENTION_DAYS: ${SANDBOX_EXPIRED_RECORDS_RETENTION_DAYS:-30} SANDBOX_EXPIRED_RECORDS_RETENTION_DAYS: ${SANDBOX_EXPIRED_RECORDS_RETENTION_DAYS:-30}
PUBSUB_REDIS_URL: ${PUBSUB_REDIS_URL:-} PUBSUB_REDIS_URL: ${PUBSUB_REDIS_URL:-}
PUBSUB_REDIS_CHANNEL_TYPE: ${PUBSUB_REDIS_CHANNEL_TYPE:-pubsub} PUBSUB_REDIS_CHANNEL_TYPE: ${PUBSUB_REDIS_CHANNEL_TYPE:-pubsub}