feat: add metrics to clean message and workflow-run task (#33143)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: hj24 <mambahj24@gmail.com>
This commit is contained in:
非法操作 2026-03-17 13:55:28 +08:00 committed by GitHub
parent 18af5fc8c7
commit 18ff5d9288
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 735 additions and 153 deletions

View File

@ -88,6 +88,8 @@ def clean_workflow_runs(
""" """
Clean workflow runs and related workflow data for free tenants. Clean workflow runs and related workflow data for free tenants.
""" """
from extensions.otel.runtime import flush_telemetry
if (start_from is None) ^ (end_before is None): if (start_from is None) ^ (end_before is None):
raise click.UsageError("--start-from and --end-before must be provided together.") raise click.UsageError("--start-from and --end-before must be provided together.")
@ -104,16 +106,27 @@ def clean_workflow_runs(
end_before = now - datetime.timedelta(days=to_days_ago) end_before = now - datetime.timedelta(days=to_days_ago)
before_days = 0 before_days = 0
if from_days_ago is not None and to_days_ago is not None:
task_label = f"{from_days_ago}to{to_days_ago}"
elif start_from is None:
task_label = f"before-{before_days}"
else:
task_label = "custom"
start_time = datetime.datetime.now(datetime.UTC) start_time = datetime.datetime.now(datetime.UTC)
click.echo(click.style(f"Starting workflow run cleanup at {start_time.isoformat()}.", fg="white")) click.echo(click.style(f"Starting workflow run cleanup at {start_time.isoformat()}.", fg="white"))
try:
WorkflowRunCleanup( WorkflowRunCleanup(
days=before_days, days=before_days,
batch_size=batch_size, batch_size=batch_size,
start_from=start_from, start_from=start_from,
end_before=end_before, end_before=end_before,
dry_run=dry_run, dry_run=dry_run,
task_label=task_label,
).run() ).run()
finally:
flush_telemetry()
end_time = datetime.datetime.now(datetime.UTC) end_time = datetime.datetime.now(datetime.UTC)
elapsed = end_time - start_time elapsed = end_time - start_time
@ -659,6 +672,8 @@ def clean_expired_messages(
""" """
Clean expired messages and related data for tenants based on clean policy. Clean expired messages and related data for tenants based on clean policy.
""" """
from extensions.otel.runtime import flush_telemetry
click.echo(click.style("clean_messages: start clean messages.", fg="green")) click.echo(click.style("clean_messages: start clean messages.", fg="green"))
start_at = time.perf_counter() start_at = time.perf_counter()
@ -698,6 +713,13 @@ def clean_expired_messages(
# NOTE: graceful_period will be ignored when billing is disabled. # NOTE: graceful_period will be ignored when billing is disabled.
policy = create_message_clean_policy(graceful_period_days=graceful_period) policy = create_message_clean_policy(graceful_period_days=graceful_period)
if from_days_ago is not None and before_days is not None:
task_label = f"{from_days_ago}to{before_days}"
elif start_from is None and before_days is not None:
task_label = f"before-{before_days}"
else:
task_label = "custom"
# Create and run the cleanup service # Create and run the cleanup service
if abs_mode: if abs_mode:
assert start_from is not None assert start_from is not None
@ -708,6 +730,7 @@ def clean_expired_messages(
end_before=end_before, end_before=end_before,
batch_size=batch_size, batch_size=batch_size,
dry_run=dry_run, dry_run=dry_run,
task_label=task_label,
) )
elif from_days_ago is None: elif from_days_ago is None:
assert before_days is not None assert before_days is not None
@ -716,6 +739,7 @@ def clean_expired_messages(
days=before_days, days=before_days,
batch_size=batch_size, batch_size=batch_size,
dry_run=dry_run, dry_run=dry_run,
task_label=task_label,
) )
else: else:
assert before_days is not None assert before_days is not None
@ -727,6 +751,7 @@ def clean_expired_messages(
end_before=now - datetime.timedelta(days=before_days), end_before=now - datetime.timedelta(days=before_days),
batch_size=batch_size, batch_size=batch_size,
dry_run=dry_run, dry_run=dry_run,
task_label=task_label,
) )
stats = service.run() stats = service.run()
@ -752,6 +777,8 @@ def clean_expired_messages(
) )
) )
raise raise
finally:
flush_telemetry()
click.echo(click.style("messages cleanup completed.", fg="green")) click.echo(click.style("messages cleanup completed.", fg="green"))

View File

@ -5,7 +5,7 @@ from typing import Union
from celery.signals import worker_init from celery.signals import worker_init
from flask_login import user_loaded_from_request, user_logged_in from flask_login import user_loaded_from_request, user_logged_in
from opentelemetry import trace from opentelemetry import metrics, trace
from opentelemetry.propagate import set_global_textmap from opentelemetry.propagate import set_global_textmap
from opentelemetry.propagators.b3 import B3MultiFormat from opentelemetry.propagators.b3 import B3MultiFormat
from opentelemetry.propagators.composite import CompositePropagator from opentelemetry.propagators.composite import CompositePropagator
@ -31,9 +31,29 @@ def setup_context_propagation() -> None:
def shutdown_tracer() -> None: def shutdown_tracer() -> None:
flush_telemetry()
def flush_telemetry() -> None:
"""
Best-effort flush for telemetry providers.
This is mainly used by short-lived command processes (e.g. Kubernetes CronJob)
so counters/histograms are exported before the process exits.
"""
provider = trace.get_tracer_provider() provider = trace.get_tracer_provider()
if hasattr(provider, "force_flush"): if hasattr(provider, "force_flush"):
try:
provider.force_flush() provider.force_flush()
except Exception:
logger.exception("otel: failed to flush trace provider")
metric_provider = metrics.get_meter_provider()
if hasattr(metric_provider, "force_flush"):
try:
metric_provider.force_flush()
except Exception:
logger.exception("otel: failed to flush metric provider")
def is_celery_worker(): def is_celery_worker():

View File

@ -1,16 +1,16 @@
import datetime import datetime
import logging import logging
import os
import random import random
import time import time
from collections.abc import Sequence from collections.abc import Sequence
from typing import cast from typing import TYPE_CHECKING, cast
import sqlalchemy as sa import sqlalchemy as sa
from sqlalchemy import delete, select, tuple_ from sqlalchemy import delete, select, tuple_
from sqlalchemy.engine import CursorResult from sqlalchemy.engine import CursorResult
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from configs import dify_config
from extensions.ext_database import db from extensions.ext_database import db
from libs.datetime_utils import naive_utc_now from libs.datetime_utils import naive_utc_now
from models.model import ( from models.model import (
@ -33,6 +33,131 @@ from services.retention.conversation.messages_clean_policy import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
if TYPE_CHECKING:
from opentelemetry.metrics import Counter, Histogram
class MessagesCleanupMetrics:
"""
Records low-cardinality OpenTelemetry metrics for expired message cleanup jobs.
We keep labels stable (dry_run/window_mode/task_label/status) so these metrics remain
dashboard-friendly for long-running CronJob executions.
"""
_job_runs_total: "Counter | None"
_batches_total: "Counter | None"
_messages_scanned_total: "Counter | None"
_messages_filtered_total: "Counter | None"
_messages_deleted_total: "Counter | None"
_job_duration_seconds: "Histogram | None"
_batch_duration_seconds: "Histogram | None"
_base_attributes: dict[str, str]
def __init__(self, *, dry_run: bool, has_window: bool, task_label: str) -> None:
self._job_runs_total = None
self._batches_total = None
self._messages_scanned_total = None
self._messages_filtered_total = None
self._messages_deleted_total = None
self._job_duration_seconds = None
self._batch_duration_seconds = None
self._base_attributes = {
"job_name": "messages_cleanup",
"dry_run": str(dry_run).lower(),
"window_mode": "between" if has_window else "before_cutoff",
"task_label": task_label,
}
self._init_instruments()
def _init_instruments(self) -> None:
if not dify_config.ENABLE_OTEL:
return
try:
from opentelemetry.metrics import get_meter
meter = get_meter("messages_cleanup", version=dify_config.project.version)
self._job_runs_total = meter.create_counter(
"messages_cleanup_jobs_total",
description="Total number of expired message cleanup jobs by status.",
unit="{job}",
)
self._batches_total = meter.create_counter(
"messages_cleanup_batches_total",
description="Total number of message cleanup batches processed.",
unit="{batch}",
)
self._messages_scanned_total = meter.create_counter(
"messages_cleanup_scanned_messages_total",
description="Total messages scanned by cleanup jobs.",
unit="{message}",
)
self._messages_filtered_total = meter.create_counter(
"messages_cleanup_filtered_messages_total",
description="Total messages selected by cleanup policy.",
unit="{message}",
)
self._messages_deleted_total = meter.create_counter(
"messages_cleanup_deleted_messages_total",
description="Total messages deleted by cleanup jobs.",
unit="{message}",
)
self._job_duration_seconds = meter.create_histogram(
"messages_cleanup_job_duration_seconds",
description="Duration of expired message cleanup jobs in seconds.",
unit="s",
)
self._batch_duration_seconds = meter.create_histogram(
"messages_cleanup_batch_duration_seconds",
description="Duration of expired message cleanup batch processing in seconds.",
unit="s",
)
except Exception:
logger.exception("messages_cleanup_metrics: failed to initialize instruments")
def _attrs(self, **extra: str) -> dict[str, str]:
return {**self._base_attributes, **extra}
@staticmethod
def _add(counter: "Counter | None", value: int, attributes: dict[str, str]) -> None:
if not counter or value <= 0:
return
try:
counter.add(value, attributes)
except Exception:
logger.exception("messages_cleanup_metrics: failed to add counter value")
@staticmethod
def _record(histogram: "Histogram | None", value: float, attributes: dict[str, str]) -> None:
if not histogram:
return
try:
histogram.record(value, attributes)
except Exception:
logger.exception("messages_cleanup_metrics: failed to record histogram value")
def record_batch(
self,
*,
scanned_messages: int,
filtered_messages: int,
deleted_messages: int,
batch_duration_seconds: float,
) -> None:
attributes = self._attrs()
self._add(self._batches_total, 1, attributes)
self._add(self._messages_scanned_total, scanned_messages, attributes)
self._add(self._messages_filtered_total, filtered_messages, attributes)
self._add(self._messages_deleted_total, deleted_messages, attributes)
self._record(self._batch_duration_seconds, batch_duration_seconds, attributes)
def record_completion(self, *, status: str, job_duration_seconds: float) -> None:
attributes = self._attrs(status=status)
self._add(self._job_runs_total, 1, attributes)
self._record(self._job_duration_seconds, job_duration_seconds, attributes)
class MessagesCleanService: class MessagesCleanService:
""" """
Service for cleaning expired messages based on retention policies. Service for cleaning expired messages based on retention policies.
@ -48,6 +173,7 @@ class MessagesCleanService:
start_from: datetime.datetime | None = None, start_from: datetime.datetime | None = None,
batch_size: int = 1000, batch_size: int = 1000,
dry_run: bool = False, dry_run: bool = False,
task_label: str = "custom",
) -> None: ) -> None:
""" """
Initialize the service with cleanup parameters. Initialize the service with cleanup parameters.
@ -58,12 +184,18 @@ class MessagesCleanService:
start_from: Optional start time (inclusive) of the range start_from: Optional start time (inclusive) of the range
batch_size: Number of messages to process per batch batch_size: Number of messages to process per batch
dry_run: Whether to perform a dry run (no actual deletion) dry_run: Whether to perform a dry run (no actual deletion)
task_label: Optional task label for retention metrics
""" """
self._policy = policy self._policy = policy
self._end_before = end_before self._end_before = end_before
self._start_from = start_from self._start_from = start_from
self._batch_size = batch_size self._batch_size = batch_size
self._dry_run = dry_run self._dry_run = dry_run
self._metrics = MessagesCleanupMetrics(
dry_run=dry_run,
has_window=bool(start_from),
task_label=task_label,
)
@classmethod @classmethod
def from_time_range( def from_time_range(
@ -73,6 +205,7 @@ class MessagesCleanService:
end_before: datetime.datetime, end_before: datetime.datetime,
batch_size: int = 1000, batch_size: int = 1000,
dry_run: bool = False, dry_run: bool = False,
task_label: str = "custom",
) -> "MessagesCleanService": ) -> "MessagesCleanService":
""" """
Create a service instance for cleaning messages within a specific time range. Create a service instance for cleaning messages within a specific time range.
@ -85,6 +218,7 @@ class MessagesCleanService:
end_before: End time (exclusive) of the range end_before: End time (exclusive) of the range
batch_size: Number of messages to process per batch batch_size: Number of messages to process per batch
dry_run: Whether to perform a dry run (no actual deletion) dry_run: Whether to perform a dry run (no actual deletion)
task_label: Optional task label for retention metrics
Returns: Returns:
MessagesCleanService instance MessagesCleanService instance
@ -112,6 +246,7 @@ class MessagesCleanService:
start_from=start_from, start_from=start_from,
batch_size=batch_size, batch_size=batch_size,
dry_run=dry_run, dry_run=dry_run,
task_label=task_label,
) )
@classmethod @classmethod
@ -121,6 +256,7 @@ class MessagesCleanService:
days: int = 30, days: int = 30,
batch_size: int = 1000, batch_size: int = 1000,
dry_run: bool = False, dry_run: bool = False,
task_label: str = "custom",
) -> "MessagesCleanService": ) -> "MessagesCleanService":
""" """
Create a service instance for cleaning messages older than specified days. Create a service instance for cleaning messages older than specified days.
@ -130,6 +266,7 @@ class MessagesCleanService:
days: Number of days to look back from now days: Number of days to look back from now
batch_size: Number of messages to process per batch batch_size: Number of messages to process per batch
dry_run: Whether to perform a dry run (no actual deletion) dry_run: Whether to perform a dry run (no actual deletion)
task_label: Optional task label for retention metrics
Returns: Returns:
MessagesCleanService instance MessagesCleanService instance
@ -153,7 +290,14 @@ class MessagesCleanService:
policy.__class__.__name__, policy.__class__.__name__,
) )
return cls(policy=policy, end_before=end_before, start_from=None, batch_size=batch_size, dry_run=dry_run) return cls(
policy=policy,
end_before=end_before,
start_from=None,
batch_size=batch_size,
dry_run=dry_run,
task_label=task_label,
)
def run(self) -> dict[str, int]: def run(self) -> dict[str, int]:
""" """
@ -162,7 +306,18 @@ class MessagesCleanService:
Returns: Returns:
Dict with statistics: batches, filtered_messages, total_deleted Dict with statistics: batches, filtered_messages, total_deleted
""" """
status = "success"
run_start = time.monotonic()
try:
return self._clean_messages_by_time_range() return self._clean_messages_by_time_range()
except Exception:
status = "failed"
raise
finally:
self._metrics.record_completion(
status=status,
job_duration_seconds=time.monotonic() - run_start,
)
def _clean_messages_by_time_range(self) -> dict[str, int]: def _clean_messages_by_time_range(self) -> dict[str, int]:
""" """
@ -197,11 +352,14 @@ class MessagesCleanService:
self._end_before, self._end_before,
) )
max_batch_interval_ms = int(os.environ.get("SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL", 200)) max_batch_interval_ms = dify_config.SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL
while True: while True:
stats["batches"] += 1 stats["batches"] += 1
batch_start = time.monotonic() batch_start = time.monotonic()
batch_scanned_messages = 0
batch_filtered_messages = 0
batch_deleted_messages = 0
# Step 1: Fetch a batch of messages using cursor # Step 1: Fetch a batch of messages using cursor
with Session(db.engine, expire_on_commit=False) as session: with Session(db.engine, expire_on_commit=False) as session:
@ -240,9 +398,16 @@ class MessagesCleanService:
# Track total messages fetched across all batches # Track total messages fetched across all batches
stats["total_messages"] += len(messages) stats["total_messages"] += len(messages)
batch_scanned_messages = len(messages)
if not messages: if not messages:
logger.info("clean_messages (batch %s): no more messages to process", stats["batches"]) logger.info("clean_messages (batch %s): no more messages to process", stats["batches"])
self._metrics.record_batch(
scanned_messages=batch_scanned_messages,
filtered_messages=batch_filtered_messages,
deleted_messages=batch_deleted_messages,
batch_duration_seconds=time.monotonic() - batch_start,
)
break break
# Update cursor to the last message's (created_at, id) # Update cursor to the last message's (created_at, id)
@ -268,6 +433,12 @@ class MessagesCleanService:
if not apps: if not apps:
logger.info("clean_messages (batch %s): no apps found, skip", stats["batches"]) logger.info("clean_messages (batch %s): no apps found, skip", stats["batches"])
self._metrics.record_batch(
scanned_messages=batch_scanned_messages,
filtered_messages=batch_filtered_messages,
deleted_messages=batch_deleted_messages,
batch_duration_seconds=time.monotonic() - batch_start,
)
continue continue
# Build app_id -> tenant_id mapping # Build app_id -> tenant_id mapping
@ -286,9 +457,16 @@ class MessagesCleanService:
if not message_ids_to_delete: if not message_ids_to_delete:
logger.info("clean_messages (batch %s): no messages to delete, skip", stats["batches"]) logger.info("clean_messages (batch %s): no messages to delete, skip", stats["batches"])
self._metrics.record_batch(
scanned_messages=batch_scanned_messages,
filtered_messages=batch_filtered_messages,
deleted_messages=batch_deleted_messages,
batch_duration_seconds=time.monotonic() - batch_start,
)
continue continue
stats["filtered_messages"] += len(message_ids_to_delete) stats["filtered_messages"] += len(message_ids_to_delete)
batch_filtered_messages = len(message_ids_to_delete)
# Step 4: Batch delete messages and their relations # Step 4: Batch delete messages and their relations
if not self._dry_run: if not self._dry_run:
@ -309,6 +487,7 @@ class MessagesCleanService:
commit_ms = int((time.monotonic() - commit_start) * 1000) commit_ms = int((time.monotonic() - commit_start) * 1000)
stats["total_deleted"] += messages_deleted stats["total_deleted"] += messages_deleted
batch_deleted_messages = messages_deleted
logger.info( logger.info(
"clean_messages (batch %s): processed %s messages, deleted %s messages", "clean_messages (batch %s): processed %s messages, deleted %s messages",
@ -343,6 +522,13 @@ class MessagesCleanService:
for msg_id in sampled_ids: for msg_id in sampled_ids:
logger.info("clean_messages (batch %s, dry_run) sample: message_id=%s", stats["batches"], msg_id) logger.info("clean_messages (batch %s, dry_run) sample: message_id=%s", stats["batches"], msg_id)
self._metrics.record_batch(
scanned_messages=batch_scanned_messages,
filtered_messages=batch_filtered_messages,
deleted_messages=batch_deleted_messages,
batch_duration_seconds=time.monotonic() - batch_start,
)
logger.info( logger.info(
"clean_messages completed: total batches: %s, total messages: %s, filtered messages: %s, total deleted: %s", "clean_messages completed: total batches: %s, total messages: %s, filtered messages: %s, total deleted: %s",
stats["batches"], stats["batches"],

View File

@ -1,9 +1,9 @@
import datetime import datetime
import logging import logging
import os
import random import random
import time import time
from collections.abc import Iterable, Sequence from collections.abc import Iterable, Sequence
from typing import TYPE_CHECKING
import click import click
from sqlalchemy.orm import Session, sessionmaker from sqlalchemy.orm import Session, sessionmaker
@ -20,6 +20,159 @@ from services.billing_service import BillingService, SubscriptionPlan
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
if TYPE_CHECKING:
from opentelemetry.metrics import Counter, Histogram
class WorkflowRunCleanupMetrics:
"""
Records low-cardinality OpenTelemetry metrics for workflow run cleanup jobs.
Metrics are emitted with stable labels only (dry_run/window_mode/task_label/status)
to keep dashboard and alert cardinality predictable in production clusters.
"""
_job_runs_total: "Counter | None"
_batches_total: "Counter | None"
_runs_scanned_total: "Counter | None"
_runs_targeted_total: "Counter | None"
_runs_deleted_total: "Counter | None"
_runs_skipped_total: "Counter | None"
_related_records_total: "Counter | None"
_job_duration_seconds: "Histogram | None"
_batch_duration_seconds: "Histogram | None"
_base_attributes: dict[str, str]
def __init__(self, *, dry_run: bool, has_window: bool, task_label: str) -> None:
self._job_runs_total = None
self._batches_total = None
self._runs_scanned_total = None
self._runs_targeted_total = None
self._runs_deleted_total = None
self._runs_skipped_total = None
self._related_records_total = None
self._job_duration_seconds = None
self._batch_duration_seconds = None
self._base_attributes = {
"job_name": "workflow_run_cleanup",
"dry_run": str(dry_run).lower(),
"window_mode": "between" if has_window else "before_cutoff",
"task_label": task_label,
}
self._init_instruments()
def _init_instruments(self) -> None:
if not dify_config.ENABLE_OTEL:
return
try:
from opentelemetry.metrics import get_meter
meter = get_meter("workflow_run_cleanup", version=dify_config.project.version)
self._job_runs_total = meter.create_counter(
"workflow_run_cleanup_jobs_total",
description="Total number of workflow run cleanup jobs by status.",
unit="{job}",
)
self._batches_total = meter.create_counter(
"workflow_run_cleanup_batches_total",
description="Total number of processed cleanup batches.",
unit="{batch}",
)
self._runs_scanned_total = meter.create_counter(
"workflow_run_cleanup_scanned_runs_total",
description="Total workflow runs scanned by cleanup jobs.",
unit="{run}",
)
self._runs_targeted_total = meter.create_counter(
"workflow_run_cleanup_targeted_runs_total",
description="Total workflow runs targeted by cleanup policy.",
unit="{run}",
)
self._runs_deleted_total = meter.create_counter(
"workflow_run_cleanup_deleted_runs_total",
description="Total workflow runs deleted by cleanup jobs.",
unit="{run}",
)
self._runs_skipped_total = meter.create_counter(
"workflow_run_cleanup_skipped_runs_total",
description="Total workflow runs skipped because tenant is paid/unknown.",
unit="{run}",
)
self._related_records_total = meter.create_counter(
"workflow_run_cleanup_related_records_total",
description="Total related records processed by cleanup jobs.",
unit="{record}",
)
self._job_duration_seconds = meter.create_histogram(
"workflow_run_cleanup_job_duration_seconds",
description="Duration of workflow run cleanup jobs in seconds.",
unit="s",
)
self._batch_duration_seconds = meter.create_histogram(
"workflow_run_cleanup_batch_duration_seconds",
description="Duration of workflow run cleanup batch processing in seconds.",
unit="s",
)
except Exception:
logger.exception("workflow_run_cleanup_metrics: failed to initialize instruments")
def _attrs(self, **extra: str) -> dict[str, str]:
return {**self._base_attributes, **extra}
@staticmethod
def _add(counter: "Counter | None", value: int, attributes: dict[str, str]) -> None:
if not counter or value <= 0:
return
try:
counter.add(value, attributes)
except Exception:
logger.exception("workflow_run_cleanup_metrics: failed to add counter value")
@staticmethod
def _record(histogram: "Histogram | None", value: float, attributes: dict[str, str]) -> None:
if not histogram:
return
try:
histogram.record(value, attributes)
except Exception:
logger.exception("workflow_run_cleanup_metrics: failed to record histogram value")
def record_batch(
self,
*,
batch_rows: int,
targeted_runs: int,
skipped_runs: int,
deleted_runs: int,
related_counts: dict[str, int] | None,
related_action: str | None,
batch_duration_seconds: float,
) -> None:
attributes = self._attrs()
self._add(self._batches_total, 1, attributes)
self._add(self._runs_scanned_total, batch_rows, attributes)
self._add(self._runs_targeted_total, targeted_runs, attributes)
self._add(self._runs_skipped_total, skipped_runs, attributes)
self._add(self._runs_deleted_total, deleted_runs, attributes)
self._record(self._batch_duration_seconds, batch_duration_seconds, attributes)
if not related_counts or not related_action:
return
for record_type, count in related_counts.items():
self._add(
self._related_records_total,
count,
self._attrs(action=related_action, record_type=record_type),
)
def record_completion(self, *, status: str, job_duration_seconds: float) -> None:
attributes = self._attrs(status=status)
self._add(self._job_runs_total, 1, attributes)
self._record(self._job_duration_seconds, job_duration_seconds, attributes)
class WorkflowRunCleanup: class WorkflowRunCleanup:
def __init__( def __init__(
self, self,
@ -29,6 +182,7 @@ class WorkflowRunCleanup:
end_before: datetime.datetime | None = None, end_before: datetime.datetime | None = None,
workflow_run_repo: APIWorkflowRunRepository | None = None, workflow_run_repo: APIWorkflowRunRepository | None = None,
dry_run: bool = False, dry_run: bool = False,
task_label: str = "custom",
): ):
if (start_from is None) ^ (end_before is None): if (start_from is None) ^ (end_before is None):
raise ValueError("start_from and end_before must be both set or both omitted.") raise ValueError("start_from and end_before must be both set or both omitted.")
@ -46,6 +200,11 @@ class WorkflowRunCleanup:
self.batch_size = batch_size self.batch_size = batch_size
self._cleanup_whitelist: set[str] | None = None self._cleanup_whitelist: set[str] | None = None
self.dry_run = dry_run self.dry_run = dry_run
self._metrics = WorkflowRunCleanupMetrics(
dry_run=dry_run,
has_window=bool(start_from),
task_label=task_label,
)
self.free_plan_grace_period_days = dify_config.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD self.free_plan_grace_period_days = dify_config.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD
self.workflow_run_repo: APIWorkflowRunRepository self.workflow_run_repo: APIWorkflowRunRepository
if workflow_run_repo: if workflow_run_repo:
@ -74,9 +233,11 @@ class WorkflowRunCleanup:
related_totals = self._empty_related_counts() if self.dry_run else None related_totals = self._empty_related_counts() if self.dry_run else None
batch_index = 0 batch_index = 0
last_seen: tuple[datetime.datetime, str] | None = None last_seen: tuple[datetime.datetime, str] | None = None
status = "success"
run_start = time.monotonic()
max_batch_interval_ms = dify_config.SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL
max_batch_interval_ms = int(os.environ.get("SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL", 200)) try:
while True: while True:
batch_start = time.monotonic() batch_start = time.monotonic()
@ -125,6 +286,15 @@ class WorkflowRunCleanup:
fg="yellow", fg="yellow",
) )
) )
self._metrics.record_batch(
batch_rows=len(run_rows),
targeted_runs=0,
skipped_runs=paid_or_skipped,
deleted_runs=0,
related_counts=None,
related_action=None,
batch_duration_seconds=time.monotonic() - batch_start,
)
continue continue
total_runs_targeted += len(free_runs) total_runs_targeted += len(free_runs)
@ -157,6 +327,15 @@ class WorkflowRunCleanup:
batch_index, batch_index,
int((time.monotonic() - batch_start) * 1000), int((time.monotonic() - batch_start) * 1000),
) )
self._metrics.record_batch(
batch_rows=len(run_rows),
targeted_runs=len(free_runs),
skipped_runs=paid_or_skipped,
deleted_runs=0,
related_counts={key: batch_counts.get(key, 0) for key in self._empty_related_counts()},
related_action="would_delete",
batch_duration_seconds=time.monotonic() - batch_start,
)
continue continue
try: try:
@ -188,6 +367,15 @@ class WorkflowRunCleanup:
delete_ms, delete_ms,
int((time.monotonic() - batch_start) * 1000), int((time.monotonic() - batch_start) * 1000),
) )
self._metrics.record_batch(
batch_rows=len(run_rows),
targeted_runs=len(free_runs),
skipped_runs=paid_or_skipped,
deleted_runs=counts["runs"],
related_counts={key: counts.get(key, 0) for key in self._empty_related_counts()},
related_action="deleted",
batch_duration_seconds=time.monotonic() - batch_start,
)
# Random sleep between batches to avoid overwhelming the database # Random sleep between batches to avoid overwhelming the database
sleep_ms = random.uniform(0, max_batch_interval_ms) # noqa: S311 sleep_ms = random.uniform(0, max_batch_interval_ms) # noqa: S311
@ -206,7 +394,9 @@ class WorkflowRunCleanup:
f"before {self.window_end.isoformat()}" f"before {self.window_end.isoformat()}"
) )
if related_totals is not None: if related_totals is not None:
summary_message = f"{summary_message}; related records: {self._format_related_counts(related_totals)}" summary_message = (
f"{summary_message}; related records: {self._format_related_counts(related_totals)}"
)
summary_color = "yellow" summary_color = "yellow"
else: else:
if self.window_start: if self.window_start:
@ -216,11 +406,20 @@ class WorkflowRunCleanup:
) )
else: else:
summary_message = ( summary_message = (
f"Cleanup complete. Deleted {total_runs_deleted} workflow runs before {self.window_end.isoformat()}" f"Cleanup complete. Deleted {total_runs_deleted} workflow runs "
f"before {self.window_end.isoformat()}"
) )
summary_color = "white" summary_color = "white"
click.echo(click.style(summary_message, fg=summary_color)) click.echo(click.style(summary_message, fg=summary_color))
except Exception:
status = "failed"
raise
finally:
self._metrics.record_completion(
status=status,
job_duration_seconds=time.monotonic() - run_start,
)
def _filter_free_tenants(self, tenant_ids: Iterable[str]) -> set[str]: def _filter_free_tenants(self, tenant_ids: Iterable[str]) -> set[str]:
tenant_id_list = list(tenant_ids) tenant_id_list = list(tenant_ids)

View File

@ -46,6 +46,7 @@ def test_absolute_mode_calls_from_time_range():
end_before=end_before, end_before=end_before,
batch_size=200, batch_size=200,
dry_run=True, dry_run=True,
task_label="custom",
) )
mock_from_days.assert_not_called() mock_from_days.assert_not_called()
@ -74,6 +75,7 @@ def test_relative_mode_before_days_only_calls_from_days():
days=30, days=30,
batch_size=500, batch_size=500,
dry_run=False, dry_run=False,
task_label="before-30",
) )
mock_from_time_range.assert_not_called() mock_from_time_range.assert_not_called()
@ -105,6 +107,7 @@ def test_relative_mode_with_from_days_ago_calls_from_time_range():
end_before=fixed_now - datetime.timedelta(days=30), end_before=fixed_now - datetime.timedelta(days=30),
batch_size=1000, batch_size=1000,
dry_run=False, dry_run=False,
task_label="60to30",
) )
mock_from_days.assert_not_called() mock_from_days.assert_not_called()

View File

@ -1,5 +1,4 @@
import datetime import datetime
import os
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import pytest import pytest
@ -282,7 +281,6 @@ class TestMessagesCleanService:
MessagesCleanService._batch_delete_message_relations(mock_db_session, ["msg1", "msg2"]) MessagesCleanService._batch_delete_message_relations(mock_db_session, ["msg1", "msg2"])
assert mock_db_session.execute.call_count == 8 # 8 tables to clean up assert mock_db_session.execute.call_count == 8 # 8 tables to clean up
@patch.dict(os.environ, {"SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL": "500"})
def test_clean_messages_interval_from_env(self, mock_db_session, mock_policy): def test_clean_messages_interval_from_env(self, mock_db_session, mock_policy):
service = MessagesCleanService( service = MessagesCleanService(
policy=mock_policy, policy=mock_policy,
@ -301,6 +299,10 @@ class TestMessagesCleanService:
mock_db_session.execute.side_effect = mock_returns mock_db_session.execute.side_effect = mock_returns
mock_policy.filter_message_ids.return_value = ["msg1"] mock_policy.filter_message_ids.return_value = ["msg1"]
with patch(
"services.retention.conversation.messages_clean_service.dify_config.SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL",
500,
):
with patch("services.retention.conversation.messages_clean_service.time.sleep") as mock_sleep: with patch("services.retention.conversation.messages_clean_service.time.sleep") as mock_sleep:
with patch("services.retention.conversation.messages_clean_service.random.uniform") as mock_uniform: with patch("services.retention.conversation.messages_clean_service.random.uniform") as mock_uniform:
mock_uniform.return_value = 300.0 mock_uniform.return_value = 300.0

View File

@ -80,7 +80,13 @@ class TestWorkflowRunCleanupInit:
cfg.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD = 0 cfg.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD = 0
cfg.BILLING_ENABLED = False cfg.BILLING_ENABLED = False
with pytest.raises(ValueError): with pytest.raises(ValueError):
WorkflowRunCleanup(days=30, batch_size=10, start_from=dt, end_before=dt, workflow_run_repo=mock_repo) WorkflowRunCleanup(
days=30,
batch_size=10,
start_from=dt,
end_before=dt,
workflow_run_repo=mock_repo,
)
def test_zero_batch_size_raises(self, mock_repo): def test_zero_batch_size_raises(self, mock_repo):
with patch("services.retention.workflow_run.clear_free_plan_expired_workflow_run_logs.dify_config") as cfg: with patch("services.retention.workflow_run.clear_free_plan_expired_workflow_run_logs.dify_config") as cfg:
@ -102,10 +108,24 @@ class TestWorkflowRunCleanupInit:
cfg.BILLING_ENABLED = False cfg.BILLING_ENABLED = False
start = datetime.datetime(2024, 1, 1) start = datetime.datetime(2024, 1, 1)
end = datetime.datetime(2024, 6, 1) end = datetime.datetime(2024, 6, 1)
c = WorkflowRunCleanup(days=30, batch_size=5, start_from=start, end_before=end, workflow_run_repo=mock_repo) c = WorkflowRunCleanup(
days=30,
batch_size=5,
start_from=start,
end_before=end,
workflow_run_repo=mock_repo,
)
assert c.window_start == start assert c.window_start == start
assert c.window_end == end assert c.window_end == end
def test_default_task_label_is_custom(self, mock_repo):
with patch("services.retention.workflow_run.clear_free_plan_expired_workflow_run_logs.dify_config") as cfg:
cfg.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD = 0
cfg.BILLING_ENABLED = False
c = WorkflowRunCleanup(days=30, batch_size=10, workflow_run_repo=mock_repo)
assert c._metrics._base_attributes["task_label"] == "custom"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# _empty_related_counts / _format_related_counts # _empty_related_counts / _format_related_counts
@ -393,7 +413,12 @@ class TestRunDryRunMode:
with patch("services.retention.workflow_run.clear_free_plan_expired_workflow_run_logs.dify_config") as cfg: with patch("services.retention.workflow_run.clear_free_plan_expired_workflow_run_logs.dify_config") as cfg:
cfg.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD = 0 cfg.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD = 0
cfg.BILLING_ENABLED = False cfg.BILLING_ENABLED = False
return WorkflowRunCleanup(days=30, batch_size=10, workflow_run_repo=mock_repo, dry_run=True) return WorkflowRunCleanup(
days=30,
batch_size=10,
workflow_run_repo=mock_repo,
dry_run=True,
)
def test_dry_run_no_delete_called(self, mock_repo): def test_dry_run_no_delete_called(self, mock_repo):
run = make_run("t1") run = make_run("t1")

View File

@ -265,6 +265,61 @@ def test_run_exits_on_empty_batch(monkeypatch: pytest.MonkeyPatch) -> None:
cleanup.run() cleanup.run()
def test_run_records_metrics_on_success(monkeypatch: pytest.MonkeyPatch) -> None:
cutoff = datetime.datetime.now()
repo = FakeRepo(
batches=[[FakeRun("run-free", "t_free", cutoff)]],
delete_result={
"runs": 0,
"node_executions": 2,
"offloads": 1,
"app_logs": 3,
"trigger_logs": 4,
"pauses": 5,
"pause_reasons": 6,
},
)
cleanup = create_cleanup(monkeypatch, repo=repo, days=30, batch_size=10)
monkeypatch.setattr(cleanup_module.dify_config, "BILLING_ENABLED", False)
batch_calls: list[dict[str, object]] = []
completion_calls: list[dict[str, object]] = []
monkeypatch.setattr(cleanup._metrics, "record_batch", lambda **kwargs: batch_calls.append(kwargs))
monkeypatch.setattr(cleanup._metrics, "record_completion", lambda **kwargs: completion_calls.append(kwargs))
cleanup.run()
assert len(batch_calls) == 1
assert batch_calls[0]["batch_rows"] == 1
assert batch_calls[0]["targeted_runs"] == 1
assert batch_calls[0]["deleted_runs"] == 1
assert batch_calls[0]["related_action"] == "deleted"
assert len(completion_calls) == 1
assert completion_calls[0]["status"] == "success"
def test_run_records_failed_metrics(monkeypatch: pytest.MonkeyPatch) -> None:
class FailingRepo(FakeRepo):
def delete_runs_with_related(
self, runs: list[FakeRun], delete_node_executions=None, delete_trigger_logs=None
) -> dict[str, int]:
raise RuntimeError("delete failed")
cutoff = datetime.datetime.now()
repo = FailingRepo(batches=[[FakeRun("run-free", "t_free", cutoff)]])
cleanup = create_cleanup(monkeypatch, repo=repo, days=30, batch_size=10)
monkeypatch.setattr(cleanup_module.dify_config, "BILLING_ENABLED", False)
completion_calls: list[dict[str, object]] = []
monkeypatch.setattr(cleanup._metrics, "record_completion", lambda **kwargs: completion_calls.append(kwargs))
with pytest.raises(RuntimeError, match="delete failed"):
cleanup.run()
assert len(completion_calls) == 1
assert completion_calls[0]["status"] == "failed"
def test_run_dry_run_skips_deletions(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: def test_run_dry_run_skips_deletions(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None:
cutoff = datetime.datetime.now() cutoff = datetime.datetime.now()
repo = FakeRepo( repo = FakeRepo(

View File

@ -540,6 +540,20 @@ class TestMessagesCleanServiceFromTimeRange:
assert service._batch_size == 1000 # default assert service._batch_size == 1000 # default
assert service._dry_run is False # default assert service._dry_run is False # default
def test_explicit_task_label(self):
start_from = datetime.datetime(2024, 1, 1)
end_before = datetime.datetime(2024, 1, 2)
policy = BillingDisabledPolicy()
service = MessagesCleanService.from_time_range(
policy=policy,
start_from=start_from,
end_before=end_before,
task_label="60to30",
)
assert service._metrics._base_attributes["task_label"] == "60to30"
class TestMessagesCleanServiceFromDays: class TestMessagesCleanServiceFromDays:
"""Unit tests for MessagesCleanService.from_days factory method.""" """Unit tests for MessagesCleanService.from_days factory method."""
@ -619,3 +633,54 @@ class TestMessagesCleanServiceFromDays:
assert service._end_before == expected_end_before assert service._end_before == expected_end_before
assert service._batch_size == 1000 # default assert service._batch_size == 1000 # default
assert service._dry_run is False # default assert service._dry_run is False # default
assert service._metrics._base_attributes["task_label"] == "custom"
class TestMessagesCleanServiceRun:
"""Unit tests for MessagesCleanService.run instrumentation behavior."""
def test_run_records_completion_metrics_on_success(self):
# Arrange
service = MessagesCleanService(
policy=BillingDisabledPolicy(),
start_from=datetime.datetime(2024, 1, 1),
end_before=datetime.datetime(2024, 1, 2),
batch_size=100,
dry_run=False,
)
expected_stats = {
"batches": 1,
"total_messages": 10,
"filtered_messages": 5,
"total_deleted": 5,
}
service._clean_messages_by_time_range = MagicMock(return_value=expected_stats) # type: ignore[method-assign]
completion_calls: list[dict[str, object]] = []
service._metrics.record_completion = lambda **kwargs: completion_calls.append(kwargs) # type: ignore[method-assign]
# Act
result = service.run()
# Assert
assert result == expected_stats
assert len(completion_calls) == 1
assert completion_calls[0]["status"] == "success"
def test_run_records_completion_metrics_on_failure(self):
# Arrange
service = MessagesCleanService(
policy=BillingDisabledPolicy(),
start_from=datetime.datetime(2024, 1, 1),
end_before=datetime.datetime(2024, 1, 2),
batch_size=100,
dry_run=False,
)
service._clean_messages_by_time_range = MagicMock(side_effect=RuntimeError("clean failed")) # type: ignore[method-assign]
completion_calls: list[dict[str, object]] = []
service._metrics.record_completion = lambda **kwargs: completion_calls.append(kwargs) # type: ignore[method-assign]
# Act & Assert
with pytest.raises(RuntimeError, match="clean failed"):
service.run()
assert len(completion_calls) == 1
assert completion_calls[0]["status"] == "failed"