mirror of https://github.com/langgenius/dify.git
feat: add metrics to clean message and workflow-run task (#33143)
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: hj24 <mambahj24@gmail.com>
This commit is contained in:
parent
18af5fc8c7
commit
18ff5d9288
|
|
@ -88,6 +88,8 @@ def clean_workflow_runs(
|
||||||
"""
|
"""
|
||||||
Clean workflow runs and related workflow data for free tenants.
|
Clean workflow runs and related workflow data for free tenants.
|
||||||
"""
|
"""
|
||||||
|
from extensions.otel.runtime import flush_telemetry
|
||||||
|
|
||||||
if (start_from is None) ^ (end_before is None):
|
if (start_from is None) ^ (end_before is None):
|
||||||
raise click.UsageError("--start-from and --end-before must be provided together.")
|
raise click.UsageError("--start-from and --end-before must be provided together.")
|
||||||
|
|
||||||
|
|
@ -104,16 +106,27 @@ def clean_workflow_runs(
|
||||||
end_before = now - datetime.timedelta(days=to_days_ago)
|
end_before = now - datetime.timedelta(days=to_days_ago)
|
||||||
before_days = 0
|
before_days = 0
|
||||||
|
|
||||||
|
if from_days_ago is not None and to_days_ago is not None:
|
||||||
|
task_label = f"{from_days_ago}to{to_days_ago}"
|
||||||
|
elif start_from is None:
|
||||||
|
task_label = f"before-{before_days}"
|
||||||
|
else:
|
||||||
|
task_label = "custom"
|
||||||
|
|
||||||
start_time = datetime.datetime.now(datetime.UTC)
|
start_time = datetime.datetime.now(datetime.UTC)
|
||||||
click.echo(click.style(f"Starting workflow run cleanup at {start_time.isoformat()}.", fg="white"))
|
click.echo(click.style(f"Starting workflow run cleanup at {start_time.isoformat()}.", fg="white"))
|
||||||
|
|
||||||
|
try:
|
||||||
WorkflowRunCleanup(
|
WorkflowRunCleanup(
|
||||||
days=before_days,
|
days=before_days,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
start_from=start_from,
|
start_from=start_from,
|
||||||
end_before=end_before,
|
end_before=end_before,
|
||||||
dry_run=dry_run,
|
dry_run=dry_run,
|
||||||
|
task_label=task_label,
|
||||||
).run()
|
).run()
|
||||||
|
finally:
|
||||||
|
flush_telemetry()
|
||||||
|
|
||||||
end_time = datetime.datetime.now(datetime.UTC)
|
end_time = datetime.datetime.now(datetime.UTC)
|
||||||
elapsed = end_time - start_time
|
elapsed = end_time - start_time
|
||||||
|
|
@ -659,6 +672,8 @@ def clean_expired_messages(
|
||||||
"""
|
"""
|
||||||
Clean expired messages and related data for tenants based on clean policy.
|
Clean expired messages and related data for tenants based on clean policy.
|
||||||
"""
|
"""
|
||||||
|
from extensions.otel.runtime import flush_telemetry
|
||||||
|
|
||||||
click.echo(click.style("clean_messages: start clean messages.", fg="green"))
|
click.echo(click.style("clean_messages: start clean messages.", fg="green"))
|
||||||
|
|
||||||
start_at = time.perf_counter()
|
start_at = time.perf_counter()
|
||||||
|
|
@ -698,6 +713,13 @@ def clean_expired_messages(
|
||||||
# NOTE: graceful_period will be ignored when billing is disabled.
|
# NOTE: graceful_period will be ignored when billing is disabled.
|
||||||
policy = create_message_clean_policy(graceful_period_days=graceful_period)
|
policy = create_message_clean_policy(graceful_period_days=graceful_period)
|
||||||
|
|
||||||
|
if from_days_ago is not None and before_days is not None:
|
||||||
|
task_label = f"{from_days_ago}to{before_days}"
|
||||||
|
elif start_from is None and before_days is not None:
|
||||||
|
task_label = f"before-{before_days}"
|
||||||
|
else:
|
||||||
|
task_label = "custom"
|
||||||
|
|
||||||
# Create and run the cleanup service
|
# Create and run the cleanup service
|
||||||
if abs_mode:
|
if abs_mode:
|
||||||
assert start_from is not None
|
assert start_from is not None
|
||||||
|
|
@ -708,6 +730,7 @@ def clean_expired_messages(
|
||||||
end_before=end_before,
|
end_before=end_before,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
dry_run=dry_run,
|
dry_run=dry_run,
|
||||||
|
task_label=task_label,
|
||||||
)
|
)
|
||||||
elif from_days_ago is None:
|
elif from_days_ago is None:
|
||||||
assert before_days is not None
|
assert before_days is not None
|
||||||
|
|
@ -716,6 +739,7 @@ def clean_expired_messages(
|
||||||
days=before_days,
|
days=before_days,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
dry_run=dry_run,
|
dry_run=dry_run,
|
||||||
|
task_label=task_label,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
assert before_days is not None
|
assert before_days is not None
|
||||||
|
|
@ -727,6 +751,7 @@ def clean_expired_messages(
|
||||||
end_before=now - datetime.timedelta(days=before_days),
|
end_before=now - datetime.timedelta(days=before_days),
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
dry_run=dry_run,
|
dry_run=dry_run,
|
||||||
|
task_label=task_label,
|
||||||
)
|
)
|
||||||
stats = service.run()
|
stats = service.run()
|
||||||
|
|
||||||
|
|
@ -752,6 +777,8 @@ def clean_expired_messages(
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
raise
|
raise
|
||||||
|
finally:
|
||||||
|
flush_telemetry()
|
||||||
|
|
||||||
click.echo(click.style("messages cleanup completed.", fg="green"))
|
click.echo(click.style("messages cleanup completed.", fg="green"))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ from typing import Union
|
||||||
|
|
||||||
from celery.signals import worker_init
|
from celery.signals import worker_init
|
||||||
from flask_login import user_loaded_from_request, user_logged_in
|
from flask_login import user_loaded_from_request, user_logged_in
|
||||||
from opentelemetry import trace
|
from opentelemetry import metrics, trace
|
||||||
from opentelemetry.propagate import set_global_textmap
|
from opentelemetry.propagate import set_global_textmap
|
||||||
from opentelemetry.propagators.b3 import B3MultiFormat
|
from opentelemetry.propagators.b3 import B3MultiFormat
|
||||||
from opentelemetry.propagators.composite import CompositePropagator
|
from opentelemetry.propagators.composite import CompositePropagator
|
||||||
|
|
@ -31,9 +31,29 @@ def setup_context_propagation() -> None:
|
||||||
|
|
||||||
|
|
||||||
def shutdown_tracer() -> None:
|
def shutdown_tracer() -> None:
|
||||||
|
flush_telemetry()
|
||||||
|
|
||||||
|
|
||||||
|
def flush_telemetry() -> None:
|
||||||
|
"""
|
||||||
|
Best-effort flush for telemetry providers.
|
||||||
|
|
||||||
|
This is mainly used by short-lived command processes (e.g. Kubernetes CronJob)
|
||||||
|
so counters/histograms are exported before the process exits.
|
||||||
|
"""
|
||||||
provider = trace.get_tracer_provider()
|
provider = trace.get_tracer_provider()
|
||||||
if hasattr(provider, "force_flush"):
|
if hasattr(provider, "force_flush"):
|
||||||
|
try:
|
||||||
provider.force_flush()
|
provider.force_flush()
|
||||||
|
except Exception:
|
||||||
|
logger.exception("otel: failed to flush trace provider")
|
||||||
|
|
||||||
|
metric_provider = metrics.get_meter_provider()
|
||||||
|
if hasattr(metric_provider, "force_flush"):
|
||||||
|
try:
|
||||||
|
metric_provider.force_flush()
|
||||||
|
except Exception:
|
||||||
|
logger.exception("otel: failed to flush metric provider")
|
||||||
|
|
||||||
|
|
||||||
def is_celery_worker():
|
def is_celery_worker():
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,16 @@
|
||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
from typing import cast
|
from typing import TYPE_CHECKING, cast
|
||||||
|
|
||||||
import sqlalchemy as sa
|
import sqlalchemy as sa
|
||||||
from sqlalchemy import delete, select, tuple_
|
from sqlalchemy import delete, select, tuple_
|
||||||
from sqlalchemy.engine import CursorResult
|
from sqlalchemy.engine import CursorResult
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from configs import dify_config
|
||||||
from extensions.ext_database import db
|
from extensions.ext_database import db
|
||||||
from libs.datetime_utils import naive_utc_now
|
from libs.datetime_utils import naive_utc_now
|
||||||
from models.model import (
|
from models.model import (
|
||||||
|
|
@ -33,6 +33,131 @@ from services.retention.conversation.messages_clean_policy import (
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from opentelemetry.metrics import Counter, Histogram
|
||||||
|
|
||||||
|
|
||||||
|
class MessagesCleanupMetrics:
|
||||||
|
"""
|
||||||
|
Records low-cardinality OpenTelemetry metrics for expired message cleanup jobs.
|
||||||
|
|
||||||
|
We keep labels stable (dry_run/window_mode/task_label/status) so these metrics remain
|
||||||
|
dashboard-friendly for long-running CronJob executions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_job_runs_total: "Counter | None"
|
||||||
|
_batches_total: "Counter | None"
|
||||||
|
_messages_scanned_total: "Counter | None"
|
||||||
|
_messages_filtered_total: "Counter | None"
|
||||||
|
_messages_deleted_total: "Counter | None"
|
||||||
|
_job_duration_seconds: "Histogram | None"
|
||||||
|
_batch_duration_seconds: "Histogram | None"
|
||||||
|
_base_attributes: dict[str, str]
|
||||||
|
|
||||||
|
def __init__(self, *, dry_run: bool, has_window: bool, task_label: str) -> None:
|
||||||
|
self._job_runs_total = None
|
||||||
|
self._batches_total = None
|
||||||
|
self._messages_scanned_total = None
|
||||||
|
self._messages_filtered_total = None
|
||||||
|
self._messages_deleted_total = None
|
||||||
|
self._job_duration_seconds = None
|
||||||
|
self._batch_duration_seconds = None
|
||||||
|
self._base_attributes = {
|
||||||
|
"job_name": "messages_cleanup",
|
||||||
|
"dry_run": str(dry_run).lower(),
|
||||||
|
"window_mode": "between" if has_window else "before_cutoff",
|
||||||
|
"task_label": task_label,
|
||||||
|
}
|
||||||
|
self._init_instruments()
|
||||||
|
|
||||||
|
def _init_instruments(self) -> None:
|
||||||
|
if not dify_config.ENABLE_OTEL:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
from opentelemetry.metrics import get_meter
|
||||||
|
|
||||||
|
meter = get_meter("messages_cleanup", version=dify_config.project.version)
|
||||||
|
self._job_runs_total = meter.create_counter(
|
||||||
|
"messages_cleanup_jobs_total",
|
||||||
|
description="Total number of expired message cleanup jobs by status.",
|
||||||
|
unit="{job}",
|
||||||
|
)
|
||||||
|
self._batches_total = meter.create_counter(
|
||||||
|
"messages_cleanup_batches_total",
|
||||||
|
description="Total number of message cleanup batches processed.",
|
||||||
|
unit="{batch}",
|
||||||
|
)
|
||||||
|
self._messages_scanned_total = meter.create_counter(
|
||||||
|
"messages_cleanup_scanned_messages_total",
|
||||||
|
description="Total messages scanned by cleanup jobs.",
|
||||||
|
unit="{message}",
|
||||||
|
)
|
||||||
|
self._messages_filtered_total = meter.create_counter(
|
||||||
|
"messages_cleanup_filtered_messages_total",
|
||||||
|
description="Total messages selected by cleanup policy.",
|
||||||
|
unit="{message}",
|
||||||
|
)
|
||||||
|
self._messages_deleted_total = meter.create_counter(
|
||||||
|
"messages_cleanup_deleted_messages_total",
|
||||||
|
description="Total messages deleted by cleanup jobs.",
|
||||||
|
unit="{message}",
|
||||||
|
)
|
||||||
|
self._job_duration_seconds = meter.create_histogram(
|
||||||
|
"messages_cleanup_job_duration_seconds",
|
||||||
|
description="Duration of expired message cleanup jobs in seconds.",
|
||||||
|
unit="s",
|
||||||
|
)
|
||||||
|
self._batch_duration_seconds = meter.create_histogram(
|
||||||
|
"messages_cleanup_batch_duration_seconds",
|
||||||
|
description="Duration of expired message cleanup batch processing in seconds.",
|
||||||
|
unit="s",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("messages_cleanup_metrics: failed to initialize instruments")
|
||||||
|
|
||||||
|
def _attrs(self, **extra: str) -> dict[str, str]:
|
||||||
|
return {**self._base_attributes, **extra}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _add(counter: "Counter | None", value: int, attributes: dict[str, str]) -> None:
|
||||||
|
if not counter or value <= 0:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
counter.add(value, attributes)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("messages_cleanup_metrics: failed to add counter value")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _record(histogram: "Histogram | None", value: float, attributes: dict[str, str]) -> None:
|
||||||
|
if not histogram:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
histogram.record(value, attributes)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("messages_cleanup_metrics: failed to record histogram value")
|
||||||
|
|
||||||
|
def record_batch(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
scanned_messages: int,
|
||||||
|
filtered_messages: int,
|
||||||
|
deleted_messages: int,
|
||||||
|
batch_duration_seconds: float,
|
||||||
|
) -> None:
|
||||||
|
attributes = self._attrs()
|
||||||
|
self._add(self._batches_total, 1, attributes)
|
||||||
|
self._add(self._messages_scanned_total, scanned_messages, attributes)
|
||||||
|
self._add(self._messages_filtered_total, filtered_messages, attributes)
|
||||||
|
self._add(self._messages_deleted_total, deleted_messages, attributes)
|
||||||
|
self._record(self._batch_duration_seconds, batch_duration_seconds, attributes)
|
||||||
|
|
||||||
|
def record_completion(self, *, status: str, job_duration_seconds: float) -> None:
|
||||||
|
attributes = self._attrs(status=status)
|
||||||
|
self._add(self._job_runs_total, 1, attributes)
|
||||||
|
self._record(self._job_duration_seconds, job_duration_seconds, attributes)
|
||||||
|
|
||||||
|
|
||||||
class MessagesCleanService:
|
class MessagesCleanService:
|
||||||
"""
|
"""
|
||||||
Service for cleaning expired messages based on retention policies.
|
Service for cleaning expired messages based on retention policies.
|
||||||
|
|
@ -48,6 +173,7 @@ class MessagesCleanService:
|
||||||
start_from: datetime.datetime | None = None,
|
start_from: datetime.datetime | None = None,
|
||||||
batch_size: int = 1000,
|
batch_size: int = 1000,
|
||||||
dry_run: bool = False,
|
dry_run: bool = False,
|
||||||
|
task_label: str = "custom",
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Initialize the service with cleanup parameters.
|
Initialize the service with cleanup parameters.
|
||||||
|
|
@ -58,12 +184,18 @@ class MessagesCleanService:
|
||||||
start_from: Optional start time (inclusive) of the range
|
start_from: Optional start time (inclusive) of the range
|
||||||
batch_size: Number of messages to process per batch
|
batch_size: Number of messages to process per batch
|
||||||
dry_run: Whether to perform a dry run (no actual deletion)
|
dry_run: Whether to perform a dry run (no actual deletion)
|
||||||
|
task_label: Optional task label for retention metrics
|
||||||
"""
|
"""
|
||||||
self._policy = policy
|
self._policy = policy
|
||||||
self._end_before = end_before
|
self._end_before = end_before
|
||||||
self._start_from = start_from
|
self._start_from = start_from
|
||||||
self._batch_size = batch_size
|
self._batch_size = batch_size
|
||||||
self._dry_run = dry_run
|
self._dry_run = dry_run
|
||||||
|
self._metrics = MessagesCleanupMetrics(
|
||||||
|
dry_run=dry_run,
|
||||||
|
has_window=bool(start_from),
|
||||||
|
task_label=task_label,
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_time_range(
|
def from_time_range(
|
||||||
|
|
@ -73,6 +205,7 @@ class MessagesCleanService:
|
||||||
end_before: datetime.datetime,
|
end_before: datetime.datetime,
|
||||||
batch_size: int = 1000,
|
batch_size: int = 1000,
|
||||||
dry_run: bool = False,
|
dry_run: bool = False,
|
||||||
|
task_label: str = "custom",
|
||||||
) -> "MessagesCleanService":
|
) -> "MessagesCleanService":
|
||||||
"""
|
"""
|
||||||
Create a service instance for cleaning messages within a specific time range.
|
Create a service instance for cleaning messages within a specific time range.
|
||||||
|
|
@ -85,6 +218,7 @@ class MessagesCleanService:
|
||||||
end_before: End time (exclusive) of the range
|
end_before: End time (exclusive) of the range
|
||||||
batch_size: Number of messages to process per batch
|
batch_size: Number of messages to process per batch
|
||||||
dry_run: Whether to perform a dry run (no actual deletion)
|
dry_run: Whether to perform a dry run (no actual deletion)
|
||||||
|
task_label: Optional task label for retention metrics
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
MessagesCleanService instance
|
MessagesCleanService instance
|
||||||
|
|
@ -112,6 +246,7 @@ class MessagesCleanService:
|
||||||
start_from=start_from,
|
start_from=start_from,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
dry_run=dry_run,
|
dry_run=dry_run,
|
||||||
|
task_label=task_label,
|
||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
@ -121,6 +256,7 @@ class MessagesCleanService:
|
||||||
days: int = 30,
|
days: int = 30,
|
||||||
batch_size: int = 1000,
|
batch_size: int = 1000,
|
||||||
dry_run: bool = False,
|
dry_run: bool = False,
|
||||||
|
task_label: str = "custom",
|
||||||
) -> "MessagesCleanService":
|
) -> "MessagesCleanService":
|
||||||
"""
|
"""
|
||||||
Create a service instance for cleaning messages older than specified days.
|
Create a service instance for cleaning messages older than specified days.
|
||||||
|
|
@ -130,6 +266,7 @@ class MessagesCleanService:
|
||||||
days: Number of days to look back from now
|
days: Number of days to look back from now
|
||||||
batch_size: Number of messages to process per batch
|
batch_size: Number of messages to process per batch
|
||||||
dry_run: Whether to perform a dry run (no actual deletion)
|
dry_run: Whether to perform a dry run (no actual deletion)
|
||||||
|
task_label: Optional task label for retention metrics
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
MessagesCleanService instance
|
MessagesCleanService instance
|
||||||
|
|
@ -153,7 +290,14 @@ class MessagesCleanService:
|
||||||
policy.__class__.__name__,
|
policy.__class__.__name__,
|
||||||
)
|
)
|
||||||
|
|
||||||
return cls(policy=policy, end_before=end_before, start_from=None, batch_size=batch_size, dry_run=dry_run)
|
return cls(
|
||||||
|
policy=policy,
|
||||||
|
end_before=end_before,
|
||||||
|
start_from=None,
|
||||||
|
batch_size=batch_size,
|
||||||
|
dry_run=dry_run,
|
||||||
|
task_label=task_label,
|
||||||
|
)
|
||||||
|
|
||||||
def run(self) -> dict[str, int]:
|
def run(self) -> dict[str, int]:
|
||||||
"""
|
"""
|
||||||
|
|
@ -162,7 +306,18 @@ class MessagesCleanService:
|
||||||
Returns:
|
Returns:
|
||||||
Dict with statistics: batches, filtered_messages, total_deleted
|
Dict with statistics: batches, filtered_messages, total_deleted
|
||||||
"""
|
"""
|
||||||
|
status = "success"
|
||||||
|
run_start = time.monotonic()
|
||||||
|
try:
|
||||||
return self._clean_messages_by_time_range()
|
return self._clean_messages_by_time_range()
|
||||||
|
except Exception:
|
||||||
|
status = "failed"
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
self._metrics.record_completion(
|
||||||
|
status=status,
|
||||||
|
job_duration_seconds=time.monotonic() - run_start,
|
||||||
|
)
|
||||||
|
|
||||||
def _clean_messages_by_time_range(self) -> dict[str, int]:
|
def _clean_messages_by_time_range(self) -> dict[str, int]:
|
||||||
"""
|
"""
|
||||||
|
|
@ -197,11 +352,14 @@ class MessagesCleanService:
|
||||||
self._end_before,
|
self._end_before,
|
||||||
)
|
)
|
||||||
|
|
||||||
max_batch_interval_ms = int(os.environ.get("SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL", 200))
|
max_batch_interval_ms = dify_config.SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
stats["batches"] += 1
|
stats["batches"] += 1
|
||||||
batch_start = time.monotonic()
|
batch_start = time.monotonic()
|
||||||
|
batch_scanned_messages = 0
|
||||||
|
batch_filtered_messages = 0
|
||||||
|
batch_deleted_messages = 0
|
||||||
|
|
||||||
# Step 1: Fetch a batch of messages using cursor
|
# Step 1: Fetch a batch of messages using cursor
|
||||||
with Session(db.engine, expire_on_commit=False) as session:
|
with Session(db.engine, expire_on_commit=False) as session:
|
||||||
|
|
@ -240,9 +398,16 @@ class MessagesCleanService:
|
||||||
|
|
||||||
# Track total messages fetched across all batches
|
# Track total messages fetched across all batches
|
||||||
stats["total_messages"] += len(messages)
|
stats["total_messages"] += len(messages)
|
||||||
|
batch_scanned_messages = len(messages)
|
||||||
|
|
||||||
if not messages:
|
if not messages:
|
||||||
logger.info("clean_messages (batch %s): no more messages to process", stats["batches"])
|
logger.info("clean_messages (batch %s): no more messages to process", stats["batches"])
|
||||||
|
self._metrics.record_batch(
|
||||||
|
scanned_messages=batch_scanned_messages,
|
||||||
|
filtered_messages=batch_filtered_messages,
|
||||||
|
deleted_messages=batch_deleted_messages,
|
||||||
|
batch_duration_seconds=time.monotonic() - batch_start,
|
||||||
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
# Update cursor to the last message's (created_at, id)
|
# Update cursor to the last message's (created_at, id)
|
||||||
|
|
@ -268,6 +433,12 @@ class MessagesCleanService:
|
||||||
|
|
||||||
if not apps:
|
if not apps:
|
||||||
logger.info("clean_messages (batch %s): no apps found, skip", stats["batches"])
|
logger.info("clean_messages (batch %s): no apps found, skip", stats["batches"])
|
||||||
|
self._metrics.record_batch(
|
||||||
|
scanned_messages=batch_scanned_messages,
|
||||||
|
filtered_messages=batch_filtered_messages,
|
||||||
|
deleted_messages=batch_deleted_messages,
|
||||||
|
batch_duration_seconds=time.monotonic() - batch_start,
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Build app_id -> tenant_id mapping
|
# Build app_id -> tenant_id mapping
|
||||||
|
|
@ -286,9 +457,16 @@ class MessagesCleanService:
|
||||||
|
|
||||||
if not message_ids_to_delete:
|
if not message_ids_to_delete:
|
||||||
logger.info("clean_messages (batch %s): no messages to delete, skip", stats["batches"])
|
logger.info("clean_messages (batch %s): no messages to delete, skip", stats["batches"])
|
||||||
|
self._metrics.record_batch(
|
||||||
|
scanned_messages=batch_scanned_messages,
|
||||||
|
filtered_messages=batch_filtered_messages,
|
||||||
|
deleted_messages=batch_deleted_messages,
|
||||||
|
batch_duration_seconds=time.monotonic() - batch_start,
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
stats["filtered_messages"] += len(message_ids_to_delete)
|
stats["filtered_messages"] += len(message_ids_to_delete)
|
||||||
|
batch_filtered_messages = len(message_ids_to_delete)
|
||||||
|
|
||||||
# Step 4: Batch delete messages and their relations
|
# Step 4: Batch delete messages and their relations
|
||||||
if not self._dry_run:
|
if not self._dry_run:
|
||||||
|
|
@ -309,6 +487,7 @@ class MessagesCleanService:
|
||||||
commit_ms = int((time.monotonic() - commit_start) * 1000)
|
commit_ms = int((time.monotonic() - commit_start) * 1000)
|
||||||
|
|
||||||
stats["total_deleted"] += messages_deleted
|
stats["total_deleted"] += messages_deleted
|
||||||
|
batch_deleted_messages = messages_deleted
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"clean_messages (batch %s): processed %s messages, deleted %s messages",
|
"clean_messages (batch %s): processed %s messages, deleted %s messages",
|
||||||
|
|
@ -343,6 +522,13 @@ class MessagesCleanService:
|
||||||
for msg_id in sampled_ids:
|
for msg_id in sampled_ids:
|
||||||
logger.info("clean_messages (batch %s, dry_run) sample: message_id=%s", stats["batches"], msg_id)
|
logger.info("clean_messages (batch %s, dry_run) sample: message_id=%s", stats["batches"], msg_id)
|
||||||
|
|
||||||
|
self._metrics.record_batch(
|
||||||
|
scanned_messages=batch_scanned_messages,
|
||||||
|
filtered_messages=batch_filtered_messages,
|
||||||
|
deleted_messages=batch_deleted_messages,
|
||||||
|
batch_duration_seconds=time.monotonic() - batch_start,
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"clean_messages completed: total batches: %s, total messages: %s, filtered messages: %s, total deleted: %s",
|
"clean_messages completed: total batches: %s, total messages: %s, filtered messages: %s, total deleted: %s",
|
||||||
stats["batches"],
|
stats["batches"],
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable, Sequence
|
from collections.abc import Iterable, Sequence
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import click
|
import click
|
||||||
from sqlalchemy.orm import Session, sessionmaker
|
from sqlalchemy.orm import Session, sessionmaker
|
||||||
|
|
@ -20,6 +20,159 @@ from services.billing_service import BillingService, SubscriptionPlan
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from opentelemetry.metrics import Counter, Histogram
|
||||||
|
|
||||||
|
|
||||||
|
class WorkflowRunCleanupMetrics:
|
||||||
|
"""
|
||||||
|
Records low-cardinality OpenTelemetry metrics for workflow run cleanup jobs.
|
||||||
|
|
||||||
|
Metrics are emitted with stable labels only (dry_run/window_mode/task_label/status)
|
||||||
|
to keep dashboard and alert cardinality predictable in production clusters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_job_runs_total: "Counter | None"
|
||||||
|
_batches_total: "Counter | None"
|
||||||
|
_runs_scanned_total: "Counter | None"
|
||||||
|
_runs_targeted_total: "Counter | None"
|
||||||
|
_runs_deleted_total: "Counter | None"
|
||||||
|
_runs_skipped_total: "Counter | None"
|
||||||
|
_related_records_total: "Counter | None"
|
||||||
|
_job_duration_seconds: "Histogram | None"
|
||||||
|
_batch_duration_seconds: "Histogram | None"
|
||||||
|
_base_attributes: dict[str, str]
|
||||||
|
|
||||||
|
def __init__(self, *, dry_run: bool, has_window: bool, task_label: str) -> None:
|
||||||
|
self._job_runs_total = None
|
||||||
|
self._batches_total = None
|
||||||
|
self._runs_scanned_total = None
|
||||||
|
self._runs_targeted_total = None
|
||||||
|
self._runs_deleted_total = None
|
||||||
|
self._runs_skipped_total = None
|
||||||
|
self._related_records_total = None
|
||||||
|
self._job_duration_seconds = None
|
||||||
|
self._batch_duration_seconds = None
|
||||||
|
self._base_attributes = {
|
||||||
|
"job_name": "workflow_run_cleanup",
|
||||||
|
"dry_run": str(dry_run).lower(),
|
||||||
|
"window_mode": "between" if has_window else "before_cutoff",
|
||||||
|
"task_label": task_label,
|
||||||
|
}
|
||||||
|
self._init_instruments()
|
||||||
|
|
||||||
|
def _init_instruments(self) -> None:
|
||||||
|
if not dify_config.ENABLE_OTEL:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
from opentelemetry.metrics import get_meter
|
||||||
|
|
||||||
|
meter = get_meter("workflow_run_cleanup", version=dify_config.project.version)
|
||||||
|
self._job_runs_total = meter.create_counter(
|
||||||
|
"workflow_run_cleanup_jobs_total",
|
||||||
|
description="Total number of workflow run cleanup jobs by status.",
|
||||||
|
unit="{job}",
|
||||||
|
)
|
||||||
|
self._batches_total = meter.create_counter(
|
||||||
|
"workflow_run_cleanup_batches_total",
|
||||||
|
description="Total number of processed cleanup batches.",
|
||||||
|
unit="{batch}",
|
||||||
|
)
|
||||||
|
self._runs_scanned_total = meter.create_counter(
|
||||||
|
"workflow_run_cleanup_scanned_runs_total",
|
||||||
|
description="Total workflow runs scanned by cleanup jobs.",
|
||||||
|
unit="{run}",
|
||||||
|
)
|
||||||
|
self._runs_targeted_total = meter.create_counter(
|
||||||
|
"workflow_run_cleanup_targeted_runs_total",
|
||||||
|
description="Total workflow runs targeted by cleanup policy.",
|
||||||
|
unit="{run}",
|
||||||
|
)
|
||||||
|
self._runs_deleted_total = meter.create_counter(
|
||||||
|
"workflow_run_cleanup_deleted_runs_total",
|
||||||
|
description="Total workflow runs deleted by cleanup jobs.",
|
||||||
|
unit="{run}",
|
||||||
|
)
|
||||||
|
self._runs_skipped_total = meter.create_counter(
|
||||||
|
"workflow_run_cleanup_skipped_runs_total",
|
||||||
|
description="Total workflow runs skipped because tenant is paid/unknown.",
|
||||||
|
unit="{run}",
|
||||||
|
)
|
||||||
|
self._related_records_total = meter.create_counter(
|
||||||
|
"workflow_run_cleanup_related_records_total",
|
||||||
|
description="Total related records processed by cleanup jobs.",
|
||||||
|
unit="{record}",
|
||||||
|
)
|
||||||
|
self._job_duration_seconds = meter.create_histogram(
|
||||||
|
"workflow_run_cleanup_job_duration_seconds",
|
||||||
|
description="Duration of workflow run cleanup jobs in seconds.",
|
||||||
|
unit="s",
|
||||||
|
)
|
||||||
|
self._batch_duration_seconds = meter.create_histogram(
|
||||||
|
"workflow_run_cleanup_batch_duration_seconds",
|
||||||
|
description="Duration of workflow run cleanup batch processing in seconds.",
|
||||||
|
unit="s",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("workflow_run_cleanup_metrics: failed to initialize instruments")
|
||||||
|
|
||||||
|
def _attrs(self, **extra: str) -> dict[str, str]:
|
||||||
|
return {**self._base_attributes, **extra}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _add(counter: "Counter | None", value: int, attributes: dict[str, str]) -> None:
|
||||||
|
if not counter or value <= 0:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
counter.add(value, attributes)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("workflow_run_cleanup_metrics: failed to add counter value")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _record(histogram: "Histogram | None", value: float, attributes: dict[str, str]) -> None:
|
||||||
|
if not histogram:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
histogram.record(value, attributes)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("workflow_run_cleanup_metrics: failed to record histogram value")
|
||||||
|
|
||||||
|
def record_batch(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
batch_rows: int,
|
||||||
|
targeted_runs: int,
|
||||||
|
skipped_runs: int,
|
||||||
|
deleted_runs: int,
|
||||||
|
related_counts: dict[str, int] | None,
|
||||||
|
related_action: str | None,
|
||||||
|
batch_duration_seconds: float,
|
||||||
|
) -> None:
|
||||||
|
attributes = self._attrs()
|
||||||
|
self._add(self._batches_total, 1, attributes)
|
||||||
|
self._add(self._runs_scanned_total, batch_rows, attributes)
|
||||||
|
self._add(self._runs_targeted_total, targeted_runs, attributes)
|
||||||
|
self._add(self._runs_skipped_total, skipped_runs, attributes)
|
||||||
|
self._add(self._runs_deleted_total, deleted_runs, attributes)
|
||||||
|
self._record(self._batch_duration_seconds, batch_duration_seconds, attributes)
|
||||||
|
|
||||||
|
if not related_counts or not related_action:
|
||||||
|
return
|
||||||
|
|
||||||
|
for record_type, count in related_counts.items():
|
||||||
|
self._add(
|
||||||
|
self._related_records_total,
|
||||||
|
count,
|
||||||
|
self._attrs(action=related_action, record_type=record_type),
|
||||||
|
)
|
||||||
|
|
||||||
|
def record_completion(self, *, status: str, job_duration_seconds: float) -> None:
|
||||||
|
attributes = self._attrs(status=status)
|
||||||
|
self._add(self._job_runs_total, 1, attributes)
|
||||||
|
self._record(self._job_duration_seconds, job_duration_seconds, attributes)
|
||||||
|
|
||||||
|
|
||||||
class WorkflowRunCleanup:
|
class WorkflowRunCleanup:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|
@ -29,6 +182,7 @@ class WorkflowRunCleanup:
|
||||||
end_before: datetime.datetime | None = None,
|
end_before: datetime.datetime | None = None,
|
||||||
workflow_run_repo: APIWorkflowRunRepository | None = None,
|
workflow_run_repo: APIWorkflowRunRepository | None = None,
|
||||||
dry_run: bool = False,
|
dry_run: bool = False,
|
||||||
|
task_label: str = "custom",
|
||||||
):
|
):
|
||||||
if (start_from is None) ^ (end_before is None):
|
if (start_from is None) ^ (end_before is None):
|
||||||
raise ValueError("start_from and end_before must be both set or both omitted.")
|
raise ValueError("start_from and end_before must be both set or both omitted.")
|
||||||
|
|
@ -46,6 +200,11 @@ class WorkflowRunCleanup:
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self._cleanup_whitelist: set[str] | None = None
|
self._cleanup_whitelist: set[str] | None = None
|
||||||
self.dry_run = dry_run
|
self.dry_run = dry_run
|
||||||
|
self._metrics = WorkflowRunCleanupMetrics(
|
||||||
|
dry_run=dry_run,
|
||||||
|
has_window=bool(start_from),
|
||||||
|
task_label=task_label,
|
||||||
|
)
|
||||||
self.free_plan_grace_period_days = dify_config.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD
|
self.free_plan_grace_period_days = dify_config.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD
|
||||||
self.workflow_run_repo: APIWorkflowRunRepository
|
self.workflow_run_repo: APIWorkflowRunRepository
|
||||||
if workflow_run_repo:
|
if workflow_run_repo:
|
||||||
|
|
@ -74,9 +233,11 @@ class WorkflowRunCleanup:
|
||||||
related_totals = self._empty_related_counts() if self.dry_run else None
|
related_totals = self._empty_related_counts() if self.dry_run else None
|
||||||
batch_index = 0
|
batch_index = 0
|
||||||
last_seen: tuple[datetime.datetime, str] | None = None
|
last_seen: tuple[datetime.datetime, str] | None = None
|
||||||
|
status = "success"
|
||||||
|
run_start = time.monotonic()
|
||||||
|
max_batch_interval_ms = dify_config.SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL
|
||||||
|
|
||||||
max_batch_interval_ms = int(os.environ.get("SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL", 200))
|
try:
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
batch_start = time.monotonic()
|
batch_start = time.monotonic()
|
||||||
|
|
||||||
|
|
@ -125,6 +286,15 @@ class WorkflowRunCleanup:
|
||||||
fg="yellow",
|
fg="yellow",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
self._metrics.record_batch(
|
||||||
|
batch_rows=len(run_rows),
|
||||||
|
targeted_runs=0,
|
||||||
|
skipped_runs=paid_or_skipped,
|
||||||
|
deleted_runs=0,
|
||||||
|
related_counts=None,
|
||||||
|
related_action=None,
|
||||||
|
batch_duration_seconds=time.monotonic() - batch_start,
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
total_runs_targeted += len(free_runs)
|
total_runs_targeted += len(free_runs)
|
||||||
|
|
@ -157,6 +327,15 @@ class WorkflowRunCleanup:
|
||||||
batch_index,
|
batch_index,
|
||||||
int((time.monotonic() - batch_start) * 1000),
|
int((time.monotonic() - batch_start) * 1000),
|
||||||
)
|
)
|
||||||
|
self._metrics.record_batch(
|
||||||
|
batch_rows=len(run_rows),
|
||||||
|
targeted_runs=len(free_runs),
|
||||||
|
skipped_runs=paid_or_skipped,
|
||||||
|
deleted_runs=0,
|
||||||
|
related_counts={key: batch_counts.get(key, 0) for key in self._empty_related_counts()},
|
||||||
|
related_action="would_delete",
|
||||||
|
batch_duration_seconds=time.monotonic() - batch_start,
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -188,6 +367,15 @@ class WorkflowRunCleanup:
|
||||||
delete_ms,
|
delete_ms,
|
||||||
int((time.monotonic() - batch_start) * 1000),
|
int((time.monotonic() - batch_start) * 1000),
|
||||||
)
|
)
|
||||||
|
self._metrics.record_batch(
|
||||||
|
batch_rows=len(run_rows),
|
||||||
|
targeted_runs=len(free_runs),
|
||||||
|
skipped_runs=paid_or_skipped,
|
||||||
|
deleted_runs=counts["runs"],
|
||||||
|
related_counts={key: counts.get(key, 0) for key in self._empty_related_counts()},
|
||||||
|
related_action="deleted",
|
||||||
|
batch_duration_seconds=time.monotonic() - batch_start,
|
||||||
|
)
|
||||||
|
|
||||||
# Random sleep between batches to avoid overwhelming the database
|
# Random sleep between batches to avoid overwhelming the database
|
||||||
sleep_ms = random.uniform(0, max_batch_interval_ms) # noqa: S311
|
sleep_ms = random.uniform(0, max_batch_interval_ms) # noqa: S311
|
||||||
|
|
@ -206,7 +394,9 @@ class WorkflowRunCleanup:
|
||||||
f"before {self.window_end.isoformat()}"
|
f"before {self.window_end.isoformat()}"
|
||||||
)
|
)
|
||||||
if related_totals is not None:
|
if related_totals is not None:
|
||||||
summary_message = f"{summary_message}; related records: {self._format_related_counts(related_totals)}"
|
summary_message = (
|
||||||
|
f"{summary_message}; related records: {self._format_related_counts(related_totals)}"
|
||||||
|
)
|
||||||
summary_color = "yellow"
|
summary_color = "yellow"
|
||||||
else:
|
else:
|
||||||
if self.window_start:
|
if self.window_start:
|
||||||
|
|
@ -216,11 +406,20 @@ class WorkflowRunCleanup:
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
summary_message = (
|
summary_message = (
|
||||||
f"Cleanup complete. Deleted {total_runs_deleted} workflow runs before {self.window_end.isoformat()}"
|
f"Cleanup complete. Deleted {total_runs_deleted} workflow runs "
|
||||||
|
f"before {self.window_end.isoformat()}"
|
||||||
)
|
)
|
||||||
summary_color = "white"
|
summary_color = "white"
|
||||||
|
|
||||||
click.echo(click.style(summary_message, fg=summary_color))
|
click.echo(click.style(summary_message, fg=summary_color))
|
||||||
|
except Exception:
|
||||||
|
status = "failed"
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
self._metrics.record_completion(
|
||||||
|
status=status,
|
||||||
|
job_duration_seconds=time.monotonic() - run_start,
|
||||||
|
)
|
||||||
|
|
||||||
def _filter_free_tenants(self, tenant_ids: Iterable[str]) -> set[str]:
|
def _filter_free_tenants(self, tenant_ids: Iterable[str]) -> set[str]:
|
||||||
tenant_id_list = list(tenant_ids)
|
tenant_id_list = list(tenant_ids)
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,7 @@ def test_absolute_mode_calls_from_time_range():
|
||||||
end_before=end_before,
|
end_before=end_before,
|
||||||
batch_size=200,
|
batch_size=200,
|
||||||
dry_run=True,
|
dry_run=True,
|
||||||
|
task_label="custom",
|
||||||
)
|
)
|
||||||
mock_from_days.assert_not_called()
|
mock_from_days.assert_not_called()
|
||||||
|
|
||||||
|
|
@ -74,6 +75,7 @@ def test_relative_mode_before_days_only_calls_from_days():
|
||||||
days=30,
|
days=30,
|
||||||
batch_size=500,
|
batch_size=500,
|
||||||
dry_run=False,
|
dry_run=False,
|
||||||
|
task_label="before-30",
|
||||||
)
|
)
|
||||||
mock_from_time_range.assert_not_called()
|
mock_from_time_range.assert_not_called()
|
||||||
|
|
||||||
|
|
@ -105,6 +107,7 @@ def test_relative_mode_with_from_days_ago_calls_from_time_range():
|
||||||
end_before=fixed_now - datetime.timedelta(days=30),
|
end_before=fixed_now - datetime.timedelta(days=30),
|
||||||
batch_size=1000,
|
batch_size=1000,
|
||||||
dry_run=False,
|
dry_run=False,
|
||||||
|
task_label="60to30",
|
||||||
)
|
)
|
||||||
mock_from_days.assert_not_called()
|
mock_from_days.assert_not_called()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,4 @@
|
||||||
import datetime
|
import datetime
|
||||||
import os
|
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
@ -282,7 +281,6 @@ class TestMessagesCleanService:
|
||||||
MessagesCleanService._batch_delete_message_relations(mock_db_session, ["msg1", "msg2"])
|
MessagesCleanService._batch_delete_message_relations(mock_db_session, ["msg1", "msg2"])
|
||||||
assert mock_db_session.execute.call_count == 8 # 8 tables to clean up
|
assert mock_db_session.execute.call_count == 8 # 8 tables to clean up
|
||||||
|
|
||||||
@patch.dict(os.environ, {"SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL": "500"})
|
|
||||||
def test_clean_messages_interval_from_env(self, mock_db_session, mock_policy):
|
def test_clean_messages_interval_from_env(self, mock_db_session, mock_policy):
|
||||||
service = MessagesCleanService(
|
service = MessagesCleanService(
|
||||||
policy=mock_policy,
|
policy=mock_policy,
|
||||||
|
|
@ -301,6 +299,10 @@ class TestMessagesCleanService:
|
||||||
mock_db_session.execute.side_effect = mock_returns
|
mock_db_session.execute.side_effect = mock_returns
|
||||||
mock_policy.filter_message_ids.return_value = ["msg1"]
|
mock_policy.filter_message_ids.return_value = ["msg1"]
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"services.retention.conversation.messages_clean_service.dify_config.SANDBOX_EXPIRED_RECORDS_CLEAN_BATCH_MAX_INTERVAL",
|
||||||
|
500,
|
||||||
|
):
|
||||||
with patch("services.retention.conversation.messages_clean_service.time.sleep") as mock_sleep:
|
with patch("services.retention.conversation.messages_clean_service.time.sleep") as mock_sleep:
|
||||||
with patch("services.retention.conversation.messages_clean_service.random.uniform") as mock_uniform:
|
with patch("services.retention.conversation.messages_clean_service.random.uniform") as mock_uniform:
|
||||||
mock_uniform.return_value = 300.0
|
mock_uniform.return_value = 300.0
|
||||||
|
|
|
||||||
|
|
@ -80,7 +80,13 @@ class TestWorkflowRunCleanupInit:
|
||||||
cfg.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD = 0
|
cfg.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD = 0
|
||||||
cfg.BILLING_ENABLED = False
|
cfg.BILLING_ENABLED = False
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
WorkflowRunCleanup(days=30, batch_size=10, start_from=dt, end_before=dt, workflow_run_repo=mock_repo)
|
WorkflowRunCleanup(
|
||||||
|
days=30,
|
||||||
|
batch_size=10,
|
||||||
|
start_from=dt,
|
||||||
|
end_before=dt,
|
||||||
|
workflow_run_repo=mock_repo,
|
||||||
|
)
|
||||||
|
|
||||||
def test_zero_batch_size_raises(self, mock_repo):
|
def test_zero_batch_size_raises(self, mock_repo):
|
||||||
with patch("services.retention.workflow_run.clear_free_plan_expired_workflow_run_logs.dify_config") as cfg:
|
with patch("services.retention.workflow_run.clear_free_plan_expired_workflow_run_logs.dify_config") as cfg:
|
||||||
|
|
@ -102,10 +108,24 @@ class TestWorkflowRunCleanupInit:
|
||||||
cfg.BILLING_ENABLED = False
|
cfg.BILLING_ENABLED = False
|
||||||
start = datetime.datetime(2024, 1, 1)
|
start = datetime.datetime(2024, 1, 1)
|
||||||
end = datetime.datetime(2024, 6, 1)
|
end = datetime.datetime(2024, 6, 1)
|
||||||
c = WorkflowRunCleanup(days=30, batch_size=5, start_from=start, end_before=end, workflow_run_repo=mock_repo)
|
c = WorkflowRunCleanup(
|
||||||
|
days=30,
|
||||||
|
batch_size=5,
|
||||||
|
start_from=start,
|
||||||
|
end_before=end,
|
||||||
|
workflow_run_repo=mock_repo,
|
||||||
|
)
|
||||||
assert c.window_start == start
|
assert c.window_start == start
|
||||||
assert c.window_end == end
|
assert c.window_end == end
|
||||||
|
|
||||||
|
def test_default_task_label_is_custom(self, mock_repo):
|
||||||
|
with patch("services.retention.workflow_run.clear_free_plan_expired_workflow_run_logs.dify_config") as cfg:
|
||||||
|
cfg.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD = 0
|
||||||
|
cfg.BILLING_ENABLED = False
|
||||||
|
c = WorkflowRunCleanup(days=30, batch_size=10, workflow_run_repo=mock_repo)
|
||||||
|
|
||||||
|
assert c._metrics._base_attributes["task_label"] == "custom"
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# _empty_related_counts / _format_related_counts
|
# _empty_related_counts / _format_related_counts
|
||||||
|
|
@ -393,7 +413,12 @@ class TestRunDryRunMode:
|
||||||
with patch("services.retention.workflow_run.clear_free_plan_expired_workflow_run_logs.dify_config") as cfg:
|
with patch("services.retention.workflow_run.clear_free_plan_expired_workflow_run_logs.dify_config") as cfg:
|
||||||
cfg.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD = 0
|
cfg.SANDBOX_EXPIRED_RECORDS_CLEAN_GRACEFUL_PERIOD = 0
|
||||||
cfg.BILLING_ENABLED = False
|
cfg.BILLING_ENABLED = False
|
||||||
return WorkflowRunCleanup(days=30, batch_size=10, workflow_run_repo=mock_repo, dry_run=True)
|
return WorkflowRunCleanup(
|
||||||
|
days=30,
|
||||||
|
batch_size=10,
|
||||||
|
workflow_run_repo=mock_repo,
|
||||||
|
dry_run=True,
|
||||||
|
)
|
||||||
|
|
||||||
def test_dry_run_no_delete_called(self, mock_repo):
|
def test_dry_run_no_delete_called(self, mock_repo):
|
||||||
run = make_run("t1")
|
run = make_run("t1")
|
||||||
|
|
|
||||||
|
|
@ -265,6 +265,61 @@ def test_run_exits_on_empty_batch(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
cleanup.run()
|
cleanup.run()
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_records_metrics_on_success(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
cutoff = datetime.datetime.now()
|
||||||
|
repo = FakeRepo(
|
||||||
|
batches=[[FakeRun("run-free", "t_free", cutoff)]],
|
||||||
|
delete_result={
|
||||||
|
"runs": 0,
|
||||||
|
"node_executions": 2,
|
||||||
|
"offloads": 1,
|
||||||
|
"app_logs": 3,
|
||||||
|
"trigger_logs": 4,
|
||||||
|
"pauses": 5,
|
||||||
|
"pause_reasons": 6,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
cleanup = create_cleanup(monkeypatch, repo=repo, days=30, batch_size=10)
|
||||||
|
monkeypatch.setattr(cleanup_module.dify_config, "BILLING_ENABLED", False)
|
||||||
|
|
||||||
|
batch_calls: list[dict[str, object]] = []
|
||||||
|
completion_calls: list[dict[str, object]] = []
|
||||||
|
monkeypatch.setattr(cleanup._metrics, "record_batch", lambda **kwargs: batch_calls.append(kwargs))
|
||||||
|
monkeypatch.setattr(cleanup._metrics, "record_completion", lambda **kwargs: completion_calls.append(kwargs))
|
||||||
|
|
||||||
|
cleanup.run()
|
||||||
|
|
||||||
|
assert len(batch_calls) == 1
|
||||||
|
assert batch_calls[0]["batch_rows"] == 1
|
||||||
|
assert batch_calls[0]["targeted_runs"] == 1
|
||||||
|
assert batch_calls[0]["deleted_runs"] == 1
|
||||||
|
assert batch_calls[0]["related_action"] == "deleted"
|
||||||
|
assert len(completion_calls) == 1
|
||||||
|
assert completion_calls[0]["status"] == "success"
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_records_failed_metrics(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
class FailingRepo(FakeRepo):
|
||||||
|
def delete_runs_with_related(
|
||||||
|
self, runs: list[FakeRun], delete_node_executions=None, delete_trigger_logs=None
|
||||||
|
) -> dict[str, int]:
|
||||||
|
raise RuntimeError("delete failed")
|
||||||
|
|
||||||
|
cutoff = datetime.datetime.now()
|
||||||
|
repo = FailingRepo(batches=[[FakeRun("run-free", "t_free", cutoff)]])
|
||||||
|
cleanup = create_cleanup(monkeypatch, repo=repo, days=30, batch_size=10)
|
||||||
|
monkeypatch.setattr(cleanup_module.dify_config, "BILLING_ENABLED", False)
|
||||||
|
|
||||||
|
completion_calls: list[dict[str, object]] = []
|
||||||
|
monkeypatch.setattr(cleanup._metrics, "record_completion", lambda **kwargs: completion_calls.append(kwargs))
|
||||||
|
|
||||||
|
with pytest.raises(RuntimeError, match="delete failed"):
|
||||||
|
cleanup.run()
|
||||||
|
|
||||||
|
assert len(completion_calls) == 1
|
||||||
|
assert completion_calls[0]["status"] == "failed"
|
||||||
|
|
||||||
|
|
||||||
def test_run_dry_run_skips_deletions(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None:
|
def test_run_dry_run_skips_deletions(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None:
|
||||||
cutoff = datetime.datetime.now()
|
cutoff = datetime.datetime.now()
|
||||||
repo = FakeRepo(
|
repo = FakeRepo(
|
||||||
|
|
|
||||||
|
|
@ -540,6 +540,20 @@ class TestMessagesCleanServiceFromTimeRange:
|
||||||
assert service._batch_size == 1000 # default
|
assert service._batch_size == 1000 # default
|
||||||
assert service._dry_run is False # default
|
assert service._dry_run is False # default
|
||||||
|
|
||||||
|
def test_explicit_task_label(self):
|
||||||
|
start_from = datetime.datetime(2024, 1, 1)
|
||||||
|
end_before = datetime.datetime(2024, 1, 2)
|
||||||
|
policy = BillingDisabledPolicy()
|
||||||
|
|
||||||
|
service = MessagesCleanService.from_time_range(
|
||||||
|
policy=policy,
|
||||||
|
start_from=start_from,
|
||||||
|
end_before=end_before,
|
||||||
|
task_label="60to30",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert service._metrics._base_attributes["task_label"] == "60to30"
|
||||||
|
|
||||||
|
|
||||||
class TestMessagesCleanServiceFromDays:
|
class TestMessagesCleanServiceFromDays:
|
||||||
"""Unit tests for MessagesCleanService.from_days factory method."""
|
"""Unit tests for MessagesCleanService.from_days factory method."""
|
||||||
|
|
@ -619,3 +633,54 @@ class TestMessagesCleanServiceFromDays:
|
||||||
assert service._end_before == expected_end_before
|
assert service._end_before == expected_end_before
|
||||||
assert service._batch_size == 1000 # default
|
assert service._batch_size == 1000 # default
|
||||||
assert service._dry_run is False # default
|
assert service._dry_run is False # default
|
||||||
|
assert service._metrics._base_attributes["task_label"] == "custom"
|
||||||
|
|
||||||
|
|
||||||
|
class TestMessagesCleanServiceRun:
|
||||||
|
"""Unit tests for MessagesCleanService.run instrumentation behavior."""
|
||||||
|
|
||||||
|
def test_run_records_completion_metrics_on_success(self):
|
||||||
|
# Arrange
|
||||||
|
service = MessagesCleanService(
|
||||||
|
policy=BillingDisabledPolicy(),
|
||||||
|
start_from=datetime.datetime(2024, 1, 1),
|
||||||
|
end_before=datetime.datetime(2024, 1, 2),
|
||||||
|
batch_size=100,
|
||||||
|
dry_run=False,
|
||||||
|
)
|
||||||
|
expected_stats = {
|
||||||
|
"batches": 1,
|
||||||
|
"total_messages": 10,
|
||||||
|
"filtered_messages": 5,
|
||||||
|
"total_deleted": 5,
|
||||||
|
}
|
||||||
|
service._clean_messages_by_time_range = MagicMock(return_value=expected_stats) # type: ignore[method-assign]
|
||||||
|
completion_calls: list[dict[str, object]] = []
|
||||||
|
service._metrics.record_completion = lambda **kwargs: completion_calls.append(kwargs) # type: ignore[method-assign]
|
||||||
|
|
||||||
|
# Act
|
||||||
|
result = service.run()
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result == expected_stats
|
||||||
|
assert len(completion_calls) == 1
|
||||||
|
assert completion_calls[0]["status"] == "success"
|
||||||
|
|
||||||
|
def test_run_records_completion_metrics_on_failure(self):
|
||||||
|
# Arrange
|
||||||
|
service = MessagesCleanService(
|
||||||
|
policy=BillingDisabledPolicy(),
|
||||||
|
start_from=datetime.datetime(2024, 1, 1),
|
||||||
|
end_before=datetime.datetime(2024, 1, 2),
|
||||||
|
batch_size=100,
|
||||||
|
dry_run=False,
|
||||||
|
)
|
||||||
|
service._clean_messages_by_time_range = MagicMock(side_effect=RuntimeError("clean failed")) # type: ignore[method-assign]
|
||||||
|
completion_calls: list[dict[str, object]] = []
|
||||||
|
service._metrics.record_completion = lambda **kwargs: completion_calls.append(kwargs) # type: ignore[method-assign]
|
||||||
|
|
||||||
|
# Act & Assert
|
||||||
|
with pytest.raises(RuntimeError, match="clean failed"):
|
||||||
|
service.run()
|
||||||
|
assert len(completion_calls) == 1
|
||||||
|
assert completion_calls[0]["status"] == "failed"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue