mirror of https://github.com/langgenius/dify.git
feat: add export app messages (#32990)
This commit is contained in:
parent
c016793efb
commit
05ab107e73
|
|
@ -2668,3 +2668,77 @@ def clean_expired_messages(
|
||||||
raise
|
raise
|
||||||
|
|
||||||
click.echo(click.style("messages cleanup completed.", fg="green"))
|
click.echo(click.style("messages cleanup completed.", fg="green"))
|
||||||
|
|
||||||
|
|
||||||
|
@click.command("export-app-messages", help="Export messages for an app to JSONL.GZ.")
|
||||||
|
@click.option("--app-id", required=True, help="Application ID to export messages for.")
|
||||||
|
@click.option(
|
||||||
|
"--start-from",
|
||||||
|
type=click.DateTime(formats=["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S"]),
|
||||||
|
default=None,
|
||||||
|
help="Optional lower bound (inclusive) for created_at.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--end-before",
|
||||||
|
type=click.DateTime(formats=["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S"]),
|
||||||
|
required=True,
|
||||||
|
help="Upper bound (exclusive) for created_at.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--filename",
|
||||||
|
required=True,
|
||||||
|
help="Base filename (relative path). Do not include suffix like .jsonl.gz.",
|
||||||
|
)
|
||||||
|
@click.option("--use-cloud-storage", is_flag=True, default=False, help="Upload to cloud storage instead of local file.")
|
||||||
|
@click.option("--batch-size", default=1000, show_default=True, help="Batch size for cursor pagination.")
|
||||||
|
@click.option("--dry-run", is_flag=True, default=False, help="Scan only, print stats without writing any file.")
|
||||||
|
def export_app_messages(
|
||||||
|
app_id: str,
|
||||||
|
start_from: datetime.datetime | None,
|
||||||
|
end_before: datetime.datetime,
|
||||||
|
filename: str,
|
||||||
|
use_cloud_storage: bool,
|
||||||
|
batch_size: int,
|
||||||
|
dry_run: bool,
|
||||||
|
):
|
||||||
|
if start_from and start_from >= end_before:
|
||||||
|
raise click.UsageError("--start-from must be before --end-before.")
|
||||||
|
|
||||||
|
from services.retention.conversation.message_export_service import AppMessageExportService
|
||||||
|
|
||||||
|
try:
|
||||||
|
validated_filename = AppMessageExportService.validate_export_filename(filename)
|
||||||
|
except ValueError as e:
|
||||||
|
raise click.BadParameter(str(e), param_hint="--filename") from e
|
||||||
|
|
||||||
|
click.echo(click.style(f"export_app_messages: starting export for app {app_id}.", fg="green"))
|
||||||
|
start_at = time.perf_counter()
|
||||||
|
|
||||||
|
try:
|
||||||
|
service = AppMessageExportService(
|
||||||
|
app_id=app_id,
|
||||||
|
end_before=end_before,
|
||||||
|
filename=validated_filename,
|
||||||
|
start_from=start_from,
|
||||||
|
batch_size=batch_size,
|
||||||
|
use_cloud_storage=use_cloud_storage,
|
||||||
|
dry_run=dry_run,
|
||||||
|
)
|
||||||
|
stats = service.run()
|
||||||
|
|
||||||
|
elapsed = time.perf_counter() - start_at
|
||||||
|
click.echo(
|
||||||
|
click.style(
|
||||||
|
f"export_app_messages: completed in {elapsed:.2f}s\n"
|
||||||
|
f" - Batches: {stats.batches}\n"
|
||||||
|
f" - Total messages: {stats.total_messages}\n"
|
||||||
|
f" - Messages with feedback: {stats.messages_with_feedback}\n"
|
||||||
|
f" - Total feedbacks: {stats.total_feedbacks}",
|
||||||
|
fg="green",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
elapsed = time.perf_counter() - start_at
|
||||||
|
logger.exception("export_app_messages failed")
|
||||||
|
click.echo(click.style(f"export_app_messages: failed after {elapsed:.2f}s - {e}", fg="red"))
|
||||||
|
raise
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ def init_app(app: DifyApp):
|
||||||
convert_to_agent_apps,
|
convert_to_agent_apps,
|
||||||
create_tenant,
|
create_tenant,
|
||||||
delete_archived_workflow_runs,
|
delete_archived_workflow_runs,
|
||||||
|
export_app_messages,
|
||||||
extract_plugins,
|
extract_plugins,
|
||||||
extract_unique_plugins,
|
extract_unique_plugins,
|
||||||
file_usage,
|
file_usage,
|
||||||
|
|
@ -66,6 +67,7 @@ def init_app(app: DifyApp):
|
||||||
restore_workflow_runs,
|
restore_workflow_runs,
|
||||||
clean_workflow_runs,
|
clean_workflow_runs,
|
||||||
clean_expired_messages,
|
clean_expired_messages,
|
||||||
|
export_app_messages,
|
||||||
]
|
]
|
||||||
for cmd in cmds_to_register:
|
for cmd in cmds_to_register:
|
||||||
app.cli.add_command(cmd)
|
app.cli.add_command(cmd)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,304 @@
|
||||||
|
"""
|
||||||
|
Export app messages to JSONL.GZ format.
|
||||||
|
|
||||||
|
Outputs: conversation_id, message_id, query, answer, inputs (raw JSON),
|
||||||
|
retriever_resources (from message_metadata), feedback (user feedbacks array).
|
||||||
|
|
||||||
|
Uses (created_at, id) cursor pagination and batch-loads feedbacks to avoid N+1.
|
||||||
|
Does NOT touch Message.inputs / Message.user_feedback properties.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
import gzip
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import tempfile
|
||||||
|
from collections import defaultdict
|
||||||
|
from collections.abc import Generator, Iterable
|
||||||
|
from pathlib import Path, PurePosixPath
|
||||||
|
from typing import Any, BinaryIO, cast
|
||||||
|
|
||||||
|
import orjson
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
|
from sqlalchemy import select, tuple_
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from extensions.ext_database import db
|
||||||
|
from extensions.ext_storage import storage
|
||||||
|
from models.model import Message, MessageFeedback
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MAX_FILENAME_BASE_LENGTH = 1024
|
||||||
|
FORBIDDEN_FILENAME_SUFFIXES = (".jsonl.gz", ".jsonl", ".gz")
|
||||||
|
|
||||||
|
|
||||||
|
class AppMessageExportFeedback(BaseModel):
|
||||||
|
id: str
|
||||||
|
app_id: str
|
||||||
|
conversation_id: str
|
||||||
|
message_id: str
|
||||||
|
rating: str
|
||||||
|
content: str | None = None
|
||||||
|
from_source: str
|
||||||
|
from_end_user_id: str | None = None
|
||||||
|
from_account_id: str | None = None
|
||||||
|
created_at: str
|
||||||
|
updated_at: str
|
||||||
|
|
||||||
|
model_config = ConfigDict(extra="forbid")
|
||||||
|
|
||||||
|
|
||||||
|
class AppMessageExportRecord(BaseModel):
|
||||||
|
conversation_id: str
|
||||||
|
message_id: str
|
||||||
|
query: str
|
||||||
|
answer: str
|
||||||
|
inputs: dict[str, Any]
|
||||||
|
retriever_resources: list[Any] = Field(default_factory=list)
|
||||||
|
feedback: list[AppMessageExportFeedback] = Field(default_factory=list)
|
||||||
|
|
||||||
|
model_config = ConfigDict(extra="forbid")
|
||||||
|
|
||||||
|
|
||||||
|
class AppMessageExportStats(BaseModel):
|
||||||
|
batches: int = 0
|
||||||
|
total_messages: int = 0
|
||||||
|
messages_with_feedback: int = 0
|
||||||
|
total_feedbacks: int = 0
|
||||||
|
|
||||||
|
model_config = ConfigDict(extra="forbid")
|
||||||
|
|
||||||
|
|
||||||
|
class AppMessageExportService:
|
||||||
|
@staticmethod
|
||||||
|
def validate_export_filename(filename: str) -> str:
|
||||||
|
normalized = filename.strip()
|
||||||
|
if not normalized:
|
||||||
|
raise ValueError("--filename must not be empty.")
|
||||||
|
|
||||||
|
normalized_lower = normalized.lower()
|
||||||
|
if normalized_lower.endswith(FORBIDDEN_FILENAME_SUFFIXES):
|
||||||
|
raise ValueError("--filename must not include .jsonl.gz/.jsonl/.gz suffix; pass base filename only.")
|
||||||
|
|
||||||
|
if normalized.startswith("/"):
|
||||||
|
raise ValueError("--filename must be a relative path; absolute paths are not allowed.")
|
||||||
|
|
||||||
|
if "\\" in normalized:
|
||||||
|
raise ValueError("--filename must use '/' as path separator; '\\' is not allowed.")
|
||||||
|
|
||||||
|
if "//" in normalized:
|
||||||
|
raise ValueError("--filename must not contain empty path segments ('//').")
|
||||||
|
|
||||||
|
if len(normalized) > MAX_FILENAME_BASE_LENGTH:
|
||||||
|
raise ValueError(f"--filename is too long; max length is {MAX_FILENAME_BASE_LENGTH}.")
|
||||||
|
|
||||||
|
for ch in normalized:
|
||||||
|
if ch == "\x00" or ord(ch) < 32 or ord(ch) == 127:
|
||||||
|
raise ValueError("--filename must not contain control characters or NUL.")
|
||||||
|
|
||||||
|
parts = PurePosixPath(normalized).parts
|
||||||
|
if not parts:
|
||||||
|
raise ValueError("--filename must include a file name.")
|
||||||
|
|
||||||
|
if any(part in (".", "..") for part in parts):
|
||||||
|
raise ValueError("--filename must not contain '.' or '..' path segments.")
|
||||||
|
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
@property
|
||||||
|
def output_gz_name(self) -> str:
|
||||||
|
return f"{self._filename_base}.jsonl.gz"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def output_jsonl_name(self) -> str:
|
||||||
|
return f"{self._filename_base}.jsonl"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
app_id: str,
|
||||||
|
end_before: datetime.datetime,
|
||||||
|
filename: str,
|
||||||
|
*,
|
||||||
|
start_from: datetime.datetime | None = None,
|
||||||
|
batch_size: int = 1000,
|
||||||
|
use_cloud_storage: bool = False,
|
||||||
|
dry_run: bool = False,
|
||||||
|
) -> None:
|
||||||
|
if start_from and start_from >= end_before:
|
||||||
|
raise ValueError(f"start_from ({start_from}) must be before end_before ({end_before})")
|
||||||
|
|
||||||
|
self._app_id = app_id
|
||||||
|
self._end_before = end_before
|
||||||
|
self._start_from = start_from
|
||||||
|
self._filename_base = self.validate_export_filename(filename)
|
||||||
|
self._batch_size = batch_size
|
||||||
|
self._use_cloud_storage = use_cloud_storage
|
||||||
|
self._dry_run = dry_run
|
||||||
|
|
||||||
|
def run(self) -> AppMessageExportStats:
|
||||||
|
stats = AppMessageExportStats()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"export_app_messages: app_id=%s, start_from=%s, end_before=%s, dry_run=%s, cloud=%s, output_gz=%s",
|
||||||
|
self._app_id,
|
||||||
|
self._start_from,
|
||||||
|
self._end_before,
|
||||||
|
self._dry_run,
|
||||||
|
self._use_cloud_storage,
|
||||||
|
self.output_gz_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self._dry_run:
|
||||||
|
for _ in self._iter_records_with_stats(stats):
|
||||||
|
pass
|
||||||
|
self._finalize_stats(stats)
|
||||||
|
return stats
|
||||||
|
|
||||||
|
if self._use_cloud_storage:
|
||||||
|
self._export_to_cloud(stats)
|
||||||
|
else:
|
||||||
|
self._export_to_local(stats)
|
||||||
|
|
||||||
|
self._finalize_stats(stats)
|
||||||
|
return stats
|
||||||
|
|
||||||
|
def iter_records(self) -> Generator[AppMessageExportRecord, None, None]:
|
||||||
|
for batch in self._iter_record_batches():
|
||||||
|
yield from batch
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def write_jsonl_gz(records: Iterable[AppMessageExportRecord], fileobj: BinaryIO) -> None:
|
||||||
|
with gzip.GzipFile(fileobj=fileobj, mode="wb") as gz:
|
||||||
|
for record in records:
|
||||||
|
gz.write(orjson.dumps(record.model_dump(mode="json")) + b"\n")
|
||||||
|
|
||||||
|
def _export_to_local(self, stats: AppMessageExportStats) -> None:
|
||||||
|
output_path = Path.cwd() / self.output_gz_name
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with output_path.open("wb") as output_file:
|
||||||
|
self.write_jsonl_gz(self._iter_records_with_stats(stats), output_file)
|
||||||
|
|
||||||
|
def _export_to_cloud(self, stats: AppMessageExportStats) -> None:
|
||||||
|
with tempfile.SpooledTemporaryFile(max_size=64 * 1024 * 1024) as tmp:
|
||||||
|
self.write_jsonl_gz(self._iter_records_with_stats(stats), cast(BinaryIO, tmp))
|
||||||
|
tmp.seek(0)
|
||||||
|
data = tmp.read()
|
||||||
|
|
||||||
|
storage.save(self.output_gz_name, data)
|
||||||
|
logger.info("export_app_messages: uploaded %d bytes to cloud key=%s", len(data), self.output_gz_name)
|
||||||
|
|
||||||
|
def _iter_records_with_stats(self, stats: AppMessageExportStats) -> Generator[AppMessageExportRecord, None, None]:
|
||||||
|
for record in self.iter_records():
|
||||||
|
self._update_stats(stats, record)
|
||||||
|
yield record
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _update_stats(stats: AppMessageExportStats, record: AppMessageExportRecord) -> None:
|
||||||
|
stats.total_messages += 1
|
||||||
|
if record.feedback:
|
||||||
|
stats.messages_with_feedback += 1
|
||||||
|
stats.total_feedbacks += len(record.feedback)
|
||||||
|
|
||||||
|
def _finalize_stats(self, stats: AppMessageExportStats) -> None:
|
||||||
|
if stats.total_messages == 0:
|
||||||
|
stats.batches = 0
|
||||||
|
return
|
||||||
|
stats.batches = (stats.total_messages + self._batch_size - 1) // self._batch_size
|
||||||
|
|
||||||
|
def _iter_record_batches(self) -> Generator[list[AppMessageExportRecord], None, None]:
|
||||||
|
cursor: tuple[datetime.datetime, str] | None = None
|
||||||
|
while True:
|
||||||
|
rows, cursor = self._fetch_batch(cursor)
|
||||||
|
if not rows:
|
||||||
|
break
|
||||||
|
|
||||||
|
message_ids = [str(row.id) for row in rows]
|
||||||
|
feedbacks_map = self._fetch_feedbacks(message_ids)
|
||||||
|
yield [self._build_record(row, feedbacks_map) for row in rows]
|
||||||
|
|
||||||
|
def _fetch_batch(
|
||||||
|
self, cursor: tuple[datetime.datetime, str] | None
|
||||||
|
) -> tuple[list[Any], tuple[datetime.datetime, str] | None]:
|
||||||
|
with Session(db.engine, expire_on_commit=False) as session:
|
||||||
|
stmt = (
|
||||||
|
select(
|
||||||
|
Message.id,
|
||||||
|
Message.conversation_id,
|
||||||
|
Message.query,
|
||||||
|
Message.answer,
|
||||||
|
Message._inputs, # pyright: ignore[reportPrivateUsage]
|
||||||
|
Message.message_metadata,
|
||||||
|
Message.created_at,
|
||||||
|
)
|
||||||
|
.where(
|
||||||
|
Message.app_id == self._app_id,
|
||||||
|
Message.created_at < self._end_before,
|
||||||
|
)
|
||||||
|
.order_by(Message.created_at, Message.id)
|
||||||
|
.limit(self._batch_size)
|
||||||
|
)
|
||||||
|
|
||||||
|
if self._start_from:
|
||||||
|
stmt = stmt.where(Message.created_at >= self._start_from)
|
||||||
|
|
||||||
|
if cursor:
|
||||||
|
stmt = stmt.where(
|
||||||
|
tuple_(Message.created_at, Message.id)
|
||||||
|
> tuple_(
|
||||||
|
sa.literal(cursor[0], type_=sa.DateTime()),
|
||||||
|
sa.literal(cursor[1], type_=Message.id.type),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
rows = list(session.execute(stmt).all())
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
return [], cursor
|
||||||
|
|
||||||
|
last = rows[-1]
|
||||||
|
return rows, (last.created_at, last.id)
|
||||||
|
|
||||||
|
def _fetch_feedbacks(self, message_ids: list[str]) -> dict[str, list[AppMessageExportFeedback]]:
|
||||||
|
if not message_ids:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
with Session(db.engine, expire_on_commit=False) as session:
|
||||||
|
stmt = (
|
||||||
|
select(MessageFeedback)
|
||||||
|
.where(
|
||||||
|
MessageFeedback.message_id.in_(message_ids),
|
||||||
|
MessageFeedback.from_source == "user",
|
||||||
|
)
|
||||||
|
.order_by(MessageFeedback.message_id, MessageFeedback.created_at)
|
||||||
|
)
|
||||||
|
feedbacks = list(session.scalars(stmt).all())
|
||||||
|
|
||||||
|
result: dict[str, list[AppMessageExportFeedback]] = defaultdict(list)
|
||||||
|
for feedback in feedbacks:
|
||||||
|
result[str(feedback.message_id)].append(AppMessageExportFeedback.model_validate(feedback.to_dict()))
|
||||||
|
return result
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _build_record(row: Any, feedbacks_map: dict[str, list[AppMessageExportFeedback]]) -> AppMessageExportRecord:
|
||||||
|
retriever_resources: list[Any] = []
|
||||||
|
if row.message_metadata:
|
||||||
|
try:
|
||||||
|
metadata = json.loads(row.message_metadata)
|
||||||
|
value = metadata.get("retriever_resources", [])
|
||||||
|
if isinstance(value, list):
|
||||||
|
retriever_resources = value
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
message_id = str(row.id)
|
||||||
|
return AppMessageExportRecord(
|
||||||
|
conversation_id=str(row.conversation_id),
|
||||||
|
message_id=message_id,
|
||||||
|
query=row.query,
|
||||||
|
answer=row.answer,
|
||||||
|
inputs=row._inputs if isinstance(row._inputs, dict) else {},
|
||||||
|
retriever_resources=retriever_resources,
|
||||||
|
feedback=feedbacks_map.get(message_id, []),
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,233 @@
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import uuid
|
||||||
|
from decimal import Decimal
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from models.account import Account, Tenant, TenantAccountJoin, TenantAccountRole
|
||||||
|
from models.model import (
|
||||||
|
App,
|
||||||
|
AppAnnotationHitHistory,
|
||||||
|
Conversation,
|
||||||
|
DatasetRetrieverResource,
|
||||||
|
Message,
|
||||||
|
MessageAgentThought,
|
||||||
|
MessageAnnotation,
|
||||||
|
MessageChain,
|
||||||
|
MessageFeedback,
|
||||||
|
MessageFile,
|
||||||
|
)
|
||||||
|
from models.web import SavedMessage
|
||||||
|
from services.retention.conversation.message_export_service import AppMessageExportService, AppMessageExportStats
|
||||||
|
|
||||||
|
|
||||||
|
class TestAppMessageExportServiceIntegration:
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def cleanup_database(self, db_session_with_containers: Session):
|
||||||
|
yield
|
||||||
|
db_session_with_containers.query(DatasetRetrieverResource).delete()
|
||||||
|
db_session_with_containers.query(AppAnnotationHitHistory).delete()
|
||||||
|
db_session_with_containers.query(SavedMessage).delete()
|
||||||
|
db_session_with_containers.query(MessageFile).delete()
|
||||||
|
db_session_with_containers.query(MessageAgentThought).delete()
|
||||||
|
db_session_with_containers.query(MessageChain).delete()
|
||||||
|
db_session_with_containers.query(MessageAnnotation).delete()
|
||||||
|
db_session_with_containers.query(MessageFeedback).delete()
|
||||||
|
db_session_with_containers.query(Message).delete()
|
||||||
|
db_session_with_containers.query(Conversation).delete()
|
||||||
|
db_session_with_containers.query(App).delete()
|
||||||
|
db_session_with_containers.query(TenantAccountJoin).delete()
|
||||||
|
db_session_with_containers.query(Tenant).delete()
|
||||||
|
db_session_with_containers.query(Account).delete()
|
||||||
|
db_session_with_containers.commit()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _create_app_context(session: Session) -> tuple[App, Conversation]:
|
||||||
|
account = Account(
|
||||||
|
email=f"test-{uuid.uuid4()}@example.com",
|
||||||
|
name="tester",
|
||||||
|
interface_language="en-US",
|
||||||
|
status="active",
|
||||||
|
)
|
||||||
|
session.add(account)
|
||||||
|
session.flush()
|
||||||
|
|
||||||
|
tenant = Tenant(name=f"tenant-{uuid.uuid4()}", status="normal")
|
||||||
|
session.add(tenant)
|
||||||
|
session.flush()
|
||||||
|
|
||||||
|
join = TenantAccountJoin(
|
||||||
|
tenant_id=tenant.id,
|
||||||
|
account_id=account.id,
|
||||||
|
role=TenantAccountRole.OWNER,
|
||||||
|
current=True,
|
||||||
|
)
|
||||||
|
session.add(join)
|
||||||
|
session.flush()
|
||||||
|
|
||||||
|
app = App(
|
||||||
|
tenant_id=tenant.id,
|
||||||
|
name="export-app",
|
||||||
|
description="integration test app",
|
||||||
|
mode="chat",
|
||||||
|
enable_site=True,
|
||||||
|
enable_api=True,
|
||||||
|
api_rpm=60,
|
||||||
|
api_rph=3600,
|
||||||
|
is_demo=False,
|
||||||
|
is_public=False,
|
||||||
|
created_by=account.id,
|
||||||
|
updated_by=account.id,
|
||||||
|
)
|
||||||
|
session.add(app)
|
||||||
|
session.flush()
|
||||||
|
|
||||||
|
conversation = Conversation(
|
||||||
|
app_id=app.id,
|
||||||
|
app_model_config_id=str(uuid.uuid4()),
|
||||||
|
model_provider="openai",
|
||||||
|
model_id="gpt-4o-mini",
|
||||||
|
mode="chat",
|
||||||
|
name="conv",
|
||||||
|
inputs={"seed": 1},
|
||||||
|
status="normal",
|
||||||
|
from_source="api",
|
||||||
|
from_end_user_id=str(uuid.uuid4()),
|
||||||
|
)
|
||||||
|
session.add(conversation)
|
||||||
|
session.commit()
|
||||||
|
return app, conversation
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _create_message(
|
||||||
|
session: Session,
|
||||||
|
app: App,
|
||||||
|
conversation: Conversation,
|
||||||
|
created_at: datetime.datetime,
|
||||||
|
*,
|
||||||
|
query: str,
|
||||||
|
answer: str,
|
||||||
|
inputs: dict,
|
||||||
|
message_metadata: str | None,
|
||||||
|
) -> Message:
|
||||||
|
message = Message(
|
||||||
|
app_id=app.id,
|
||||||
|
conversation_id=conversation.id,
|
||||||
|
model_provider="openai",
|
||||||
|
model_id="gpt-4o-mini",
|
||||||
|
inputs=inputs,
|
||||||
|
query=query,
|
||||||
|
answer=answer,
|
||||||
|
message=[{"role": "assistant", "content": answer}],
|
||||||
|
message_tokens=10,
|
||||||
|
message_unit_price=Decimal("0.001"),
|
||||||
|
answer_tokens=20,
|
||||||
|
answer_unit_price=Decimal("0.002"),
|
||||||
|
total_price=Decimal("0.003"),
|
||||||
|
currency="USD",
|
||||||
|
message_metadata=message_metadata,
|
||||||
|
from_source="api",
|
||||||
|
from_end_user_id=conversation.from_end_user_id,
|
||||||
|
created_at=created_at,
|
||||||
|
)
|
||||||
|
session.add(message)
|
||||||
|
session.flush()
|
||||||
|
return message
|
||||||
|
|
||||||
|
def test_iter_records_with_stats(self, db_session_with_containers: Session):
|
||||||
|
app, conversation = self._create_app_context(db_session_with_containers)
|
||||||
|
|
||||||
|
first_inputs = {
|
||||||
|
"plain": "v1",
|
||||||
|
"nested": {"a": 1, "b": [1, {"x": True}]},
|
||||||
|
"list": ["x", 2, {"y": "z"}],
|
||||||
|
}
|
||||||
|
second_inputs = {"other": "value", "items": [1, 2, 3]}
|
||||||
|
|
||||||
|
base_time = datetime.datetime(2026, 2, 25, 10, 0, 0)
|
||||||
|
first_message = self._create_message(
|
||||||
|
db_session_with_containers,
|
||||||
|
app,
|
||||||
|
conversation,
|
||||||
|
created_at=base_time,
|
||||||
|
query="q1",
|
||||||
|
answer="a1",
|
||||||
|
inputs=first_inputs,
|
||||||
|
message_metadata=json.dumps({"retriever_resources": [{"dataset_id": "ds-1"}]}),
|
||||||
|
)
|
||||||
|
second_message = self._create_message(
|
||||||
|
db_session_with_containers,
|
||||||
|
app,
|
||||||
|
conversation,
|
||||||
|
created_at=base_time + datetime.timedelta(minutes=1),
|
||||||
|
query="q2",
|
||||||
|
answer="a2",
|
||||||
|
inputs=second_inputs,
|
||||||
|
message_metadata=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
user_feedback_1 = MessageFeedback(
|
||||||
|
app_id=app.id,
|
||||||
|
conversation_id=conversation.id,
|
||||||
|
message_id=first_message.id,
|
||||||
|
rating="like",
|
||||||
|
from_source="user",
|
||||||
|
content="first",
|
||||||
|
from_end_user_id=conversation.from_end_user_id,
|
||||||
|
)
|
||||||
|
user_feedback_2 = MessageFeedback(
|
||||||
|
app_id=app.id,
|
||||||
|
conversation_id=conversation.id,
|
||||||
|
message_id=first_message.id,
|
||||||
|
rating="dislike",
|
||||||
|
from_source="user",
|
||||||
|
content="second",
|
||||||
|
from_end_user_id=conversation.from_end_user_id,
|
||||||
|
)
|
||||||
|
admin_feedback = MessageFeedback(
|
||||||
|
app_id=app.id,
|
||||||
|
conversation_id=conversation.id,
|
||||||
|
message_id=first_message.id,
|
||||||
|
rating="like",
|
||||||
|
from_source="admin",
|
||||||
|
content="should-be-filtered",
|
||||||
|
from_account_id=str(uuid.uuid4()),
|
||||||
|
)
|
||||||
|
db_session_with_containers.add_all([user_feedback_1, user_feedback_2, admin_feedback])
|
||||||
|
user_feedback_1.created_at = base_time + datetime.timedelta(minutes=2)
|
||||||
|
user_feedback_2.created_at = base_time + datetime.timedelta(minutes=3)
|
||||||
|
admin_feedback.created_at = base_time + datetime.timedelta(minutes=4)
|
||||||
|
db_session_with_containers.commit()
|
||||||
|
|
||||||
|
service = AppMessageExportService(
|
||||||
|
app_id=app.id,
|
||||||
|
start_from=base_time - datetime.timedelta(minutes=1),
|
||||||
|
end_before=base_time + datetime.timedelta(minutes=10),
|
||||||
|
filename="unused",
|
||||||
|
batch_size=1,
|
||||||
|
dry_run=True,
|
||||||
|
)
|
||||||
|
stats = AppMessageExportStats()
|
||||||
|
records = list(service._iter_records_with_stats(stats))
|
||||||
|
service._finalize_stats(stats)
|
||||||
|
|
||||||
|
assert len(records) == 2
|
||||||
|
assert records[0].message_id == first_message.id
|
||||||
|
assert records[1].message_id == second_message.id
|
||||||
|
|
||||||
|
assert records[0].inputs == first_inputs
|
||||||
|
assert records[1].inputs == second_inputs
|
||||||
|
|
||||||
|
assert records[0].retriever_resources == [{"dataset_id": "ds-1"}]
|
||||||
|
assert records[1].retriever_resources == []
|
||||||
|
|
||||||
|
assert [feedback.rating for feedback in records[0].feedback] == ["like", "dislike"]
|
||||||
|
assert [feedback.content for feedback in records[0].feedback] == ["first", "second"]
|
||||||
|
assert records[1].feedback == []
|
||||||
|
|
||||||
|
assert stats.batches == 2
|
||||||
|
assert stats.total_messages == 2
|
||||||
|
assert stats.messages_with_feedback == 1
|
||||||
|
assert stats.total_feedbacks == 2
|
||||||
|
|
@ -0,0 +1,43 @@
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from services.retention.conversation.message_export_service import AppMessageExportService
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_export_filename_accepts_relative_path():
|
||||||
|
assert AppMessageExportService.validate_export_filename("exports/2026/test01") == "exports/2026/test01"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"filename",
|
||||||
|
[
|
||||||
|
"test01.jsonl.gz",
|
||||||
|
"test01.jsonl",
|
||||||
|
"test01.gz",
|
||||||
|
"/tmp/test01",
|
||||||
|
"exports/../test01",
|
||||||
|
"bad\x00name",
|
||||||
|
"bad\tname",
|
||||||
|
"a" * 1025,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_validate_export_filename_rejects_invalid_values(filename: str):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
AppMessageExportService.validate_export_filename(filename)
|
||||||
|
|
||||||
|
|
||||||
|
def test_service_derives_output_names_from_filename_base():
|
||||||
|
service = AppMessageExportService(
|
||||||
|
app_id="736b9b03-20f2-4697-91da-8d00f6325900",
|
||||||
|
start_from=None,
|
||||||
|
end_before=datetime.datetime(2026, 3, 1),
|
||||||
|
filename="exports/2026/test01",
|
||||||
|
batch_size=1000,
|
||||||
|
use_cloud_storage=True,
|
||||||
|
dry_run=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert service._filename_base == "exports/2026/test01"
|
||||||
|
assert service.output_gz_name == "exports/2026/test01.jsonl.gz"
|
||||||
|
assert service.output_jsonl_name == "exports/2026/test01.jsonl"
|
||||||
Loading…
Reference in New Issue