googleapis/google/cloud/aiplatform/v1beta1/schema/io_format.proto

// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.aiplatform.v1beta1.schema;

import "google/cloud/aiplatform/v1beta1/schema/annotation_spec_color.proto";
import "google/cloud/aiplatform/v1beta1/schema/geometry.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/struct.proto";
import "google/protobuf/wrappers.proto";
import "google/api/annotations.proto";

option go_package = "google.golang.org/genproto/googleapis/cloud/aiplatform/v1beta1/schema;schema";
option java_multiple_files = true;
option java_outer_classname = "IoFormatProto";
option java_package = "com.google.cloud.aiplatform.v1beta1.schema";

// Prediction input format for Image Classification.
message ImageClassificationPredictionInstance {
  // The image bytes or GCS URI to make the prediction on.
  string content = 1;

  // The MIME type of the content of the image. Only the images in below listed
  // MIME types are supported.
  // - image/jpeg
  // - image/gif
  // - image/png
  // - image/webp
  // - image/bmp
  // - image/tiff
  // - image/vnd.microsoft.icon
  string mime_type = 2;
}

// Prediction input format for Image Object Detection.
message ImageObjectDetectionPredictionInstance {
  // The image bytes or GCS URI to make the prediction on.
  string content = 1;

  // The MIME type of the content of the image. Only the images in below listed
  // MIME types are supported.
  // - image/jpeg
  // - image/gif
  // - image/png
  // - image/webp
  // - image/bmp
  // - image/tiff
  // - image/vnd.microsoft.icon
  string mime_type = 2;
}

// Prediction input format for Image Segmentation.
message ImageSegmentationPredictionInstance {
  // The image bytes to make the predictions on.
  string content = 1;

  // The MIME type of the content of the image. Only the images in below listed
  // MIME types are supported.
  // - image/jpeg
  // - image/png
  string mime_type = 2;
}

// Prediction input format for Video Classification.
message VideoClassificationPredictionInstance {
  // The Google Cloud Storage location of the video on which to perform the
  // prediction.
  string content = 1;

  // The MIME type of the content of the video. Only the following are
  // supported: video/mp4 video/avi video/quicktime
  string mime_type = 2;

  // The beginning, inclusive, of the video's time segment on which to perform
  // the prediction. Expressed as a number of seconds as measured from the
  // start of the video, with "s" appended at the end. Fractions are allowed,
  // up to a microsecond precision.
  string time_segment_start = 3;

  // The end, exclusive, of the video's time segment on which to perform
  // the prediction. Expressed as a number of seconds as measured from the
  // start of the video, with "s" appended at the end. Fractions are allowed,
  // up to a microsecond precision, and "Infinity" is allowed, which means the
  // end of the video.
  string time_segment_end = 4;
}

// Prediction input format for Video Classification.
message VideoObjectTrackingPredictionInstance {
  // The Google Cloud Storage location of the video on which to perform the
  // prediction.
  string content = 1;

  // The MIME type of the content of the video. Only the following are
  // supported: video/mp4 video/avi video/quicktime
  string mime_type = 2;

  // The beginning, inclusive, of the video's time segment on which to perform
  // the prediction. Expressed as a number of seconds as measured from the
  // start of the video, with "s" appended at the end. Fractions are allowed,
  // up to a microsecond precision.
  string time_segment_start = 3;

  // The end, exclusive, of the video's time segment on which to perform
  // the prediction. Expressed as a number of seconds as measured from the
  // start of the video, with "s" appended at the end. Fractions are allowed,
  // up to a microsecond precision, and "Infinity" is allowed, which means the
  // end of the video.
  string time_segment_end = 4;
}

// Prediction input format for Video Action Recognition.
message VideoActionRecognitionPredictionInstance {
  // The Google Cloud Storage location of the video on which to perform the
  // prediction.
  string content = 1;

  // The MIME type of the content of the video. Only the following are
  // supported: video/mp4 video/avi video/quicktime
  string mime_type = 2;

  // The beginning, inclusive, of the video's time segment on which to perform
  // the prediction. Expressed as a number of seconds as measured from the
  // start of the video, with "s" appended at the end. Fractions are allowed,
  // up to a microsecond precision.
  string time_segment_start = 3;

  // The end, exclusive, of the video's time segment on which to perform
  // the prediction. Expressed as a number of seconds as measured from the
  // start of the video, with "s" appended at the end. Fractions are allowed,
  // up to a microsecond precision, and "Infinity" is allowed, which means the
  // end of the video.
  string time_segment_end = 4;
}

// Prediction input format for Text Classification.
message TextClassificationPredictionInstance {
  // The text snippet to make the predictions on.
  string content = 1;

  // The MIME type of the text snippet. The supported MIME types are listed
  // below.
  // - text/plain
  string mime_type = 2;
}

// Prediction input format for Text Sentiment.
message TextSentimentPredictionInstance {
  // The text snippet to make the predictions on.
  string content = 1;

  // The MIME type of the text snippet. The supported MIME types are listed
  // below.
  // - text/plain
  string mime_type = 2;
}

// Prediction input format for Text Extraction.
message TextExtractionPredictionInstance {
  // The text snippet to make the predictions on.
  string content = 1;

  // The MIME type of the text snippet. The supported MIME types are listed
  // below.
  // - text/plain
  string mime_type = 2;

  // This field is only used for batch prediction. If a key is provided, the
  // batch prediction result will by mapped to this key. If omitted, then the
  // batch prediction result will contain the entire input instance. AI Platform
  // will not check if keys in the request are duplicates, so it is up to the
  // caller to ensure the keys are unique.
  string key = 3;
}

// Prediction model parameters for Image Classification.
message ImageClassificationPredictionParams {
  // The Model only returns predictions with at least this confidence score.
  // Default value is 0.0
  float confidence_threshold = 1;

  // The Model only returns up to that many top, by confidence score,
  // predictions per instance. If this number is very high, the Model may return
  // fewer predictions. Default value is 10.
  int32 max_predictions = 2;
}

// Prediction model parameters for Image Object Detection.
message ImageObjectDetectionPredictionParams {
  // The Model only returns predictions with at least this confidence score.
  // Default value is 0.0
  float confidence_threshold = 1;

  // The Model only returns up to that many top, by confidence score,
  // predictions per instance. Note that number of returned predictions is also
  // limited by metadata's predictionsLimit. Default value is 10.
  int32 max_predictions = 2;
}

// Prediction model parameters for Image Segmentation.
message ImageSegmentationPredictionParams {
  // When the model predicts category of pixels of the image, it will only
  // provide predictions for pixels that it is at least this much confident
  // about. All other pixels will be classified as background. Default value is
  // 0.5.
  float confidence_threshold = 1;
}

// Prediction model parameters for Video Classification.
message VideoClassificationPredictionParams {
  // The Model only returns predictions with at least this confidence score.
  // Default value is 0.0
  float confidence_threshold = 1;

  // The Model only returns up to that many top, by confidence score,
  // predictions per instance. If this number is very high, the Model may return
  // fewer predictions. Default value is 10,000.
  int32 max_predictions = 2;

  // Set to true to request segment-level classification. AI Platform returns
  // labels and their confidence scores for the entire time segment of the
  // video that user specified in the input instance.
  // Default value is true
  bool segment_classification = 3;

  // Set to true to request shot-level classification. AI Platform determines
  // the boundaries for each camera shot in the entire time segment of the
  // video that user specified in the input instance. AI Platform then
  // returns labels and their confidence scores for each detected shot, along
  // with the start and end time of the shot.
  // WARNING: Model evaluation is not done for this classification type,
  // the quality of it depends on the training data, but there are no metrics
  // provided to describe that quality.
  // Default value is false
  bool shot_classification = 4;

  // Set to true to request classification for a video at one-second intervals.
  // AI Platform returns labels and their confidence scores for each second of
  // the entire time segment of the video that user specified in the input
  // WARNING: Model evaluation is not done for this classification type, the
  // quality of it depends on the training data, but there are no metrics
  // provided to describe that quality. Default value is false
  bool one_sec_interval_classification = 5;
}

// Prediction model parameters for Video Object Tracking.
message VideoObjectTrackingPredictionParams {
  // The Model only returns predictions with at least this confidence score.
  // Default value is 0.0
  float confidence_threshold = 1;

  // The model only returns up to that many top, by confidence score,
  // predictions per frame of the video. If this number is very high, the
  // Model may return fewer predictions per frame. Default value is 50.
  int32 max_predictions = 2;

  // Only bounding boxes with shortest edge at least that long as a relative
  // value of video frame size are returned. Default value is 0.0.
  float min_bounding_box_size = 3;
}

// Prediction model parameters for Video Action Recognition.
message VideoActionRecognitionPredictionParams {
  // The Model only returns predictions with at least this confidence score.
  // Default value is 0.0
  float confidence_threshold = 1;

  // The model only returns up to that many top, by confidence score,
  // predictions per frame of the video. If this number is very high, the
  // Model may return fewer predictions per frame. Default value is 50.
  int32 max_predictions = 2;
}

// Represents a line of JSONL in the batch prediction output file.
message PredictionResult {
  // Some identifier from the input so that the prediction can be mapped back to
  // the input instance.
  oneof input {
    // User's input instance.
    // Struct is used here instead of Any so that JsonFormat does not append an
    // extra "@type" field when we convert the proto to JSON.
    google.protobuf.Struct instance = 1;

    // Optional user-provided key from the input instance.
    string key = 2;
  }

  // The prediction result.
  // Value is used here instead of Any so that JsonFormat does not append an
  // extra "@type" field when we convert the proto to JSON and so we can
  // represent array of objects.
  google.protobuf.Value prediction = 3;
}

// Represents a line of JSONL in the text sentiment batch prediction output
// file. This is a hack to allow printing of integer values.
message TextSentimentPredictionResult {
  // Prediction output format for Text Sentiment.
  message Prediction {
    // The integer sentiment labels between 0 (inclusive) and sentimentMax label
    // (inclusive), while 0 maps to the least positive sentiment and
    // sentimentMax maps to the most positive one. The higher the score is, the
    // more positive the sentiment in the text snippet is. Note: sentimentMax is
    // an integer value between 1 (inclusive) and 10 (inclusive).
    int32 sentiment = 1;
  }

  // User's input instance.
  TextSentimentPredictionInstance instance = 1;

  // The prediction result.
  Prediction prediction = 2;
}

// Prediction output format for Image Classification.
message ClassificationPredictionResult {
  // The resource IDs of the AnnotationSpecs that had been identified, ordered
  // by the confidence score descendingly.
  repeated int64 ids = 1;

  // The display names of the AnnotationSpecs that had been identified, order
  // matches the IDs.
  repeated string display_names = 2;

  // The Model's confidences in correctness of the predicted IDs, higher value
  // means higher confidence. Order matches the Ids.
  repeated float confidences = 3;
}

// Prediction output format for Image Object Detection.
message ImageObjectDetectionPredictionResult {
  // The resource IDs of the AnnotationSpecs that had been identified, ordered
  // by the confidence score descendingly.
  repeated int64 ids = 1;

  // The display names of the AnnotationSpecs that had been identified, order
  // matches the IDs.
  repeated string display_names = 2;

  // The Model's confidences in correctness of the predicted IDs, higher value
  // means higher confidence. Order matches the Ids.
  repeated float confidences = 3;

  // Bounding boxes, i.e. the rectangles over the image, that pinpoint
  // the found AnnotationSpecs. Given in order that matches the IDs. Each
  // bounding box is an array of 4 numbers `xMin`, `xMax`, `yMin`, and
  // `yMax`, which represent the extremal coordinates of the box. They are
  // relative to the image size, and the point 0,0 is in the top left
  // of the image.
  repeated google.protobuf.ListValue bboxes = 4;
}

// Prediction output format for Video Classification.
message VideoClassificationPredictionResult {
  // The resource ID of the AnnotationSpec that had been identified.
  string id = 1;

  // The display name of the AnnotationSpec that had been identified.
  string display_name = 2;

  // The type of the prediction. The requested types can be configured
  // via parameters. This will be one of
  // - segment-classification
  // - shot-classification
  // - one-sec-interval-classification
  string type = 3;

  // The beginning, inclusive, of the video's time segment in which the
  // AnnotationSpec has been identified. Expressed as a number of seconds as
  // measured from the start of the video, with fractions up to a microsecond
  // precision, and with "s" appended at the end. Note that for
  // 'segment-classification' prediction type, this equals the original
  // 'timeSegmentStart' from the input instance, for other types it is the
  // start of a shot or a 1 second interval respectively.
  google.protobuf.Duration time_segment_start = 4;

  // The end, exclusive, of the video's time segment in which the
  // AnnotationSpec has been identified. Expressed as a number of seconds as
  // measured from the start of the video, with fractions up to a microsecond
  // precision, and with "s" appended at the end. Note that for
  // 'segment-classification' prediction type, this equals the original
  // 'timeSegmentEnd' from the input instance, for other types it is the end
  // of a shot or a 1 second interval respectively.
  google.protobuf.Duration time_segment_end = 5;

  // The Model's confidence in correction of this prediction, higher
  // value means higher confidence.
  google.protobuf.FloatValue confidence = 6;
}

// Prediction output format for Video Object Tracking.
message VideoObjectTrackingPredictionResult {
  // The fields `xMin`, `xMax`, `yMin`, and `yMax` refer to a bounding box,
  // i.e. the rectangle over the video frame pinpointing the found
  // AnnotationSpec. The coordinates are relative to the frame size, and the
  // point 0,0 is in the top left of the frame.
  message Frame {
    // A time (frame) of a video in which the object has been detected.
    // Expressed as a number of seconds as measured from the
    // start of the video, with fractions up to a microsecond precision, and
    // with "s" appended at the end.
    google.protobuf.Duration time_offset = 1;

    // The leftmost coordinate of the bounding box.
    google.protobuf.FloatValue x_min = 2;

    // The rightmost coordinate of the bounding box.
    google.protobuf.FloatValue x_max = 3;

    // The topmost coordinate of the bounding box.
    google.protobuf.FloatValue y_min = 4;

    // The bottommost coordinate of the bounding box.
    google.protobuf.FloatValue y_max = 5;
  }

  // The resource ID of the AnnotationSpec that had been identified.
  string id = 1;

  // The display name of the AnnotationSpec that had been identified.
  string display_name = 2;

  // The beginning, inclusive, of the video's time segment in which the
  // object instance has been detected. Expressed as a number of seconds as
  // measured from the start of the video, with fractions up to a microsecond
  // precision, and with "s" appended at the end.
  google.protobuf.Duration time_segment_start = 3;

  // The end, inclusive, of the video's time segment in which the
  // object instance has been detected. Expressed as a number of seconds as
  // measured from the start of the video, with fractions up to a microsecond
  // precision, and with "s" appended at the end.
  google.protobuf.Duration time_segment_end = 4;

  // The Model's confidence in correction of this prediction, higher
  // value means higher confidence.
  google.protobuf.FloatValue confidence = 5;

  // All of the frames of the video in which a single object instance has been
  // detected. The bounding boxes in the frames identify the same object.
  repeated Frame frames = 6;
}

// Prediction output format for Text Extraction.
message TextExtractionPredictionResult {
  // The resource IDs of the AnnotationSpecs that had been identified,
  // ordered by the confidence score descendingly.
  repeated int64 ids = 1;

  // The display names of the AnnotationSpecs that had been identified,
  // order matches the IDs.
  repeated string display_names = 2;

  // The start offsets, inclusive, of the text segment in which the
  // AnnotationSpec has been identified. Expressed as a zero-based number
  // of characters as measured from the start of the text snippet.
  repeated int64 text_segment_start_offsets = 3;

  // The end offsets, inclusive, of the text segment in which the
  // AnnotationSpec has been identified. Expressed as a zero-based number
  // of characters as measured from the start of the text snippet.
  repeated int64 text_segment_end_offsets = 4;

  // The Model's confidences in correctness of the predicted IDs, higher
  // value means higher confidence. Order matches the Ids.
  repeated float confidences = 5;
}