googleapis/google/cloud/aiplatform/v1beta1/training_pipeline.proto

// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.aiplatform.v1beta1;

import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/cloud/aiplatform/v1beta1/io.proto";
import "google/cloud/aiplatform/v1beta1/machine_resources.proto";
import "google/cloud/aiplatform/v1beta1/manual_batch_tuning_parameters.proto";
import "google/cloud/aiplatform/v1beta1/model.proto";
import "google/cloud/aiplatform/v1beta1/pipeline_state.proto";
import "google/protobuf/struct.proto";
import "google/protobuf/timestamp.proto";
import "google/rpc/status.proto";
import "google/api/annotations.proto";

option go_package = "google.golang.org/genproto/googleapis/cloud/aiplatform/v1beta1;aiplatform";
option java_multiple_files = true;
option java_outer_classname = "TrainingPipelineProto";
option java_package = "com.google.cloud.aiplatform.v1beta1";

// The TrainingPipeline orchestrates tasks associated with training a Model. It
// always executes the training task, and optionally may also
// export data from AI Platform's Dataset which becomes the training input,
// [upload][google.cloud.aiplatform.v1beta1.ModelService.UploadModel] the Model to AI Platform, and evaluate the
// Model.
message TrainingPipeline {
  option (google.api.resource) = {
    type: "aiplatform.googleapis.com/TrainingPipeline"
    pattern: "projects/{project}/locations/{location}/trainingPipelines/{training_pipeline}"
  };

  // Output only. Resource name of the TrainingPipeline.
  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Required. The user-defined name of this TrainingPipeline.
  string display_name = 2 [(google.api.field_behavior) = REQUIRED];

  // Specifies AI Platform owned input data that may be used for training the
  // Model. The TrainingPipeline's [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition] should make
  // clear whether this config is used and if there are any special requirements
  // on how it should be filled. If nothing about this config is mentioned in
  // the [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition], then it should be assumed that the
  // TrainingPipeline does not depend on this configuration.
  InputDataConfig input_data_config = 3;

  // Required. A Google Cloud Storage path to the YAML file that defines the training task
  // which is responsible for producing the model artifact, and may also include
  // additional auxiliary work.
  // The definition files that can be used here are found in
  // gs://google-cloud-aiplatform/schema/trainingjob/definition/.
  // Note: The URI given on output will be immutable and probably different,
  // including the URI scheme, than the one given on input. The output URI will
  // point to a location where the user only has a read access.
  string training_task_definition = 4 [(google.api.field_behavior) = REQUIRED];

  // Required. The training task's parameter(s), as specified in the
  // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]'s `inputs`.
  google.protobuf.Value training_task_inputs = 5 [(google.api.field_behavior) = REQUIRED];

  // Output only. The metadata information as specified in the [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]'s
  // `metadata`. This metadata is an auxiliary runtime and final information
  // about the training task. While the pipeline is running this information is
  // populated only at a best effort basis. Only present if the
  // pipeline's [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition] contains `metadata` object.
  google.protobuf.Value training_task_metadata = 6 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Describes the Model that may be uploaded (via [ModelService.UploadMode][])
  // by this TrainingPipeline. The TrainingPipeline's
  // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition] should make clear whether this Model
  // description should be populated, and if there are any special requirements
  // regarding how it should be filled. If nothing is mentioned in the
  // [training_task_definition][google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition], then it should be assumed that this field
  // should not be filled and the training task either uploads the Model without
  // a need of this information, or that training task does not support
  // uploading a Model as part of the pipeline.
  // When the Pipeline's state becomes `PIPELINE_STATE_SUCCEEDED` and
  // the trained Model had been uploaded into AI Platform, then the
  // model_to_upload's resource [name][google.cloud.aiplatform.v1beta1.Model.name] is populated. The Model
  // is always uploaded into the Project and Location in which this pipeline
  // is.
  Model model_to_upload = 7;

  // Output only. The detailed state of the pipeline.
  PipelineState state = 9 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Only populated when the pipeline's state is `PIPELINE_STATE_FAILED` or
  // `PIPELINE_STATE_CANCELLED`.
  google.rpc.Status error = 10 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Time when the TrainingPipeline was created.
  google.protobuf.Timestamp create_time = 11 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Time when the TrainingPipeline for the first time entered the
  // `PIPELINE_STATE_RUNNING` state.
  google.protobuf.Timestamp start_time = 12 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Time when the TrainingPipeline entered any of the following states:
  // `PIPELINE_STATE_SUCCEEDED`, `PIPELINE_STATE_FAILED`,
  // `PIPELINE_STATE_CANCELLED`.
  google.protobuf.Timestamp end_time = 13 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Time when the TrainingPipeline was most recently updated.
  google.protobuf.Timestamp update_time = 14 [(google.api.field_behavior) = OUTPUT_ONLY];

  // The labels with user-defined metadata to organize TrainingPipelines.
  //
  // Label keys and values can be no longer than 64 characters
  // (Unicode codepoints), can only contain lowercase letters, numeric
  // characters, underscores and dashes. International characters are allowed.
  //
  // See https://goo.gl/xmQnxf for more information and examples of labels.
  map<string, string> labels = 15;
}

// Specifies AI Platform owned input data to be used for training, and
// possibly evaluating, the Model.
message InputDataConfig {
  // The instructions how the input data should be split between the
  // training, validation and test sets.
  // If no split type is provided, the [fraction_split][google.cloud.aiplatform.v1beta1.InputDataConfig.fraction_split] is used by default.
  oneof split {
    // Split based on fractions defining the size of each set.
    FractionSplit fraction_split = 2;

    // Split based on the provided filters for each set.
    FilterSplit filter_split = 3;

    // Supported only for tabular Datasets.
    //
    // Split based on a predefined key.
    PredefinedSplit predefined_split = 4;

    // Supported only for tabular Datasets.
    //
    // Split based on the timestamp of the input data pieces.
    TimestampSplit timestamp_split = 5;
  }

  // Only applicable to Custom and Hyperparameter Tuning TrainingPipelines.
  //
  // The destination of the training data to be written to.
  //
  // Supported destination file formats:
  //   * For non-tabular data: "jsonl".
  //   * For tabular data: "csv" and "bigquery".
  //
  // Following AI Platform environment variables will be passed to containers
  // or python modules of the training task when this field is set:
  //
  // * AIP_DATA_FORMAT : Exported data format.
  // * AIP_TRAINING_DATA_URI : Sharded exported training data uris.
  // * AIP_VALIDATION_DATA_URI : Sharded exported validation data uris.
  // * AIP_TEST_DATA_URI : Sharded exported test data uris.
  oneof destination {
    // The Google Cloud Storage location where the training data is to be
    // written to. In the given directory a new directory will be created with
    // name:
    // `dataset-<dataset-id>-<annotation-type>-<timestamp-of-training-call>`
    // where timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format.
    // All training input data will be written into that directory.
    //
    // The AI Platform environment variables representing Google Cloud Storage
    // data URIs will always be represented in the Google Cloud Storage wildcard
    // format to support sharded data. e.g.: "gs://.../training-*.jsonl"
    //
    // * AIP_DATA_FORMAT = "jsonl" for non-tabular data, "csv" for tabular data
    // * AIP_TRAINING_DATA_URI  =
    //
    // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/training-*.${AIP_DATA_FORMAT}"
    // * AIP_VALIDATION_DATA_URI =
    //
    // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/validation-*.${AIP_DATA_FORMAT}"
    // * AIP_TEST_DATA_URI =
    //
    // "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/test-*.${AIP_DATA_FORMAT}"
    GcsDestination gcs_destination = 8;

    // The BigQuery project location where the training data is to be written
    // to. In the given project a new dataset is created with name
    // `dataset_<dataset-id>_<annotation-type>_<timestamp-of-training-call>`
    // where timestamp is in YYYY_MM_DDThh_mm_ss_sssZ format. All training
    // input data will be written into that dataset. In the dataset three
    // tables will be created, `training`, `validation` and `test`.
    //
    // * AIP_DATA_FORMAT = "bigquery".
    // * AIP_TRAINING_DATA_URI  =
    //
    // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.training"
    // * AIP_VALIDATION_DATA_URI =
    //
    // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.validation"
    // * AIP_TEST_DATA_URI =
    // "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.test"
    BigQueryDestination bigquery_destination = 10;
  }

  // Required. The ID of the Dataset in the same Project and Location which data will be
  // used to train the Model. The Dataset must use schema compatible with
  // Model being trained, and what is compatible should be described in the
  // used TrainingPipeline's [training_task_definition]
  // [google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition].
  // For tabular Datasets, all their data is exported to training, to pick
  // and choose from.
  string dataset_id = 1 [(google.api.field_behavior) = REQUIRED];

  // Only applicable to Datasets that have DataItems and Annotations.
  //
  // A filter on Annotations of the Dataset. Only Annotations that both
  // match this filter and belong to DataItems not ignored by the split method
  // are used in respectively training, validation or test role, depending on
  // the role of the DataItem they are on (for the auto-assigned that role is
  // decided by AI Platform). A filter with same syntax as the one used in
  // [ListAnnotations][google.cloud.aiplatform.v1beta1.DatasetService.ListAnnotations] may be used, but note
  // here it filters across all Annotations of the Dataset, and not just within
  // a single DataItem.
  string annotations_filter = 6;

  // Only applicable to custom training.
  //
  // Google Cloud Storage URI points to a YAML file describing annotation
  // schema. The schema is defined as an OpenAPI 3.0.2 [Schema Object](
  //
  // https:
  // //github.com/OAI/OpenAPI-Specification/b
  // // lob/master/versions/3.0.2.md#schema-object)
  // The schema files that can be used here are found in
  // gs://google-cloud-aiplatform/schema/dataset/annotation/, note that the
  // chosen schema must be consistent with
  // [metadata][google.cloud.aiplatform.v1beta1.Dataset.metadata_schema_uri] of the Dataset specified by
  // [dataset_id][google.cloud.aiplatform.v1beta1.InputDataConfig.dataset_id].
  //
  // Only Annotations that both match this schema and belong to DataItems not
  // ignored by the split method are used in respectively training, validation
  // or test role, depending on the role of the DataItem they are on.
  //
  // When used in conjunction with [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter], the Annotations used
  // for training are filtered by both [annotations_filter][google.cloud.aiplatform.v1beta1.InputDataConfig.annotations_filter] and
  // [annotation_schema_uri][google.cloud.aiplatform.v1beta1.InputDataConfig.annotation_schema_uri].
  string annotation_schema_uri = 9;
}

// Assigns the input data to training, validation, and test sets as per the
// given fractions. Any of `training_fraction`, `validation_fraction` and
// `test_fraction` may optionally be provided, they must sum to up to 1. If the
// provided ones sum to less than 1, the remainder is assigned to sets as
// decided by AI Platform. If none of the fractions are set, by default roughly
// 80% of data will be used for training, 10% for validation, and 10% for test.
message FractionSplit {
  // The fraction of the input data that is to be used to train the Model.
  double training_fraction = 1;

  // The fraction of the input data that is to be used to validate the Model.
  double validation_fraction = 2;

  // The fraction of the input data that is to be used to evaluate the Model.
  double test_fraction = 3;
}

// Assigns input data to training, validation, and test sets based on the given
// filters, data pieces not matched by any filter are ignored. Currently only
// supported for Datasets containing DataItems.
// If any of the filters in this message are to match nothing, then they can be
// set as '-' (the minus sign).
message FilterSplit {
  // Required. A filter on DataItems of the Dataset. DataItems that match
  // this filter are used to train the Model. A filter with same syntax
  // as the one used in [DatasetService.ListDataItems][google.cloud.aiplatform.v1beta1.DatasetService.ListDataItems] may be used. If a
  // single DataItem is matched by more than one of the FilterSplit filters,
  // then it will be assigned to the first set that applies to it in the
  // training, validation, test order.
  string training_filter = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. A filter on DataItems of the Dataset. DataItems that match
  // this filter are used to validate the Model. A filter with same syntax
  // as the one used in [DatasetService.ListDataItems][google.cloud.aiplatform.v1beta1.DatasetService.ListDataItems] may be used. If a
  // single DataItem is matched by more than one of the FilterSplit filters,
  // then it will be assigned to the first set that applies to it in the
  // training, validation, test order.
  string validation_filter = 2 [(google.api.field_behavior) = REQUIRED];

  // Required. A filter on DataItems of the Dataset. DataItems that match
  // this filter are used to test the Model. A filter with same syntax
  // as the one used in [DatasetService.ListDataItems][google.cloud.aiplatform.v1beta1.DatasetService.ListDataItems] may be used. If a
  // single DataItem is matched by more than one of the FilterSplit filters,
  // then it will be assigned to the first set that applies to it in the
  // training, validation, test order.
  string test_filter = 3 [(google.api.field_behavior) = REQUIRED];
}

// Assigns input data to training, validation, and test sets based on the
// value of a provided key.
//
// Supported only for tabular Datasets.
message PredefinedSplit {
  // Required. The key is a name of one of the Dataset's data columns.
  // The value of the key (either the label's value or value in the column)
  // must be one of {`training`, `validation`, `test`}, and it defines to which
  // set the given piece of data is assigned. If for a piece of data the key
  // is not present or has an invalid value, that piece is ignored by the
  // pipeline.
  string key = 1 [(google.api.field_behavior) = REQUIRED];
}

// Assigns input data to training, validation, and test sets based on a
// provided timestamps. The youngest data pieces are assigned to training set,
// next to validation set, and the oldest to the test set.
//
// Supported only for tabular Datasets.
message TimestampSplit {
  // The fraction of the input data that is to be used to train the Model.
  double training_fraction = 1;

  // The fraction of the input data that is to be used to validate the Model.
  double validation_fraction = 2;

  // The fraction of the input data that is to be used to evaluate the Model.
  double test_fraction = 3;

  // Required. The key is a name of one of the Dataset's data columns.
  // The values of the key (the values in the column) must be in RFC 3339
  // `date-time` format, where `time-offset` = `"Z"`
  // (e.g. 1985-04-12T23:20:50.52Z). If for a piece of data the key is not
  // present or has an invalid value, that piece is ignored by the pipeline.
  string key = 4 [(google.api.field_behavior) = REQUIRED];
}