diff --git a/google/cloud/videointelligence/v1p3beta1/video_intelligence.proto b/google/cloud/videointelligence/v1p3beta1/video_intelligence.proto index e37726e0..1203b315 100644 --- a/google/cloud/videointelligence/v1p3beta1/video_intelligence.proto +++ b/google/cloud/videointelligence/v1p3beta1/video_intelligence.proto @@ -1,4 +1,4 @@ -// Copyright 2018 Google LLC. +// Copyright 2019 Google LLC. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ syntax = "proto3"; package google.cloud.videointelligence.v1p3beta1; import "google/api/annotations.proto"; +import "google/api/client.proto"; +import "google/api/field_behavior.proto"; import "google/longrunning/operations.proto"; import "google/protobuf/duration.proto"; import "google/protobuf/timestamp.proto"; @@ -29,10 +31,13 @@ option java_multiple_files = true; option java_outer_classname = "VideoIntelligenceServiceProto"; option java_package = "com.google.cloud.videointelligence.v1p3beta1"; option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p3beta1"; -option ruby_package = "Google::Cloud::VideoIntelligence::V1p3beta1"; // Service that implements Google Cloud Video Intelligence API. service VideoIntelligenceService { + option (google.api.default_host) = "videointelligence.googleapis.com"; + option (google.api.oauth_scopes) = + "https://www.googleapis.com/auth/cloud-platform"; + // Performs asynchronous video annotation. Progress and results can be // retrieved through the `google.longrunning.Operations` interface. // `Operation.metadata` contains `AnnotateVideoProgress` (progress). @@ -43,16 +48,25 @@ service VideoIntelligenceService { post: "/v1p3beta1/videos:annotate" body: "*" }; + option (google.api.method_signature) = "input_uri,features"; + option (google.longrunning.operation_info) = { + response_type: "AnnotateVideoResponse" + metadata_type: "AnnotateVideoProgress" + }; } } -// Service that implements Google Cloud Video Intelligence Streaming API. +// Service that implements streaming Google Cloud Video Intelligence API. service StreamingVideoIntelligenceService { + option (google.api.default_host) = "videointelligence.googleapis.com"; + option (google.api.oauth_scopes) = + "https://www.googleapis.com/auth/cloud-platform"; + // Performs video annotation with bidirectional streaming: emitting results // while sending video/audio bytes. // This method is only available via the gRPC API (not REST). rpc StreamingAnnotateVideo(stream StreamingAnnotateVideoRequest) - returns (stream StreamingAnnotateVideoResponse); + returns (stream StreamingAnnotateVideoResponse) {} } // Video annotation request. @@ -74,24 +88,24 @@ message AnnotateVideoRequest { // If set, `input_uri` should be unset. bytes input_content = 6; - // Requested video annotation features. - repeated Feature features = 2; + // Required. Requested video annotation features. + repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED]; // Additional video context and/or feature-specific parameters. VideoContext video_context = 3; - // Optional location where the output (in JSON format) should be stored. + // Optional. Location where the output (in JSON format) should be stored. // Currently, only [Google Cloud Storage](https://cloud.google.com/storage/) // URIs are supported, which must be specified in the following format: // `gs://bucket-id/object-id` (other URI formats return // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For // more information, see [Request URIs](/storage/docs/reference-uris). - string output_uri = 4; + string output_uri = 4 [(google.api.field_behavior) = OPTIONAL]; - // Optional cloud region where annotation should take place. Supported cloud + // Optional. Cloud region where annotation should take place. Supported cloud // regions: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no region // is specified, a region will be determined based on video file location. - string location_id = 5; + string location_id = 5 [(google.api.field_behavior) = OPTIONAL]; } // Video context and/or feature-specific parameters. @@ -290,8 +304,7 @@ message NormalizedBoundingBox { float bottom = 4; } -// For tracking related features, such as LOGO_RECOGNITION, FACE_DETECTION, -// CELEBRITY_RECOGNITION, PERSON_DETECTION. +// For tracking related features. // An object at time_offset with attributes, and located with // normalized_bounding_box. message TimestampedObject { @@ -303,7 +316,8 @@ message TimestampedObject { google.protobuf.Duration time_offset = 2; // Optional. The attributes of the object in the bounding box. - repeated DetectedAttribute attributes = 3; + repeated DetectedAttribute attributes = 3 + [(google.api.field_behavior) = OPTIONAL]; } // A track of an object instance. @@ -315,10 +329,11 @@ message Track { repeated TimestampedObject timestamped_objects = 2; // Optional. Attributes in the track level. - repeated DetectedAttribute attributes = 3; + repeated DetectedAttribute attributes = 3 + [(google.api.field_behavior) = OPTIONAL]; // Optional. The confidence score of the tracked object. - float confidence = 4; + float confidence = 4 [(google.api.field_behavior) = OPTIONAL]; } // A generic detected attribute represented by name in string format. @@ -335,20 +350,80 @@ message DetectedAttribute { string value = 3; } +// Celebrity definition. +message Celebrity { + // The resource name of the celebrity. Have the format + // `video-intelligence/kg-mid` indicates a celebrity from preloaded gallery. + // kg-mid is the id in Google knowledge graph, which is unique for the + // celebrity. + string name = 1; + + // The celebrity name. + string display_name = 2; + + // Textual description of additional information about the celebrity, if + // applicable. + string description = 3; +} + +// The annotation result of a celebrity face track. RecognizedCelebrity field +// could be empty if the face track does not have any matched celebrities. +message CelebrityTrack { + // The recognized celebrity with confidence score. + message RecognizedCelebrity { + // The recognized celebrity. + Celebrity celebrity = 1; + + // Recognition confidence. Range [0, 1]. + float confidence = 2; + } + + // Top N match of the celebrities for the face in this track. + repeated RecognizedCelebrity celebrities = 1; + + // A track of a person's face. + Track face_track = 3; +} + +// Celebrity recognition annotation per video. +message CelebrityRecognitionAnnotation { + // The tracks detected from the input video, including recognized celebrities + // and other detected faces in the video. + repeated CelebrityTrack celebrity_tracks = 1; +} + // Annotation results for a single video. message VideoAnnotationResults { // Video file location in // [Google Cloud Storage](https://cloud.google.com/storage/). string input_uri = 1; - // Label annotations on video level or user specified segment level. + // Video segment on which the annotation is run. + VideoSegment segment = 10; + + // Topical label annotations on video level or user specified segment level. // There is exactly one element for each unique label. repeated LabelAnnotation segment_label_annotations = 2; - // Label annotations on shot level. + // Presence label annotations on video level or user specified segment level. + // There is exactly one element for each unique label. Compared to the + // existing topical `segment_label_annotations`, this field presents more + // fine-grained, segment-level labels detected in video content and is made + // available only when the client sets `LabelDetectionConfig.model` to + // "builtin/latest" in the request. + repeated LabelAnnotation segment_presence_label_annotations = 23; + + // Topical label annotations on shot level. // There is exactly one element for each unique label. repeated LabelAnnotation shot_label_annotations = 3; + // Presence label annotations on shot level. There is exactly one element for + // each unique label. Compared to the existing topical + // `shot_label_annotations`, this field presents more fine-grained, shot-level + // labels detected in video content and is made available only when the client + // sets `LabelDetectionConfig.model` to "builtin/latest" in the request. + repeated LabelAnnotation shot_presence_label_annotations = 24; + // Label annotations on frame level. // There is exactly one element for each unique label. repeated LabelAnnotation frame_label_annotations = 4; @@ -373,6 +448,9 @@ message VideoAnnotationResults { // Annotations for list of logos detected, tracked and recognized in video. repeated LogoRecognitionAnnotation logo_recognition_annotations = 19; + // Celebrity recognition annotations. + CelebrityRecognitionAnnotation celebrity_recognition_annotations = 21; + // If set, indicates an error. Note that for a single `AnnotateVideoRequest` // some videos may succeed and some may fail. google.rpc.Status error = 9; @@ -401,6 +479,14 @@ message VideoAnnotationProgress { // Time of the most recent update. google.protobuf.Timestamp update_time = 4; + + // Specifies which feature is being tracked if the request contains more than + // one features. + Feature feature = 5; + + // Specifies which segment is being tracked if the request contains more than + // one segments. + VideoSegment segment = 6; } // Video annotation progress. Included in the `metadata` @@ -413,72 +499,73 @@ message AnnotateVideoProgress { // Config for SPEECH_TRANSCRIPTION. message SpeechTranscriptionConfig { - // *Required* The language of the supplied audio as a + // Required. *Required* The language of the supplied audio as a // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. // Example: "en-US". // See [Language Support](https://cloud.google.com/speech/docs/languages) // for a list of the currently supported language codes. - string language_code = 1; + string language_code = 1 [(google.api.field_behavior) = REQUIRED]; - // *Optional* Maximum number of recognition hypotheses to be returned. + // Optional. Maximum number of recognition hypotheses to be returned. // Specifically, the maximum number of `SpeechRecognitionAlternative` messages // within each `SpeechTranscription`. The server may return fewer than // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will // return a maximum of one. If omitted, will return a maximum of one. - int32 max_alternatives = 2; + int32 max_alternatives = 2 [(google.api.field_behavior) = OPTIONAL]; - // *Optional* If set to `true`, the server will attempt to filter out + // Optional. If set to `true`, the server will attempt to filter out // profanities, replacing all but the initial character in each filtered word // with asterisks, e.g. "f***". If set to `false` or omitted, profanities // won't be filtered out. - bool filter_profanity = 3; + bool filter_profanity = 3 [(google.api.field_behavior) = OPTIONAL]; - // *Optional* A means to provide context to assist the speech recognition. - repeated SpeechContext speech_contexts = 4; + // Optional. A means to provide context to assist the speech recognition. + repeated SpeechContext speech_contexts = 4 + [(google.api.field_behavior) = OPTIONAL]; - // *Optional* If 'true', adds punctuation to recognition result hypotheses. + // Optional. If 'true', adds punctuation to recognition result hypotheses. // This feature is only available in select languages. Setting this for // requests in other languages has no effect at all. The default 'false' value // does not add punctuation to result hypotheses. NOTE: "This is currently // offered as an experimental service, complimentary to all users. In the // future this may be exclusively available as a premium feature." - bool enable_automatic_punctuation = 5; + bool enable_automatic_punctuation = 5 + [(google.api.field_behavior) = OPTIONAL]; - // *Optional* For file formats, such as MXF or MKV, supporting multiple audio + // Optional. For file formats, such as MXF or MKV, supporting multiple audio // tracks, specify up to two tracks. Default: track 0. - repeated int32 audio_tracks = 6; + repeated int32 audio_tracks = 6 [(google.api.field_behavior) = OPTIONAL]; - // *Optional* If 'true', enables speaker detection for each recognized word in + // Optional. If 'true', enables speaker detection for each recognized word in // the top alternative of the recognition result using a speaker_tag provided // in the WordInfo. // Note: When this is true, we send all the words from the beginning of the // audio for the top alternative in every consecutive responses. // This is done in order to improve our speaker tags as our models learn to // identify the speakers in the conversation over time. - bool enable_speaker_diarization = 7; + bool enable_speaker_diarization = 7 [(google.api.field_behavior) = OPTIONAL]; - // *Optional* - // If set, specifies the estimated number of speakers in the conversation. - // If not set, defaults to '2'. - // Ignored unless enable_speaker_diarization is set to true. - int32 diarization_speaker_count = 8; + // Optional. If set, specifies the estimated number of speakers in the + // conversation. If not set, defaults to '2'. Ignored unless + // enable_speaker_diarization is set to true. + int32 diarization_speaker_count = 8 [(google.api.field_behavior) = OPTIONAL]; - // *Optional* If `true`, the top result includes a list of words and the + // Optional. If `true`, the top result includes a list of words and the // confidence for those words. If `false`, no word-level confidence // information is returned. The default is `false`. - bool enable_word_confidence = 9; + bool enable_word_confidence = 9 [(google.api.field_behavior) = OPTIONAL]; } // Provides "hints" to the speech recognizer to favor specific words and phrases // in the results. message SpeechContext { - // *Optional* A list of strings containing words and phrases "hints" so that + // Optional. A list of strings containing words and phrases "hints" so that // the speech recognition is more likely to recognize them. This can be used // to improve the accuracy for specific words and phrases, for example, if // specific commands are typically spoken by the user. This can also be used // to add additional words to the vocabulary of the recognizer. See // [usage limits](https://cloud.google.com/speech/limits#content). - repeated string phrases = 1; + repeated string phrases = 1 [(google.api.field_behavior) = OPTIONAL]; } // A speech recognition result corresponding to a portion of the audio. @@ -489,11 +576,10 @@ message SpeechTranscription { // ranked by the recognizer. repeated SpeechRecognitionAlternative alternatives = 1; - // Output only. The - // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the - // language in this result. This language code was detected to have the most - // likelihood of being spoken in the audio. - string language_code = 2; + // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) + // language tag of the language in this result. This language code was + // detected to have the most likelihood of being spoken in the audio. + string language_code = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Alternative hypotheses (a.k.a. n-best list). @@ -501,16 +587,18 @@ message SpeechRecognitionAlternative { // Transcript text representing the words that the user spoke. string transcript = 1; - // The confidence estimate between 0.0 and 1.0. A higher number + // Output only. The confidence estimate between 0.0 and 1.0. A higher number // indicates an estimated greater likelihood that the recognized words are - // correct. This field is typically provided only for the top hypothesis, and - // only for `is_final=true` results. Clients should not rely on the - // `confidence` field as it is not guaranteed to be accurate or consistent. + // correct. This field is set only for the top alternative. + // This field is not guaranteed to be accurate and users should not rely on it + // to be always provided. // The default of 0.0 is a sentinel value indicating `confidence` was not set. - float confidence = 2; + float confidence = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; - // A list of word-specific information for each recognized word. - repeated WordInfo words = 3; + // Output only. A list of word-specific information for each recognized word. + // Note: When `enable_speaker_diarization` is true, you will see all the words + // from the beginning of the audio. + repeated WordInfo words = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Word-specific information for recognized words. Word information is only @@ -538,13 +626,13 @@ message WordInfo { // This field is not guaranteed to be accurate and users should not rely on it // to be always provided. // The default of 0.0 is a sentinel value indicating `confidence` was not set. - float confidence = 4; + float confidence = 4 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. A distinct integer value is assigned for every speaker within // the audio. This field specifies which one of those speakers was detected to // have spoken this word. Value ranges from 1 up to diarization_speaker_count, // and is only set if speaker diarization is enabled. - int32 speaker_tag = 5; + int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; } // A vertex represents a 2D point in the image. @@ -645,6 +733,7 @@ message ObjectTrackingAnnotation { // Non-streaming batch mode ONLY. // Each object track corresponds to one video segment where it appears. VideoSegment segment = 3; + // Streaming mode ONLY. // In streaming mode, we do not know the end time of a tracked object // before it is completed. Hence, there is no VideoSegment info returned. @@ -712,26 +801,24 @@ message StreamingAnnotateVideoResponse { string annotation_results_uri = 3; } -// Config for AUTOML_CLASSIFICATION in streaming mode. +// Config for STREAMING_AUTOML_CLASSIFICATION. message StreamingAutomlClassificationConfig { // Resource name of AutoML model. // Format: `projects/{project_id}/locations/{location_id}/models/{model_id}` string model_name = 1; } -// Config for AUTOML_OBJECT_TRACKING in streaming mode. +// Config for STREAMING_AUTOML_OBJECT_TRACKING. message StreamingAutomlObjectTrackingConfig { // Resource name of AutoML model. // Format: `projects/{project_id}/locations/{location_id}/models/{model_id}` string model_name = 1; } -// Config for EXPLICIT_CONTENT_DETECTION in streaming mode. -message StreamingExplicitContentDetectionConfig { - // No customized config support. -} +// Config for STREAMING_EXPLICIT_CONTENT_DETECTION. +message StreamingExplicitContentDetectionConfig {} -// Config for LABEL_DETECTION in streaming mode. +// Config for STREAMING_LABEL_DETECTION. message StreamingLabelDetectionConfig { // Whether the video has been captured from a stationary (i.e. non-moving) // camera. When set to true, might improve detection accuracy for moving @@ -740,14 +827,10 @@ message StreamingLabelDetectionConfig { } // Config for STREAMING_OBJECT_TRACKING. -message StreamingObjectTrackingConfig { - // No customized config support. -} +message StreamingObjectTrackingConfig {} -// Config for SHOT_CHANGE_DETECTION in streaming mode. -message StreamingShotChangeDetectionConfig { - // No customized config support. -} +// Config for STREAMING_SHOT_CHANGE_DETECTION. +message StreamingShotChangeDetectionConfig {} // Config for streaming storage option. message StreamingStorageConfig { @@ -840,6 +923,9 @@ enum Feature { // Logo detection, tracking, and recognition. LOGO_RECOGNITION = 12; + + // Celebrity recognition. + CELEBRITY_RECOGNITION = 13; } // Label detection mode. @@ -882,16 +968,22 @@ enum Likelihood { enum StreamingFeature { // Unspecified. STREAMING_FEATURE_UNSPECIFIED = 0; + // Label detection. Detect objects, such as dog or flower. STREAMING_LABEL_DETECTION = 1; + // Shot change detection. STREAMING_SHOT_CHANGE_DETECTION = 2; + // Explicit content detection. STREAMING_EXPLICIT_CONTENT_DETECTION = 3; + // Object detection and tracking. STREAMING_OBJECT_TRACKING = 4; + // Video classification based on AutoML model. STREAMING_AUTOML_CLASSIFICATION = 21; + // Object detection and tracking based on AutoML model. STREAMING_AUTOML_OBJECT_TRACKING = 22; }