diff --git a/google/cloud/videointelligence/v1/video_intelligence.proto b/google/cloud/videointelligence/v1/video_intelligence.proto index 67d9168f..223e866e 100644 --- a/google/cloud/videointelligence/v1/video_intelligence.proto +++ b/google/cloud/videointelligence/v1/video_intelligence.proto @@ -83,8 +83,8 @@ message AnnotateVideoRequest { // Video context and/or feature-specific parameters. message VideoContext { // Video segments to annotate. The segments may overlap and are not required - // to be contiguous or span the whole video. If unspecified, each video - // is treated as a single segment. + // to be contiguous or span the whole video. If unspecified, each video is + // treated as a single segment. repeated VideoSegment segments = 1; // Config for LABEL_DETECTION. @@ -98,6 +98,9 @@ message VideoContext { // Config for FACE_DETECTION. FaceDetectionConfig face_detection_config = 5; + + // Config for SPEECH_TRANSCRIPTION. + SpeechTranscriptionConfig speech_transcription_config = 6; } // Config for LABEL_DETECTION. @@ -299,6 +302,9 @@ message VideoAnnotationResults { // Explicit content annotation. ExplicitContentAnnotation explicit_annotation = 7; + // Speech transcription. + repeated SpeechTranscription speech_transcriptions = 11; + // If set, indicates an error. Note that for a single `AnnotateVideoRequest` // some videos may succeed and some may fail. google.rpc.Status error = 9; @@ -318,8 +324,8 @@ message VideoAnnotationProgress { // [Google Cloud Storage](https://cloud.google.com/storage/). string input_uri = 1; - // Approximate percentage processed thus far. - // Guaranteed to be 100 when fully processed. + // Approximate percentage processed thus far. Guaranteed to be + // 100 when fully processed. int32 progress_percent = 2; // Time when the request was received. @@ -337,6 +343,142 @@ message AnnotateVideoProgress { repeated VideoAnnotationProgress annotation_progress = 1; } +// Config for SPEECH_TRANSCRIPTION. +message SpeechTranscriptionConfig { + // *Required* The language of the supplied audio as a + // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. + // Example: "en-US". + // See [Language Support](https://cloud.google.com/speech/docs/languages) + // for a list of the currently supported language codes. + string language_code = 1; + + // *Optional* Maximum number of recognition hypotheses to be returned. + // Specifically, the maximum number of `SpeechRecognitionAlternative` messages + // within each `SpeechTranscription`. The server may return fewer than + // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will + // return a maximum of one. If omitted, will return a maximum of one. + int32 max_alternatives = 2; + + // *Optional* If set to `true`, the server will attempt to filter out + // profanities, replacing all but the initial character in each filtered word + // with asterisks, e.g. "f***". If set to `false` or omitted, profanities + // won't be filtered out. + bool filter_profanity = 3; + + // *Optional* A means to provide context to assist the speech recognition. + repeated SpeechContext speech_contexts = 4; + + // *Optional* If 'true', adds punctuation to recognition result hypotheses. + // This feature is only available in select languages. Setting this for + // requests in other languages has no effect at all. The default 'false' value + // does not add punctuation to result hypotheses. NOTE: "This is currently + // offered as an experimental service, complimentary to all users. In the + // future this may be exclusively available as a premium feature." + bool enable_automatic_punctuation = 5; + + // *Optional* For file formats, such as MXF or MKV, supporting multiple audio + // tracks, specify up to two tracks. Default: track 0. + repeated int32 audio_tracks = 6; + + // *Optional* If 'true', enables speaker detection for each recognized word in + // the top alternative of the recognition result using a speaker_tag provided + // in the WordInfo. + // Note: When this is true, we send all the words from the beginning of the + // audio for the top alternative in every consecutive responses. + // This is done in order to improve our speaker tags as our models learn to + // identify the speakers in the conversation over time. + bool enable_speaker_diarization = 7; + + // *Optional* + // If set, specifies the estimated number of speakers in the conversation. + // If not set, defaults to '2'. + // Ignored unless enable_speaker_diarization is set to true. + int32 diarization_speaker_count = 8; + + // *Optional* If `true`, the top result includes a list of words and the + // confidence for those words. If `false`, no word-level confidence + // information is returned. The default is `false`. + bool enable_word_confidence = 9; +} + +// Provides "hints" to the speech recognizer to favor specific words and phrases +// in the results. +message SpeechContext { + // *Optional* A list of strings containing words and phrases "hints" so that + // the speech recognition is more likely to recognize them. This can be used + // to improve the accuracy for specific words and phrases, for example, if + // specific commands are typically spoken by the user. This can also be used + // to add additional words to the vocabulary of the recognizer. See + // [usage limits](https://cloud.google.com/speech/limits#content). + repeated string phrases = 1; +} + +// A speech recognition result corresponding to a portion of the audio. +message SpeechTranscription { + // May contain one or more recognition hypotheses (up to the maximum specified + // in `max_alternatives`). These alternatives are ordered in terms of + // accuracy, with the top (first) alternative being the most probable, as + // ranked by the recognizer. + repeated SpeechRecognitionAlternative alternatives = 1; + + // Output only. The + // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the + // language in this result. This language code was detected to have the most + // likelihood of being spoken in the audio. + string language_code = 2; +} + +// Alternative hypotheses (a.k.a. n-best list). +message SpeechRecognitionAlternative { + // Transcript text representing the words that the user spoke. + string transcript = 1; + + // The confidence estimate between 0.0 and 1.0. A higher number + // indicates an estimated greater likelihood that the recognized words are + // correct. This field is typically provided only for the top hypothesis, and + // only for `is_final=true` results. Clients should not rely on the + // `confidence` field as it is not guaranteed to be accurate or consistent. + // The default of 0.0 is a sentinel value indicating `confidence` was not set. + float confidence = 2; + + // A list of word-specific information for each recognized word. + repeated WordInfo words = 3; +} + +// Word-specific information for recognized words. Word information is only +// included in the response when certain request parameters are set, such +// as `enable_word_time_offsets`. +message WordInfo { + // Time offset relative to the beginning of the audio, and + // corresponding to the start of the spoken word. This field is only set if + // `enable_word_time_offsets=true` and only in the top hypothesis. This is an + // experimental feature and the accuracy of the time offset can vary. + google.protobuf.Duration start_time = 1; + + // Time offset relative to the beginning of the audio, and + // corresponding to the end of the spoken word. This field is only set if + // `enable_word_time_offsets=true` and only in the top hypothesis. This is an + // experimental feature and the accuracy of the time offset can vary. + google.protobuf.Duration end_time = 2; + + // The word corresponding to this set of information. + string word = 3; + + // Output only. The confidence estimate between 0.0 and 1.0. A higher number + // indicates an estimated greater likelihood that the recognized words are + // correct. This field is set only for the top alternative. + // This field is not guaranteed to be accurate and users should not rely on it + // to be always provided. + // The default of 0.0 is a sentinel value indicating `confidence` was not set. + float confidence = 4; + + // Output only. A distinct integer value is assigned for every speaker within + // the audio. This field specifies which one of those speakers was detected to + // have spoken this word. Value ranges from 1 up to diarization_speaker_count, + // and is only set if speaker diarization is enabled. + int32 speaker_tag = 5; +} + // Video annotation feature. enum Feature { // Unspecified. @@ -353,6 +495,9 @@ enum Feature { // Human face detection and tracking. FACE_DETECTION = 4; + + // Speech transcription. + SPEECH_TRANSCRIPTION = 6; } // Label detection mode.