Synchronize new proto/yaml changes.

PiperOrigin-RevId: 218785979
This commit is contained in:
Google APIs 2018-10-25 18:25:09 -07:00 committed by Copybara-Service
parent 58863d90ee
commit caa431d9dd
1 changed files with 149 additions and 4 deletions

View File

@ -83,8 +83,8 @@ message AnnotateVideoRequest {
// Video context and/or feature-specific parameters.
message VideoContext {
// Video segments to annotate. The segments may overlap and are not required
// to be contiguous or span the whole video. If unspecified, each video
// is treated as a single segment.
// to be contiguous or span the whole video. If unspecified, each video is
// treated as a single segment.
repeated VideoSegment segments = 1;
// Config for LABEL_DETECTION.
@ -98,6 +98,9 @@ message VideoContext {
// Config for FACE_DETECTION.
FaceDetectionConfig face_detection_config = 5;
// Config for SPEECH_TRANSCRIPTION.
SpeechTranscriptionConfig speech_transcription_config = 6;
}
// Config for LABEL_DETECTION.
@ -299,6 +302,9 @@ message VideoAnnotationResults {
// Explicit content annotation.
ExplicitContentAnnotation explicit_annotation = 7;
// Speech transcription.
repeated SpeechTranscription speech_transcriptions = 11;
// If set, indicates an error. Note that for a single `AnnotateVideoRequest`
// some videos may succeed and some may fail.
google.rpc.Status error = 9;
@ -318,8 +324,8 @@ message VideoAnnotationProgress {
// [Google Cloud Storage](https://cloud.google.com/storage/).
string input_uri = 1;
// Approximate percentage processed thus far.
// Guaranteed to be 100 when fully processed.
// Approximate percentage processed thus far. Guaranteed to be
// 100 when fully processed.
int32 progress_percent = 2;
// Time when the request was received.
@ -337,6 +343,142 @@ message AnnotateVideoProgress {
repeated VideoAnnotationProgress annotation_progress = 1;
}
// Config for SPEECH_TRANSCRIPTION.
message SpeechTranscriptionConfig {
// *Required* The language of the supplied audio as a
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
// Example: "en-US".
// See [Language Support](https://cloud.google.com/speech/docs/languages)
// for a list of the currently supported language codes.
string language_code = 1;
// *Optional* Maximum number of recognition hypotheses to be returned.
// Specifically, the maximum number of `SpeechRecognitionAlternative` messages
// within each `SpeechTranscription`. The server may return fewer than
// `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will
// return a maximum of one. If omitted, will return a maximum of one.
int32 max_alternatives = 2;
// *Optional* If set to `true`, the server will attempt to filter out
// profanities, replacing all but the initial character in each filtered word
// with asterisks, e.g. "f***". If set to `false` or omitted, profanities
// won't be filtered out.
bool filter_profanity = 3;
// *Optional* A means to provide context to assist the speech recognition.
repeated SpeechContext speech_contexts = 4;
// *Optional* If 'true', adds punctuation to recognition result hypotheses.
// This feature is only available in select languages. Setting this for
// requests in other languages has no effect at all. The default 'false' value
// does not add punctuation to result hypotheses. NOTE: "This is currently
// offered as an experimental service, complimentary to all users. In the
// future this may be exclusively available as a premium feature."
bool enable_automatic_punctuation = 5;
// *Optional* For file formats, such as MXF or MKV, supporting multiple audio
// tracks, specify up to two tracks. Default: track 0.
repeated int32 audio_tracks = 6;
// *Optional* If 'true', enables speaker detection for each recognized word in
// the top alternative of the recognition result using a speaker_tag provided
// in the WordInfo.
// Note: When this is true, we send all the words from the beginning of the
// audio for the top alternative in every consecutive responses.
// This is done in order to improve our speaker tags as our models learn to
// identify the speakers in the conversation over time.
bool enable_speaker_diarization = 7;
// *Optional*
// If set, specifies the estimated number of speakers in the conversation.
// If not set, defaults to '2'.
// Ignored unless enable_speaker_diarization is set to true.
int32 diarization_speaker_count = 8;
// *Optional* If `true`, the top result includes a list of words and the
// confidence for those words. If `false`, no word-level confidence
// information is returned. The default is `false`.
bool enable_word_confidence = 9;
}
// Provides "hints" to the speech recognizer to favor specific words and phrases
// in the results.
message SpeechContext {
// *Optional* A list of strings containing words and phrases "hints" so that
// the speech recognition is more likely to recognize them. This can be used
// to improve the accuracy for specific words and phrases, for example, if
// specific commands are typically spoken by the user. This can also be used
// to add additional words to the vocabulary of the recognizer. See
// [usage limits](https://cloud.google.com/speech/limits#content).
repeated string phrases = 1;
}
// A speech recognition result corresponding to a portion of the audio.
message SpeechTranscription {
// May contain one or more recognition hypotheses (up to the maximum specified
// in `max_alternatives`). These alternatives are ordered in terms of
// accuracy, with the top (first) alternative being the most probable, as
// ranked by the recognizer.
repeated SpeechRecognitionAlternative alternatives = 1;
// Output only. The
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
// language in this result. This language code was detected to have the most
// likelihood of being spoken in the audio.
string language_code = 2;
}
// Alternative hypotheses (a.k.a. n-best list).
message SpeechRecognitionAlternative {
// Transcript text representing the words that the user spoke.
string transcript = 1;
// The confidence estimate between 0.0 and 1.0. A higher number
// indicates an estimated greater likelihood that the recognized words are
// correct. This field is typically provided only for the top hypothesis, and
// only for `is_final=true` results. Clients should not rely on the
// `confidence` field as it is not guaranteed to be accurate or consistent.
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
float confidence = 2;
// A list of word-specific information for each recognized word.
repeated WordInfo words = 3;
}
// Word-specific information for recognized words. Word information is only
// included in the response when certain request parameters are set, such
// as `enable_word_time_offsets`.
message WordInfo {
// Time offset relative to the beginning of the audio, and
// corresponding to the start of the spoken word. This field is only set if
// `enable_word_time_offsets=true` and only in the top hypothesis. This is an
// experimental feature and the accuracy of the time offset can vary.
google.protobuf.Duration start_time = 1;
// Time offset relative to the beginning of the audio, and
// corresponding to the end of the spoken word. This field is only set if
// `enable_word_time_offsets=true` and only in the top hypothesis. This is an
// experimental feature and the accuracy of the time offset can vary.
google.protobuf.Duration end_time = 2;
// The word corresponding to this set of information.
string word = 3;
// Output only. The confidence estimate between 0.0 and 1.0. A higher number
// indicates an estimated greater likelihood that the recognized words are
// correct. This field is set only for the top alternative.
// This field is not guaranteed to be accurate and users should not rely on it
// to be always provided.
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
float confidence = 4;
// Output only. A distinct integer value is assigned for every speaker within
// the audio. This field specifies which one of those speakers was detected to
// have spoken this word. Value ranges from 1 up to diarization_speaker_count,
// and is only set if speaker diarization is enabled.
int32 speaker_tag = 5;
}
// Video annotation feature.
enum Feature {
// Unspecified.
@ -353,6 +495,9 @@ enum Feature {
// Human face detection and tracking.
FACE_DETECTION = 4;
// Speech transcription.
SPEECH_TRANSCRIPTION = 6;
}
// Label detection mode.