diff --git a/google/cloud/speech/speech_v1.yaml b/google/cloud/speech/speech_v1.yaml index 1ffe2072..e0dd453a 100644 --- a/google/cloud/speech/speech_v1.yaml +++ b/google/cloud/speech/speech_v1.yaml @@ -1,35 +1,44 @@ -# Google Cloud Speech API service configuration - type: google.api.Service config_version: 3 name: speech.googleapis.com - -title: Google Cloud Speech API - -documentation: - summary: - Google Cloud Speech API. +title: Cloud Speech API apis: - name: google.cloud.speech.v1.Speech -authentication: +documentation: + summary: Converts audio to text by applying powerful neural network models. + overview: |- + # Introduction + + Google Cloud Speech API provides speech recognition as a service. + +backend: rules: - - selector: '*' - oauth: - canonical_scopes: https://www.googleapis.com/auth/cloud-platform + - selector: google.longrunning.Operations.GetOperation + deadline: 200.0 + - selector: google.longrunning.Operations.WaitOperation + deadline: 200.0 + - selector: google.cloud.speech.v1.Speech.Recognize + deadline: 200.0 + - selector: google.cloud.speech.v1.Speech.LongRunningRecognize + deadline: 200.0 + - selector: google.cloud.speech.v1.Speech.StreamingRecognize + deadline: 200.0 http: rules: - - selector: google.longrunning.Operations.ListOperations - get: '/v1/operations' - - selector: google.longrunning.Operations.GetOperation get: '/v1/operations/{name=*}' + additional_bindings: + - get: '/v1beta1/operations/{name=*}' - - selector: google.longrunning.Operations.DeleteOperation - delete: '/v1/operations/{name=*}' + - get: '/v1p1beta1/operations/{name=*}' - - selector: google.longrunning.Operations.CancelOperation - post: '/v1/operations/{name=*}:cancel' - body: '*' + +authentication: + rules: + - selector: '*' + oauth: + canonical_scopes: |- + https://www.googleapis.com/auth/cloud-platform diff --git a/google/cloud/speech/v1/cloud_speech.proto b/google/cloud/speech/v1/cloud_speech.proto index b1bac7df..001d54b3 100644 --- a/google/cloud/speech/v1/cloud_speech.proto +++ b/google/cloud/speech/v1/cloud_speech.proto @@ -1,4 +1,4 @@ -// Copyright 2017 Google Inc. +// Copyright 2018 Google LLC. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// syntax = "proto3"; @@ -20,6 +21,7 @@ import "google/api/annotations.proto"; import "google/longrunning/operations.proto"; import "google/protobuf/any.proto"; import "google/protobuf/duration.proto"; +import "google/protobuf/empty.proto"; import "google/protobuf/timestamp.proto"; import "google/rpc/status.proto"; @@ -35,7 +37,10 @@ service Speech { // Performs synchronous speech recognition: receive results after all audio // has been sent and processed. rpc Recognize(RecognizeRequest) returns (RecognizeResponse) { - option (google.api.http) = { post: "/v1/speech:recognize" body: "*" }; + option (google.api.http) = { + post: "/v1/speech:recognize" + body: "*" + }; } // Performs asynchronous speech recognition: receive results via the @@ -43,12 +48,16 @@ service Speech { // `Operation.error` or an `Operation.response` which contains // a `LongRunningRecognizeResponse` message. rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) { - option (google.api.http) = { post: "/v1/speech:longrunningrecognize" body: "*" }; + option (google.api.http) = { + post: "/v1/speech:longrunningrecognize" + body: "*" + }; } // Performs bidirectional streaming speech recognition: receive results while // sending audio. This method is only available via the gRPC API (not REST). - rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse); + rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) { + } } // The top-level message sent by the client for the `Recognize` method. @@ -92,7 +101,7 @@ message StreamingRecognizeRequest { // `audio_content` data. The audio bytes must be encoded as specified in // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a // pure binary representation (not base64). See - // [audio limits](https://cloud.google.com/speech/limits#content). + // [content limits](/speech-to-text/quotas#content). bytes audio_content = 2; } } @@ -127,24 +136,34 @@ message StreamingRecognitionConfig { // Provides information to the recognizer that specifies how to process the // request. message RecognitionConfig { - // Audio encoding of the data sent in the audio message. All encodings support - // only 1 channel (mono) audio. Only `FLAC` and `WAV` include a header that - // describes the bytes of audio that follow the header. The other encodings - // are raw audio bytes with no header. + // The encoding of the audio data sent in the request. + // + // All encodings support only 1 channel (mono) audio. // // For best results, the audio source should be captured and transmitted using - // a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be - // reduced if lossy codecs, which include the other codecs listed in - // this section, are used to capture or transmit the audio, particularly if - // background noise is present. + // a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech + // recognition can be reduced if lossy codecs are used to capture or transmit + // audio, particularly if background noise is present. Lossy codecs include + // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, and `SPEEX_WITH_HEADER_BYTE`. + // + // The `FLAC` and `WAV` audio file formats include a header that describes the + // included audio content. You can request recognition for `WAV` files that + // contain either `LINEAR16` or `MULAW` encoded audio. + // If you send `FLAC` or `WAV` audio file format in + // your request, you do not need to specify an `AudioEncoding`; the audio + // encoding format is determined from the file header. If you specify + // an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the + // encoding configuration must match the encoding described in the audio + // header; otherwise the request returns an + // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code. enum AudioEncoding { - // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. + // Not specified. ENCODING_UNSPECIFIED = 0; // Uncompressed 16-bit signed little-endian samples (Linear PCM). LINEAR16 = 1; - // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio + // `FLAC` (Free Lossless Audio // Codec) is the recommended encoding because it is // lossless--therefore recognition is not compromised--and // requires only about half the bandwidth of `LINEAR16`. `FLAC` stream @@ -163,7 +182,7 @@ message RecognitionConfig { // Opus encoded audio frames in Ogg container // ([OggOpus](https://wiki.xiph.org/OggOpus)). - // `sample_rate_hertz` must be 16000. + // `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000. OGG_OPUS = 6; // Although the use of lossy encodings is not recommended, if a very low @@ -182,20 +201,24 @@ message RecognitionConfig { SPEEX_WITH_HEADER_BYTE = 7; } - // *Required* Encoding of audio data sent in all `RecognitionAudio` messages. + // Encoding of audio data sent in all `RecognitionAudio` messages. + // This field is optional for `FLAC` and `WAV` audio files and required + // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding]. AudioEncoding encoding = 1; - // *Required* Sample rate in Hertz of the audio data sent in all + // Sample rate in Hertz of the audio data sent in all // `RecognitionAudio` messages. Valid values are: 8000-48000. // 16000 is optimal. For best results, set the sampling rate of the audio // source to 16000 Hz. If that's not possible, use the native sample rate of // the audio source (instead of re-sampling). + // This field is optional for `FLAC` and `WAV` audio files and required + // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding]. int32 sample_rate_hertz = 2; // *Required* The language of the supplied audio as a // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. // Example: "en-US". - // See [Language Support](https://cloud.google.com/speech/docs/languages) + // See [Language Support](/speech-to-text/docs/languages) // for a list of the currently supported language codes. string language_code = 3; @@ -213,7 +236,9 @@ message RecognitionConfig { // won't be filtered out. bool profanity_filter = 5; - // *Optional* A means to provide context to assist the speech recognition. + // *Optional* array of [SpeechContext][google.cloud.speech.v1.SpeechContext]. + // A means to provide context to assist the speech recognition. For more + // information, see [Phrase Hints](/speech-to-text/docs/basics#phrase-hints). repeated SpeechContext speech_contexts = 6; // *Optional* If `true`, the top result includes a list of words and @@ -221,6 +246,62 @@ message RecognitionConfig { // `false`, no word-level time offset information is returned. The default is // `false`. bool enable_word_time_offsets = 8; + + // *Optional* If 'true', adds punctuation to recognition result hypotheses. + // This feature is only available in select languages. Setting this for + // requests in other languages has no effect at all. + // The default 'false' value does not add punctuation to result hypotheses. + // Note: This is currently offered as an experimental service, complimentary + // to all users. In the future this may be exclusively available as a + // premium feature. + bool enable_automatic_punctuation = 11; + + // *Optional* Which model to select for the given request. Select the model + // best suited to your domain to get best results. If a model is not + // explicitly specified, then we auto-select a model based on the parameters + // in the RecognitionConfig. + // + // + // + // + // + // + // + // + // + // + // + // + // + // + // + // + // + // + // + // + // + //
ModelDescription
command_and_searchBest for short queries such as voice commands or voice search.
phone_callBest for audio that originated from a phone call (typically + // recorded at an 8khz sampling rate).
videoBest for audio that originated from from video or includes multiple + // speakers. Ideally the audio is recorded at a 16khz or greater + // sampling rate. This is a premium model that costs more than the + // standard rate.
defaultBest for audio that is not one of the specific audio models. + // For example, long-form audio. Ideally the audio is high-fidelity, + // recorded at a 16khz or greater sampling rate.
+ string model = 13; + + // *Optional* Set to true to use an enhanced model for speech recognition. + // You must also set the `model` field to a valid, enhanced model. If + // `use_enhanced` is set to true and the `model` field is not set, then + // `use_enhanced` is ignored. If `use_enhanced` is true and an enhanced + // version of the specified model does not exist, then the speech is + // recognized using the standard version of the specified model. + // + // Enhanced speech models require that you opt-in to data logging using + // instructions in the [documentation](/speech-to-text/enable-data-logging). + // If you set `use_enhanced` to true and you have not enabled audio logging, + // then you will receive an error. + bool use_enhanced = 14; } // Provides "hints" to the speech recognizer to favor specific words and phrases @@ -231,14 +312,14 @@ message SpeechContext { // to improve the accuracy for specific words and phrases, for example, if // specific commands are typically spoken by the user. This can also be used // to add additional words to the vocabulary of the recognizer. See - // [usage limits](https://cloud.google.com/speech/limits#content). + // [usage limits](/speech-to-text/quotas#content). repeated string phrases = 1; } // Contains audio data in the encoding specified in the `RecognitionConfig`. // Either `content` or `uri` must be supplied. Supplying both or neither // returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See -// [audio limits](https://cloud.google.com/speech/limits#content). +// [content limits](/speech-to-text/quotas#content). message RecognitionAudio { // The audio source, which is either inline content or a Google Cloud // Storage uri. @@ -249,7 +330,8 @@ message RecognitionAudio { bytes content = 1; // URI that points to a file that contains audio data bytes as specified in - // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are + // `RecognitionConfig`. The file must not be compressed (for example, gzip). + // Currently, only Google Cloud Storage URIs are // supported, which must be specified in the following format: // `gs://bucket_name/object_name` (other URI formats return // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see @@ -262,7 +344,7 @@ message RecognitionAudio { // contains the result as zero or more sequential `SpeechRecognitionResult` // messages. message RecognizeResponse { - // *Output-only* Sequential list of transcription results corresponding to + // Output only. Sequential list of transcription results corresponding to // sequential portions of audio. repeated SpeechRecognitionResult results = 2; } @@ -273,7 +355,7 @@ message RecognizeResponse { // returned by the `GetOperation` call of the `google::longrunning::Operations` // service. message LongRunningRecognizeResponse { - // *Output-only* Sequential list of transcription results corresponding to + // Output only. Sequential list of transcription results corresponding to // sequential portions of audio. repeated SpeechRecognitionResult results = 2; } @@ -358,35 +440,37 @@ message StreamingRecognizeResponse { END_OF_SINGLE_UTTERANCE = 1; } - // *Output-only* If set, returns a [google.rpc.Status][google.rpc.Status] message that + // Output only. If set, returns a [google.rpc.Status][google.rpc.Status] message that // specifies the error for the operation. google.rpc.Status error = 1; - // *Output-only* This repeated list contains zero or more results that + // Output only. This repeated list contains zero or more results that // correspond to consecutive portions of the audio currently being processed. - // It contains zero or more `is_final=false` results followed by zero or one - // `is_final=true` result (the newly settled portion). + // It contains zero or one `is_final=true` result (the newly settled portion), + // followed by zero or more `is_final=false` results (the interim results). repeated StreamingRecognitionResult results = 2; - // *Output-only* Indicates the type of speech event. + // Output only. Indicates the type of speech event. SpeechEventType speech_event_type = 4; } // A streaming speech recognition result corresponding to a portion of the audio // that is currently being processed. message StreamingRecognitionResult { - // *Output-only* May contain one or more recognition hypotheses (up to the + // Output only. May contain one or more recognition hypotheses (up to the // maximum specified in `max_alternatives`). + // These alternatives are ordered in terms of accuracy, with the top (first) + // alternative being the most probable, as ranked by the recognizer. repeated SpeechRecognitionAlternative alternatives = 1; - // *Output-only* If `false`, this `StreamingRecognitionResult` represents an + // Output only. If `false`, this `StreamingRecognitionResult` represents an // interim result that may change. If `true`, this is the final time the // speech service will return this particular `StreamingRecognitionResult`, // the recognizer will not return any further hypotheses for this portion of // the transcript and corresponding audio. bool is_final = 2; - // *Output-only* An estimate of the likelihood that the recognizer will not + // Output only. An estimate of the likelihood that the recognizer will not // change its guess about this interim result. Values range from 0.0 // (completely unstable) to 1.0 (completely stable). // This field is only provided for interim results (`is_final=false`). @@ -396,7 +480,7 @@ message StreamingRecognitionResult { // A speech recognition result corresponding to a portion of the audio. message SpeechRecognitionResult { - // *Output-only* May contain one or more recognition hypotheses (up to the + // Output only. May contain one or more recognition hypotheses (up to the // maximum specified in `max_alternatives`). // These alternatives are ordered in terms of accuracy, with the top (first) // alternative being the most probable, as ranked by the recognizer. @@ -405,26 +489,25 @@ message SpeechRecognitionResult { // Alternative hypotheses (a.k.a. n-best list). message SpeechRecognitionAlternative { - // *Output-only* Transcript text representing the words that the user spoke. + // Output only. Transcript text representing the words that the user spoke. string transcript = 1; - // *Output-only* The confidence estimate between 0.0 and 1.0. A higher number + // Output only. The confidence estimate between 0.0 and 1.0. A higher number // indicates an estimated greater likelihood that the recognized words are - // correct. This field is typically provided only for the top hypothesis, and - // only for `is_final=true` results. Clients should not rely on the - // `confidence` field as it is not guaranteed to be accurate or consistent. + // correct. This field is set only for the top alternative of a non-streaming + // result or, of a streaming result where `is_final=true`. + // This field is not guaranteed to be accurate and users should not rely on it + // to be always provided. // The default of 0.0 is a sentinel value indicating `confidence` was not set. float confidence = 2; - // *Output-only* A list of word-specific information for each recognized word. + // Output only. A list of word-specific information for each recognized word. repeated WordInfo words = 3; } -// Word-specific information for recognized words. Word information is only -// included in the response when certain request parameters are set, such -// as `enable_word_time_offsets`. +// Word-specific information for recognized words. message WordInfo { - // *Output-only* Time offset relative to the beginning of the audio, + // Output only. Time offset relative to the beginning of the audio, // and corresponding to the start of the spoken word. // This field is only set if `enable_word_time_offsets=true` and only // in the top hypothesis. @@ -432,7 +515,7 @@ message WordInfo { // vary. google.protobuf.Duration start_time = 1; - // *Output-only* Time offset relative to the beginning of the audio, + // Output only. Time offset relative to the beginning of the audio, // and corresponding to the end of the spoken word. // This field is only set if `enable_word_time_offsets=true` and only // in the top hypothesis. @@ -440,6 +523,6 @@ message WordInfo { // vary. google.protobuf.Duration end_time = 2; - // *Output-only* The word corresponding to this set of information. + // Output only. The word corresponding to this set of information. string word = 3; }