diff --git a/google/cloud/speech/speech_v1.yaml b/google/cloud/speech/speech_v1.yaml index 1ffe2072..e0dd453a 100644 --- a/google/cloud/speech/speech_v1.yaml +++ b/google/cloud/speech/speech_v1.yaml @@ -1,35 +1,44 @@ -# Google Cloud Speech API service configuration - type: google.api.Service config_version: 3 name: speech.googleapis.com - -title: Google Cloud Speech API - -documentation: - summary: - Google Cloud Speech API. +title: Cloud Speech API apis: - name: google.cloud.speech.v1.Speech -authentication: +documentation: + summary: Converts audio to text by applying powerful neural network models. + overview: |- + # Introduction + + Google Cloud Speech API provides speech recognition as a service. + +backend: rules: - - selector: '*' - oauth: - canonical_scopes: https://www.googleapis.com/auth/cloud-platform + - selector: google.longrunning.Operations.GetOperation + deadline: 200.0 + - selector: google.longrunning.Operations.WaitOperation + deadline: 200.0 + - selector: google.cloud.speech.v1.Speech.Recognize + deadline: 200.0 + - selector: google.cloud.speech.v1.Speech.LongRunningRecognize + deadline: 200.0 + - selector: google.cloud.speech.v1.Speech.StreamingRecognize + deadline: 200.0 http: rules: - - selector: google.longrunning.Operations.ListOperations - get: '/v1/operations' - - selector: google.longrunning.Operations.GetOperation get: '/v1/operations/{name=*}' + additional_bindings: + - get: '/v1beta1/operations/{name=*}' - - selector: google.longrunning.Operations.DeleteOperation - delete: '/v1/operations/{name=*}' + - get: '/v1p1beta1/operations/{name=*}' - - selector: google.longrunning.Operations.CancelOperation - post: '/v1/operations/{name=*}:cancel' - body: '*' + +authentication: + rules: + - selector: '*' + oauth: + canonical_scopes: |- + https://www.googleapis.com/auth/cloud-platform diff --git a/google/cloud/speech/v1/cloud_speech.proto b/google/cloud/speech/v1/cloud_speech.proto index b1bac7df..001d54b3 100644 --- a/google/cloud/speech/v1/cloud_speech.proto +++ b/google/cloud/speech/v1/cloud_speech.proto @@ -1,4 +1,4 @@ -// Copyright 2017 Google Inc. +// Copyright 2018 Google LLC. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// syntax = "proto3"; @@ -20,6 +21,7 @@ import "google/api/annotations.proto"; import "google/longrunning/operations.proto"; import "google/protobuf/any.proto"; import "google/protobuf/duration.proto"; +import "google/protobuf/empty.proto"; import "google/protobuf/timestamp.proto"; import "google/rpc/status.proto"; @@ -35,7 +37,10 @@ service Speech { // Performs synchronous speech recognition: receive results after all audio // has been sent and processed. rpc Recognize(RecognizeRequest) returns (RecognizeResponse) { - option (google.api.http) = { post: "/v1/speech:recognize" body: "*" }; + option (google.api.http) = { + post: "/v1/speech:recognize" + body: "*" + }; } // Performs asynchronous speech recognition: receive results via the @@ -43,12 +48,16 @@ service Speech { // `Operation.error` or an `Operation.response` which contains // a `LongRunningRecognizeResponse` message. rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) { - option (google.api.http) = { post: "/v1/speech:longrunningrecognize" body: "*" }; + option (google.api.http) = { + post: "/v1/speech:longrunningrecognize" + body: "*" + }; } // Performs bidirectional streaming speech recognition: receive results while // sending audio. This method is only available via the gRPC API (not REST). - rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse); + rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) { + } } // The top-level message sent by the client for the `Recognize` method. @@ -92,7 +101,7 @@ message StreamingRecognizeRequest { // `audio_content` data. The audio bytes must be encoded as specified in // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a // pure binary representation (not base64). See - // [audio limits](https://cloud.google.com/speech/limits#content). + // [content limits](/speech-to-text/quotas#content). bytes audio_content = 2; } } @@ -127,24 +136,34 @@ message StreamingRecognitionConfig { // Provides information to the recognizer that specifies how to process the // request. message RecognitionConfig { - // Audio encoding of the data sent in the audio message. All encodings support - // only 1 channel (mono) audio. Only `FLAC` and `WAV` include a header that - // describes the bytes of audio that follow the header. The other encodings - // are raw audio bytes with no header. + // The encoding of the audio data sent in the request. + // + // All encodings support only 1 channel (mono) audio. // // For best results, the audio source should be captured and transmitted using - // a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be - // reduced if lossy codecs, which include the other codecs listed in - // this section, are used to capture or transmit the audio, particularly if - // background noise is present. + // a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech + // recognition can be reduced if lossy codecs are used to capture or transmit + // audio, particularly if background noise is present. Lossy codecs include + // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, and `SPEEX_WITH_HEADER_BYTE`. + // + // The `FLAC` and `WAV` audio file formats include a header that describes the + // included audio content. You can request recognition for `WAV` files that + // contain either `LINEAR16` or `MULAW` encoded audio. + // If you send `FLAC` or `WAV` audio file format in + // your request, you do not need to specify an `AudioEncoding`; the audio + // encoding format is determined from the file header. If you specify + // an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the + // encoding configuration must match the encoding described in the audio + // header; otherwise the request returns an + // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code. enum AudioEncoding { - // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. + // Not specified. ENCODING_UNSPECIFIED = 0; // Uncompressed 16-bit signed little-endian samples (Linear PCM). LINEAR16 = 1; - // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio + // `FLAC` (Free Lossless Audio // Codec) is the recommended encoding because it is // lossless--therefore recognition is not compromised--and // requires only about half the bandwidth of `LINEAR16`. `FLAC` stream @@ -163,7 +182,7 @@ message RecognitionConfig { // Opus encoded audio frames in Ogg container // ([OggOpus](https://wiki.xiph.org/OggOpus)). - // `sample_rate_hertz` must be 16000. + // `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000. OGG_OPUS = 6; // Although the use of lossy encodings is not recommended, if a very low @@ -182,20 +201,24 @@ message RecognitionConfig { SPEEX_WITH_HEADER_BYTE = 7; } - // *Required* Encoding of audio data sent in all `RecognitionAudio` messages. + // Encoding of audio data sent in all `RecognitionAudio` messages. + // This field is optional for `FLAC` and `WAV` audio files and required + // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding]. AudioEncoding encoding = 1; - // *Required* Sample rate in Hertz of the audio data sent in all + // Sample rate in Hertz of the audio data sent in all // `RecognitionAudio` messages. Valid values are: 8000-48000. // 16000 is optimal. For best results, set the sampling rate of the audio // source to 16000 Hz. If that's not possible, use the native sample rate of // the audio source (instead of re-sampling). + // This field is optional for `FLAC` and `WAV` audio files and required + // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding]. int32 sample_rate_hertz = 2; // *Required* The language of the supplied audio as a // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. // Example: "en-US". - // See [Language Support](https://cloud.google.com/speech/docs/languages) + // See [Language Support](/speech-to-text/docs/languages) // for a list of the currently supported language codes. string language_code = 3; @@ -213,7 +236,9 @@ message RecognitionConfig { // won't be filtered out. bool profanity_filter = 5; - // *Optional* A means to provide context to assist the speech recognition. + // *Optional* array of [SpeechContext][google.cloud.speech.v1.SpeechContext]. + // A means to provide context to assist the speech recognition. For more + // information, see [Phrase Hints](/speech-to-text/docs/basics#phrase-hints). repeated SpeechContext speech_contexts = 6; // *Optional* If `true`, the top result includes a list of words and @@ -221,6 +246,62 @@ message RecognitionConfig { // `false`, no word-level time offset information is returned. The default is // `false`. bool enable_word_time_offsets = 8; + + // *Optional* If 'true', adds punctuation to recognition result hypotheses. + // This feature is only available in select languages. Setting this for + // requests in other languages has no effect at all. + // The default 'false' value does not add punctuation to result hypotheses. + // Note: This is currently offered as an experimental service, complimentary + // to all users. In the future this may be exclusively available as a + // premium feature. + bool enable_automatic_punctuation = 11; + + // *Optional* Which model to select for the given request. Select the model + // best suited to your domain to get best results. If a model is not + // explicitly specified, then we auto-select a model based on the parameters + // in the RecognitionConfig. + //
| Model | + //Description | + //
command_and_search |
+ // Best for short queries such as voice commands or voice search. | + //
phone_call |
+ // Best for audio that originated from a phone call (typically + // recorded at an 8khz sampling rate). | + //
video |
+ // Best for audio that originated from from video or includes multiple + // speakers. Ideally the audio is recorded at a 16khz or greater + // sampling rate. This is a premium model that costs more than the + // standard rate. | + //
default |
+ // Best for audio that is not one of the specific audio models. + // For example, long-form audio. Ideally the audio is high-fidelity, + // recorded at a 16khz or greater sampling rate. | + //