Synchronize new proto/yaml changes.

PiperOrigin-RevId: 211138713
2018-08-31 13:51:06 -07:00 · 2018-08-31 13:51:06 -07:00 · 098a5976de
parent 04aa50dd5d
commit 098a5976de
2 changed files with 158 additions and 66 deletions
--- a/google/cloud/speech/speech_v1.yaml
+++ b/google/cloud/speech/speech_v1.yaml
@ -1,35 +1,44 @@
-# Google Cloud Speech API service configuration
-
 type: google.api.Service
 config_version: 3
 name: speech.googleapis.com
-
-title: Google Cloud Speech API
-
-documentation:
-  summary:
-    Google Cloud Speech API.
+title: Cloud Speech API

 apis:
 - name: google.cloud.speech.v1.Speech

-authentication:
+documentation:
+  summary: Converts audio to text by applying powerful neural network models.
+  overview: |-
+    # Introduction
+
+    Google Cloud Speech API provides speech recognition as a service.
+
+backend:
  rules:
-    - selector: '*'
-      oauth:
-        canonical_scopes: https://www.googleapis.com/auth/cloud-platform
+  - selector: google.longrunning.Operations.GetOperation
+    deadline: 200.0
+  - selector: google.longrunning.Operations.WaitOperation
+    deadline: 200.0
+  - selector: google.cloud.speech.v1.Speech.Recognize
+    deadline: 200.0
+  - selector: google.cloud.speech.v1.Speech.LongRunningRecognize
+    deadline: 200.0
+  - selector: google.cloud.speech.v1.Speech.StreamingRecognize
+    deadline: 200.0

 http:
  rules:
-  - selector: google.longrunning.Operations.ListOperations
-    get: '/v1/operations'
-
  - selector: google.longrunning.Operations.GetOperation
    get: '/v1/operations/{name=*}'
+    additional_bindings:
+    - get: '/v1beta1/operations/{name=*}'

-  - selector: google.longrunning.Operations.DeleteOperation
-    delete: '/v1/operations/{name=*}'
+    - get: '/v1p1beta1/operations/{name=*}'

-  - selector: google.longrunning.Operations.CancelOperation
-    post: '/v1/operations/{name=*}:cancel'
-    body: '*'
+
+authentication:
+  rules:
+  - selector: '*'
+    oauth:
+      canonical_scopes: |-
+        https://www.googleapis.com/auth/cloud-platform
--- a/google/cloud/speech/v1/cloud_speech.proto
+++ b/google/cloud/speech/v1/cloud_speech.proto
@ -1,4 +1,4 @@
-// Copyright 2017 Google Inc.
+// Copyright 2018 Google LLC.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+//

 syntax = "proto3";

@ -20,6 +21,7 @@ import "google/api/annotations.proto";
 import "google/longrunning/operations.proto";
 import "google/protobuf/any.proto";
 import "google/protobuf/duration.proto";
+import "google/protobuf/empty.proto";
 import "google/protobuf/timestamp.proto";
 import "google/rpc/status.proto";

@ -35,7 +37,10 @@ service Speech {
  // Performs synchronous speech recognition: receive results after all audio
  // has been sent and processed.
  rpc Recognize(RecognizeRequest) returns (RecognizeResponse) {
-    option (google.api.http) = { post: "/v1/speech:recognize" body: "*" };
+    option (google.api.http) = {
+      post: "/v1/speech:recognize"
+      body: "*"
+    };
  }

  // Performs asynchronous speech recognition: receive results via the
@ -43,12 +48,16 @@ service Speech {
  // `Operation.error` or an `Operation.response` which contains
  // a `LongRunningRecognizeResponse` message.
  rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) {
-    option (google.api.http) = { post: "/v1/speech:longrunningrecognize" body: "*" };
+    option (google.api.http) = {
+      post: "/v1/speech:longrunningrecognize"
+      body: "*"
+    };
  }

  // Performs bidirectional streaming speech recognition: receive results while
  // sending audio. This method is only available via the gRPC API (not REST).
-  rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse);
+  rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {
+  }
 }

 // The top-level message sent by the client for the `Recognize` method.
@ -92,7 +101,7 @@ message StreamingRecognizeRequest {
    // `audio_content` data. The audio bytes must be encoded as specified in
    // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
    // pure binary representation (not base64). See
-    // [audio limits](https://cloud.google.com/speech/limits#content).
+    // [content limits](/speech-to-text/quotas#content).
    bytes audio_content = 2;
  }
 }
@ -127,24 +136,34 @@ message StreamingRecognitionConfig {
 // Provides information to the recognizer that specifies how to process the
 // request.
 message RecognitionConfig {
-  // Audio encoding of the data sent in the audio message. All encodings support
-  // only 1 channel (mono) audio. Only `FLAC` and `WAV` include a header that
-  // describes the bytes of audio that follow the header. The other encodings
-  // are raw audio bytes with no header.
+  // The encoding of the audio data sent in the request.
+  //
+  // All encodings support only 1 channel (mono) audio.
  //
  // For best results, the audio source should be captured and transmitted using
-  // a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be
-  // reduced if lossy codecs, which include the other codecs listed in
-  // this section, are used to capture or transmit the audio, particularly if
-  // background noise is present.
+  // a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
+  // recognition can be reduced if lossy codecs are used to capture or transmit
+  // audio, particularly if background noise is present. Lossy codecs include
+  // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, and `SPEEX_WITH_HEADER_BYTE`.
+  //
+  // The `FLAC` and `WAV` audio file formats include a header that describes the
+  // included audio content. You can request recognition for `WAV` files that
+  // contain either `LINEAR16` or `MULAW` encoded audio.
+  // If you send `FLAC` or `WAV` audio file format in
+  // your request, you do not need to specify an `AudioEncoding`; the audio
+  // encoding format is determined from the file header. If you specify
+  // an `AudioEncoding` when you send  send `FLAC` or `WAV` audio, the
+  // encoding configuration must match the encoding described in the audio
+  // header; otherwise the request returns an
+  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code.
  enum AudioEncoding {
-    // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
+    // Not specified.
    ENCODING_UNSPECIFIED = 0;

    // Uncompressed 16-bit signed little-endian samples (Linear PCM).
    LINEAR16 = 1;

-    // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
+    // `FLAC` (Free Lossless Audio
    // Codec) is the recommended encoding because it is
    // lossless--therefore recognition is not compromised--and
    // requires only about half the bandwidth of `LINEAR16`. `FLAC` stream
@ -163,7 +182,7 @@ message RecognitionConfig {

    // Opus encoded audio frames in Ogg container
    // ([OggOpus](https://wiki.xiph.org/OggOpus)).
-    // `sample_rate_hertz` must be 16000.
+    // `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000.
    OGG_OPUS = 6;

    // Although the use of lossy encodings is not recommended, if a very low
@ -182,20 +201,24 @@ message RecognitionConfig {
    SPEEX_WITH_HEADER_BYTE = 7;
  }

-  // *Required* Encoding of audio data sent in all `RecognitionAudio` messages.
+  // Encoding of audio data sent in all `RecognitionAudio` messages.
+  // This field is optional for `FLAC` and `WAV` audio files and required
+  // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
  AudioEncoding encoding = 1;

-  // *Required* Sample rate in Hertz of the audio data sent in all
+  // Sample rate in Hertz of the audio data sent in all
  // `RecognitionAudio` messages. Valid values are: 8000-48000.
  // 16000 is optimal. For best results, set the sampling rate of the audio
  // source to 16000 Hz. If that's not possible, use the native sample rate of
  // the audio source (instead of re-sampling).
+  // This field is optional for `FLAC` and `WAV` audio files and required
+  // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
  int32 sample_rate_hertz = 2;

  // *Required* The language of the supplied audio as a
  // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
  // Example: "en-US".
-  // See [Language Support](https://cloud.google.com/speech/docs/languages)
+  // See [Language Support](/speech-to-text/docs/languages)
  // for a list of the currently supported language codes.
  string language_code = 3;

@ -213,7 +236,9 @@ message RecognitionConfig {
  // won't be filtered out.
  bool profanity_filter = 5;

-  // *Optional* A means to provide context to assist the speech recognition.
+  // *Optional* array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
+  // A means to provide context to assist the speech recognition. For more
+  // information, see [Phrase Hints](/speech-to-text/docs/basics#phrase-hints).
  repeated SpeechContext speech_contexts = 6;

  // *Optional* If `true`, the top result includes a list of words and
@ -221,6 +246,62 @@ message RecognitionConfig {
  // `false`, no word-level time offset information is returned. The default is
  // `false`.
  bool enable_word_time_offsets = 8;
+
+  // *Optional* If 'true', adds punctuation to recognition result hypotheses.
+  // This feature is only available in select languages. Setting this for
+  // requests in other languages has no effect at all.
+  // The default 'false' value does not add punctuation to result hypotheses.
+  // Note: This is currently offered as an experimental service, complimentary
+  // to all users. In the future this may be exclusively available as a
+  // premium feature.
+  bool enable_automatic_punctuation = 11;
+
+  // *Optional* Which model to select for the given request. Select the model
+  // best suited to your domain to get best results. If a model is not
+  // explicitly specified, then we auto-select a model based on the parameters
+  // in the RecognitionConfig.
+  // <table>
+  //   <tr>
+  //     <td><b>Model</b></td>
+  //     <td><b>Description</b></td>
+  //   </tr>
+  //   <tr>
+  //     <td><code>command_and_search</code></td>
+  //     <td>Best for short queries such as voice commands or voice search.</td>
+  //   </tr>
+  //   <tr>
+  //     <td><code>phone_call</code></td>
+  //     <td>Best for audio that originated from a phone call (typically
+  //     recorded at an 8khz sampling rate).</td>
+  //   </tr>
+  //   <tr>
+  //     <td><code>video</code></td>
+  //     <td>Best for audio that originated from from video or includes multiple
+  //         speakers. Ideally the audio is recorded at a 16khz or greater
+  //         sampling rate. This is a premium model that costs more than the
+  //         standard rate.</td>
+  //   </tr>
+  //   <tr>
+  //     <td><code>default</code></td>
+  //     <td>Best for audio that is not one of the specific audio models.
+  //         For example, long-form audio. Ideally the audio is high-fidelity,
+  //         recorded at a 16khz or greater sampling rate.</td>
+  //   </tr>
+  // </table>
+  string model = 13;
+
+  // *Optional* Set to true to use an enhanced model for speech recognition.
+  // You must also set the `model` field to a valid, enhanced model. If
+  // `use_enhanced` is set to true and the `model` field is not set, then
+  // `use_enhanced` is ignored. If `use_enhanced` is true and an enhanced
+  // version of the specified model does not exist, then the speech is
+  // recognized using the standard version of the specified model.
+  //
+  // Enhanced speech models require that you opt-in to data logging using
+  // instructions in the [documentation](/speech-to-text/enable-data-logging).
+  // If you set `use_enhanced` to true and you have not enabled audio logging,
+  // then you will receive an error.
+  bool use_enhanced = 14;
 }

 // Provides "hints" to the speech recognizer to favor specific words and phrases
@ -231,14 +312,14 @@ message SpeechContext {
  // to improve the accuracy for specific words and phrases, for example, if
  // specific commands are typically spoken by the user. This can also be used
  // to add additional words to the vocabulary of the recognizer. See
-  // [usage limits](https://cloud.google.com/speech/limits#content).
+  // [usage limits](/speech-to-text/quotas#content).
  repeated string phrases = 1;
 }

 // Contains audio data in the encoding specified in the `RecognitionConfig`.
 // Either `content` or `uri` must be supplied. Supplying both or neither
 // returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
-// [audio limits](https://cloud.google.com/speech/limits#content).
+// [content limits](/speech-to-text/quotas#content).
 message RecognitionAudio {
  // The audio source, which is either inline content or a Google Cloud
  // Storage uri.
@ -249,7 +330,8 @@ message RecognitionAudio {
    bytes content = 1;

    // URI that points to a file that contains audio data bytes as specified in
-    // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are
+    // `RecognitionConfig`. The file must not be compressed (for example, gzip).
+    // Currently, only Google Cloud Storage URIs are
    // supported, which must be specified in the following format:
    // `gs://bucket_name/object_name` (other URI formats return
    // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
@ -262,7 +344,7 @@ message RecognitionAudio {
 // contains the result as zero or more sequential `SpeechRecognitionResult`
 // messages.
 message RecognizeResponse {
-  // *Output-only* Sequential list of transcription results corresponding to
+  // Output only. Sequential list of transcription results corresponding to
  // sequential portions of audio.
  repeated SpeechRecognitionResult results = 2;
 }
@ -273,7 +355,7 @@ message RecognizeResponse {
 // returned by the `GetOperation` call of the `google::longrunning::Operations`
 // service.
 message LongRunningRecognizeResponse {
-  // *Output-only* Sequential list of transcription results corresponding to
+  // Output only. Sequential list of transcription results corresponding to
  // sequential portions of audio.
  repeated SpeechRecognitionResult results = 2;
 }
@ -358,35 +440,37 @@ message StreamingRecognizeResponse {
    END_OF_SINGLE_UTTERANCE = 1;
  }

-  // *Output-only* If set, returns a [google.rpc.Status][google.rpc.Status] message that
+  // Output only. If set, returns a [google.rpc.Status][google.rpc.Status] message that
  // specifies the error for the operation.
  google.rpc.Status error = 1;

-  // *Output-only* This repeated list contains zero or more results that
+  // Output only. This repeated list contains zero or more results that
  // correspond to consecutive portions of the audio currently being processed.
-  // It contains zero or more `is_final=false` results followed by zero or one
-  // `is_final=true` result (the newly settled portion).
+  // It contains zero or one `is_final=true` result (the newly settled portion),
+  // followed by zero or more `is_final=false` results (the interim results).
  repeated StreamingRecognitionResult results = 2;

-  // *Output-only* Indicates the type of speech event.
+  // Output only. Indicates the type of speech event.
  SpeechEventType speech_event_type = 4;
 }

 // A streaming speech recognition result corresponding to a portion of the audio
 // that is currently being processed.
 message StreamingRecognitionResult {
-  // *Output-only* May contain one or more recognition hypotheses (up to the
+  // Output only. May contain one or more recognition hypotheses (up to the
  // maximum specified in `max_alternatives`).
+  // These alternatives are ordered in terms of accuracy, with the top (first)
+  // alternative being the most probable, as ranked by the recognizer.
  repeated SpeechRecognitionAlternative alternatives = 1;

-  // *Output-only* If `false`, this `StreamingRecognitionResult` represents an
+  // Output only. If `false`, this `StreamingRecognitionResult` represents an
  // interim result that may change. If `true`, this is the final time the
  // speech service will return this particular `StreamingRecognitionResult`,
  // the recognizer will not return any further hypotheses for this portion of
  // the transcript and corresponding audio.
  bool is_final = 2;

-  // *Output-only* An estimate of the likelihood that the recognizer will not
+  // Output only. An estimate of the likelihood that the recognizer will not
  // change its guess about this interim result. Values range from 0.0
  // (completely unstable) to 1.0 (completely stable).
  // This field is only provided for interim results (`is_final=false`).
@ -396,7 +480,7 @@ message StreamingRecognitionResult {

 // A speech recognition result corresponding to a portion of the audio.
 message SpeechRecognitionResult {
-  // *Output-only* May contain one or more recognition hypotheses (up to the
+  // Output only. May contain one or more recognition hypotheses (up to the
  // maximum specified in `max_alternatives`).
  // These alternatives are ordered in terms of accuracy, with the top (first)
  // alternative being the most probable, as ranked by the recognizer.
@ -405,26 +489,25 @@ message SpeechRecognitionResult {

 // Alternative hypotheses (a.k.a. n-best list).
 message SpeechRecognitionAlternative {
-  // *Output-only* Transcript text representing the words that the user spoke.
+  // Output only. Transcript text representing the words that the user spoke.
  string transcript = 1;

-  // *Output-only* The confidence estimate between 0.0 and 1.0. A higher number
+  // Output only. The confidence estimate between 0.0 and 1.0. A higher number
  // indicates an estimated greater likelihood that the recognized words are
-  // correct. This field is typically provided only for the top hypothesis, and
-  // only for `is_final=true` results. Clients should not rely on the
-  // `confidence` field as it is not guaranteed to be accurate or consistent.
+  // correct. This field is set only for the top alternative of a non-streaming
+  // result or, of a streaming result where `is_final=true`.
+  // This field is not guaranteed to be accurate and users should not rely on it
+  // to be always provided.
  // The default of 0.0 is a sentinel value indicating `confidence` was not set.
  float confidence = 2;

-  // *Output-only* A list of word-specific information for each recognized word.
+  // Output only. A list of word-specific information for each recognized word.
  repeated WordInfo words = 3;
 }

-// Word-specific information for recognized words. Word information is only
-// included in the response when certain request parameters are set, such
-// as `enable_word_time_offsets`.
+// Word-specific information for recognized words.
 message WordInfo {
-  // *Output-only* Time offset relative to the beginning of the audio,
+  // Output only. Time offset relative to the beginning of the audio,
  // and corresponding to the start of the spoken word.
  // This field is only set if `enable_word_time_offsets=true` and only
  // in the top hypothesis.
@ -432,7 +515,7 @@ message WordInfo {
  // vary.
  google.protobuf.Duration start_time = 1;

-  // *Output-only* Time offset relative to the beginning of the audio,
+  // Output only. Time offset relative to the beginning of the audio,
  // and corresponding to the end of the spoken word.
  // This field is only set if `enable_word_time_offsets=true` and only
  // in the top hypothesis.
@ -440,6 +523,6 @@ message WordInfo {
  // vary.
  google.protobuf.Duration end_time = 2;

-  // *Output-only* The word corresponding to this set of information.
+  // Output only. The word corresponding to this set of information.
  string word = 3;
 }