Synchronize new proto/yaml changes.

PiperOrigin-RevId: 211138713
This commit is contained in:
Google APIs 2018-08-31 13:51:06 -07:00 committed by Copybara-Service
parent 04aa50dd5d
commit 098a5976de
2 changed files with 158 additions and 66 deletions

View File

@ -1,35 +1,44 @@
# Google Cloud Speech API service configuration
type: google.api.Service
config_version: 3
name: speech.googleapis.com
title: Google Cloud Speech API
documentation:
summary:
Google Cloud Speech API.
title: Cloud Speech API
apis:
- name: google.cloud.speech.v1.Speech
authentication:
documentation:
summary: Converts audio to text by applying powerful neural network models.
overview: |-
# Introduction
Google Cloud Speech API provides speech recognition as a service.
backend:
rules:
- selector: '*'
oauth:
canonical_scopes: https://www.googleapis.com/auth/cloud-platform
- selector: google.longrunning.Operations.GetOperation
deadline: 200.0
- selector: google.longrunning.Operations.WaitOperation
deadline: 200.0
- selector: google.cloud.speech.v1.Speech.Recognize
deadline: 200.0
- selector: google.cloud.speech.v1.Speech.LongRunningRecognize
deadline: 200.0
- selector: google.cloud.speech.v1.Speech.StreamingRecognize
deadline: 200.0
http:
rules:
- selector: google.longrunning.Operations.ListOperations
get: '/v1/operations'
- selector: google.longrunning.Operations.GetOperation
get: '/v1/operations/{name=*}'
additional_bindings:
- get: '/v1beta1/operations/{name=*}'
- selector: google.longrunning.Operations.DeleteOperation
delete: '/v1/operations/{name=*}'
- get: '/v1p1beta1/operations/{name=*}'
- selector: google.longrunning.Operations.CancelOperation
post: '/v1/operations/{name=*}:cancel'
body: '*'
authentication:
rules:
- selector: '*'
oauth:
canonical_scopes: |-
https://www.googleapis.com/auth/cloud-platform

View File

@ -1,4 +1,4 @@
// Copyright 2017 Google Inc.
// Copyright 2018 Google LLC.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -11,6 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
syntax = "proto3";
@ -20,6 +21,7 @@ import "google/api/annotations.proto";
import "google/longrunning/operations.proto";
import "google/protobuf/any.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/empty.proto";
import "google/protobuf/timestamp.proto";
import "google/rpc/status.proto";
@ -35,7 +37,10 @@ service Speech {
// Performs synchronous speech recognition: receive results after all audio
// has been sent and processed.
rpc Recognize(RecognizeRequest) returns (RecognizeResponse) {
option (google.api.http) = { post: "/v1/speech:recognize" body: "*" };
option (google.api.http) = {
post: "/v1/speech:recognize"
body: "*"
};
}
// Performs asynchronous speech recognition: receive results via the
@ -43,12 +48,16 @@ service Speech {
// `Operation.error` or an `Operation.response` which contains
// a `LongRunningRecognizeResponse` message.
rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) {
option (google.api.http) = { post: "/v1/speech:longrunningrecognize" body: "*" };
option (google.api.http) = {
post: "/v1/speech:longrunningrecognize"
body: "*"
};
}
// Performs bidirectional streaming speech recognition: receive results while
// sending audio. This method is only available via the gRPC API (not REST).
rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse);
rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {
}
}
// The top-level message sent by the client for the `Recognize` method.
@ -92,7 +101,7 @@ message StreamingRecognizeRequest {
// `audio_content` data. The audio bytes must be encoded as specified in
// `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
// pure binary representation (not base64). See
// [audio limits](https://cloud.google.com/speech/limits#content).
// [content limits](/speech-to-text/quotas#content).
bytes audio_content = 2;
}
}
@ -127,24 +136,34 @@ message StreamingRecognitionConfig {
// Provides information to the recognizer that specifies how to process the
// request.
message RecognitionConfig {
// Audio encoding of the data sent in the audio message. All encodings support
// only 1 channel (mono) audio. Only `FLAC` and `WAV` include a header that
// describes the bytes of audio that follow the header. The other encodings
// are raw audio bytes with no header.
// The encoding of the audio data sent in the request.
//
// All encodings support only 1 channel (mono) audio.
//
// For best results, the audio source should be captured and transmitted using
// a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be
// reduced if lossy codecs, which include the other codecs listed in
// this section, are used to capture or transmit the audio, particularly if
// background noise is present.
// a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
// recognition can be reduced if lossy codecs are used to capture or transmit
// audio, particularly if background noise is present. Lossy codecs include
// `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, and `SPEEX_WITH_HEADER_BYTE`.
//
// The `FLAC` and `WAV` audio file formats include a header that describes the
// included audio content. You can request recognition for `WAV` files that
// contain either `LINEAR16` or `MULAW` encoded audio.
// If you send `FLAC` or `WAV` audio file format in
// your request, you do not need to specify an `AudioEncoding`; the audio
// encoding format is determined from the file header. If you specify
// an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the
// encoding configuration must match the encoding described in the audio
// header; otherwise the request returns an
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code.
enum AudioEncoding {
// Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
// Not specified.
ENCODING_UNSPECIFIED = 0;
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
LINEAR16 = 1;
// [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
// `FLAC` (Free Lossless Audio
// Codec) is the recommended encoding because it is
// lossless--therefore recognition is not compromised--and
// requires only about half the bandwidth of `LINEAR16`. `FLAC` stream
@ -163,7 +182,7 @@ message RecognitionConfig {
// Opus encoded audio frames in Ogg container
// ([OggOpus](https://wiki.xiph.org/OggOpus)).
// `sample_rate_hertz` must be 16000.
// `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000.
OGG_OPUS = 6;
// Although the use of lossy encodings is not recommended, if a very low
@ -182,20 +201,24 @@ message RecognitionConfig {
SPEEX_WITH_HEADER_BYTE = 7;
}
// *Required* Encoding of audio data sent in all `RecognitionAudio` messages.
// Encoding of audio data sent in all `RecognitionAudio` messages.
// This field is optional for `FLAC` and `WAV` audio files and required
// for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
AudioEncoding encoding = 1;
// *Required* Sample rate in Hertz of the audio data sent in all
// Sample rate in Hertz of the audio data sent in all
// `RecognitionAudio` messages. Valid values are: 8000-48000.
// 16000 is optimal. For best results, set the sampling rate of the audio
// source to 16000 Hz. If that's not possible, use the native sample rate of
// the audio source (instead of re-sampling).
// This field is optional for `FLAC` and `WAV` audio files and required
// for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
int32 sample_rate_hertz = 2;
// *Required* The language of the supplied audio as a
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
// Example: "en-US".
// See [Language Support](https://cloud.google.com/speech/docs/languages)
// See [Language Support](/speech-to-text/docs/languages)
// for a list of the currently supported language codes.
string language_code = 3;
@ -213,7 +236,9 @@ message RecognitionConfig {
// won't be filtered out.
bool profanity_filter = 5;
// *Optional* A means to provide context to assist the speech recognition.
// *Optional* array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
// A means to provide context to assist the speech recognition. For more
// information, see [Phrase Hints](/speech-to-text/docs/basics#phrase-hints).
repeated SpeechContext speech_contexts = 6;
// *Optional* If `true`, the top result includes a list of words and
@ -221,6 +246,62 @@ message RecognitionConfig {
// `false`, no word-level time offset information is returned. The default is
// `false`.
bool enable_word_time_offsets = 8;
// *Optional* If 'true', adds punctuation to recognition result hypotheses.
// This feature is only available in select languages. Setting this for
// requests in other languages has no effect at all.
// The default 'false' value does not add punctuation to result hypotheses.
// Note: This is currently offered as an experimental service, complimentary
// to all users. In the future this may be exclusively available as a
// premium feature.
bool enable_automatic_punctuation = 11;
// *Optional* Which model to select for the given request. Select the model
// best suited to your domain to get best results. If a model is not
// explicitly specified, then we auto-select a model based on the parameters
// in the RecognitionConfig.
// <table>
// <tr>
// <td><b>Model</b></td>
// <td><b>Description</b></td>
// </tr>
// <tr>
// <td><code>command_and_search</code></td>
// <td>Best for short queries such as voice commands or voice search.</td>
// </tr>
// <tr>
// <td><code>phone_call</code></td>
// <td>Best for audio that originated from a phone call (typically
// recorded at an 8khz sampling rate).</td>
// </tr>
// <tr>
// <td><code>video</code></td>
// <td>Best for audio that originated from from video or includes multiple
// speakers. Ideally the audio is recorded at a 16khz or greater
// sampling rate. This is a premium model that costs more than the
// standard rate.</td>
// </tr>
// <tr>
// <td><code>default</code></td>
// <td>Best for audio that is not one of the specific audio models.
// For example, long-form audio. Ideally the audio is high-fidelity,
// recorded at a 16khz or greater sampling rate.</td>
// </tr>
// </table>
string model = 13;
// *Optional* Set to true to use an enhanced model for speech recognition.
// You must also set the `model` field to a valid, enhanced model. If
// `use_enhanced` is set to true and the `model` field is not set, then
// `use_enhanced` is ignored. If `use_enhanced` is true and an enhanced
// version of the specified model does not exist, then the speech is
// recognized using the standard version of the specified model.
//
// Enhanced speech models require that you opt-in to data logging using
// instructions in the [documentation](/speech-to-text/enable-data-logging).
// If you set `use_enhanced` to true and you have not enabled audio logging,
// then you will receive an error.
bool use_enhanced = 14;
}
// Provides "hints" to the speech recognizer to favor specific words and phrases
@ -231,14 +312,14 @@ message SpeechContext {
// to improve the accuracy for specific words and phrases, for example, if
// specific commands are typically spoken by the user. This can also be used
// to add additional words to the vocabulary of the recognizer. See
// [usage limits](https://cloud.google.com/speech/limits#content).
// [usage limits](/speech-to-text/quotas#content).
repeated string phrases = 1;
}
// Contains audio data in the encoding specified in the `RecognitionConfig`.
// Either `content` or `uri` must be supplied. Supplying both or neither
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
// [audio limits](https://cloud.google.com/speech/limits#content).
// [content limits](/speech-to-text/quotas#content).
message RecognitionAudio {
// The audio source, which is either inline content or a Google Cloud
// Storage uri.
@ -249,7 +330,8 @@ message RecognitionAudio {
bytes content = 1;
// URI that points to a file that contains audio data bytes as specified in
// `RecognitionConfig`. Currently, only Google Cloud Storage URIs are
// `RecognitionConfig`. The file must not be compressed (for example, gzip).
// Currently, only Google Cloud Storage URIs are
// supported, which must be specified in the following format:
// `gs://bucket_name/object_name` (other URI formats return
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
@ -262,7 +344,7 @@ message RecognitionAudio {
// contains the result as zero or more sequential `SpeechRecognitionResult`
// messages.
message RecognizeResponse {
// *Output-only* Sequential list of transcription results corresponding to
// Output only. Sequential list of transcription results corresponding to
// sequential portions of audio.
repeated SpeechRecognitionResult results = 2;
}
@ -273,7 +355,7 @@ message RecognizeResponse {
// returned by the `GetOperation` call of the `google::longrunning::Operations`
// service.
message LongRunningRecognizeResponse {
// *Output-only* Sequential list of transcription results corresponding to
// Output only. Sequential list of transcription results corresponding to
// sequential portions of audio.
repeated SpeechRecognitionResult results = 2;
}
@ -358,35 +440,37 @@ message StreamingRecognizeResponse {
END_OF_SINGLE_UTTERANCE = 1;
}
// *Output-only* If set, returns a [google.rpc.Status][google.rpc.Status] message that
// Output only. If set, returns a [google.rpc.Status][google.rpc.Status] message that
// specifies the error for the operation.
google.rpc.Status error = 1;
// *Output-only* This repeated list contains zero or more results that
// Output only. This repeated list contains zero or more results that
// correspond to consecutive portions of the audio currently being processed.
// It contains zero or more `is_final=false` results followed by zero or one
// `is_final=true` result (the newly settled portion).
// It contains zero or one `is_final=true` result (the newly settled portion),
// followed by zero or more `is_final=false` results (the interim results).
repeated StreamingRecognitionResult results = 2;
// *Output-only* Indicates the type of speech event.
// Output only. Indicates the type of speech event.
SpeechEventType speech_event_type = 4;
}
// A streaming speech recognition result corresponding to a portion of the audio
// that is currently being processed.
message StreamingRecognitionResult {
// *Output-only* May contain one or more recognition hypotheses (up to the
// Output only. May contain one or more recognition hypotheses (up to the
// maximum specified in `max_alternatives`).
// These alternatives are ordered in terms of accuracy, with the top (first)
// alternative being the most probable, as ranked by the recognizer.
repeated SpeechRecognitionAlternative alternatives = 1;
// *Output-only* If `false`, this `StreamingRecognitionResult` represents an
// Output only. If `false`, this `StreamingRecognitionResult` represents an
// interim result that may change. If `true`, this is the final time the
// speech service will return this particular `StreamingRecognitionResult`,
// the recognizer will not return any further hypotheses for this portion of
// the transcript and corresponding audio.
bool is_final = 2;
// *Output-only* An estimate of the likelihood that the recognizer will not
// Output only. An estimate of the likelihood that the recognizer will not
// change its guess about this interim result. Values range from 0.0
// (completely unstable) to 1.0 (completely stable).
// This field is only provided for interim results (`is_final=false`).
@ -396,7 +480,7 @@ message StreamingRecognitionResult {
// A speech recognition result corresponding to a portion of the audio.
message SpeechRecognitionResult {
// *Output-only* May contain one or more recognition hypotheses (up to the
// Output only. May contain one or more recognition hypotheses (up to the
// maximum specified in `max_alternatives`).
// These alternatives are ordered in terms of accuracy, with the top (first)
// alternative being the most probable, as ranked by the recognizer.
@ -405,26 +489,25 @@ message SpeechRecognitionResult {
// Alternative hypotheses (a.k.a. n-best list).
message SpeechRecognitionAlternative {
// *Output-only* Transcript text representing the words that the user spoke.
// Output only. Transcript text representing the words that the user spoke.
string transcript = 1;
// *Output-only* The confidence estimate between 0.0 and 1.0. A higher number
// Output only. The confidence estimate between 0.0 and 1.0. A higher number
// indicates an estimated greater likelihood that the recognized words are
// correct. This field is typically provided only for the top hypothesis, and
// only for `is_final=true` results. Clients should not rely on the
// `confidence` field as it is not guaranteed to be accurate or consistent.
// correct. This field is set only for the top alternative of a non-streaming
// result or, of a streaming result where `is_final=true`.
// This field is not guaranteed to be accurate and users should not rely on it
// to be always provided.
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
float confidence = 2;
// *Output-only* A list of word-specific information for each recognized word.
// Output only. A list of word-specific information for each recognized word.
repeated WordInfo words = 3;
}
// Word-specific information for recognized words. Word information is only
// included in the response when certain request parameters are set, such
// as `enable_word_time_offsets`.
// Word-specific information for recognized words.
message WordInfo {
// *Output-only* Time offset relative to the beginning of the audio,
// Output only. Time offset relative to the beginning of the audio,
// and corresponding to the start of the spoken word.
// This field is only set if `enable_word_time_offsets=true` and only
// in the top hypothesis.
@ -432,7 +515,7 @@ message WordInfo {
// vary.
google.protobuf.Duration start_time = 1;
// *Output-only* Time offset relative to the beginning of the audio,
// Output only. Time offset relative to the beginning of the audio,
// and corresponding to the end of the spoken word.
// This field is only set if `enable_word_time_offsets=true` and only
// in the top hypothesis.
@ -440,6 +523,6 @@ message WordInfo {
// vary.
google.protobuf.Duration end_time = 2;
// *Output-only* The word corresponding to this set of information.
// Output only. The word corresponding to this set of information.
string word = 3;
}