Synchronize new proto/yaml changes.
PiperOrigin-RevId: 211138713
This commit is contained in:
parent
04aa50dd5d
commit
098a5976de
|
|
@ -1,35 +1,44 @@
|
|||
# Google Cloud Speech API service configuration
|
||||
|
||||
type: google.api.Service
|
||||
config_version: 3
|
||||
name: speech.googleapis.com
|
||||
|
||||
title: Google Cloud Speech API
|
||||
|
||||
documentation:
|
||||
summary:
|
||||
Google Cloud Speech API.
|
||||
title: Cloud Speech API
|
||||
|
||||
apis:
|
||||
- name: google.cloud.speech.v1.Speech
|
||||
|
||||
authentication:
|
||||
documentation:
|
||||
summary: Converts audio to text by applying powerful neural network models.
|
||||
overview: |-
|
||||
# Introduction
|
||||
|
||||
Google Cloud Speech API provides speech recognition as a service.
|
||||
|
||||
backend:
|
||||
rules:
|
||||
- selector: '*'
|
||||
oauth:
|
||||
canonical_scopes: https://www.googleapis.com/auth/cloud-platform
|
||||
- selector: google.longrunning.Operations.GetOperation
|
||||
deadline: 200.0
|
||||
- selector: google.longrunning.Operations.WaitOperation
|
||||
deadline: 200.0
|
||||
- selector: google.cloud.speech.v1.Speech.Recognize
|
||||
deadline: 200.0
|
||||
- selector: google.cloud.speech.v1.Speech.LongRunningRecognize
|
||||
deadline: 200.0
|
||||
- selector: google.cloud.speech.v1.Speech.StreamingRecognize
|
||||
deadline: 200.0
|
||||
|
||||
http:
|
||||
rules:
|
||||
- selector: google.longrunning.Operations.ListOperations
|
||||
get: '/v1/operations'
|
||||
|
||||
- selector: google.longrunning.Operations.GetOperation
|
||||
get: '/v1/operations/{name=*}'
|
||||
additional_bindings:
|
||||
- get: '/v1beta1/operations/{name=*}'
|
||||
|
||||
- selector: google.longrunning.Operations.DeleteOperation
|
||||
delete: '/v1/operations/{name=*}'
|
||||
- get: '/v1p1beta1/operations/{name=*}'
|
||||
|
||||
- selector: google.longrunning.Operations.CancelOperation
|
||||
post: '/v1/operations/{name=*}:cancel'
|
||||
body: '*'
|
||||
|
||||
authentication:
|
||||
rules:
|
||||
- selector: '*'
|
||||
oauth:
|
||||
canonical_scopes: |-
|
||||
https://www.googleapis.com/auth/cloud-platform
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
// Copyright 2017 Google Inc.
|
||||
// Copyright 2018 Google LLC.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
|
@ -11,6 +11,7 @@
|
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
|
|
@ -20,6 +21,7 @@ import "google/api/annotations.proto";
|
|||
import "google/longrunning/operations.proto";
|
||||
import "google/protobuf/any.proto";
|
||||
import "google/protobuf/duration.proto";
|
||||
import "google/protobuf/empty.proto";
|
||||
import "google/protobuf/timestamp.proto";
|
||||
import "google/rpc/status.proto";
|
||||
|
||||
|
|
@ -35,7 +37,10 @@ service Speech {
|
|||
// Performs synchronous speech recognition: receive results after all audio
|
||||
// has been sent and processed.
|
||||
rpc Recognize(RecognizeRequest) returns (RecognizeResponse) {
|
||||
option (google.api.http) = { post: "/v1/speech:recognize" body: "*" };
|
||||
option (google.api.http) = {
|
||||
post: "/v1/speech:recognize"
|
||||
body: "*"
|
||||
};
|
||||
}
|
||||
|
||||
// Performs asynchronous speech recognition: receive results via the
|
||||
|
|
@ -43,12 +48,16 @@ service Speech {
|
|||
// `Operation.error` or an `Operation.response` which contains
|
||||
// a `LongRunningRecognizeResponse` message.
|
||||
rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) {
|
||||
option (google.api.http) = { post: "/v1/speech:longrunningrecognize" body: "*" };
|
||||
option (google.api.http) = {
|
||||
post: "/v1/speech:longrunningrecognize"
|
||||
body: "*"
|
||||
};
|
||||
}
|
||||
|
||||
// Performs bidirectional streaming speech recognition: receive results while
|
||||
// sending audio. This method is only available via the gRPC API (not REST).
|
||||
rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse);
|
||||
rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {
|
||||
}
|
||||
}
|
||||
|
||||
// The top-level message sent by the client for the `Recognize` method.
|
||||
|
|
@ -92,7 +101,7 @@ message StreamingRecognizeRequest {
|
|||
// `audio_content` data. The audio bytes must be encoded as specified in
|
||||
// `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
|
||||
// pure binary representation (not base64). See
|
||||
// [audio limits](https://cloud.google.com/speech/limits#content).
|
||||
// [content limits](/speech-to-text/quotas#content).
|
||||
bytes audio_content = 2;
|
||||
}
|
||||
}
|
||||
|
|
@ -127,24 +136,34 @@ message StreamingRecognitionConfig {
|
|||
// Provides information to the recognizer that specifies how to process the
|
||||
// request.
|
||||
message RecognitionConfig {
|
||||
// Audio encoding of the data sent in the audio message. All encodings support
|
||||
// only 1 channel (mono) audio. Only `FLAC` and `WAV` include a header that
|
||||
// describes the bytes of audio that follow the header. The other encodings
|
||||
// are raw audio bytes with no header.
|
||||
// The encoding of the audio data sent in the request.
|
||||
//
|
||||
// All encodings support only 1 channel (mono) audio.
|
||||
//
|
||||
// For best results, the audio source should be captured and transmitted using
|
||||
// a lossless encoding (`FLAC` or `LINEAR16`). Recognition accuracy may be
|
||||
// reduced if lossy codecs, which include the other codecs listed in
|
||||
// this section, are used to capture or transmit the audio, particularly if
|
||||
// background noise is present.
|
||||
// a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
|
||||
// recognition can be reduced if lossy codecs are used to capture or transmit
|
||||
// audio, particularly if background noise is present. Lossy codecs include
|
||||
// `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, and `SPEEX_WITH_HEADER_BYTE`.
|
||||
//
|
||||
// The `FLAC` and `WAV` audio file formats include a header that describes the
|
||||
// included audio content. You can request recognition for `WAV` files that
|
||||
// contain either `LINEAR16` or `MULAW` encoded audio.
|
||||
// If you send `FLAC` or `WAV` audio file format in
|
||||
// your request, you do not need to specify an `AudioEncoding`; the audio
|
||||
// encoding format is determined from the file header. If you specify
|
||||
// an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the
|
||||
// encoding configuration must match the encoding described in the audio
|
||||
// header; otherwise the request returns an
|
||||
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code.
|
||||
enum AudioEncoding {
|
||||
// Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
|
||||
// Not specified.
|
||||
ENCODING_UNSPECIFIED = 0;
|
||||
|
||||
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
|
||||
LINEAR16 = 1;
|
||||
|
||||
// [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
|
||||
// `FLAC` (Free Lossless Audio
|
||||
// Codec) is the recommended encoding because it is
|
||||
// lossless--therefore recognition is not compromised--and
|
||||
// requires only about half the bandwidth of `LINEAR16`. `FLAC` stream
|
||||
|
|
@ -163,7 +182,7 @@ message RecognitionConfig {
|
|||
|
||||
// Opus encoded audio frames in Ogg container
|
||||
// ([OggOpus](https://wiki.xiph.org/OggOpus)).
|
||||
// `sample_rate_hertz` must be 16000.
|
||||
// `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000.
|
||||
OGG_OPUS = 6;
|
||||
|
||||
// Although the use of lossy encodings is not recommended, if a very low
|
||||
|
|
@ -182,20 +201,24 @@ message RecognitionConfig {
|
|||
SPEEX_WITH_HEADER_BYTE = 7;
|
||||
}
|
||||
|
||||
// *Required* Encoding of audio data sent in all `RecognitionAudio` messages.
|
||||
// Encoding of audio data sent in all `RecognitionAudio` messages.
|
||||
// This field is optional for `FLAC` and `WAV` audio files and required
|
||||
// for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
|
||||
AudioEncoding encoding = 1;
|
||||
|
||||
// *Required* Sample rate in Hertz of the audio data sent in all
|
||||
// Sample rate in Hertz of the audio data sent in all
|
||||
// `RecognitionAudio` messages. Valid values are: 8000-48000.
|
||||
// 16000 is optimal. For best results, set the sampling rate of the audio
|
||||
// source to 16000 Hz. If that's not possible, use the native sample rate of
|
||||
// the audio source (instead of re-sampling).
|
||||
// This field is optional for `FLAC` and `WAV` audio files and required
|
||||
// for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
|
||||
int32 sample_rate_hertz = 2;
|
||||
|
||||
// *Required* The language of the supplied audio as a
|
||||
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
|
||||
// Example: "en-US".
|
||||
// See [Language Support](https://cloud.google.com/speech/docs/languages)
|
||||
// See [Language Support](/speech-to-text/docs/languages)
|
||||
// for a list of the currently supported language codes.
|
||||
string language_code = 3;
|
||||
|
||||
|
|
@ -213,7 +236,9 @@ message RecognitionConfig {
|
|||
// won't be filtered out.
|
||||
bool profanity_filter = 5;
|
||||
|
||||
// *Optional* A means to provide context to assist the speech recognition.
|
||||
// *Optional* array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
|
||||
// A means to provide context to assist the speech recognition. For more
|
||||
// information, see [Phrase Hints](/speech-to-text/docs/basics#phrase-hints).
|
||||
repeated SpeechContext speech_contexts = 6;
|
||||
|
||||
// *Optional* If `true`, the top result includes a list of words and
|
||||
|
|
@ -221,6 +246,62 @@ message RecognitionConfig {
|
|||
// `false`, no word-level time offset information is returned. The default is
|
||||
// `false`.
|
||||
bool enable_word_time_offsets = 8;
|
||||
|
||||
// *Optional* If 'true', adds punctuation to recognition result hypotheses.
|
||||
// This feature is only available in select languages. Setting this for
|
||||
// requests in other languages has no effect at all.
|
||||
// The default 'false' value does not add punctuation to result hypotheses.
|
||||
// Note: This is currently offered as an experimental service, complimentary
|
||||
// to all users. In the future this may be exclusively available as a
|
||||
// premium feature.
|
||||
bool enable_automatic_punctuation = 11;
|
||||
|
||||
// *Optional* Which model to select for the given request. Select the model
|
||||
// best suited to your domain to get best results. If a model is not
|
||||
// explicitly specified, then we auto-select a model based on the parameters
|
||||
// in the RecognitionConfig.
|
||||
// <table>
|
||||
// <tr>
|
||||
// <td><b>Model</b></td>
|
||||
// <td><b>Description</b></td>
|
||||
// </tr>
|
||||
// <tr>
|
||||
// <td><code>command_and_search</code></td>
|
||||
// <td>Best for short queries such as voice commands or voice search.</td>
|
||||
// </tr>
|
||||
// <tr>
|
||||
// <td><code>phone_call</code></td>
|
||||
// <td>Best for audio that originated from a phone call (typically
|
||||
// recorded at an 8khz sampling rate).</td>
|
||||
// </tr>
|
||||
// <tr>
|
||||
// <td><code>video</code></td>
|
||||
// <td>Best for audio that originated from from video or includes multiple
|
||||
// speakers. Ideally the audio is recorded at a 16khz or greater
|
||||
// sampling rate. This is a premium model that costs more than the
|
||||
// standard rate.</td>
|
||||
// </tr>
|
||||
// <tr>
|
||||
// <td><code>default</code></td>
|
||||
// <td>Best for audio that is not one of the specific audio models.
|
||||
// For example, long-form audio. Ideally the audio is high-fidelity,
|
||||
// recorded at a 16khz or greater sampling rate.</td>
|
||||
// </tr>
|
||||
// </table>
|
||||
string model = 13;
|
||||
|
||||
// *Optional* Set to true to use an enhanced model for speech recognition.
|
||||
// You must also set the `model` field to a valid, enhanced model. If
|
||||
// `use_enhanced` is set to true and the `model` field is not set, then
|
||||
// `use_enhanced` is ignored. If `use_enhanced` is true and an enhanced
|
||||
// version of the specified model does not exist, then the speech is
|
||||
// recognized using the standard version of the specified model.
|
||||
//
|
||||
// Enhanced speech models require that you opt-in to data logging using
|
||||
// instructions in the [documentation](/speech-to-text/enable-data-logging).
|
||||
// If you set `use_enhanced` to true and you have not enabled audio logging,
|
||||
// then you will receive an error.
|
||||
bool use_enhanced = 14;
|
||||
}
|
||||
|
||||
// Provides "hints" to the speech recognizer to favor specific words and phrases
|
||||
|
|
@ -231,14 +312,14 @@ message SpeechContext {
|
|||
// to improve the accuracy for specific words and phrases, for example, if
|
||||
// specific commands are typically spoken by the user. This can also be used
|
||||
// to add additional words to the vocabulary of the recognizer. See
|
||||
// [usage limits](https://cloud.google.com/speech/limits#content).
|
||||
// [usage limits](/speech-to-text/quotas#content).
|
||||
repeated string phrases = 1;
|
||||
}
|
||||
|
||||
// Contains audio data in the encoding specified in the `RecognitionConfig`.
|
||||
// Either `content` or `uri` must be supplied. Supplying both or neither
|
||||
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
|
||||
// [audio limits](https://cloud.google.com/speech/limits#content).
|
||||
// [content limits](/speech-to-text/quotas#content).
|
||||
message RecognitionAudio {
|
||||
// The audio source, which is either inline content or a Google Cloud
|
||||
// Storage uri.
|
||||
|
|
@ -249,7 +330,8 @@ message RecognitionAudio {
|
|||
bytes content = 1;
|
||||
|
||||
// URI that points to a file that contains audio data bytes as specified in
|
||||
// `RecognitionConfig`. Currently, only Google Cloud Storage URIs are
|
||||
// `RecognitionConfig`. The file must not be compressed (for example, gzip).
|
||||
// Currently, only Google Cloud Storage URIs are
|
||||
// supported, which must be specified in the following format:
|
||||
// `gs://bucket_name/object_name` (other URI formats return
|
||||
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
|
||||
|
|
@ -262,7 +344,7 @@ message RecognitionAudio {
|
|||
// contains the result as zero or more sequential `SpeechRecognitionResult`
|
||||
// messages.
|
||||
message RecognizeResponse {
|
||||
// *Output-only* Sequential list of transcription results corresponding to
|
||||
// Output only. Sequential list of transcription results corresponding to
|
||||
// sequential portions of audio.
|
||||
repeated SpeechRecognitionResult results = 2;
|
||||
}
|
||||
|
|
@ -273,7 +355,7 @@ message RecognizeResponse {
|
|||
// returned by the `GetOperation` call of the `google::longrunning::Operations`
|
||||
// service.
|
||||
message LongRunningRecognizeResponse {
|
||||
// *Output-only* Sequential list of transcription results corresponding to
|
||||
// Output only. Sequential list of transcription results corresponding to
|
||||
// sequential portions of audio.
|
||||
repeated SpeechRecognitionResult results = 2;
|
||||
}
|
||||
|
|
@ -358,35 +440,37 @@ message StreamingRecognizeResponse {
|
|||
END_OF_SINGLE_UTTERANCE = 1;
|
||||
}
|
||||
|
||||
// *Output-only* If set, returns a [google.rpc.Status][google.rpc.Status] message that
|
||||
// Output only. If set, returns a [google.rpc.Status][google.rpc.Status] message that
|
||||
// specifies the error for the operation.
|
||||
google.rpc.Status error = 1;
|
||||
|
||||
// *Output-only* This repeated list contains zero or more results that
|
||||
// Output only. This repeated list contains zero or more results that
|
||||
// correspond to consecutive portions of the audio currently being processed.
|
||||
// It contains zero or more `is_final=false` results followed by zero or one
|
||||
// `is_final=true` result (the newly settled portion).
|
||||
// It contains zero or one `is_final=true` result (the newly settled portion),
|
||||
// followed by zero or more `is_final=false` results (the interim results).
|
||||
repeated StreamingRecognitionResult results = 2;
|
||||
|
||||
// *Output-only* Indicates the type of speech event.
|
||||
// Output only. Indicates the type of speech event.
|
||||
SpeechEventType speech_event_type = 4;
|
||||
}
|
||||
|
||||
// A streaming speech recognition result corresponding to a portion of the audio
|
||||
// that is currently being processed.
|
||||
message StreamingRecognitionResult {
|
||||
// *Output-only* May contain one or more recognition hypotheses (up to the
|
||||
// Output only. May contain one or more recognition hypotheses (up to the
|
||||
// maximum specified in `max_alternatives`).
|
||||
// These alternatives are ordered in terms of accuracy, with the top (first)
|
||||
// alternative being the most probable, as ranked by the recognizer.
|
||||
repeated SpeechRecognitionAlternative alternatives = 1;
|
||||
|
||||
// *Output-only* If `false`, this `StreamingRecognitionResult` represents an
|
||||
// Output only. If `false`, this `StreamingRecognitionResult` represents an
|
||||
// interim result that may change. If `true`, this is the final time the
|
||||
// speech service will return this particular `StreamingRecognitionResult`,
|
||||
// the recognizer will not return any further hypotheses for this portion of
|
||||
// the transcript and corresponding audio.
|
||||
bool is_final = 2;
|
||||
|
||||
// *Output-only* An estimate of the likelihood that the recognizer will not
|
||||
// Output only. An estimate of the likelihood that the recognizer will not
|
||||
// change its guess about this interim result. Values range from 0.0
|
||||
// (completely unstable) to 1.0 (completely stable).
|
||||
// This field is only provided for interim results (`is_final=false`).
|
||||
|
|
@ -396,7 +480,7 @@ message StreamingRecognitionResult {
|
|||
|
||||
// A speech recognition result corresponding to a portion of the audio.
|
||||
message SpeechRecognitionResult {
|
||||
// *Output-only* May contain one or more recognition hypotheses (up to the
|
||||
// Output only. May contain one or more recognition hypotheses (up to the
|
||||
// maximum specified in `max_alternatives`).
|
||||
// These alternatives are ordered in terms of accuracy, with the top (first)
|
||||
// alternative being the most probable, as ranked by the recognizer.
|
||||
|
|
@ -405,26 +489,25 @@ message SpeechRecognitionResult {
|
|||
|
||||
// Alternative hypotheses (a.k.a. n-best list).
|
||||
message SpeechRecognitionAlternative {
|
||||
// *Output-only* Transcript text representing the words that the user spoke.
|
||||
// Output only. Transcript text representing the words that the user spoke.
|
||||
string transcript = 1;
|
||||
|
||||
// *Output-only* The confidence estimate between 0.0 and 1.0. A higher number
|
||||
// Output only. The confidence estimate between 0.0 and 1.0. A higher number
|
||||
// indicates an estimated greater likelihood that the recognized words are
|
||||
// correct. This field is typically provided only for the top hypothesis, and
|
||||
// only for `is_final=true` results. Clients should not rely on the
|
||||
// `confidence` field as it is not guaranteed to be accurate or consistent.
|
||||
// correct. This field is set only for the top alternative of a non-streaming
|
||||
// result or, of a streaming result where `is_final=true`.
|
||||
// This field is not guaranteed to be accurate and users should not rely on it
|
||||
// to be always provided.
|
||||
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
|
||||
float confidence = 2;
|
||||
|
||||
// *Output-only* A list of word-specific information for each recognized word.
|
||||
// Output only. A list of word-specific information for each recognized word.
|
||||
repeated WordInfo words = 3;
|
||||
}
|
||||
|
||||
// Word-specific information for recognized words. Word information is only
|
||||
// included in the response when certain request parameters are set, such
|
||||
// as `enable_word_time_offsets`.
|
||||
// Word-specific information for recognized words.
|
||||
message WordInfo {
|
||||
// *Output-only* Time offset relative to the beginning of the audio,
|
||||
// Output only. Time offset relative to the beginning of the audio,
|
||||
// and corresponding to the start of the spoken word.
|
||||
// This field is only set if `enable_word_time_offsets=true` and only
|
||||
// in the top hypothesis.
|
||||
|
|
@ -432,7 +515,7 @@ message WordInfo {
|
|||
// vary.
|
||||
google.protobuf.Duration start_time = 1;
|
||||
|
||||
// *Output-only* Time offset relative to the beginning of the audio,
|
||||
// Output only. Time offset relative to the beginning of the audio,
|
||||
// and corresponding to the end of the spoken word.
|
||||
// This field is only set if `enable_word_time_offsets=true` and only
|
||||
// in the top hypothesis.
|
||||
|
|
@ -440,6 +523,6 @@ message WordInfo {
|
|||
// vary.
|
||||
google.protobuf.Duration end_time = 2;
|
||||
|
||||
// *Output-only* The word corresponding to this set of information.
|
||||
// Output only. The word corresponding to this set of information.
|
||||
string word = 3;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue