Add speech adaptation, phrase sets and custom classes.
PiperOrigin-RevId: 306280989
This commit is contained in:
parent
3824f547aa
commit
4d61e1cb40
|
|
@ -13,11 +13,13 @@ proto_library(
|
|||
name = "speech_proto",
|
||||
srcs = [
|
||||
"cloud_speech.proto",
|
||||
"resource.proto",
|
||||
],
|
||||
deps = [
|
||||
"//google/api:annotations_proto",
|
||||
"//google/api:client_proto",
|
||||
"//google/api:field_behavior_proto",
|
||||
"//google/api:resource_proto",
|
||||
"//google/longrunning:operations_proto",
|
||||
"//google/rpc:status_proto",
|
||||
"@com_google_protobuf//:any_proto",
|
||||
|
|
@ -166,6 +168,7 @@ moved_proto_library(
|
|||
"//google/api:annotations_proto",
|
||||
"//google/api:client_proto",
|
||||
"//google/api:field_behavior_proto",
|
||||
"//google/api:resource_proto",
|
||||
"//google/longrunning:operations_proto",
|
||||
"//google/rpc:status_proto",
|
||||
"@com_google_protobuf//:any_proto",
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
// Copyright 2019 Google LLC.
|
||||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
|
@ -11,7 +11,6 @@
|
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
|
|
@ -20,6 +19,8 @@ package google.cloud.speech.v1p1beta1;
|
|||
import "google/api/annotations.proto";
|
||||
import "google/api/client.proto";
|
||||
import "google/api/field_behavior.proto";
|
||||
import "google/api/resource.proto";
|
||||
import "google/cloud/speech/v1p1beta1/resource.proto";
|
||||
import "google/longrunning/operations.proto";
|
||||
import "google/protobuf/any.proto";
|
||||
import "google/protobuf/duration.proto";
|
||||
|
|
@ -36,7 +37,8 @@ option objc_class_prefix = "GCS";
|
|||
// Service that implements Google Cloud Speech API.
|
||||
service Speech {
|
||||
option (google.api.default_host) = "speech.googleapis.com";
|
||||
option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";
|
||||
option (google.api.oauth_scopes) =
|
||||
"https://www.googleapis.com/auth/cloud-platform";
|
||||
|
||||
// Performs synchronous speech recognition: receive results after all audio
|
||||
// has been sent and processed.
|
||||
|
|
@ -54,7 +56,8 @@ service Speech {
|
|||
// a `LongRunningRecognizeResponse` message.
|
||||
// For more information on asynchronous speech recognition, see the
|
||||
// [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize).
|
||||
rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) {
|
||||
rpc LongRunningRecognize(LongRunningRecognizeRequest)
|
||||
returns (google.longrunning.Operation) {
|
||||
option (google.api.http) = {
|
||||
post: "/v1p1beta1/speech:longrunningrecognize"
|
||||
body: "*"
|
||||
|
|
@ -68,8 +71,8 @@ service Speech {
|
|||
|
||||
// Performs bidirectional streaming speech recognition: receive results while
|
||||
// sending audio. This method is only available via the gRPC API (not REST).
|
||||
rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {
|
||||
}
|
||||
rpc StreamingRecognize(stream StreamingRecognizeRequest)
|
||||
returns (stream StreamingRecognizeResponse) {}
|
||||
}
|
||||
|
||||
// The top-level message sent by the client for the `Recognize` method.
|
||||
|
|
@ -169,7 +172,8 @@ message RecognitionConfig {
|
|||
// an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the
|
||||
// encoding configuration must match the encoding described in the audio
|
||||
// header; otherwise the request returns an
|
||||
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code.
|
||||
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error
|
||||
// code.
|
||||
enum AudioEncoding {
|
||||
// Not specified.
|
||||
ENCODING_UNSPECIFIED = 0;
|
||||
|
|
@ -215,14 +219,15 @@ message RecognitionConfig {
|
|||
SPEEX_WITH_HEADER_BYTE = 7;
|
||||
|
||||
// MP3 audio. Support all standard MP3 bitrates (which range from 32-320
|
||||
// kbps). When using this encoding, `sample_rate_hertz` can be optionally
|
||||
// unset if not known.
|
||||
// kbps). When using this encoding, `sample_rate_hertz` has to match the
|
||||
// sample rate of the file being used.
|
||||
MP3 = 8;
|
||||
}
|
||||
|
||||
// Encoding of audio data sent in all `RecognitionAudio` messages.
|
||||
// This field is optional for `FLAC` and `WAV` audio files and required
|
||||
// for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
|
||||
// for all other audio formats. For details, see
|
||||
// [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
|
||||
AudioEncoding encoding = 1;
|
||||
|
||||
// Sample rate in Hertz of the audio data sent in all
|
||||
|
|
@ -231,7 +236,8 @@ message RecognitionConfig {
|
|||
// source to 16000 Hz. If that's not possible, use the native sample rate of
|
||||
// the audio source (instead of re-sampling).
|
||||
// This field is optional for FLAC and WAV audio files, but is
|
||||
// required for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
|
||||
// required for all other audio formats. For details, see
|
||||
// [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
|
||||
int32 sample_rate_hertz = 2;
|
||||
|
||||
// The number of channels in the input audio data.
|
||||
|
|
@ -289,6 +295,13 @@ message RecognitionConfig {
|
|||
// won't be filtered out.
|
||||
bool profanity_filter = 5;
|
||||
|
||||
// Speech adaptation configuration improves the accuracy of speech
|
||||
// recognition. When speech adaptation is set it supersedes the
|
||||
// `speech_contexts` field. For more information, see the [speech
|
||||
// adaptation](https://cloud.google.com/speech-to-text/docs/context-strength)
|
||||
// documentation.
|
||||
SpeechAdaptation adaptation = 20;
|
||||
|
||||
// Array of [SpeechContext][google.cloud.speech.v1p1beta1.SpeechContext].
|
||||
// A means to provide context to assist the speech recognition. For more
|
||||
// information, see
|
||||
|
|
@ -311,9 +324,6 @@ message RecognitionConfig {
|
|||
// This feature is only available in select languages. Setting this for
|
||||
// requests in other languages has no effect at all.
|
||||
// The default 'false' value does not add punctuation to result hypotheses.
|
||||
// Note: This is currently offered as an experimental service, complimentary
|
||||
// to all users. In the future this may be exclusively available as a
|
||||
// premium feature.
|
||||
bool enable_automatic_punctuation = 11;
|
||||
|
||||
// If 'true', enables speaker detection for each recognized word in
|
||||
|
|
@ -401,6 +411,10 @@ message SpeakerDiarizationConfig {
|
|||
// flexibility by allowing the system to automatically determine the correct
|
||||
// number of speakers. If not set, the default value is 6.
|
||||
int32 max_speaker_count = 3;
|
||||
|
||||
// Output only. Unused.
|
||||
int32 speaker_tag = 5
|
||||
[deprecated = true, (google.api.field_behavior) = OUTPUT_ONLY];
|
||||
}
|
||||
|
||||
// Description of audio data to be recognized.
|
||||
|
|
@ -564,8 +578,8 @@ message SpeechContext {
|
|||
|
||||
// Contains audio data in the encoding specified in the `RecognitionConfig`.
|
||||
// Either `content` or `uri` must be supplied. Supplying both or neither
|
||||
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
|
||||
// [content limits](https://cloud.google.com/speech-to-text/quotas#content).
|
||||
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
|
||||
// See [content limits](https://cloud.google.com/speech-to-text/quotas#content).
|
||||
message RecognitionAudio {
|
||||
// The audio source, which is either inline content or a Google Cloud
|
||||
// Storage uri.
|
||||
|
|
@ -580,8 +594,9 @@ message RecognitionAudio {
|
|||
// Currently, only Google Cloud Storage URIs are
|
||||
// supported, which must be specified in the following format:
|
||||
// `gs://bucket_name/object_name` (other URI formats return
|
||||
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
|
||||
// [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
|
||||
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]).
|
||||
// For more information, see [Request
|
||||
// URIs](https://cloud.google.com/storage/docs/reference-uris).
|
||||
string uri = 2;
|
||||
}
|
||||
}
|
||||
|
|
@ -619,6 +634,10 @@ message LongRunningRecognizeMetadata {
|
|||
|
||||
// Time of the most recent processing update.
|
||||
google.protobuf.Timestamp last_update_time = 3;
|
||||
|
||||
// The URI of the audio file being transcribed. Empty if the audio was sent
|
||||
// as byte content.
|
||||
string uri = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
|
||||
}
|
||||
|
||||
// `StreamingRecognizeResponse` is the only message returned to the client by
|
||||
|
|
@ -732,10 +751,10 @@ message StreamingRecognitionResult {
|
|||
// For audio_channel_count = N, its output values can range from '1' to 'N'.
|
||||
int32 channel_tag = 5;
|
||||
|
||||
// The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
|
||||
// of the language in this result. This language code was detected to have
|
||||
// the most likelihood of being spoken in the audio.
|
||||
string language_code = 6;
|
||||
// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
|
||||
// language tag of the language in this result. This language code was
|
||||
// detected to have the most likelihood of being spoken in the audio.
|
||||
string language_code = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
|
||||
}
|
||||
|
||||
// A speech recognition result corresponding to a portion of the audio.
|
||||
|
|
@ -751,10 +770,10 @@ message SpeechRecognitionResult {
|
|||
// For audio_channel_count = N, its output values can range from '1' to 'N'.
|
||||
int32 channel_tag = 2;
|
||||
|
||||
// The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
|
||||
// of the language in this result. This language code was detected to have
|
||||
// the most likelihood of being spoken in the audio.
|
||||
string language_code = 5;
|
||||
// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
|
||||
// language tag of the language in this result. This language code was
|
||||
// detected to have the most likelihood of being spoken in the audio.
|
||||
string language_code = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
|
||||
}
|
||||
|
||||
// Alternative hypotheses (a.k.a. n-best list).
|
||||
|
|
@ -807,10 +826,10 @@ message WordInfo {
|
|||
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
|
||||
float confidence = 4;
|
||||
|
||||
// A distinct integer value is assigned for every speaker within
|
||||
// Output only. A distinct integer value is assigned for every speaker within
|
||||
// the audio. This field specifies which one of those speakers was detected to
|
||||
// have spoken this word. Value ranges from '1' to diarization_speaker_count.
|
||||
// speaker_tag is set if enable_speaker_diarization = 'true' and only in the
|
||||
// top alternative.
|
||||
int32 speaker_tag = 5;
|
||||
int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,129 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package google.cloud.speech.v1p1beta1;
|
||||
|
||||
import "google/api/annotations.proto";
|
||||
import "google/api/resource.proto";
|
||||
|
||||
option cc_enable_arenas = true;
|
||||
option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1p1beta1;speech";
|
||||
option java_multiple_files = true;
|
||||
option java_outer_classname = "SpeechResourceProto";
|
||||
option java_package = "com.google.cloud.speech.v1p1beta1";
|
||||
option objc_class_prefix = "GCS";
|
||||
|
||||
// A set of words or phrases that represents a common concept likely to appear
|
||||
// in your audio, for example a list of passenger ship names. CustomClass items
|
||||
// can be substituted into placeholders that you set in PhraseSet phrases.
|
||||
message CustomClass {
|
||||
option (google.api.resource) = {
|
||||
type: "speech.googleapis.com/CustomClass"
|
||||
pattern: "projects/{project}/locations/{location}/customClasses/{custom_class}"
|
||||
};
|
||||
|
||||
// An item of the class.
|
||||
message ClassItem {
|
||||
// The class item's value.
|
||||
string value = 1;
|
||||
}
|
||||
|
||||
// The resource name of the custom class.
|
||||
string name = 1;
|
||||
|
||||
// If this custom class is a resource, the custom_class_id is the resource id
|
||||
// of the CustomClass.
|
||||
string custom_class_id = 2;
|
||||
|
||||
// A collection of class items.
|
||||
repeated ClassItem items = 3;
|
||||
}
|
||||
|
||||
// Provides "hints" to the speech recognizer to favor specific words and phrases
|
||||
// in the results.
|
||||
message PhraseSet {
|
||||
option (google.api.resource) = {
|
||||
type: "speech.googleapis.com/PhraseSet"
|
||||
pattern: "projects/{project}/locations/{location}/phraseSets/{phrase_set}"
|
||||
};
|
||||
|
||||
// A phrases containing words and phrase "hints" so that
|
||||
// the speech recognition is more likely to recognize them. This can be used
|
||||
// to improve the accuracy for specific words and phrases, for example, if
|
||||
// specific commands are typically spoken by the user. This can also be used
|
||||
// to add additional words to the vocabulary of the recognizer. See
|
||||
// [usage limits](https://cloud.google.com/speech-to-text/quotas#content).
|
||||
//
|
||||
// List items can also include pre-built or custom classes containing groups
|
||||
// of words that represent common concepts that occur in natural language. For
|
||||
// example, rather than providing a phrase hint for every month of the
|
||||
// year (e.g. "i was born in january", "i was born in febuary", ...), use the
|
||||
// pre-built `$MONTH` class improves the likelihood of correctly transcribing
|
||||
// audio that includes months (e.g. "i was born in $month").
|
||||
// To refer to pre-built classes, use the class' symbol prepended with `$`
|
||||
// e.g. `$MONTH`. To refer to custom classes that were defined inline in the
|
||||
// request, set the class's `custom_class_id` to a string unique to all class
|
||||
// resources and inline classes. Then use the class' id wrapped in $`{...}`
|
||||
// e.g. "${my-months}". To refer to custom classes resources, use the class'
|
||||
// id wrapped in `${}` (e.g. `${my-months}`).
|
||||
message Phrase {
|
||||
// The phrase itself.
|
||||
string value = 1;
|
||||
|
||||
// Hint Boost. Overrides the boost set at the phrase set level.
|
||||
// Positive value will increase the probability that a specific phrase will
|
||||
// be recognized over other similar sounding phrases. The higher the boost,
|
||||
// the higher the chance of false positive recognition as well. Negative
|
||||
// boost values would correspond to anti-biasing. Anti-biasing is not
|
||||
// enabled, so negative boost will simply be ignored. Though `boost` can
|
||||
// accept a wide range of positive values, most use cases are best served
|
||||
// with values between 0 and 20. We recommend using a binary search approach
|
||||
// to finding the optimal value for your use case. Speech recognition
|
||||
// will skip PhraseSets with a boost value of 0.
|
||||
float boost = 2;
|
||||
}
|
||||
|
||||
// The resource name of the phrase set.
|
||||
string name = 1;
|
||||
|
||||
// A list of word and phrases.
|
||||
repeated Phrase phrases = 2;
|
||||
|
||||
// Hint Boost. Positive value will increase the probability that a specific
|
||||
// phrase will be recognized over other similar sounding phrases. The higher
|
||||
// the boost, the higher the chance of false positive recognition as well.
|
||||
// Negative boost values would correspond to anti-biasing. Anti-biasing is not
|
||||
// enabled, so negative boost will simply be ignored. Though `boost` can
|
||||
// accept a wide range of positive values, most use cases are best served with
|
||||
// values between 0 (exclusive) and 20. We recommend using a binary search
|
||||
// approach to finding the optimal value for your use case. Speech recognition
|
||||
// will skip PhraseSets with a boost value of 0.
|
||||
float boost = 4;
|
||||
}
|
||||
|
||||
// Speech adaptation configuration.
|
||||
message SpeechAdaptation {
|
||||
// A collection of phrase sets. To specify the hints inline, leave the
|
||||
// phrase set's `name` blank and fill in the rest of its fields. Any
|
||||
// phrase set can use any custom class.
|
||||
repeated PhraseSet phrase_sets = 1;
|
||||
|
||||
// A collection of custom classes. To specify the classes inline, leave the
|
||||
// class' `name` blank and fill in the rest of its fields, giving it a unique
|
||||
// `custom_class_id`. Refer to the inline defined class in phrase hints by its
|
||||
// `custom_class_id`.
|
||||
repeated CustomClass custom_classes = 2;
|
||||
}
|
||||
Loading…
Reference in New Issue