diff --git a/google/cloud/speech/v1p1beta1/BUILD.bazel b/google/cloud/speech/v1p1beta1/BUILD.bazel index 3afe397c..14fd48bb 100644 --- a/google/cloud/speech/v1p1beta1/BUILD.bazel +++ b/google/cloud/speech/v1p1beta1/BUILD.bazel @@ -13,11 +13,13 @@ proto_library( name = "speech_proto", srcs = [ "cloud_speech.proto", + "resource.proto", ], deps = [ "//google/api:annotations_proto", "//google/api:client_proto", "//google/api:field_behavior_proto", + "//google/api:resource_proto", "//google/longrunning:operations_proto", "//google/rpc:status_proto", "@com_google_protobuf//:any_proto", @@ -166,6 +168,7 @@ moved_proto_library( "//google/api:annotations_proto", "//google/api:client_proto", "//google/api:field_behavior_proto", + "//google/api:resource_proto", "//google/longrunning:operations_proto", "//google/rpc:status_proto", "@com_google_protobuf//:any_proto", diff --git a/google/cloud/speech/v1p1beta1/cloud_speech.proto b/google/cloud/speech/v1p1beta1/cloud_speech.proto index 7718b0eb..7bb12866 100644 --- a/google/cloud/speech/v1p1beta1/cloud_speech.proto +++ b/google/cloud/speech/v1p1beta1/cloud_speech.proto @@ -1,4 +1,4 @@ -// Copyright 2019 Google LLC. +// Copyright 2020 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,7 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -// syntax = "proto3"; @@ -20,6 +19,8 @@ package google.cloud.speech.v1p1beta1; import "google/api/annotations.proto"; import "google/api/client.proto"; import "google/api/field_behavior.proto"; +import "google/api/resource.proto"; +import "google/cloud/speech/v1p1beta1/resource.proto"; import "google/longrunning/operations.proto"; import "google/protobuf/any.proto"; import "google/protobuf/duration.proto"; @@ -36,7 +37,8 @@ option objc_class_prefix = "GCS"; // Service that implements Google Cloud Speech API. service Speech { option (google.api.default_host) = "speech.googleapis.com"; - option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform"; + option (google.api.oauth_scopes) = + "https://www.googleapis.com/auth/cloud-platform"; // Performs synchronous speech recognition: receive results after all audio // has been sent and processed. @@ -54,7 +56,8 @@ service Speech { // a `LongRunningRecognizeResponse` message. // For more information on asynchronous speech recognition, see the // [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize). - rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) { + rpc LongRunningRecognize(LongRunningRecognizeRequest) + returns (google.longrunning.Operation) { option (google.api.http) = { post: "/v1p1beta1/speech:longrunningrecognize" body: "*" @@ -68,8 +71,8 @@ service Speech { // Performs bidirectional streaming speech recognition: receive results while // sending audio. This method is only available via the gRPC API (not REST). - rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) { - } + rpc StreamingRecognize(stream StreamingRecognizeRequest) + returns (stream StreamingRecognizeResponse) {} } // The top-level message sent by the client for the `Recognize` method. @@ -169,7 +172,8 @@ message RecognitionConfig { // an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the // encoding configuration must match the encoding described in the audio // header; otherwise the request returns an - // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code. + // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error + // code. enum AudioEncoding { // Not specified. ENCODING_UNSPECIFIED = 0; @@ -215,14 +219,15 @@ message RecognitionConfig { SPEEX_WITH_HEADER_BYTE = 7; // MP3 audio. Support all standard MP3 bitrates (which range from 32-320 - // kbps). When using this encoding, `sample_rate_hertz` can be optionally - // unset if not known. + // kbps). When using this encoding, `sample_rate_hertz` has to match the + // sample rate of the file being used. MP3 = 8; } // Encoding of audio data sent in all `RecognitionAudio` messages. // This field is optional for `FLAC` and `WAV` audio files and required - // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding]. + // for all other audio formats. For details, see + // [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding]. AudioEncoding encoding = 1; // Sample rate in Hertz of the audio data sent in all @@ -231,7 +236,8 @@ message RecognitionConfig { // source to 16000 Hz. If that's not possible, use the native sample rate of // the audio source (instead of re-sampling). // This field is optional for FLAC and WAV audio files, but is - // required for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding]. + // required for all other audio formats. For details, see + // [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding]. int32 sample_rate_hertz = 2; // The number of channels in the input audio data. @@ -289,6 +295,13 @@ message RecognitionConfig { // won't be filtered out. bool profanity_filter = 5; + // Speech adaptation configuration improves the accuracy of speech + // recognition. When speech adaptation is set it supersedes the + // `speech_contexts` field. For more information, see the [speech + // adaptation](https://cloud.google.com/speech-to-text/docs/context-strength) + // documentation. + SpeechAdaptation adaptation = 20; + // Array of [SpeechContext][google.cloud.speech.v1p1beta1.SpeechContext]. // A means to provide context to assist the speech recognition. For more // information, see @@ -311,9 +324,6 @@ message RecognitionConfig { // This feature is only available in select languages. Setting this for // requests in other languages has no effect at all. // The default 'false' value does not add punctuation to result hypotheses. - // Note: This is currently offered as an experimental service, complimentary - // to all users. In the future this may be exclusively available as a - // premium feature. bool enable_automatic_punctuation = 11; // If 'true', enables speaker detection for each recognized word in @@ -401,6 +411,10 @@ message SpeakerDiarizationConfig { // flexibility by allowing the system to automatically determine the correct // number of speakers. If not set, the default value is 6. int32 max_speaker_count = 3; + + // Output only. Unused. + int32 speaker_tag = 5 + [deprecated = true, (google.api.field_behavior) = OUTPUT_ONLY]; } // Description of audio data to be recognized. @@ -564,8 +578,8 @@ message SpeechContext { // Contains audio data in the encoding specified in the `RecognitionConfig`. // Either `content` or `uri` must be supplied. Supplying both or neither -// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See -// [content limits](https://cloud.google.com/speech-to-text/quotas#content). +// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. +// See [content limits](https://cloud.google.com/speech-to-text/quotas#content). message RecognitionAudio { // The audio source, which is either inline content or a Google Cloud // Storage uri. @@ -580,8 +594,9 @@ message RecognitionAudio { // Currently, only Google Cloud Storage URIs are // supported, which must be specified in the following format: // `gs://bucket_name/object_name` (other URI formats return - // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see - // [Request URIs](https://cloud.google.com/storage/docs/reference-uris). + // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). + // For more information, see [Request + // URIs](https://cloud.google.com/storage/docs/reference-uris). string uri = 2; } } @@ -619,6 +634,10 @@ message LongRunningRecognizeMetadata { // Time of the most recent processing update. google.protobuf.Timestamp last_update_time = 3; + + // The URI of the audio file being transcribed. Empty if the audio was sent + // as byte content. + string uri = 4 [(google.api.field_behavior) = OUTPUT_ONLY]; } // `StreamingRecognizeResponse` is the only message returned to the client by @@ -732,10 +751,10 @@ message StreamingRecognitionResult { // For audio_channel_count = N, its output values can range from '1' to 'N'. int32 channel_tag = 5; - // The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag - // of the language in this result. This language code was detected to have - // the most likelihood of being spoken in the audio. - string language_code = 6; + // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) + // language tag of the language in this result. This language code was + // detected to have the most likelihood of being spoken in the audio. + string language_code = 6 [(google.api.field_behavior) = OUTPUT_ONLY]; } // A speech recognition result corresponding to a portion of the audio. @@ -751,10 +770,10 @@ message SpeechRecognitionResult { // For audio_channel_count = N, its output values can range from '1' to 'N'. int32 channel_tag = 2; - // The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag - // of the language in this result. This language code was detected to have - // the most likelihood of being spoken in the audio. - string language_code = 5; + // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) + // language tag of the language in this result. This language code was + // detected to have the most likelihood of being spoken in the audio. + string language_code = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Alternative hypotheses (a.k.a. n-best list). @@ -807,10 +826,10 @@ message WordInfo { // The default of 0.0 is a sentinel value indicating `confidence` was not set. float confidence = 4; - // A distinct integer value is assigned for every speaker within + // Output only. A distinct integer value is assigned for every speaker within // the audio. This field specifies which one of those speakers was detected to // have spoken this word. Value ranges from '1' to diarization_speaker_count. // speaker_tag is set if enable_speaker_diarization = 'true' and only in the // top alternative. - int32 speaker_tag = 5; + int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; } diff --git a/google/cloud/speech/v1p1beta1/resource.proto b/google/cloud/speech/v1p1beta1/resource.proto new file mode 100644 index 00000000..5bb379ff --- /dev/null +++ b/google/cloud/speech/v1p1beta1/resource.proto @@ -0,0 +1,129 @@ +// Copyright 2020 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.cloud.speech.v1p1beta1; + +import "google/api/annotations.proto"; +import "google/api/resource.proto"; + +option cc_enable_arenas = true; +option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1p1beta1;speech"; +option java_multiple_files = true; +option java_outer_classname = "SpeechResourceProto"; +option java_package = "com.google.cloud.speech.v1p1beta1"; +option objc_class_prefix = "GCS"; + +// A set of words or phrases that represents a common concept likely to appear +// in your audio, for example a list of passenger ship names. CustomClass items +// can be substituted into placeholders that you set in PhraseSet phrases. +message CustomClass { + option (google.api.resource) = { + type: "speech.googleapis.com/CustomClass" + pattern: "projects/{project}/locations/{location}/customClasses/{custom_class}" + }; + + // An item of the class. + message ClassItem { + // The class item's value. + string value = 1; + } + + // The resource name of the custom class. + string name = 1; + + // If this custom class is a resource, the custom_class_id is the resource id + // of the CustomClass. + string custom_class_id = 2; + + // A collection of class items. + repeated ClassItem items = 3; +} + +// Provides "hints" to the speech recognizer to favor specific words and phrases +// in the results. +message PhraseSet { + option (google.api.resource) = { + type: "speech.googleapis.com/PhraseSet" + pattern: "projects/{project}/locations/{location}/phraseSets/{phrase_set}" + }; + + // A phrases containing words and phrase "hints" so that + // the speech recognition is more likely to recognize them. This can be used + // to improve the accuracy for specific words and phrases, for example, if + // specific commands are typically spoken by the user. This can also be used + // to add additional words to the vocabulary of the recognizer. See + // [usage limits](https://cloud.google.com/speech-to-text/quotas#content). + // + // List items can also include pre-built or custom classes containing groups + // of words that represent common concepts that occur in natural language. For + // example, rather than providing a phrase hint for every month of the + // year (e.g. "i was born in january", "i was born in febuary", ...), use the + // pre-built `$MONTH` class improves the likelihood of correctly transcribing + // audio that includes months (e.g. "i was born in $month"). + // To refer to pre-built classes, use the class' symbol prepended with `$` + // e.g. `$MONTH`. To refer to custom classes that were defined inline in the + // request, set the class's `custom_class_id` to a string unique to all class + // resources and inline classes. Then use the class' id wrapped in $`{...}` + // e.g. "${my-months}". To refer to custom classes resources, use the class' + // id wrapped in `${}` (e.g. `${my-months}`). + message Phrase { + // The phrase itself. + string value = 1; + + // Hint Boost. Overrides the boost set at the phrase set level. + // Positive value will increase the probability that a specific phrase will + // be recognized over other similar sounding phrases. The higher the boost, + // the higher the chance of false positive recognition as well. Negative + // boost values would correspond to anti-biasing. Anti-biasing is not + // enabled, so negative boost will simply be ignored. Though `boost` can + // accept a wide range of positive values, most use cases are best served + // with values between 0 and 20. We recommend using a binary search approach + // to finding the optimal value for your use case. Speech recognition + // will skip PhraseSets with a boost value of 0. + float boost = 2; + } + + // The resource name of the phrase set. + string name = 1; + + // A list of word and phrases. + repeated Phrase phrases = 2; + + // Hint Boost. Positive value will increase the probability that a specific + // phrase will be recognized over other similar sounding phrases. The higher + // the boost, the higher the chance of false positive recognition as well. + // Negative boost values would correspond to anti-biasing. Anti-biasing is not + // enabled, so negative boost will simply be ignored. Though `boost` can + // accept a wide range of positive values, most use cases are best served with + // values between 0 (exclusive) and 20. We recommend using a binary search + // approach to finding the optimal value for your use case. Speech recognition + // will skip PhraseSets with a boost value of 0. + float boost = 4; +} + +// Speech adaptation configuration. +message SpeechAdaptation { + // A collection of phrase sets. To specify the hints inline, leave the + // phrase set's `name` blank and fill in the rest of its fields. Any + // phrase set can use any custom class. + repeated PhraseSet phrase_sets = 1; + + // A collection of custom classes. To specify the classes inline, leave the + // class' `name` blank and fill in the rest of its fields, giving it a unique + // `custom_class_id`. Refer to the inline defined class in phrase hints by its + // `custom_class_id`. + repeated CustomClass custom_classes = 2; +}