Synchronize new proto/yaml changes.
PiperOrigin-RevId: 248006867
This commit is contained in:
parent
aa25423894
commit
bb79813309
|
|
@ -19,9 +19,7 @@ package google.cloud.speech.v1;
|
|||
|
||||
import "google/api/annotations.proto";
|
||||
import "google/longrunning/operations.proto";
|
||||
import "google/protobuf/any.proto";
|
||||
import "google/protobuf/duration.proto";
|
||||
import "google/protobuf/empty.proto";
|
||||
import "google/protobuf/timestamp.proto";
|
||||
import "google/rpc/status.proto";
|
||||
|
||||
|
|
@ -278,6 +276,9 @@ message RecognitionConfig {
|
|||
// premium feature.
|
||||
bool enable_automatic_punctuation = 11;
|
||||
|
||||
// *Optional* Metadata regarding this request.
|
||||
RecognitionMetadata metadata = 9;
|
||||
|
||||
// *Optional* Which model to select for the given request. Select the model
|
||||
// best suited to your domain to get best results. If a model is not
|
||||
// explicitly specified, then we auto-select a model based on the parameters
|
||||
|
|
@ -330,6 +331,133 @@ message RecognitionConfig {
|
|||
bool use_enhanced = 14;
|
||||
}
|
||||
|
||||
// Description of audio data to be recognized.
|
||||
message RecognitionMetadata {
|
||||
// Use case categories that the audio recognition request can be described
|
||||
// by.
|
||||
enum InteractionType {
|
||||
// Use case is either unknown or is something other than one of the other
|
||||
// values below.
|
||||
INTERACTION_TYPE_UNSPECIFIED = 0;
|
||||
|
||||
// Multiple people in a conversation or discussion. For example in a
|
||||
// meeting with two or more people actively participating. Typically
|
||||
// all the primary people speaking would be in the same room (if not,
|
||||
// see PHONE_CALL)
|
||||
DISCUSSION = 1;
|
||||
|
||||
// One or more persons lecturing or presenting to others, mostly
|
||||
// uninterrupted.
|
||||
PRESENTATION = 2;
|
||||
|
||||
// A phone-call or video-conference in which two or more people, who are
|
||||
// not in the same room, are actively participating.
|
||||
PHONE_CALL = 3;
|
||||
|
||||
// A recorded message intended for another person to listen to.
|
||||
VOICEMAIL = 4;
|
||||
|
||||
// Professionally produced audio (eg. TV Show, Podcast).
|
||||
PROFESSIONALLY_PRODUCED = 5;
|
||||
|
||||
// Transcribe spoken questions and queries into text.
|
||||
VOICE_SEARCH = 6;
|
||||
|
||||
// Transcribe voice commands, such as for controlling a device.
|
||||
VOICE_COMMAND = 7;
|
||||
|
||||
// Transcribe speech to text to create a written document, such as a
|
||||
// text-message, email or report.
|
||||
DICTATION = 8;
|
||||
}
|
||||
|
||||
// The use case most closely describing the audio content to be recognized.
|
||||
InteractionType interaction_type = 1;
|
||||
|
||||
// The industry vertical to which this speech recognition request most
|
||||
// closely applies. This is most indicative of the topics contained
|
||||
// in the audio. Use the 6-digit NAICS code to identify the industry
|
||||
// vertical - see https://www.naics.com/search/.
|
||||
uint32 industry_naics_code_of_audio = 3;
|
||||
|
||||
// Enumerates the types of capture settings describing an audio file.
|
||||
enum MicrophoneDistance {
|
||||
// Audio type is not known.
|
||||
MICROPHONE_DISTANCE_UNSPECIFIED = 0;
|
||||
|
||||
// The audio was captured from a closely placed microphone. Eg. phone,
|
||||
// dictaphone, or handheld microphone. Generally if there speaker is within
|
||||
// 1 meter of the microphone.
|
||||
NEARFIELD = 1;
|
||||
|
||||
// The speaker if within 3 meters of the microphone.
|
||||
MIDFIELD = 2;
|
||||
|
||||
// The speaker is more than 3 meters away from the microphone.
|
||||
FARFIELD = 3;
|
||||
}
|
||||
|
||||
// The audio type that most closely describes the audio being recognized.
|
||||
MicrophoneDistance microphone_distance = 4;
|
||||
|
||||
// The original media the speech was recorded on.
|
||||
enum OriginalMediaType {
|
||||
// Unknown original media type.
|
||||
ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0;
|
||||
|
||||
// The speech data is an audio recording.
|
||||
AUDIO = 1;
|
||||
|
||||
// The speech data originally recorded on a video.
|
||||
VIDEO = 2;
|
||||
}
|
||||
|
||||
// The original media the speech was recorded on.
|
||||
OriginalMediaType original_media_type = 5;
|
||||
|
||||
// The type of device the speech was recorded with.
|
||||
enum RecordingDeviceType {
|
||||
// The recording device is unknown.
|
||||
RECORDING_DEVICE_TYPE_UNSPECIFIED = 0;
|
||||
|
||||
// Speech was recorded on a smartphone.
|
||||
SMARTPHONE = 1;
|
||||
|
||||
// Speech was recorded using a personal computer or tablet.
|
||||
PC = 2;
|
||||
|
||||
// Speech was recorded over a phone line.
|
||||
PHONE_LINE = 3;
|
||||
|
||||
// Speech was recorded in a vehicle.
|
||||
VEHICLE = 4;
|
||||
|
||||
// Speech was recorded outdoors.
|
||||
OTHER_OUTDOOR_DEVICE = 5;
|
||||
|
||||
// Speech was recorded indoors.
|
||||
OTHER_INDOOR_DEVICE = 6;
|
||||
}
|
||||
|
||||
// The type of device the speech was recorded with.
|
||||
RecordingDeviceType recording_device_type = 6;
|
||||
|
||||
// The device used to make the recording. Examples 'Nexus 5X' or
|
||||
// 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
|
||||
// 'Cardioid Microphone'.
|
||||
string recording_device_name = 7;
|
||||
|
||||
// Mime type of the original audio file. For example `audio/m4a`,
|
||||
// `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
|
||||
// A list of possible audio mime types is maintained at
|
||||
// http://www.iana.org/assignments/media-types/media-types.xhtml#audio
|
||||
string original_mime_type = 8;
|
||||
|
||||
// Description of the content. Eg. "Recordings of federal supreme court
|
||||
// hearings from 2012".
|
||||
string audio_topic = 10;
|
||||
}
|
||||
|
||||
// Provides "hints" to the speech recognizer to favor specific words and phrases
|
||||
// in the results.
|
||||
message SpeechContext {
|
||||
|
|
@ -504,10 +632,20 @@ message StreamingRecognitionResult {
|
|||
// The default of 0.0 is a sentinel value indicating `stability` was not set.
|
||||
float stability = 3;
|
||||
|
||||
// Output only. Time offset of the end of this result relative to the
|
||||
// beginning of the audio.
|
||||
google.protobuf.Duration result_end_time = 4;
|
||||
|
||||
// For multi-channel audio, this is the channel number corresponding to the
|
||||
// recognized result for the audio from that channel.
|
||||
// For audio_channel_count = N, its output values can range from '1' to 'N'.
|
||||
int32 channel_tag = 5;
|
||||
|
||||
// Output only. The
|
||||
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
|
||||
// language in this result. This language code was detected to have the most
|
||||
// likelihood of being spoken in the audio.
|
||||
string language_code = 6;
|
||||
}
|
||||
|
||||
// A speech recognition result corresponding to a portion of the audio.
|
||||
|
|
|
|||
Loading…
Reference in New Issue