diff --git a/google/cloud/language/v1beta1/language_service.proto b/google/cloud/language/v1beta1/language_service.proto new file mode 100644 index 00000000..71754452 --- /dev/null +++ b/google/cloud/language/v1beta1/language_service.proto @@ -0,0 +1,610 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.cloud.language.v1beta1; + +import "google/api/annotations.proto"; + +option java_multiple_files = true; +option java_outer_classname = "LanguageServiceProto"; +option java_package = "com.google.cloud.language.v1beta1"; + + +// Provides text analysis operations such as sentiment analysis and entity +// recognition. +service LanguageService { + // Analyzes the sentiment of the provided text. + rpc AnalyzeSentiment(AnalyzeSentimentRequest) returns (AnalyzeSentimentResponse) { + option (google.api.http) = { post: "/v1beta1/documents:analyzeSentiment" body: "*" }; + } + + // Finds named entities (currently finds proper names) in the text, + // entity types, salience, mentions for each entity, and other properties. + rpc AnalyzeEntities(AnalyzeEntitiesRequest) returns (AnalyzeEntitiesResponse) { + option (google.api.http) = { post: "/v1beta1/documents:analyzeEntities" body: "*" }; + } + + // Advanced API that analyzes the document and provides a full set of text + // annotations, including semantic, syntactic, and sentiment information. This + // API is intended for users who are familiar with machine learning and need + // in-depth text features to build upon. + rpc AnnotateText(AnnotateTextRequest) returns (AnnotateTextResponse) { + option (google.api.http) = { post: "/v1beta1/documents:annotateText" body: "*" }; + } +} + +// ################################################################ # +// +// Represents the input to API methods. +message Document { + // The document types enum. + enum Type { + // The content type is not specified. + TYPE_UNSPECIFIED = 0; + + // Plain text + PLAIN_TEXT = 1; + + // HTML + HTML = 2; + } + + // Required. If the type is not set or is `TYPE_UNSPECIFIED`, + // returns an `INVALID_ARGUMENT` error. + Type type = 1; + + // The source of the document: a string containing the content or a + // Google Cloud Storage URI. + oneof source { + // The content of the input in string format. + string content = 2; + + // The Google Cloud Storage URI where the file content is located. + string gcs_content_uri = 3; + } + + // The language of the document (if not specified, the language is + // automatically detected). Both ISO and BCP-47 language codes are + // accepted.
+ // **Current Language Restrictions:** + // + // * Only English, Spanish, and Japanese textual content + // are supported, with the following additional restriction: + // * `analyzeSentiment` only supports English text. + // If the language (either specified by the caller or automatically detected) + // is not supported by the called API method, an `INVALID_ARGUMENT` error + // is returned. + string language = 4; +} + +// Represents a sentence in the input document. +message Sentence { + // The sentence text. + TextSpan text = 1; +} + +// Represents a phrase in the text that is a known entity, such as +// a person, an organization, or location. The API associates information, such +// as salience and mentions, with entities. +message Entity { + // The type of the entity. + enum Type { + // Unknown + UNKNOWN = 0; + + // Person + PERSON = 1; + + // Location + LOCATION = 2; + + // Organization + ORGANIZATION = 3; + + // Event + EVENT = 4; + + // Work of art + WORK_OF_ART = 5; + + // Consumer goods + CONSUMER_GOOD = 6; + + // Other types + OTHER = 7; + } + + // The representative name for the entity. + string name = 1; + + // The entity type. + Type type = 2; + + // Metadata associated with the entity. + // + // Currently, only Wikipedia URLs are provided, if available. + // The associated key is "wikipedia_url". + map metadata = 3; + + // The salience score associated with the entity in the [0, 1.0] range. + // + // The salience score for an entity provides information about the + // importance or centrality of that entity to the entire document text. + // Scores closer to 0 are less salient, while scores closer to 1.0 are highly + // salient. + float salience = 4; + + // The mentions of this entity in the input document. The API currently + // supports proper noun mentions. + repeated EntityMention mentions = 5; +} + +// Represents the smallest syntactic building block of the text. +message Token { + // The token text. + TextSpan text = 1; + + // Parts of speech tag for this token. + PartOfSpeech part_of_speech = 2; + + // Dependency tree parse for this token. + DependencyEdge dependency_edge = 3; + + // [Lemma](https://en.wikipedia.org/wiki/Lemma_(morphology)) + // of the token. + string lemma = 4; +} + +// Represents the feeling associated with the entire text or entities in +// the text. +message Sentiment { + // Polarity of the sentiment in the [-1.0, 1.0] range. Larger numbers + // represent more positive sentiments. + float polarity = 1; + + // A non-negative number in the [0, +inf) range, which represents + // the absolute magnitude of sentiment regardless of polarity (positive or + // negative). + float magnitude = 2; +} + +// Represents part of speech information for a token. +message PartOfSpeech { + // The part of speech tags enum. + enum Tag { + // Unknown + UNKNOWN = 0; + + // Adjective + ADJ = 1; + + // Adposition (preposition and postposition) + ADP = 2; + + // Adverb + ADV = 3; + + // Conjunction + CONJ = 4; + + // Determiner + DET = 5; + + // Noun (common and proper) + NOUN = 6; + + // Cardinal number + NUM = 7; + + // Pronoun + PRON = 8; + + // Particle or other function word + PRT = 9; + + // Punctuation + PUNCT = 10; + + // Verb (all tenses and modes) + VERB = 11; + + // Other: foreign words, typos, abbreviations + X = 12; + + // Affix + AFFIX = 13; + } + + // The part of speech tag. + Tag tag = 1; +} + +// Represents dependency parse tree information for a token. +message DependencyEdge { + // The parse label enum for the token. + enum Label { + // Unknown + UNKNOWN = 0; + + // Abbreviation modifier + ABBREV = 1; + + // Adjectival complement + ACOMP = 2; + + // Adverbial clause modifier + ADVCL = 3; + + // Adverbial modifier + ADVMOD = 4; + + // Adjectival modifier of an NP + AMOD = 5; + + // Appositional modifier of an NP + APPOS = 6; + + // Attribute dependent of a copular verb + ATTR = 7; + + // Auxiliary (non-main) verb + AUX = 8; + + // Passive auxiliary + AUXPASS = 9; + + // Coordinating conjunction + CC = 10; + + // Clausal complement of a verb or adjective + CCOMP = 11; + + // Conjunct + CONJ = 12; + + // Clausal subject + CSUBJ = 13; + + // Clausal passive subject + CSUBJPASS = 14; + + // Dependency (unable to determine) + DEP = 15; + + // Determiner + DET = 16; + + // Discourse + DISCOURSE = 17; + + // Direct object + DOBJ = 18; + + // Expletive + EXPL = 19; + + // Goes with (part of a word in a text not well edited) + GOESWITH = 20; + + // Indirect object + IOBJ = 21; + + // Marker (word introducing a subordinate clause) + MARK = 22; + + // Multi-word expression + MWE = 23; + + // Multi-word verbal expression + MWV = 24; + + // Negation modifier + NEG = 25; + + // Noun compound modifier + NN = 26; + + // Noun phrase used as an adverbial modifier + NPADVMOD = 27; + + // Nominal subject + NSUBJ = 28; + + // Passive nominal subject + NSUBJPASS = 29; + + // Numeric modifier of a noun + NUM = 30; + + // Element of compound number + NUMBER = 31; + + // Punctuation mark + P = 32; + + // Parataxis relation + PARATAXIS = 33; + + // Participial modifier + PARTMOD = 34; + + // The complement of a preposition is a clause + PCOMP = 35; + + // Object of a preposition + POBJ = 36; + + // Possession modifier + POSS = 37; + + // Postverbal negative particle + POSTNEG = 38; + + // Predicate complement + PRECOMP = 39; + + // Preconjunt + PRECONJ = 40; + + // Predeterminer + PREDET = 41; + + // Prefix + PREF = 42; + + // Prepositional modifier + PREP = 43; + + // The relationship between a verb and verbal morpheme + PRONL = 44; + + // Particle + PRT = 45; + + // Associative or possessive marker + PS = 46; + + // Quantifier phrase modifier + QUANTMOD = 47; + + // Relative clause modifier + RCMOD = 48; + + // Complementizer in relative clause + RCMODREL = 49; + + // Ellipsis without a preceding predicate + RDROP = 50; + + // Referent + REF = 51; + + // Remnant + REMNANT = 52; + + // Reparandum + REPARANDUM = 53; + + // Root + ROOT = 54; + + // Suffix specifying a unit of number + SNUM = 55; + + // Suffix + SUFF = 56; + + // Temporal modifier + TMOD = 57; + + // Topic marker + TOPIC = 58; + + // Clause headed by an infinite form of the verb that modifies a noun + VMOD = 59; + + // Vocative + VOCATIVE = 60; + + // Open clausal complement + XCOMP = 61; + + // Name suffix + SUFFIX = 62; + + // Name title + TITLE = 63; + + // Adverbial phrase modifier + ADVPHMOD = 64; + + // Causative auxiliary + AUXCAUS = 65; + + // Helper auxiliary + AUXVV = 66; + + // Rentaishi (Prenominal modifier) + DTMOD = 67; + + // Foreign words + FOREIGN = 68; + + // Keyword + KW = 69; + + // List for chains of comparable items + LIST = 70; + + // Nominalized clause + NOMC = 71; + + // Nominalized clausal subject + NOMCSUBJ = 72; + + // Nominalized clausal passive + NOMCSUBJPASS = 73; + + // Compound of numeric modifier + NUMC = 74; + + // Copula + COP = 75; + + // Dislocated relation (for fronted/topicalized elements) + DISLOCATED = 76; + } + + // Represents the head of this token in the dependency tree. + // This is the index of the token which has an arc going to this token. + // The index is the position of the token in the array of tokens returned + // by the API method. If this token is a root token, then the + // `head_token_index` is its own index. + int32 head_token_index = 1; + + // The parse label for the token. + Label label = 2; +} + +// Represents a mention for an entity in the text. Currently, proper noun +// mentions are supported. +message EntityMention { + // The mention text. + TextSpan text = 1; +} + +// Represents an output piece of text. +message TextSpan { + // The content of the output text. + string content = 1; + + // The API calculates the beginning offset of the content in the original + // document according to the [EncodingType][google.cloud.language.v1beta1.EncodingType] specified in the API request. + int32 begin_offset = 2; +} + +// The sentiment analysis request message. +message AnalyzeSentimentRequest { + // Input document. Currently, `analyzeSentiment` only supports English text + // ([Document.language][google.cloud.language.v1beta1.Document.language]="EN"). + Document document = 1; +} + +// The sentiment analysis response message. +message AnalyzeSentimentResponse { + // The overall sentiment of the input document. + Sentiment document_sentiment = 1; + + // The language of the text, which will be the same as the language specified + // in the request or, if not specified, the automatically-detected language. + string language = 2; +} + +// The entity analysis request message. +message AnalyzeEntitiesRequest { + // Input document. + Document document = 1; + + // The encoding type used by the API to calculate offsets. + EncodingType encoding_type = 2; +} + +// The entity analysis response message. +message AnalyzeEntitiesResponse { + // The recognized entities in the input document. + repeated Entity entities = 1; + + // The language of the text, which will be the same as the language specified + // in the request or, if not specified, the automatically-detected language. + string language = 2; +} + +// The request message for the advanced text annotation API, which performs all +// the above plus syntactic analysis. +message AnnotateTextRequest { + // All available features for sentiment, syntax, and semantic analysis. + // Setting each one to true will enable that specific analysis for the input. + message Features { + // Extract syntax information. + bool extract_syntax = 1; + + // Extract entities. + bool extract_entities = 2; + + // Extract document-level sentiment. + bool extract_document_sentiment = 3; + } + + // Input document. + Document document = 1; + + // The enabled features. + Features features = 2; + + // The encoding type used by the API to calculate offsets. + EncodingType encoding_type = 3; +} + +// The text annotations response message. +message AnnotateTextResponse { + // Sentences in the input document. Populated if the user enables + // [AnnotateTextRequest.Features.extract_syntax][google.cloud.language.v1beta1.AnnotateTextRequest.Features.extract_syntax]. + repeated Sentence sentences = 1; + + // Tokens, along with their syntactic information, in the input document. + // Populated if the user enables + // [AnnotateTextRequest.Features.extract_syntax][google.cloud.language.v1beta1.AnnotateTextRequest.Features.extract_syntax]. + repeated Token tokens = 2; + + // Entities, along with their semantic information, in the input document. + // Populated if the user enables + // [AnnotateTextRequest.Features.extract_entities][google.cloud.language.v1beta1.AnnotateTextRequest.Features.extract_entities]. + repeated Entity entities = 3; + + // The overall sentiment for the document. Populated if the user enables + // [AnnotateTextRequest.Features.extract_document_sentiment][google.cloud.language.v1beta1.AnnotateTextRequest.Features.extract_document_sentiment]. + Sentiment document_sentiment = 4; + + // The language of the text, which will be the same as the language specified + // in the request or, if not specified, the automatically-detected language. + string language = 5; +} + +// Represents the text encoding that the caller uses to process the output. +// Providing an `EncodingType` is recommended because the API provides the +// beginning offsets for various outputs, such as tokens and mentions, and +// languages that natively use different text encodings may access offsets +// differently. +enum EncodingType { + // If `EncodingType` is not specified, encoding-dependent information (such as + // `begin_offset`) will be set at `-1`. + NONE = 0; + + // Encoding-dependent information (such as `begin_offset`) is calculated based + // on the UTF-8 encoding of the input. C++ and Go are examples of languages + // that use this encoding natively. + UTF8 = 1; + + // Encoding-dependent information (such as `begin_offset`) is calculated based + // on the UTF-16 encoding of the input. Java and Javascript are examples of + // languages that use this encoding natively. + UTF16 = 2; + + // Encoding-dependent information (such as `begin_offset`) is calculated based + // on the UTF-32 encoding of the input. Python is an example of a language + // that uses this encoding natively. + UTF32 = 3; +}