ccp.docs/source/developermanual/ccp_methods/Gate Cloud Twitie Named Ent...

363 lines
12 KiB
JSON

{
"title": "Gate Cloud Twitie Named Entity Recognizer For Tweets",
"description": "Named entity recognition service for Twitter data. Identifies person, location, organization etc. and also performs normalization of abbreviations and common shorthands (such as brb, gr8, 2day, etc.) - for full details see https://cloud.gate.ac.uk/shopfront/displayItem/twitie-named-entity-recognizer-for-tweets",
"version": "1.0.0",
"jobControlOptions": "async-execute",
"metadata": [
{
"title": "Marco Lettere",
"role": "author",
"href": "https://accounts.dev.d4science.org/auth/admin/realms/d4science/users/88c76e47-5881-4716-a2bf-02d3b4073574"
},
{
"role": "category",
"title": "Text_Analytics"
}
],
"inputs": {
"ccpimage": {
"id": "ccpimage",
"title": "Runtime",
"description": "The image of the runtime to use for method execution. This depends on the infrastructure specific protocol for interacting with registries.",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"format": "url",
"contentMediaType": "text/plain",
"default": "hub.dev.d4science.org/ccp-runtimes/gatecloud-base:latest",
"readOnly": true
}
},
"inputFile": {
"id": "inputFile",
"title": "inputFile",
"description": "Input CSV file()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"format": "remotefile",
"default": "",
"contentMediaType": "text/csv"
}
},
"columnSeparator": {
"id": "columnSeparator",
"title": "columnSeparator",
"description": "Separator between columns - comma for a normal CSV file but tab is also supported()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"enum": [
"comma",
"tab"
],
"default": "comma"
}
},
"hasHeaders": {
"id": "hasHeaders",
"title": "hasHeaders",
"description": "Does the CSV have a header row?()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"format": "boolean",
"default": "false"
}
},
"textColumn": {
"id": "textColumn",
"title": "textColumn",
"description": "Column name (if hasHeaders) or index (1-based) that contains the text to process()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"format": "none",
"default": "1"
}
},
"copyColumns": {
"id": "copyColumns",
"title": "copyColumns",
"description": "Comma-separated list of column names (if hasHeaders is true) or indexes (1-based) that should be copied from the input to output, for example a column representing the document identifier. 0 means do not copy any columns to the output.()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"format": "none",
"default": "0"
}
},
"output1": {
"id": "output1",
"title": "output1",
"description": "Output column definition (11 pre-configured options available, select as many as you wish, or if you prefer you can also add custom output column definitions below)()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"enum": [
"none",
"Person",
"Location",
"Organization",
"Date",
"URL",
"Hashtag tokenized",
"UserID",
"Tweet lang",
"Token string (category)",
"UserID user",
"Emoticon normalized"
],
"default": "none"
}
},
"output2": {
"id": "output2",
"title": "output2",
"description": "Output column definition()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"enum": [
"none",
"Person",
"Location",
"Organization",
"Date",
"URL",
"Hashtag tokenized",
"UserID",
"Tweet lang",
"Token string (category)",
"UserID user",
"Emoticon normalized"
],
"default": "none"
}
},
"output3": {
"id": "output3",
"title": "output3",
"description": "Output column definition()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"enum": [
"none",
"Person",
"Location",
"Organization",
"Date",
"URL",
"Hashtag tokenized",
"UserID",
"Tweet lang",
"Token string (category)",
"UserID user",
"Emoticon normalized"
],
"default": "none"
}
},
"output4": {
"id": "output4",
"title": "output4",
"description": "Output column definition()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"enum": [
"none",
"Person",
"Location",
"Organization",
"Date",
"URL",
"Hashtag tokenized",
"UserID",
"Tweet lang",
"Token string (category)",
"UserID user",
"Emoticon normalized"
],
"default": "none"
}
},
"output5": {
"id": "output5",
"title": "output5",
"description": "Output column definition()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"enum": [
"none",
"Person",
"Location",
"Organization",
"Date",
"URL",
"Hashtag tokenized",
"UserID",
"Tweet lang",
"Token string (category)",
"UserID user",
"Emoticon normalized"
],
"default": "none"
}
},
"output6": {
"id": "output6",
"title": "output6",
"description": "Output column definition()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"enum": [
"none",
"Person",
"Location",
"Organization",
"Date",
"URL",
"Hashtag tokenized",
"UserID",
"Tweet lang",
"Token string (category)",
"UserID user",
"Emoticon normalized"
],
"default": "none"
}
},
"output7": {
"id": "output7",
"title": "output7",
"description": "Output column definition()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"enum": [
"none",
"Person",
"Location",
"Organization",
"Date",
"URL",
"Hashtag tokenized",
"UserID",
"Tweet lang",
"Token string (category)",
"UserID user",
"Emoticon normalized"
],
"default": "none"
}
},
"output8": {
"id": "output8",
"title": "output8",
"description": "Output column definition()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"enum": [
"none",
"Person",
"Location",
"Organization",
"Date",
"URL",
"Hashtag tokenized",
"UserID",
"Tweet lang",
"Token string (category)",
"UserID user",
"Emoticon normalized"
],
"default": "none"
}
},
"moreOutput": {
"id": "moreOutput",
"title": "moreOutput",
"description": "More output column definitions, using the output specification language. Enter none if you do not require any extra outputs [a sequence of values separated by #](https://cloud.gate.ac.uk/info/help/sobigdata/#output-spec)",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"format": "none",
"default": "none"
}
}
},
"outputs": {
"result": {
"id": "result",
"title": "result",
"description": "Result CSV file",
"minOccurs": 1,
"maxOccurs": 1,
"metadata": [
{
"title": "result.csv",
"role": "file",
"href": "/ccp_data/output/result.csv"
}
],
"schema": {
"type": "string",
"contentEncoding": "binary",
"contentMediaType": "text/csv"
}
}
},
"additionalParameters": {
"parameters": [
{
"name": "deploy-script",
"value": [
"./download.sh {{inputFile}}"
]
},
{
"name": "execute-script",
"value": [
"python build/twitie-named-entity-recognizer-for-tweets/standard_service.py /ccp_data/inputFile.csv {{columnSeparator}} {{hasHeaders}} '{{textColumn}}' '{{copyColumns}}' '{{output1}}' '{{output2}}' '{{output3}}' '{{output4}}' '{{output5}}' '{{output6}}' '{{output7}}' '{{output8}}' '{{moreOutput}}'",
"cp -f result.csv /ccp_data/"
]
},
{
"name": "undeploy-script",
"value": []
}
]
},
"links": [
{
"rel": "compatibleWith",
"title": "D4Science development Infrastructure",
"href": "infrastructures/d4science-dev-swarm"
}
],
"keywords": [
"gatecloud"
],
"id": "003b05cb-d546-4348-9345-09eeb06b440d"
}