{ "title": "Gate Cloud Multilingual News Similarity Service", "description": "Service that takes two news article texts and attempts to determine whether they refer to the same underlying news story. The articles may be in the same or in different languages, and may optionally have been pre-tagged with named entities; if named entity annotations are present these are used as additional evidence when calculating the similarity score - for full details see https://cloud.gate.ac.uk/shopfront/displayItem/news-similarity", "version": "1.0.0", "jobControlOptions": "async-execute", "metadata": [ { "title": "Marco Lettere", "role": "author", "href": "https://accounts.dev.d4science.org/auth/admin/realms/d4science/users/88c76e47-5881-4716-a2bf-02d3b4073574" }, { "role": "category", "title": "Text_Analytics" } ], "inputs": { "ccpimage": { "id": "ccpimage", "title": "Runtime", "description": "The image of the runtime to use for method execution. This depends on the infrastructure specific protocol for interacting with registries.", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "url", "contentMediaType": "text/plain", "default": "hub.dev.d4science.org/ccp-runtimes/gatecloud-base:latest", "readOnly": true } }, "inputFile": { "id": "inputFile", "title": "inputFile", "description": "Input CSV file()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "remotefile", "default": "", "contentMediaType": "text/csv" } }, "columnSeparator": { "id": "columnSeparator", "title": "columnSeparator", "description": "Separator between columns - comma for a normal CSV file but tab is also supported()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "enum": [ "comma", "tab" ], "default": "comma" } }, "hasHeaders": { "id": "hasHeaders", "title": "hasHeaders", "description": "Does the CSV have a header row?()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "boolean", "default": "true" } }, "text1Column": { "id": "text1Column", "title": "text1Column", "description": "Column name (if hasHeaders) or index (1-based) that contains the first text()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "none", "default": "text1" } }, "text2Column": { "id": "text2Column", "title": "text2Column", "description": "Column name (if hasHeaders) or index (1-based) that contains the second text()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "none", "default": "text2" } }, "idColumn": { "id": "idColumn", "title": "idColumn", "description": "Column name (if hasHeaders) or index (1-based) that contains the row ID that will identify this pair in the output CSV file. Set to 0 to use the row number (1-based, excluding the header row if any)()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "none", "default": "id" } }, "hasEntities": { "id": "hasEntities", "title": "hasEntities", "description": "Have the texts been annotated with named entities? See the documentation for details. The remaining parameters are ignored if hasEntities is false.(https://cloud.gate.ac.uk/info/help/sobigdata/similarity.html)", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "boolean", "default": "false" } }, "text1LocationsColumn": { "id": "text1LocationsColumn", "title": "text1LocationsColumn", "description": "Column name (if hasHeaders) or index (1-based) that contains any locations such as countries, cities, rivers, etc. found in text 1.()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "none", "default": "text1_loc" } }, "text2LocationsColumn": { "id": "text2LocationsColumn", "title": "text2LocationsColumn", "description": "Column name (if hasHeaders) or index (1-based) that contains any locations such as countries, cities, rivers, etc. found in text 2.()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "none", "default": "text2_loc" } }, "text1DatesColumn": { "id": "text1DatesColumn", "title": "text1DatesColumn", "description": "Column name (if hasHeaders) or index (1-based) that contains any dates and times found in text 1.()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "none", "default": "text1_date" } }, "text2DatesColumn": { "id": "text2DatesColumn", "title": "text2DatesColumn", "description": "Column name (if hasHeaders) or index (1-based) that contains any dates and times found in text 2.()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "none", "default": "text2_date" } }, "text1NumbersColumn": { "id": "text1NumbersColumn", "title": "text1NumbersColumn", "description": "Column name (if hasHeaders) or index (1-based) that contains any numbers and measurement expressions found in text 1.()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "none", "default": "text1_num" } }, "text2NumbersColumn": { "id": "text2NumbersColumn", "title": "text2NumbersColumn", "description": "Column name (if hasHeaders) or index (1-based) that contains any numbers and measurement expressions found in text 2.()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "none", "default": "text2_num" } }, "text1OtherEntitiesColumn": { "id": "text1OtherEntitiesColumn", "title": "text1OtherEntitiesColumn", "description": "Column name (if hasHeaders) or index (1-based) that contains any other named entities, e.g. people, organizations or events found in text 1.()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "none", "default": "text1_oth" } }, "text2OtherEntitiesColumn": { "id": "text2OtherEntitiesColumn", "title": "text2OtherEntitiesColumn", "description": "Column name (if hasHeaders) or index (1-based) that contains any other named entities, e.g. people, organizations or events found in text 2.()", "minOccurs": 1, "maxOccurs": 1, "schema": { "type": "string", "format": "none", "default": "text2_oth" } } }, "outputs": { "outputCsv": { "id": "outputCsv", "title": "outputCsv", "description": "Output file with one row per input pair giving the similarity score", "minOccurs": 1, "maxOccurs": 1, "metadata": [ { "title": "result.csv", "role": "file", "href": "/ccp_data/output/result.csv" } ], "schema": { "type": "string", "contentEncoding": "binary", "contentMediaType": "text/csv" } } }, "additionalParameters": { "parameters": [ { "name": "deploy-script", "value": [ "./download.sh {{inputFile}}" ] }, { "name": "execute-script", "value": [ "python build/news-similarity/news_similarity_service.py /ccp_data/inputFile.csv {{columnSeparator}} {{hasHeaders}} '{{text1Column}}' '{{text2Column}}' '{{idColumn}}' '{{hasEntities}}' '{{text1LocationsColumn}}' '{{text2LocationsColumn}}' '{{text1DatesColumn}}' '{{text2DatesColumn}}' '{{text1NumbersColumn}}' '{{text2NumbersColumn}}' '{{text1OtherEntitiesColumn}}' '{{text2OtherEntitiesColumn}}'", "cp -f result.csv /ccp_data/" ] }, { "name": "undeploy-script", "value": [] } ] }, "links": [ { "rel": "compatibleWith", "title": "D4Science development Infrastructure", "href": "infrastructures/d4science-dev-swarm" } ], "keywords": [ "gatecloud" ], "id": "c3b4a5bb-34f0-4f84-bb57-573571d1dbf3" }