ccp.docs/source/methods_ccp/Gate Cloud Url Domain Analy...

156 lines
5.8 KiB
JSON
Raw Normal View History

{
"title": "Gate Cloud Url Domain Analysis",
"description": "Service that takes a list of URLs and assigns to each information on what multiple organisations who analyse the credibility of online content have said about the domain (or sometimes, domain path) in the URL. Input is a CSV file with a column of URLs, and output is another CSV with a row for each item of credibility information that is available from the various sources. This may be several rows for a single URL if it has been rated by different organisations. By default there is at least one row in the output for every URL in the input, even if some URLs do not have any credibility information, since the resolved domain (i.e. the eventual target if the URL is from a shortener service such as bit.ly) may be useful information on its own, but these URLs may be omitted using the includeNoData parameter - for full details see https://cloud.gate.ac.uk/shopfront/displayItem/credibility-full-urls",
"version": "1.0.0",
"jobControlOptions": "async-execute",
"metadata": [
{
"title": "Marco Lettere",
"role": "author",
2024-10-07 19:59:13 +02:00
"href": "https://accounts.d4science.org/auth/admin/realms/d4science/users/09138708-9a19-4724-93d1-8c721d591da2"
},
{
"role": "category",
"title": "Misinformation_Detection"
}
],
"inputs": {
"ccpimage": {
"id": "ccpimage",
"title": "Runtime",
"description": "The image of the runtime to use for method execution. This depends on the infrastructure specific protocol for interacting with registries.",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"format": "url",
"contentMediaType": "text/plain",
"default": "hub.dev.d4science.org/ccp-runtimes/gatecloud-base:latest",
"readOnly": true
}
},
"inputFile": {
"id": "inputFile",
"title": "inputFile",
"description": "Input CSV file()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"format": "remotefile",
"default": "",
"contentMediaType": "text/csv"
}
},
"columnSeparator": {
"id": "columnSeparator",
"title": "columnSeparator",
"description": "Separator between columns - comma for a normal CSV file but tab is also supported()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"enum": [
"comma",
"tab"
],
"default": "comma"
}
},
"hasHeaders": {
"id": "hasHeaders",
"title": "hasHeaders",
"description": "Does the CSV have a header row?()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"format": "boolean",
"default": "false"
}
},
"urlColumn": {
"id": "urlColumn",
"title": "urlColumn",
"description": "Column name (if hasHeaders) or index (1-based) that contains the URL to analyse()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"format": "none",
"default": "1"
}
},
"includeNoData": {
"id": "includeNoData",
"title": "includeNoData",
"description": "In addition to credibility information, should the output include rows for those URLs where no credibility data is available?()",
"minOccurs": 1,
"maxOccurs": 1,
"schema": {
"type": "string",
"format": "boolean",
"default": "true"
}
}
},
"outputs": {
"outputCsv": {
"id": "outputCsv",
"title": "outputCsv",
"description": "Output file with one row per input pair giving the similarity score",
"minOccurs": 1,
"maxOccurs": 1,
"metadata": [
{
"title": "result.csv",
"role": "file",
"href": "/ccp_data/output/result.csv"
}
],
"schema": {
"type": "string",
"contentEncoding": "binary",
"contentMediaType": "text/csv"
}
}
},
"additionalParameters": {
"parameters": [
{
"name": "deploy-script",
"value": [
"./download.sh {{inputFile}}"
]
},
{
"name": "execute-script",
"value": [
"python build/credibility-full-urls/url_domain_analysis_service.py /ccp_data/inputFile.csv {{columnSeparator}} {{hasHeaders}} '{{urlColumn}}' '{{includeNoData}}'",
"cp -f result.csv /ccp_data/"
]
},
{
"name": "undeploy-script",
"value": []
}
]
},
"links": [
{
"rel": "compatibleWith",
"title": "D4Science development Infrastructure",
"href": "infrastructures/d4science-dev-swarm"
2024-10-07 19:59:13 +02:00
},
{
"rel": "compatibleWith",
"title": "D4Science production Infrastructure",
"href": "infrastructures/d4science-prod-swarm"
}
],
"keywords": [
"gatecloud"
],
"id": "0cddac35-69d7-47a9-8909-43a746ec16c3"
}