Eliminare 'ScholexplorerPropagation.json'

removed because not rendered
This commit is contained in:
Miriam Baglioni 2020-06-29 18:09:20 +02:00
parent afec5e4cf9
commit 6bc22d7e39
1 changed files with 0 additions and 113 deletions

View File

@ -1,113 +0,0 @@
{
"paragraphs": [
{
"text": "%pyspark\nimport json\nimport sys\nimport re\nfrom pyspark.sql.types import *\nfrom pyspark.sql import SQLContext\nfrom pyspark.sql.functions import *\nimport copy\n\nsqlContext = SQLContext(sc)\n\npaper_dataset_propagation = {\n \"documents\": {\n \"prob\": 1.0,\n \"path\":set()\n },\n \"isderivedfrom\": {\n \"prob\": 0.9,\n \"path\":set()\n },\n \"issourceof\": {\n \"prob\": 0.7,\n \"path\":set()\n },\n \"reviews\": {\n \"prob\": 0.8,\n \"path\":set()\n },\n \"references\": {\n \"prob\": 1.0,\n \"path\":set()\n },\n \"issupplementedby\": {\n \"prob\": 0.8,\n \"path\":set()\n },\n \"cites\": {\n \"prob\": 0.8,\n \"path\":set()\n }\n}\n\ndataset_dataset_propagation= {\n \"issupplementedby\": {\n \"prob\": 1.0\n },\n \"documents\": {\n \"prob\": 0.9\n },\n \"iscitedby\": {\n \"prob\": 0.9\n },\n \"haspart\": {\n \"prob\": 0.7 },\n \"isdocumentedby\": {\n \"prob\": 0.7 },\n \"continues\": {\n \"prob\": 0.8 },\n \"cites\": {\n \"prob\": 1.0 },\n \"issupplementto\": {\n \"prob\": 0.8 },\n \"isnewversionof\": {\n \"prob\": 0.9 },\n \"ispartof\": {\n \"prob\": 0.8 },\n \"references\": {\n \"prob\": 1.0 },\n \"isreferencedby\": {\n \"prob\": 0.9 },\n \"iscontinuedby\": {\n \"prob\": 0.7 },\n \"isvariantformof\": {\n \"prob\": 0.9 }\n }\n\n\n \ndef propagateDataset(x):\n propagation = copy.deepcopy(x[1][0]) #dictionary {\"publicationId\":{propagation_probabilities and path}}\n dsprob = x[1][1] #dictionary {\"datasetId\":{dataset_probabilities}}\n source = dsprob.keys().pop()\n todelpid = set()\n for pid in propagation:\n entry = propagation[pid]\n if source in propagation[pid]['path']:\n todelpid.add(pid)\n continue\n for use in entry:\n if use == 'path':\n continue\n new_p = entry[use] * dsprob[source][\"prob\"]\n if new_p > 0.3:\n entry[use] = new_p\n propagation[pid]['path'].add(x[0])\n else:\n todelpid.add(pid)\n for pid in todelpid:\n del propagation[pid]\n return (source, propagation)\n\ndef reduceRelation(a, b):\n if a is None:\n return b\n if b is None:\n return a \n for pid in b:\n if not pid in a:\n a[pid] = copy.deepcopy(b[pid])\n else:\n probabilities = b[pid]\n for prob in probabilities:\n if prob =='path':\n for e in probabilities['path']:\n a[pid]['path'].add(e)\n continue\n if prob in a[pid]:\n if a[pid][prob] < probabilities[prob]:\n a[pid][prob] = probabilities[prob]\n else:\n a[pid][prob] = probabilities[prob]\n return a \n \ndef hasDescription(x):\n if 'description' in x and not x['description'] is None:\n for dic in x['description']:\n if dic['value'] is not None and dic['value'].strip() != \"\":\n return True\n return False\n ",
"user": "miriam.baglioni",
"dateUpdated": "2020-06-29T14:55:43+0000",
"config": {
"editorSetting": {
"language": "python",
"editOnDblClick": false,
"completionKey": "TAB",
"completionSupport": true
},
"colWidth": 12,
"editorMode": "ace/mode/python",
"fontSize": 9,
"results": {},
"enabled": true
},
"settings": {
"params": {},
"forms": {}
},
"results": {
"code": "SUCCESS",
"msg": []
},
"apps": [],
"jobName": "paragraph_1593089330199_-1573015420",
"id": "20200521-082800_526102814",
"dateCreated": "2020-06-25T12:48:50+0000",
"status": "READY",
"errorMessage": "",
"progressUpdateIntervalMs": 500,
"focus": true,
"$$hashKey": "object:1124"
},
{
"text": "%pyspark\n\nload_datasets = sc.textFile('/user/sandro.labruzzo/scholix/graph/dataset').map(json.loads).filter(lambda x: x['dataInfo'] is None or not x['dataInfo']['deletedbyinference'])\nload_publications = sc.textFile('/user/sandro.labruzzo/scholix/graph/publication').map(json.loads).filter(lambda x: x['dataInfo'] is None or not x['dataInfo']['deletedbyinference'])\nrelations_rdd = spark.read.parquet('/user/sandro.labruzzo/scholix/graph/relation').rdd.filter(lambda x: x['dataInfo'] is None or not x['dataInfo']['deletedbyinference'])\n\n#relations from publication to dataset in the graph subset \npubs_relation = relations_rdd.filter(lambda x: x['source'][:2] == '50' and x['target'][:2] == '60' and x['relType'].lower() in paper_dataset_propagation)\n\n#relation from dataset to dataset (no self loops) in the graph subset\ndats_relation = relations_rdd.filter(lambda x: x['source'][:2] == '60' and x['target'][:2] == '60' and x['source'] != x['target'] and x['relType'].lower() in dataset_dataset_propagation)\n\n#distinct publication subset appearing in a relation to at least one dataset\npubs_subgraph = pubs_relation.map(lambda x: (x['source'],1)).reduceByKey(lambda a,b : a+b).join(load_publications.map(lambda x:(x['id'],x))).map(lambda x: x[1][1])\n\n#publications with abstract\npubs_with_abst = pubs_subgraph.filter(hasDescription).map(lambda x:(x['id'],x))\n\n#relations from publication with abstract to dataset\nrel_pubs_dats_abst = pubs_relation.map(lambda x: (x['source'],x)).join(pubs_with_abst).map(lambda x: x[1][0]).map(lambda x: (x['target'], x)).join(load_datasets.map(lambda x: (x['id'], 1))).map(lambda x: x[1][0])\n\n\npublication_dataset = rel_pubs_dats_abst.map(lambda x: (x['target'], {x['source']:copy.deepcopy(paper_dataset_propagation[x['relType'].lower()])}))\ndataset_dataset = dats_relation.map(lambda x: (x['source'], {x['target']:copy.deepcopy(dataset_dataset_propagation[x['relType'].lower()])}))\n\n\npl1 = publication_dataset.reduceByKey(reduceRelation)\n\npreviuos_propagation = pl1\npl1.count()\ncount = 2\nhops = 3\nwhile (True):\n if count > hops:\n break\n pl_step1 = previuos_propagation.join(dataset_dataset)\n pl_step2 = pl_step1.map(propagateDataset).filter(lambda x: len(x[1]) > 0)\n if pl_step2.count() == 0:\n break\n pl_step3 = pl_step2.reduceByKey(reduceRelation)\n current_propagation = pl_step3.union(previuos_propagation).reduceByKey(reduceRelation)\n current_propagation.count()\n count += 1\n previuos_propagation = current_propagation\n\n",
"user": "miriam.baglioni",
"dateUpdated": "2020-06-29T14:52:36+0000",
"config": {
"editorSetting": {
"language": "python",
"editOnDblClick": false,
"completionKey": "TAB",
"completionSupport": true
},
"colWidth": 12,
"editorMode": "ace/mode/python",
"fontSize": 9,
"results": {},
"enabled": true
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"jobName": "paragraph_1593089330225_1214619039",
"id": "20200521-084556_457403103",
"dateCreated": "2020-06-25T12:48:50+0000",
"status": "READY",
"errorMessage": "",
"progressUpdateIntervalMs": 500,
"$$hashKey": "object:1125"
},
{
"text": "%pyspark\n",
"user": "miriam.baglioni",
"dateUpdated": "2020-06-29T14:53:46+0000",
"config": {
"colWidth": 12,
"fontSize": 9,
"enabled": true,
"results": {},
"editorSetting": {
"language": "scala",
"editOnDblClick": false,
"completionKey": "TAB",
"completionSupport": true
},
"editorMode": "ace/mode/scala"
},
"settings": {
"params": {},
"forms": {}
},
"apps": [],
"jobName": "paragraph_1593442426323_1460687479",
"id": "20200629-145346_169818547",
"dateCreated": "2020-06-29T14:53:46+0000",
"status": "READY",
"progressUpdateIntervalMs": 500,
"focus": true,
"$$hashKey": "object:1868"
}
],
"name": "ScholexplorerPropagation",
"id": "2FB9ZGBK4",
"noteParams": {},
"noteForms": {},
"angularObjects": {
"md:shared_process": [],
"spark:miriam.baglioni:": []
},
"config": {
"isZeppelinNotebookCronEnable": false,
"looknfeel": "default",
"personalizedMode": "false"
},
"info": {}
}