diff --git a/src/deduplication/ds_to_json.py b/src/deduplication/ds_to_json.py index 5c925b4..b50820b 100644 --- a/src/deduplication/ds_to_json.py +++ b/src/deduplication/ds_to_json.py @@ -27,24 +27,19 @@ def testValidJSON(x): except: return False -conf = SparkConf().setAppName('fairshare').setMaster('local[*]') +conf = SparkConf().setAppName('Create JSON Dump').setMaster('local[*]') sc = SparkContext(conf=conf) spark = SparkSession.builder.config(conf=conf).getOrCreate() -deleteDir("datasources") +# RAW DUMPS PATH +fairsharing_path = "../../data/raw/fairsharing_dump_api_02_2022.json" +opendoar_path = "../../data/raw/openDoar.tsv" +re3data_path = "../../data/raw/re3data.tsv" +roar_path = "../../data/raw/export_roar_CSV.csv" -fairsharing_path = "../data/raw/fairsharing_dump_api_02_2022.json" -opendoar_path = "../data/raw/openDoar.tsv" -re3data_path = "../data/raw/re3data.tsv" -roar_path = "../data/raw/export_roar_CSV.csv" +output_path = "../../data/interim/datasources" -# #FAIRSHARING V1.0 -# #FILE STRUCTURE: Full name of record | Short name | FAIRsharing URL | Homepage URL of the resource | Country Field | Subjects -# fairsharing = sc.textFile("/Users/miconis/Desktop/Fairsharing dedup/registries_analysis/data/raw/FAIRsharingDBrec_summary20210304.csv") -# fairsharing = fairsharing.filter(lambda x: " FAIRsharing URL " not in x.split("|")).map(lambda x: dict(fullname=x.split("|")[0], shortname=x.split("|")[1], fairshare_url=x.split("|")[2], homepage_url=x.split("|")[3], country=x.split("|")[4], subjects=x.split("|")[5])) -# fairsharing = fairsharing.map(lambda x: dict(id=x['fairshare_url'], originalId=x['fairshare_url'], websiteurl=x['homepage_url'], name=x['fullname'], alternativeNames=[x['shortname']], collectedfrom="FAIRsharing")) -# -# print "FAIRSHARING:" + str(fairsharing.count()) +deleteDir(output_path) #FAIRSHARING (file structure: ) fairsharing = sc.textFile(fairsharing_path).filter(testValidJSON).map(json.loads) @@ -73,4 +68,4 @@ print "ROAR:" + str(roar.count()) all_ds = fairsharing.union(opendoar).union(re3data).union(roar) -all_ds.map(json.dumps).saveAsTextFile("datasources") \ No newline at end of file +all_ds.map(json.dumps).saveAsTextFile(output_path) diff --git a/src/deduplication/mergerels_to_csv.py b/src/deduplication/mergerels_to_csv.py index 427d2f5..b581931 100644 --- a/src/deduplication/mergerels_to_csv.py +++ b/src/deduplication/mergerels_to_csv.py @@ -21,11 +21,13 @@ def relToRe3data(x): rel.append((re3data, id)) return rel -conf = SparkConf().setAppName('FAIRsharing').setMaster('local[*]') +conf = SparkConf().setAppName('Create Dedup CSV').setMaster('local[*]') sc = SparkContext(conf=conf) spark = SparkSession.builder.config(conf=conf).getOrCreate() -datasources = sc.textFile("datasources").map(json.loads) +input_path = "../../data/interim/datasources" + +datasources = sc.textFile(input_path).map(json.loads) print("TOTAL DS", datasources.count()) print("ROAR", datasources.filter(lambda x: "roar" in x['collectedfrom']).count()) @@ -44,9 +46,9 @@ joinRes = joinRes.map(lambda x: dict(dedup_id=x[1][0], duplicate_id=x[0], origin #CREATE CSV FILE (file structure: dedup_id; duplicate_id; original_id; name; collectedfrom csv_name = "ds_dedup_" + str(datetime.now()).split(".")[0].replace(" ", "_").replace(":", ".") + ".csv" -f = open(csv_name, "w") +f = open("../../data/processed/" + csv_name, "w") f.write("dedup_id;duplicate_id;original_id;name;collectedfrom\n") for row in joinRes.collect(): line = row['dedup_id'] + ";" + row['duplicate_id'] + ";" + row['original_id'] + ";\"" + row['name'] + "\";" + row['collectedfrom'] + "\n" f.write(line.encode("utf-8")) -f.close() \ No newline at end of file +f.close()