# coding=utf-8 from pyspark import SparkContext from pyspark import SparkConf from pyspark.sql import SparkSession import json #!/usr/bin/python import os import sys import shutil def deleteDir(x): try: shutil.rmtree(x) except OSError as e: print("Error: %s - %s." % (e.filename, e.strerror)) def stringToArray(x): try: return eval(x) except: return [] def testValidJSON(x): try: json.loads(x) return True except: return False conf = SparkConf().setAppName('Create JSON Dump').setMaster('local[*]') sc = SparkContext(conf=conf) spark = SparkSession.builder.config(conf=conf).getOrCreate() # RAW DUMPS PATH fairsharing_path = "../../data/raw/fairsharing_dump_api_02_2022.json" opendoar_path = "../../data/raw/openDoar.tsv" re3data_path = "../../data/raw/re3data.tsv" roar_path = "../../data/raw/export_roar_CSV.csv" output_path = "../../data/interim/datasources" deleteDir(output_path) #FAIRSHARING (file structure: ) fairsharing = sc.textFile(fairsharing_path).filter(testValidJSON).map(json.loads) fairsharing = fairsharing.map(lambda x: dict(id=x['attributes']['url'], originalId=x['id'], websiteurl=x['attributes']['metadata']['homepage'], name=x['attributes']['metadata']['name'], alternativeNames=[x['attributes']['abbreviation']], collectedfrom="FAIRsharing")) print("FAIRSHARING DS", fairsharing.count()) #OPENDOAR (file structure: OpenAIREID orgIdentifier repositoryName alternativeNames repositoryURL description type updateDate startDate subject contentType institution metadataPolicy dataPolicy submissionPolicy contentPolicy software api) opendoar = sc.textFile(opendoar_path) opendoar = opendoar.filter(lambda x: "system_metadata.id" not in x.split("\t")).map(lambda x: dict(fullname=x.split("\t")[1], shortname=x.split("\t")[1], alternativeNames=x.split("\t")[2], opendoar_id=x.split("\t")[0], homepage_url=x.split("\t")[3], subjects=x.split("\t")[9], openaire_id="opendoar::"+x.split("\t")[0])) opendoar = opendoar.map(lambda x: dict(id=x['openaire_id'], originalId=x['opendoar_id'], websiteurl=x['homepage_url'], name=json.loads(x['fullname'])['name'], alternativeNames=x['alternativeNames'], collectedfrom="OpenDOAR")) print("OPENDOAR DS", opendoar.count()) #RE3DATA (file structure: openaire_id re3data_id repository_name additional_name repository_url repository_id description type size update_date start_date end_date subject mission_statement content_type provider_type keyword institution policy database_access database_license data_access data_license data_upload data_upload_license software versioning api pid_system citation_guideline_url aid_system enhanced_publication quality_management certificate metadata_standard syndication remarks entry_date last_update) re3data = sc.textFile(re3data_path) re3data = re3data.filter(lambda x: "repositoryName.language" not in x.split("\t")).map(lambda x: dict(fullname=x.split("\t")[1], shortname=x.split("\t")[3], re3data_id=x.split("\t")[0], homepage_url=x.split("\t")[4], subjects=x.split("\t")[14], openaire_id="re3data::"+x.split("\t")[0])) re3data = re3data.map(lambda x: dict(id=x['openaire_id'], originalId=x['re3data_id'], websiteurl=x['homepage_url'], name=x['fullname'], alternativeNames=stringToArray(x['shortname']), collectedfrom="re3data")) print("RE3DATA", re3data.count()) #ROAR (file structure: "eprintid","rev_number","eprint_status","userid","importid","source","dir","datestamp","lastmod","status_changed","type","succeeds","commentary","metadata_visibility","latitude","longitude","relation_type","relation_uri","item_issues_id","item_issues_type","item_issues_description","item_issues_timestamp","item_issues_status","item_issues_reported_by","item_issues_resolved_by","item_issues_comment","item_issues_count","sword_depositor","sword_slug","exemplar","home_page","title","oai_pmh","sword_endpoint","rss_feed","twitter_feed","description","fulltext","open_access","mandate","organisation_title","organisation_home_page","location_country","location_city","location_latitude","location_longitude","software","geoname","version","subjects","date","note","suggestions","activity_low","activity_medium","activity_high","recordcount","recordhistory","fulltexts_total","fulltexts_docs","fulltexts_rtotal","fulltexts_rdocs","registry_name","registry_id","submit_to","submitted_to_name","submitted_to_done","webometrics_rank","webometrics_size","webometrics_visibility","webometrics_rich_files","webometrics_scholar","monthly_deposits","total_deposits","association") roar = sc.textFile(roar_path) roar = roar.filter(lambda x: "\"eprintid\"" not in x.split(",")).filter(lambda x: len(x.split(","))>31).map(lambda x: dict(id=x.split(",")[0], homepage_url=x.split(",")[30], title=x.split(",")[31])).filter(lambda x: x['title'] != "" and x['homepage_url'] != "") roar = roar.map(lambda x: dict(id="roar::"+x['id'].replace("\"",""), originalId=x['id'].replace("\"",""), websiteurl=x['homepage_url'].replace("\"",""), name=x['title'].replace("\"", ""), alternativeNames=[], collectedfrom="roar")) roar = roar.filter(lambda x: x['id'] != "" and x['name'] != "") roar = roar.map(lambda x: (x['id'], x)).reduceByKey(lambda x, y: x).map(lambda x: x[1]) print "ROAR:" + str(roar.count()) all_ds = fairsharing.union(opendoar).union(re3data).union(roar) all_ds.map(json.dumps).saveAsTextFile(output_path)