Oalex #13
55
strings.py
55
strings.py
|
@ -12,7 +12,6 @@ spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
|
|||
|
||||
folder_path = sys.argv[1]
|
||||
hdfs_output_path = sys.argv[2]
|
||||
working_dir = sys.argv[3]
|
||||
|
||||
matchings_schema = ArrayType(
|
||||
StructType([
|
||||
|
@ -24,20 +23,6 @@ matchings_schema = ArrayType(
|
|||
])
|
||||
)
|
||||
|
||||
result_schema = StructType([
|
||||
StructField("affiliation", StringType(),nullable=True),
|
||||
StructField("match", ArrayType(
|
||||
StructType([
|
||||
StructField("Provenance", StringType(), nullable=True),
|
||||
StructField("PID", StringType(), nullable=True),
|
||||
StructField("Value", StringType(), nullable=True),
|
||||
StructField("Confidence", DoubleType(), nullable=True),
|
||||
StructField("Status", StringType(), nullable=True)
|
||||
])
|
||||
))
|
||||
])
|
||||
|
||||
|
||||
def oalex_affro_2(aff_string):
|
||||
try:
|
||||
matchings = affro(aff_string)
|
||||
|
@ -77,34 +62,28 @@ def oalex_affro(aff_string):
|
|||
|
||||
oalex_affro_udf = udf(oalex_affro_2, matchings_schema)
|
||||
|
||||
explode = spark.read.json(folder_path) \
|
||||
exploded = spark.read.json(folder_path) \
|
||||
.filter(col("doi").isNotNull()) \
|
||||
.select(
|
||||
col("doi").alias("DOI"),
|
||||
col("rors").alias("OAlex"),
|
||||
explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
|
||||
)
|
||||
|
||||
rdd = explode \
|
||||
col("doi").alias("DOI"),
|
||||
col("rors").alias("OAlex"),
|
||||
explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
|
||||
)
|
||||
affs = explode \
|
||||
.select("affiliation") \
|
||||
.distinct() \
|
||||
.rdd \
|
||||
.flatMap(lambda row: [{"affiliation":row['affiliation'], "match": m} for m in oalex_affro(row['affiliation'])])
|
||||
.withColumn("Matchings", oalex_affro_udf(col("affiliation")))
|
||||
|
||||
rdd.map(json.dumps).saveAsTextFile(working_dir + "/ tmp")
|
||||
affs = spark.read.json(working_dir + "/ tmp")
|
||||
|
||||
affs.join(explode, on="affiliation") \
|
||||
affs.join(exploded, on="affiliation") \
|
||||
.select(col("DOI"),
|
||||
col("OAlex"),
|
||||
col("match")
|
||||
explode(col("Matchings")).alias("match")
|
||||
) \
|
||||
.groupBy("DOI") \
|
||||
.agg(first("OAlex").alias("OAlex"), #for each DOI it says what are the other columns Since OALEX is equal for each doi just select the first, while use the collect_list function to aggregate the Matchings
|
||||
collect_list("match").alias("Matchings") #each exploded match is collected again
|
||||
) \
|
||||
.write \
|
||||
.mode("overwrite") \
|
||||
.option("compression","gzip") \
|
||||
.json(hdfs_output_path)
|
||||
|
||||
.groupBy("DOI") \
|
||||
.agg(first("OAlex").alias("OAlex"), #for each DOI it says what are the other columns Since OALEX is equal for each doi just select the first, while use the collect_list function to aggregate the Matchings
|
||||
collect_list("match").alias("Matchings") #each exploded match is collected again
|
||||
) \
|
||||
.write \
|
||||
.mode("overwrite") \
|
||||
.option("compression","gzip") \
|
||||
.json(hdfs_output_path)
|
||||
|
|
Loading…
Reference in New Issue