[oalex] avoid redefinition of explode function
This commit is contained in:
parent
e2f8007433
commit
a59d0ce9fc
55
strings.py
55
strings.py
|
@ -12,7 +12,6 @@ spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
|
||||||
|
|
||||||
folder_path = sys.argv[1]
|
folder_path = sys.argv[1]
|
||||||
hdfs_output_path = sys.argv[2]
|
hdfs_output_path = sys.argv[2]
|
||||||
working_dir = sys.argv[3]
|
|
||||||
|
|
||||||
matchings_schema = ArrayType(
|
matchings_schema = ArrayType(
|
||||||
StructType([
|
StructType([
|
||||||
|
@ -24,20 +23,6 @@ matchings_schema = ArrayType(
|
||||||
])
|
])
|
||||||
)
|
)
|
||||||
|
|
||||||
result_schema = StructType([
|
|
||||||
StructField("affiliation", StringType(),nullable=True),
|
|
||||||
StructField("match", ArrayType(
|
|
||||||
StructType([
|
|
||||||
StructField("Provenance", StringType(), nullable=True),
|
|
||||||
StructField("PID", StringType(), nullable=True),
|
|
||||||
StructField("Value", StringType(), nullable=True),
|
|
||||||
StructField("Confidence", DoubleType(), nullable=True),
|
|
||||||
StructField("Status", StringType(), nullable=True)
|
|
||||||
])
|
|
||||||
))
|
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
def oalex_affro_2(aff_string):
|
def oalex_affro_2(aff_string):
|
||||||
try:
|
try:
|
||||||
matchings = affro(aff_string)
|
matchings = affro(aff_string)
|
||||||
|
@ -77,34 +62,28 @@ def oalex_affro(aff_string):
|
||||||
|
|
||||||
oalex_affro_udf = udf(oalex_affro_2, matchings_schema)
|
oalex_affro_udf = udf(oalex_affro_2, matchings_schema)
|
||||||
|
|
||||||
explode = spark.read.json(folder_path) \
|
exploded = spark.read.json(folder_path) \
|
||||||
.filter(col("doi").isNotNull()) \
|
.filter(col("doi").isNotNull()) \
|
||||||
.select(
|
.select(
|
||||||
col("doi").alias("DOI"),
|
col("doi").alias("DOI"),
|
||||||
col("rors").alias("OAlex"),
|
col("rors").alias("OAlex"),
|
||||||
explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
|
explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
|
||||||
)
|
)
|
||||||
|
affs = explode \
|
||||||
rdd = explode \
|
|
||||||
.select("affiliation") \
|
.select("affiliation") \
|
||||||
.distinct() \
|
.distinct() \
|
||||||
.rdd \
|
.withColumn("Matchings", oalex_affro_udf(col("affiliation")))
|
||||||
.flatMap(lambda row: [{"affiliation":row['affiliation'], "match": m} for m in oalex_affro(row['affiliation'])])
|
|
||||||
|
|
||||||
rdd.map(json.dumps).saveAsTextFile(working_dir + "/ tmp")
|
affs.join(exploded, on="affiliation") \
|
||||||
affs = spark.read.json(working_dir + "/ tmp")
|
|
||||||
|
|
||||||
affs.join(explode, on="affiliation") \
|
|
||||||
.select(col("DOI"),
|
.select(col("DOI"),
|
||||||
col("OAlex"),
|
col("OAlex"),
|
||||||
col("match")
|
explode(col("Matchings")).alias("match")
|
||||||
) \
|
) \
|
||||||
.groupBy("DOI") \
|
.groupBy("DOI") \
|
||||||
.agg(first("OAlex").alias("OAlex"), #for each DOI it says what are the other columns Since OALEX is equal for each doi just select the first, while use the collect_list function to aggregate the Matchings
|
.agg(first("OAlex").alias("OAlex"), #for each DOI it says what are the other columns Since OALEX is equal for each doi just select the first, while use the collect_list function to aggregate the Matchings
|
||||||
collect_list("match").alias("Matchings") #each exploded match is collected again
|
collect_list("match").alias("Matchings") #each exploded match is collected again
|
||||||
) \
|
) \
|
||||||
.write \
|
.write \
|
||||||
.mode("overwrite") \
|
.mode("overwrite") \
|
||||||
.option("compression","gzip") \
|
.option("compression","gzip") \
|
||||||
.json(hdfs_output_path)
|
.json(hdfs_output_path)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue