Oalex #13

Merged
mkallipo merged 15 commits from openaire-workflow-ready_2 into openaire-workflow-ready 2024-12-09 18:51:24 +01:00
1 changed files with 17 additions and 3 deletions
Showing only changes of commit f8479083f2 - Show all commits

View File

@ -23,6 +23,20 @@ matchings_schema = ArrayType(
]) ])
) )
result_schema = StructType([
StructField("affiliation", StringType(),nullable=True),
StructField("match", ArrayType(
StructType([
StructField("Provenance", StringType(), nullable=True),
StructField("PID", StringType(), nullable=True),
StructField("Value", StringType(), nullable=True),
StructField("Confidence", DoubleType(), nullable=True),
StructField("Status", StringType(), nullable=True)
])
))
])
def oalex_affro_2(aff_string): def oalex_affro_2(aff_string):
try: try:
matchings = affro(aff_string) matchings = affro(aff_string)
@ -70,14 +84,14 @@ explode = spark.read.json(folder_path) \
explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
) )
affs = explode \ rdd = explode \
.select("affiliation") \ .select("affiliation") \
.distinct() \ .distinct() \
.rdd \ .rdd \
.flatMap(lambda row: [{"affiliation":row['affiliation'], "match": m} for m in oalex_affro(row['affiliation'])]).toDF() .flatMap(lambda row: [{"affiliation":row['affiliation'], "match": m} for m in oalex_affro(row['affiliation'])])
#affs.map(json.dumps).saveAsTextFile("./out/rdd") #affs.map(json.dumps).saveAsTextFile("./out/rdd")
affs = spark.createDataFrame(rdd, schema=result_schema)
affs.join(explode, on="affiliation") \ affs.join(explode, on="affiliation") \
.select(col("DOI"), .select(col("DOI"),