[oalex] pasing the schema to avoid changing in confidence type

2024-12-05 16:44:17 +01:00 · 2024-12-05 16:44:17 +01:00 · f8479083f2
parent 9440f863c9
commit f8479083f2
1 changed files with 17 additions and 3 deletions
--- a/strings.py
+++ b/strings.py
@ -23,6 +23,20 @@ matchings_schema = ArrayType(
    ])
 )

+result_schema = StructType([
+    StructField("affiliation", StringType(),nullable=True),
+    StructField("match", ArrayType(
+        StructType([
+            StructField("Provenance", StringType(), nullable=True),
+            StructField("PID", StringType(), nullable=True),
+            StructField("Value", StringType(), nullable=True),
+            StructField("Confidence", DoubleType(), nullable=True),
+            StructField("Status", StringType(), nullable=True)
+        ])
+    ))
+])
+
+
 def oalex_affro_2(aff_string):
    try:
        matchings = affro(aff_string)
@ -70,14 +84,14 @@ explode = spark.read.json(folder_path) \
        explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
    )

-affs = explode \
+rdd = explode \
    .select("affiliation") \
    .distinct() \
    .rdd \
-    .flatMap(lambda row: [{"affiliation":row['affiliation'], "match": m} for m in oalex_affro(row['affiliation'])]).toDF()
+    .flatMap(lambda row: [{"affiliation":row['affiliation'], "match": m} for m in oalex_affro(row['affiliation'])])

 #affs.map(json.dumps).saveAsTextFile("./out/rdd")
-
+affs = spark.createDataFrame(rdd, schema=result_schema)

 affs.join(explode, on="affiliation") \
    .select(col("DOI"),