Oalex #13
20
strings.py
20
strings.py
|
@ -23,6 +23,20 @@ matchings_schema = ArrayType(
|
||||||
])
|
])
|
||||||
)
|
)
|
||||||
|
|
||||||
|
result_schema = StructType([
|
||||||
|
StructField("affiliation", StringType(),nullable=True),
|
||||||
|
StructField("match", ArrayType(
|
||||||
|
StructType([
|
||||||
|
StructField("Provenance", StringType(), nullable=True),
|
||||||
|
StructField("PID", StringType(), nullable=True),
|
||||||
|
StructField("Value", StringType(), nullable=True),
|
||||||
|
StructField("Confidence", DoubleType(), nullable=True),
|
||||||
|
StructField("Status", StringType(), nullable=True)
|
||||||
|
])
|
||||||
|
))
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
def oalex_affro_2(aff_string):
|
def oalex_affro_2(aff_string):
|
||||||
try:
|
try:
|
||||||
matchings = affro(aff_string)
|
matchings = affro(aff_string)
|
||||||
|
@ -70,14 +84,14 @@ explode = spark.read.json(folder_path) \
|
||||||
explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
|
explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
|
||||||
)
|
)
|
||||||
|
|
||||||
affs = explode \
|
rdd = explode \
|
||||||
.select("affiliation") \
|
.select("affiliation") \
|
||||||
.distinct() \
|
.distinct() \
|
||||||
.rdd \
|
.rdd \
|
||||||
.flatMap(lambda row: [{"affiliation":row['affiliation'], "match": m} for m in oalex_affro(row['affiliation'])]).toDF()
|
.flatMap(lambda row: [{"affiliation":row['affiliation'], "match": m} for m in oalex_affro(row['affiliation'])])
|
||||||
|
|
||||||
#affs.map(json.dumps).saveAsTextFile("./out/rdd")
|
#affs.map(json.dumps).saveAsTextFile("./out/rdd")
|
||||||
|
affs = spark.createDataFrame(rdd, schema=result_schema)
|
||||||
|
|
||||||
affs.join(explode, on="affiliation") \
|
affs.join(explode, on="affiliation") \
|
||||||
.select(col("DOI"),
|
.select(col("DOI"),
|
||||||
|
|
Loading…
Reference in New Issue