Oalex #13
40
strings.py
40
strings.py
|
@ -1,11 +1,11 @@
|
||||||
import json
|
import time
|
||||||
|
|
||||||
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
|
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
|
||||||
|
from threading import Thread
|
||||||
from affro_cluster import *
|
from affro_cluster import *
|
||||||
|
|
||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
from pyspark.sql.functions import col, explode, first, collect_set, udf
|
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
|
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
|
||||||
|
@ -23,10 +23,14 @@ matchings_schema = ArrayType(
|
||||||
])
|
])
|
||||||
)
|
)
|
||||||
|
|
||||||
def oalex_affro_2(aff_string):
|
operation_counter = spark.sparkContext.accumulator(0)
|
||||||
|
|
||||||
|
#Version of affro application on a single raw_aff_string and returns just the Matchins set
|
||||||
|
def oalex_affro(aff_string):
|
||||||
|
global operation_counter
|
||||||
try:
|
try:
|
||||||
matchings = affro(aff_string)
|
matchings = affro(aff_string)
|
||||||
|
operation_counter += 1
|
||||||
# Ensure matchings is a list, even if affro returns a single dict
|
# Ensure matchings is a list, even if affro returns a single dict
|
||||||
if not isinstance(matchings, list):
|
if not isinstance(matchings, list):
|
||||||
matchings = [matchings]
|
matchings = [matchings]
|
||||||
|
@ -49,18 +53,15 @@ def oalex_affro_2(aff_string):
|
||||||
print(f"Error processing affiliation string {aff_string}: {str(e)}")
|
print(f"Error processing affiliation string {aff_string}: {str(e)}")
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
#Version of affro application on a single raw_aff_string and returns just the Matchins set
|
oalex_affro_udf = udf(oalex_affro, matchings_schema)
|
||||||
def oalex_affro(aff_string):
|
monitor_done = False
|
||||||
try:
|
|
||||||
matchings = affro(aff_string)
|
|
||||||
if not isinstance(matchings, list):
|
|
||||||
matchings = [matchings]
|
|
||||||
return matchings
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error processing affiliation string {aff_string}: {str(e)}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
oalex_affro_udf = udf(oalex_affro_2, matchings_schema)
|
def monitor_counter(interval):
|
||||||
|
while True:
|
||||||
|
print(f"Number of calls to AffRo: {operation_counter.value}")
|
||||||
|
time.sleep(interval)
|
||||||
|
if monitor_done:
|
||||||
|
break
|
||||||
|
|
||||||
exploded = spark.read.json(folder_path) \
|
exploded = spark.read.json(folder_path) \
|
||||||
.filter(col("doi").isNotNull()) \
|
.filter(col("doi").isNotNull()) \
|
||||||
|
@ -69,6 +70,10 @@ exploded = spark.read.json(folder_path) \
|
||||||
col("rors").alias("OAlex"),
|
col("rors").alias("OAlex"),
|
||||||
explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
|
explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
|
||||||
)
|
)
|
||||||
|
|
||||||
|
monitor_thread = Thread(target=monitor_counter, args=(600,), daemon=True)
|
||||||
|
monitor_thread.start()
|
||||||
|
|
||||||
affs = exploded \
|
affs = exploded \
|
||||||
.select("affiliation") \
|
.select("affiliation") \
|
||||||
.distinct() \
|
.distinct() \
|
||||||
|
@ -87,3 +92,6 @@ affs.join(exploded, on="affiliation") \
|
||||||
.mode("overwrite") \
|
.mode("overwrite") \
|
||||||
.option("compression","gzip") \
|
.option("compression","gzip") \
|
||||||
.json(hdfs_output_path)
|
.json(hdfs_output_path)
|
||||||
|
|
||||||
|
monitor_done = True
|
||||||
|
monitor_thread.join()
|
Loading…
Reference in New Issue