Add affRo algorithm as an external library
This commit is contained in:
parent
37c04cbad7
commit
50401a872f
|
@ -6,7 +6,18 @@
|
|||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
|
||||
<properties>
|
||||
<affro.release.version>1.0.0</affro.release.version>
|
||||
</properties>
|
||||
|
||||
<scm>
|
||||
<url>https://code-repo.d4science.org/mkallipo/affRo</url>
|
||||
<connection>scm:git:https://code-repo.d4science.org/mkallipo/affRo.git</connection>
|
||||
</scm>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
|
@ -43,6 +54,28 @@
|
|||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-scm-plugin</artifactId>
|
||||
<version>1.8.1</version>
|
||||
<configuration>
|
||||
<connectionType>connection</connectionType>
|
||||
<scmVersionType>tag</scmVersionType><!-- 'branch' can also be provided here -->
|
||||
<scmVersion>${affro.release.version}</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
|
||||
<checkoutDirectory>${project.build.directory}/${oozie.package.file.name}/${oozieAppDir}/affRo</checkoutDirectory>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>checkout-affro</id>
|
||||
<phase>prepare-package</phase>
|
||||
<goals>
|
||||
<goal>checkout</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
|
|
@ -1,31 +0,0 @@
|
|||
import sys
|
||||
##import functions
|
||||
from functions_cluster import *
|
||||
from matching_cluster import *
|
||||
from create_input_cluster import *
|
||||
import json
|
||||
|
||||
dix_org = load_json('dix_acad.json')
|
||||
dix_mult = load_json('dix_mult.json')
|
||||
dix_city = load_json('dix_city.json')
|
||||
dix_country = load_json('dix_country.json')
|
||||
|
||||
print('READY')
|
||||
|
||||
def affro(raw_aff_string):
|
||||
result = Aff_Ids(create_df_algorithm(raw_aff_string), dix_org, dix_mult, dix_city, dix_country, 0.5, 0.5 )
|
||||
return {'raw_affiliation_string':raw_aff_string, 'Matchings': [{'RORid':x[2], 'Confidence':x[1]} for x in result]}
|
||||
|
||||
#raw_aff = 'university of california, los angeles, university of athens, university of california, san diego, university of athens, greece'
|
||||
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# if len(sys.argv) != 2:
|
||||
# print("Usage: python affro_spark.py <string> <float1> <float2>")
|
||||
# sys.exit(1)
|
||||
#
|
||||
# string_arg = sys.argv[1]
|
||||
# # float_arg1 = float(sys.argv[2])
|
||||
# # float_arg2 = float(sys.argv[3])
|
||||
#
|
||||
# print(affro(string_arg))
|
|
@ -1,35 +0,0 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
from pyspark.sql import SQLContext
|
||||
from pyspark.sql.functions import udf
|
||||
from pyspark.sql.types import StringType
|
||||
from pyspark.sql import SparkSession
|
||||
import sys
|
||||
|
||||
from affro_cluster import *
|
||||
|
||||
# Initialize SparkSession
|
||||
spark = SparkSession.builder.appName("CustomFunctionExample").getOrCreate()
|
||||
|
||||
output_folder = sys.argv[1]
|
||||
print("Writing to folder: ", output_folder)
|
||||
|
||||
# Register the function as a UDF
|
||||
affro_udf = udf(affro, StringType())
|
||||
|
||||
# Input list of strings
|
||||
input_data = ["university of athens", "university of vienna", "UCLA"]
|
||||
|
||||
# # Convert the list to a Spark DataFrame
|
||||
df = spark.createDataFrame(input_data, "string").toDF("raw_affiliation_string")
|
||||
|
||||
# # Apply your custom UDF to the DataFrame
|
||||
df_with_custom_value = df.withColumn("affro_value", affro_udf(df["raw_affiliation_string"]))
|
||||
|
||||
|
||||
# df_with_custom_value.show(truncate=False)
|
||||
df_with_custom_value.write.mode("overwrite").option("delimiter", "\t").csv(output_folder)
|
||||
|
||||
|
||||
# Stop the SparkSession
|
||||
spark.stop()
|
|
@ -1,66 +0,0 @@
|
|||
from functions_cluster import *
|
||||
|
||||
def create_df_algorithm(raw_aff_string):
|
||||
|
||||
|
||||
aff_no_symbols_d = substrings_dict(clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string))))
|
||||
|
||||
dict_ = list(aff_no_symbols_d.values())
|
||||
|
||||
i = 0
|
||||
|
||||
|
||||
while i < len(dict_) - 1:
|
||||
if is_contained('progr', dict_[i]) and is_contained('dep', dict_[i+1]):
|
||||
dict_.pop(i)
|
||||
|
||||
|
||||
elif (is_contained('assistant', dict_[i]) or is_contained('researcher', dict_[i]) or is_contained('phd', dict_[i]) or is_contained('student', dict_[i]) or is_contained('section', dict_[i]) or is_contained('prof', dict_[i]) or is_contained('director', dict_[i])) and (not is_contained('school', dict_[i+1]) or is_contained('univ', dict_[i+1]) or is_contained('inst', dict_[i+1]) or is_contained('lab', dict_[i+1]) or is_contained('fac', dict_[i+1])):
|
||||
dict_.pop(i)
|
||||
|
||||
elif (is_contained('engineer', dict_[i]) or is_contained('progr', dict_[i]) or is_contained('unit', dict_[i]) or is_contained('lab', dict_[i]) or is_contained('dep', dict_[i]) or is_contained('school', dict_[i]) or is_contained('inst', dict_[i]) #or is_contained('hosp', dict_[i])
|
||||
or is_contained('fac', dict_[i])) and is_contained('univ', dict_[i+1]):
|
||||
if not is_contained('univ', dict_[i]):
|
||||
dict_.pop(i)
|
||||
|
||||
elif is_contained('lab', dict_[i]) and (is_contained('colege', dict_[i+1]) or is_contained('inst', dict_[i+1]) or is_contained('dep', dict_[i+1]) or is_contained('school', dict_[i+1])):
|
||||
if not is_contained('univ', dict_[i]):
|
||||
dict_.pop(i)
|
||||
|
||||
elif is_contained('dep', dict_[i]) and (is_contained('tech', dict_[i+1]) or is_contained('colege', dict_[i+1]) or is_contained('inst', dict_[i+1]) or is_contained('hosp', dict_[i+1]) or is_contained('school', dict_[i+1]) or is_contained('fac', dict_[i+1])):
|
||||
if not is_contained('univ', dict_[i]):
|
||||
dict_.pop(i)
|
||||
|
||||
elif is_contained('inst',dict_[i]) and (is_contained('school', dict_[i+1]) or is_contained('dep', dict_[i+1]) or is_contained('acad', dict_[i+1]) or is_contained('hosp', dict_[i+1]) or is_contained('clin', dict_[i+1]) or is_contained('klin', dict_[i+1]) or is_contained('fak', dict_[i+1]) or is_contained('fac', dict_[i+1]) or is_contained('cent', dict_[i+1]) or is_contained('div', dict_[i+1])):
|
||||
if not is_contained('univ', dict_[i]):
|
||||
dict_.pop(i)
|
||||
|
||||
elif is_contained('school',dict_[i]) and is_contained('colege', dict_[i+1]):
|
||||
if not is_contained('univ', dict_[i]):
|
||||
dict_.pop(i)
|
||||
else:
|
||||
i += 1
|
||||
|
||||
light_aff = (', '.join((dict_)))
|
||||
|
||||
for x in dict_:
|
||||
if x in city_names+remove_list:
|
||||
dict_.remove(x)
|
||||
|
||||
|
||||
dict_ = [shorten_keywords_spark([x])[0] for x in dict_]
|
||||
|
||||
keywords= []
|
||||
def valueToCategory(value):
|
||||
flag = 0
|
||||
|
||||
for k in categ_dicts:
|
||||
if k in value:
|
||||
flag = 1
|
||||
return flag
|
||||
|
||||
aff_list = [{"index": i, "keywords": dict_[i], "category": valueToCategory(dict_[i])} for i in range(len(dict_))]
|
||||
|
||||
filtered_list = [entry for entry in aff_list if entry.get("category") == 1]
|
||||
|
||||
return [light_aff, filtered_list]
|
File diff suppressed because one or more lines are too long
|
@ -1 +0,0 @@
|
|||
{"research": "Univ/Inst", "uniwersytet": "Univ/Inst", "investigacions": "Univ/Inst", "institu": "Univ/Inst", "istitut": "Univ/Inst", "univ": "Univ/Inst", "col": "Univ/Inst", "center": "Univ/Inst", "polytechnic": "Univ/Inst", "politecnico": "Univ/Inst", "centre": "Univ/Inst", "cnrs": "Univ/Inst", "faculty": "Univ/Inst", "school": "Univ/Inst", "academ": "Univ/Inst", "akadem": "Univ/Inst", "hochschule": "Univ/Inst", "ecole": "Univ/Inst", "tech": "Univ/Inst", "observ": "Univ/Inst", "escuela": "Univ/Inst", "escola": "Univ/Inst", "discovery programe": "Univ/Inst", "ku leuven": "Univ/Inst", "ucla": "Univ/Inst", "eth zurich": "Univ/Inst", "athena": "Univ/Inst", "openaire": "Univ/Inst", "erasmus": "Univ/Inst", "ist austria": "Univ/Inst", "lmu munich": "Univ/Inst", "cancer trials ireland": "Univ/Inst", "food safety authority": "Univ/Inst", "ucd": "Univ/Inst", "tcd": "Univ/Inst", "apc microbiome": "Univ/Inst", "nasa": "Univ/Inst", "ucl": "Univ/Inst", "zentrum": "Univ/Inst", "ncsr demokritos": "Univ/Inst", "panepistemio": "Univ/Inst", "forth": "Univ/Inst", "nui galway": "Univ/Inst", "nui maynooth": "Univ/Inst", "tu wien": "Univ/Inst", "tu dublin": "Univ/Inst", "lab": "Laboratory", "science": "Laboratory", "cientific": "Laboratory", "hospital": "Hospital", "clinic": "Hospital", "hopital": "Hospital", "klinik": "Hospital", "oncol": "Hospital", "medical": "Hospital", "health": "Hospital", "medicin": "Hospital", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "museum": "Museum", "library": "Museum", "foundation": "Foundation", "asociation": "Foundation", "organization": "Foundation", "society": "Foundation", "group": "Foundation", "royal": "Foundation", "ofice": "Foundation", "trust": "Foundation", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "microsoft": "Specific", "teagasc": "Specific", "ibm research": "Specific", "alergan": "Specific", "analog devices": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific"}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,615 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
import html
|
||||
from unidecode import unidecode
|
||||
import json
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
#import pandas as pd
|
||||
|
||||
def load_txt(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
list_ = [line.strip() for line in file]
|
||||
return list_
|
||||
|
||||
def load_pickled_dict(file_path):
|
||||
with open(file_path, 'rb') as file:
|
||||
pickled_dict = pickle.load(file)
|
||||
return pickled_dict
|
||||
|
||||
|
||||
def load_json(file_path):
|
||||
with open(file_path, 'r') as json_file:
|
||||
json_dict = json.load(json_file)
|
||||
return json_dict
|
||||
|
||||
categ_string = 'Laboratory|Univ/Inst|Hospital|Foundation|Specific|Museum'
|
||||
|
||||
def replace_double_consonants(text):
|
||||
# This regex pattern matches any double consonant
|
||||
pattern = r'([bcdfghjklmnpqrstvwxyz])\1'
|
||||
# The replacement is the first captured group (the single consonant)
|
||||
result = re.sub(pattern, r'\1', text, flags=re.IGNORECASE)
|
||||
return result
|
||||
|
||||
remove_list = [replace_double_consonants(x) for x in load_txt('remove_list.txt')]
|
||||
stop_words = load_txt('stop_words.txt')
|
||||
university_terms = [replace_double_consonants(x) for x in load_txt('university_terms.txt')]
|
||||
city_names = [replace_double_consonants(x) for x in load_txt('city_names.txt')]
|
||||
|
||||
categ_dicts = load_json('dix_categ.json')
|
||||
|
||||
|
||||
def is_contained(s, w):
|
||||
words = s.split() # Split the string 's' into a list of words
|
||||
for word in words:
|
||||
if word not in w: # If a word from 's' is not found in 'w'
|
||||
return False # Return False immediately
|
||||
return True # If all words from 's' are found in 'w', return True
|
||||
|
||||
def starts_with_any(string, prefixes):
|
||||
for prefix in prefixes:
|
||||
if string.startswith(prefix):
|
||||
return [True, prefix]
|
||||
return False
|
||||
|
||||
def remove_leading_numbers(s):
|
||||
return re.sub(r'^\d+', '', s)
|
||||
|
||||
def remove_outer_parentheses(string):
|
||||
"""Remove outer parentheses from the string if they enclose the entire string."""
|
||||
if string.startswith('(') and string.endswith(')'):
|
||||
return string[1:-1].strip()
|
||||
return string
|
||||
|
||||
def index_multiple_matchings(pairs):
|
||||
result_dict = {}
|
||||
|
||||
r_list = [pair[3] for pair in pairs]
|
||||
modified_list = [item for sublist in r_list for item in sublist]
|
||||
r = len(list(set(modified_list)))
|
||||
|
||||
for t in [pair[0] for pair in pairs]:
|
||||
key = t
|
||||
if key in result_dict and r>1:
|
||||
result_dict[key] += 1
|
||||
|
||||
else:
|
||||
result_dict[key] = 1
|
||||
|
||||
return result_dict
|
||||
|
||||
def avg_string(df, col):
|
||||
avg = []
|
||||
for i in range(len(df)):
|
||||
avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i]))
|
||||
return sum(avg)/len(avg)
|
||||
|
||||
def remove_stop_words(text):
|
||||
words = text.split()
|
||||
filtered_words = [word for word in words if word not in stop_words]
|
||||
return ' '.join(filtered_words)
|
||||
|
||||
|
||||
def remove_parentheses(text):
|
||||
return re.sub(r'\([^()]*\)', '', text)
|
||||
|
||||
|
||||
def replace_umlauts(text):
|
||||
normalized_text = unicodedata.normalize('NFKD', text)
|
||||
replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
|
||||
return replaced_text
|
||||
|
||||
def protect_phrases(input_string, phrases):
|
||||
# Replace phrases with placeholders
|
||||
placeholder_map = {}
|
||||
for i, phrase in enumerate(phrases):
|
||||
placeholder = "__PLACEHOLDER_" + str(i) + "__"
|
||||
placeholder_map[placeholder] = phrase
|
||||
input_string = input_string.replace(phrase, placeholder)
|
||||
return input_string, placeholder_map
|
||||
|
||||
def restore_phrases(split_strings, placeholder_map):
|
||||
# Restore placeholders with original phrases
|
||||
restored_strings = []
|
||||
for s in split_strings:
|
||||
for placeholder, phrase in placeholder_map.items():
|
||||
s = s.replace(placeholder, phrase)
|
||||
restored_strings.append(s)
|
||||
return restored_strings
|
||||
|
||||
def replace_comma_spaces(text):
|
||||
return text.replace(' ', ' ').replace(' , ', ', ')
|
||||
|
||||
def split_string_with_protection(input_string, protected_phrases):
|
||||
# Step 1: Protect specific phrases
|
||||
input_string, placeholder_map = protect_phrases(input_string, protected_phrases)
|
||||
|
||||
# Step 2: Split the string on specified delimiters
|
||||
split_strings = [s.strip() for s in re.split(r'[,;/]| – ', input_string) if s.strip()]
|
||||
|
||||
# Step 3: Restore protected phrases
|
||||
split_strings = restore_phrases(split_strings, placeholder_map)
|
||||
|
||||
return split_strings
|
||||
|
||||
protected_phrases1 = [
|
||||
phrase.format(x=x)
|
||||
for x in city_names
|
||||
for phrase in [
|
||||
'university california, {x}',
|
||||
# 'university california , {x}',
|
||||
|
||||
'university colege hospital, {x}',
|
||||
# 'university colege hospital , {x}',
|
||||
|
||||
'national univ ireland, {x}',
|
||||
# 'national univ ireland , {x}',
|
||||
|
||||
'national university ireland, {x}',
|
||||
# 'national university ireland , {x}',
|
||||
|
||||
'university colege, {x}',
|
||||
# 'university colege , {x}',
|
||||
|
||||
'university hospital, {x}',
|
||||
# 'university hospital , {x}',
|
||||
|
||||
'imperial colege, {x}',
|
||||
# 'imperial colege , {x}'
|
||||
|
||||
'city university, {x}',
|
||||
# 'city university , {x}'
|
||||
|
||||
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
|
||||
replacements = {'uni versity':'university',
|
||||
'univ ':'university ',
|
||||
'univercity':'university',
|
||||
'universtiy':'university',
|
||||
'univeristy':'university',
|
||||
'universirty':'university',
|
||||
'universiti':'university',
|
||||
'universitiy':'university',
|
||||
'universty' :'university',
|
||||
'univ col': 'university colege',
|
||||
'belfield, dublin': 'dublin',
|
||||
'balsbridge, dublin': 'dublin', #ballsbridge
|
||||
'earlsfort terrace, dublin': 'dublin',
|
||||
'bon secours hospital, cork' : 'bon secours hospital cork',
|
||||
'bon secours hospital, dublin' : 'bon secours hospital dublin',
|
||||
'bon secours hospital, galway' : 'bon secours hospital galway',
|
||||
'bon secours hospital, tralee' : 'bon secours hospital tralee',
|
||||
'bon secours health system' : 'bon secours hospital dublin',
|
||||
'bon secours hospital, glasnevin' : 'bon secours hospital dublin',
|
||||
'imperial colege science, technology medicine' : 'imperial colege science technology medicine',
|
||||
'ucl queen square institute neurology' : 'ucl, london',
|
||||
'ucl institute neurology' : 'ucl, london',
|
||||
'royal holoway, university london' : 'royal holoway universi london', #holloway
|
||||
'city, university london' : 'city universi london',
|
||||
'city university, london' : 'city universi london',
|
||||
'aeginition':'eginition',
|
||||
'national technical university, athens' : 'national technical university athens'
|
||||
# 'harvard medical school' : 'harvard university'
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def substrings_dict(string):
|
||||
# Split the input string and clean each substring
|
||||
# split_strings = split_string_with_protection(string.replace('univ coll', 'university college').replace('belfield, dublin', 'dublin').replace('ballsbridge, dublin', 'dublin').replace('earlsfort Terrace, dublin', 'dublin'), protected_phrases1)
|
||||
|
||||
for old, new in replacements.items():
|
||||
string = string.replace(old, new)
|
||||
split_strings = split_string_with_protection(string, protected_phrases1)
|
||||
|
||||
# Define a set of university-related terms for later use
|
||||
|
||||
|
||||
dict_string = {}
|
||||
index = 0
|
||||
for value in split_strings:
|
||||
|
||||
# Check if the substring contains any university-related terms
|
||||
if not any(term in value.lower() for term in university_terms):
|
||||
# Apply regex substitutions for common patterns
|
||||
|
||||
modified_value = re.sub(r'universi\w*', 'universi', value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'institu\w*', 'institu', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'centre*', 'center', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'\bsaint\b', 'st', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'\btrinity col\b', 'trinity colege', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'\btechnische\b', 'technological', modified_value, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
|
||||
# Add the modified substring to the dictionary
|
||||
|
||||
dict_string[index] = modified_value.lower().strip()
|
||||
index += 1
|
||||
# elif 'universitetskaya' in value.lower():
|
||||
# index += 1
|
||||
|
||||
|
||||
# Add the original substring to the dictionary
|
||||
else:
|
||||
dict_string[index] = value.lower().strip()
|
||||
index += 1
|
||||
|
||||
return dict_string
|
||||
|
||||
|
||||
|
||||
def clean_string(input_string):
|
||||
# Temporarily replace " - " with a unique placeholder
|
||||
placeholder = "placeholder"
|
||||
# input_string = input_string.replace(" - ", placeholder)
|
||||
input_string = input_string.replace(" – ", placeholder)
|
||||
|
||||
# Unescape HTML entities and convert to lowercase
|
||||
input_string = replace_comma_spaces(remove_stop_words(replace_double_consonants(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.lower())))))).strip())
|
||||
|
||||
# Normalize unicode characters (optional, e.g., replace umlauts)
|
||||
input_string = unidecode(input_string)
|
||||
|
||||
# Replace `/` and `–` with space (do not replace hyphen `-`)
|
||||
result = re.sub(r'[/\-]', ' ', input_string)
|
||||
|
||||
# Replace "saint" with "st"
|
||||
result = re.sub(r'\bsaint\b', 'st', result)
|
||||
result = re.sub(r'\baghia\b', 'agia', result)
|
||||
|
||||
|
||||
# Remove characters that are not from the Latin alphabet, or allowed punctuation
|
||||
result = replace_comma_spaces(re.sub(r'[^a-zA-Z\s,;/]', '', result).strip())
|
||||
|
||||
# Restore the " - " sequence from the placeholder
|
||||
result = result.replace(placeholder, " – ")
|
||||
|
||||
# Replace consecutive whitespace with a single space
|
||||
result = re.sub(r'\s+', ' ', result)
|
||||
#result = result.replace('ss', 's')
|
||||
|
||||
return result.strip() # Strip leading/trailing spaces
|
||||
|
||||
|
||||
def clean_string_facts(input_string):
|
||||
# Replace specified characters with space
|
||||
input_string = remove_stop_words(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.lower())))))
|
||||
result = re.sub(r'[/\-,]', ' ', input_string)
|
||||
result = re.sub(r'\bsaint\b', 'st', result)
|
||||
|
||||
# Remove characters that are not from the Latin alphabet or numbers
|
||||
result = re.sub(r'[^a-zA-Z0-9\s;/-]', '', result)
|
||||
|
||||
# Replace consecutive whitespace with a single space
|
||||
result = re.sub(r'\s+', ' ', result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def str_radius_u(string):
|
||||
string = string.lower()
|
||||
radius = 3
|
||||
|
||||
str_list = string.split()
|
||||
indices = []
|
||||
result = []
|
||||
|
||||
for i, x in enumerate(str_list):
|
||||
if is_contained('univers',x):
|
||||
indices.append(i)
|
||||
# elif is_contained('coll',x):
|
||||
# indices.append(i)
|
||||
|
||||
for r0 in indices:
|
||||
lmin =max(0,r0-radius)
|
||||
lmax =min(r0+radius, len(str_list))
|
||||
s = str_list[lmin:lmax+1]
|
||||
|
||||
result.append(' '.join(s))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def str_radius_coll(string):
|
||||
string = string.lower()
|
||||
radius = 1
|
||||
|
||||
str_list = string.split()
|
||||
indices = []
|
||||
result = []
|
||||
|
||||
for i, x in enumerate(str_list):
|
||||
if is_contained('col',x):
|
||||
indices.append(i)
|
||||
|
||||
for r0 in indices:
|
||||
lmin =max(0,r0-radius)
|
||||
lmax =min(r0+radius, len(str_list))
|
||||
s = str_list[lmin:lmax]
|
||||
|
||||
result.append(' '.join(s))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def str_radius_h(string):
|
||||
string = string.lower()
|
||||
radius = 3
|
||||
|
||||
str_list = string.split()
|
||||
indices = []
|
||||
result = []
|
||||
|
||||
for i, x in enumerate(str_list):
|
||||
if is_contained('hospital',x):
|
||||
indices.append(i)
|
||||
|
||||
for r0 in indices:
|
||||
lmin =max(0,r0-radius-1)
|
||||
lmax =min(r0+radius, len(str_list))
|
||||
s = str_list[lmin:lmax]
|
||||
|
||||
result.append(' '.join(s))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def str_radius_c(string):
|
||||
string = string.lower()
|
||||
radius = 2
|
||||
|
||||
str_list = string.split()
|
||||
indices = []
|
||||
result = []
|
||||
|
||||
for i, x in enumerate(str_list):
|
||||
if is_contained('clinic',x) or is_contained('klinik',x):
|
||||
indices.append(i)
|
||||
|
||||
for r0 in indices:
|
||||
lmin =max(0,r0-radius-1)
|
||||
lmax =min(r0+radius, len(str_list))
|
||||
s = str_list[lmin:lmax]
|
||||
|
||||
result.append(' '.join(s))
|
||||
|
||||
return result
|
||||
|
||||
def str_radius_r(string):
|
||||
string = string.lower()
|
||||
radius = 2
|
||||
|
||||
str_list = string.split()
|
||||
indices = []
|
||||
result = []
|
||||
|
||||
for i, x in enumerate(str_list):
|
||||
if is_contained('research',x):
|
||||
indices.append(i)
|
||||
|
||||
for r0 in indices:
|
||||
lmin =max(0,r0-radius-1)
|
||||
lmax =min(r0+radius, len(str_list))
|
||||
s = str_list[lmin:lmax]
|
||||
|
||||
result.append(' '.join(s))
|
||||
|
||||
return result
|
||||
|
||||
def str_radius_spec(string):
|
||||
spec = False
|
||||
for x in string.split():
|
||||
try:
|
||||
if categ_dicts[x] == 'Specific':
|
||||
spec = True
|
||||
return x
|
||||
except:
|
||||
pass
|
||||
if spec == False:
|
||||
return string
|
||||
|
||||
|
||||
def avg_string(df, col):
|
||||
avg = []
|
||||
for i in range(len(df)):
|
||||
avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i]))
|
||||
return sum(avg)/len(avg)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def shorten_keywords(affiliations_simple):
|
||||
affiliations_simple_n = []
|
||||
|
||||
for aff in affiliations_simple:
|
||||
inner = []
|
||||
for str in aff:
|
||||
if 'universi' in str:
|
||||
inner.extend(str_radius_u(str))
|
||||
elif 'col' in str and 'trinity' in str:
|
||||
inner.extend(str_radius_coll(str))
|
||||
elif 'hospital' in str or 'hopita' in str:
|
||||
inner.extend(str_radius_h(str))
|
||||
elif 'clinic' in str or 'klinik' in str:
|
||||
inner.extend(str_radius_c(str))
|
||||
elif 'research council' in str:
|
||||
inner.extend(str_radius_r(str))
|
||||
else:
|
||||
inner.append(str_radius_spec(str))
|
||||
|
||||
affiliations_simple_n.append(inner)
|
||||
|
||||
return affiliations_simple_n
|
||||
|
||||
def shorten_keywords_spark(affiliations_simple):
|
||||
affiliations_simple_n = []
|
||||
|
||||
for aff in affiliations_simple:
|
||||
|
||||
if 'universi' in aff:
|
||||
affiliations_simple_n.extend(str_radius_u(aff))
|
||||
elif 'col' in aff and 'trinity' in aff:
|
||||
affiliations_simple_n.extend(str_radius_coll(aff))
|
||||
elif 'hospital' in aff or 'hopita' in aff:
|
||||
affiliations_simple_n.extend(str_radius_h(aff))
|
||||
elif 'clinic' in aff or 'klinik' in aff:
|
||||
affiliations_simple_n.extend(str_radius_c(aff))
|
||||
elif 'research council' in aff:
|
||||
affiliations_simple_n.extend(str_radius_r(aff))
|
||||
else:
|
||||
affiliations_simple_n.append(str_radius_spec(aff))
|
||||
|
||||
|
||||
return affiliations_simple_n
|
||||
|
||||
|
||||
def refine(list_, affil):
|
||||
affil = affil.lower()
|
||||
|
||||
ids = []
|
||||
|
||||
for matched_org_list in list_:
|
||||
|
||||
id_list = []
|
||||
|
||||
for matched_org in matched_org_list:
|
||||
|
||||
if dix_mult[matched_org] == 'unique':
|
||||
id_list.append(dix_acad[matched_org])
|
||||
else:
|
||||
city_found = False
|
||||
for city in dix_city[matched_org]:
|
||||
if city[0] in affil:
|
||||
id_list.append(city[1])
|
||||
city_found = True
|
||||
break
|
||||
|
||||
if not city_found:
|
||||
country_found = False
|
||||
|
||||
for country in dix_country[matched_org]:
|
||||
if country[0] in list(country_mapping.keys()):
|
||||
print(country[0])
|
||||
if country[0] in affil or country_mapping[country[0]][0] in affil or country_mapping[country[0]][0] in affil:
|
||||
id_list.append(country[1])
|
||||
country_found = True
|
||||
break
|
||||
|
||||
|
||||
|
||||
elif country[0] in affil:
|
||||
print('country found',country[0])
|
||||
|
||||
id_list.append(country[1])
|
||||
country_found = True
|
||||
break
|
||||
|
||||
|
||||
|
||||
if not country_found:
|
||||
id_list.append(dix_acad[matched_org])
|
||||
|
||||
|
||||
|
||||
ids.append(id_list)
|
||||
return ids
|
||||
|
||||
def compute_cos(x,s):
|
||||
vectorizer = CountVectorizer()
|
||||
|
||||
s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
|
||||
x_vector = vectorizer.transform([x]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
return cosine_similarity(x_vector, s_vector)[0][0]
|
||||
|
||||
|
||||
# def find_ror(string, simU, simG):
|
||||
# df = pd.DataFrame()
|
||||
|
||||
# df['Unique affiliations'] = [[string.lower()]]
|
||||
# academia = create_df_algorithm(df)
|
||||
|
||||
|
||||
# result = Aff_Ids(len(academia), academia,dix_acad, dix_mult, dix_city, dix_country, simU,simG)
|
||||
# if len(result)>0:
|
||||
|
||||
# dict_aff_open = {x: y for x, y in zip(result['Original affiliations'], result['Matched organizations'])}
|
||||
# dict_aff_id = {x: y for x, y in zip(result['Original affiliations'], result['unique ROR'])}
|
||||
|
||||
# dict_aff_score = {}
|
||||
# for i in range(len(result)):
|
||||
# if type(result['Similarity score'].iloc[i]) == list:
|
||||
# dict_aff_score[result['Original affiliations'].iloc[i]] = result['Similarity score'].iloc[i]
|
||||
# else:
|
||||
# dict_aff_score[result['Original affiliations'].iloc[i]] = [result['Similarity score'].iloc[i]]
|
||||
|
||||
|
||||
# pids = []
|
||||
# for i in range(len(df)):
|
||||
# pidsi = []
|
||||
# for aff in df['Unique affiliations'].iloc[i]:
|
||||
# if aff in list(dict_aff_id.keys()):
|
||||
# pidsi = pidsi + dict_aff_id[aff]
|
||||
# # elif 'unmatched organization(s)' not in pidsi:
|
||||
# # pidsi = pidsi + ['unmatched organization(s)']
|
||||
# pids.append(pidsi)
|
||||
|
||||
|
||||
# names = []
|
||||
# for i in range(len(df)):
|
||||
# namesi = []
|
||||
# for aff in df['Unique affiliations'].iloc[i]:
|
||||
# if aff in list(dict_aff_open.keys()):
|
||||
# try:
|
||||
# namesi = namesi + dict_aff_open[aff]
|
||||
# except TypeError:
|
||||
# namesi = namesi + [dict_aff_open[aff]]
|
||||
|
||||
# names.append(namesi)
|
||||
|
||||
# scores = []
|
||||
# for i in range(len(df)):
|
||||
# scoresi = []
|
||||
# for aff in df['Unique affiliations'].iloc[i]:
|
||||
# if aff in list(dict_aff_score.keys()):
|
||||
# scoresi = scoresi + dict_aff_score[aff]
|
||||
|
||||
# scores.append(scoresi)
|
||||
|
||||
|
||||
# df['Matched organizations'] = names
|
||||
# df['ROR'] = pids
|
||||
# df['Scores'] = scores
|
||||
|
||||
|
||||
|
||||
# def update_Z(row):
|
||||
# if len(row['ROR']) == 0 or len(row['Scores']) == 0:
|
||||
# return []
|
||||
|
||||
# new_Z = []
|
||||
# for ror, score in zip(row['ROR'], row['Scores']):
|
||||
# entry = {'ROR_ID': ror, 'Confidence': score}
|
||||
# new_Z.append(entry)
|
||||
# return new_Z
|
||||
|
||||
# matching = df.apply(update_Z, axis=1)
|
||||
|
||||
# df['Matchings'] = matching
|
||||
|
||||
|
||||
# return df['Matchings'].iloc[0]
|
||||
# else:
|
||||
# return 'no result'
|
|
@ -1,326 +0,0 @@
|
|||
from collections import defaultdict
|
||||
from collections import Counter
|
||||
|
||||
import Levenshtein
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
from functions_cluster import *
|
||||
from create_input_cluster import *
|
||||
|
||||
def best_sim_score(light_raw, l2, l3, l4, simU, simG):
|
||||
"""
|
||||
Finds the best match between a 'key word' and several legal names from the OpenAIRE database.
|
||||
---> corrects special cases in the main map that follows
|
||||
|
||||
Args:
|
||||
l1: List of level2 affiliations.
|
||||
l2: number of candidates.
|
||||
l3: List of pairs.
|
||||
l4: mult
|
||||
|
||||
Returns:
|
||||
List: Resulting list containing OpenAIRE names and their similarity scores.
|
||||
"""
|
||||
|
||||
vectorizer = CountVectorizer()
|
||||
numUniv = light_raw.lower().count('univ')
|
||||
result = []
|
||||
best = []
|
||||
s = light_raw
|
||||
for j in range(len(l3)):
|
||||
x = l3[j][1]
|
||||
|
||||
if [x, l3[j][2]] in result:
|
||||
continue
|
||||
|
||||
if l4[l3[j][0]] == 1:
|
||||
|
||||
if is_contained('univ', x.lower()) and l3[j][2]> simU:
|
||||
result.append([x, l3[j][2]])
|
||||
elif l3[j][2] >simG:
|
||||
result.append([x, l3[j][2]])
|
||||
|
||||
|
||||
|
||||
elif l3[j][2] >=0.98:# and (is_contained("univ", x.lower()) or is_contained("college", x.lower()) or is_contained("center", x.lower()) or is_contained("schule", x.lower())): # If the similarity score of a pair (s,x) was 1, we store it to results list
|
||||
result.append([l3[j][1], 1])
|
||||
|
||||
else:
|
||||
try:
|
||||
if not is_contained("univ", x.lower()):
|
||||
continue # Skip if x does not contain "university" or "univ"
|
||||
|
||||
# if (is_contained('hosp', x.lower()) and not is_contained('hosp', s)) or (not is_contained('hosp', x.lower()) and is_contained('hosp', s)) or (is_contained('hopital', x.lower()) and not is_contained('hopital', s)) or (not is_contained('hopital', x.lower()) and is_contained('hopital', s)):
|
||||
# continue
|
||||
s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
|
||||
x_vector = vectorizer.transform([x]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(x_vector, s_vector)[0][0]
|
||||
if similarity> 0.1:
|
||||
similarity_l = 1 - Levenshtein.distance(x, l3[j][0]) / max(len(x), len(l3[j][0]))
|
||||
|
||||
|
||||
best.append([x, similarity,similarity_l])#(similarity+similarity2)/2])
|
||||
except:
|
||||
KeyError
|
||||
|
||||
if best:
|
||||
# max_numbers = defaultdict(float)
|
||||
|
||||
|
||||
# Assuming best is a list of three-element lists
|
||||
# Each element is (string, number1, number2)
|
||||
max_numbers = defaultdict(float)
|
||||
for item in best:
|
||||
string, number1, number2 = item # Unpack the three elements
|
||||
max_numbers[string] = max(max_numbers[string], number1)
|
||||
|
||||
reduced_best = [[string, number1, number2] for string, number1, number2 in best if number1 == max_numbers[string]]
|
||||
|
||||
# Sort by number1 decreasingly and then by number2 in descending order
|
||||
reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True)
|
||||
|
||||
result = result + reduced_best
|
||||
|
||||
univ_list = []
|
||||
other_list = []
|
||||
|
||||
for r in result:
|
||||
if is_contained('univ',r[0]):
|
||||
univ_list.append(r)
|
||||
else:
|
||||
other_list.append(r)
|
||||
|
||||
limit = min(numUniv, l2)
|
||||
|
||||
if len(univ_list)> limit:
|
||||
result = univ_list[:limit] + other_list
|
||||
|
||||
result_dict = {}
|
||||
pairs_dict = {}
|
||||
|
||||
|
||||
for l in l3:
|
||||
pairs_dict[l[1]] = l[2]
|
||||
|
||||
|
||||
for p in result:
|
||||
result_dict[p[0]]= pairs_dict[p[0]]
|
||||
|
||||
|
||||
|
||||
|
||||
result_dict_list = [[y[0],result_dict[y[0]]] for y in result]
|
||||
|
||||
|
||||
return result_dict_list
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG):
|
||||
|
||||
"""
|
||||
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
|
||||
|
||||
Args:
|
||||
m (int): The number of DOIs to check.
|
||||
DF (DataFrame): The input DataFrame containing affiliation data.
|
||||
dix_org (dict): A dictionary of names of organizations and their ROR_ids.
|
||||
simU (float): Similarity threshold for universities.
|
||||
simG (float): Similarity threshold for non-universities.
|
||||
|
||||
Returns:
|
||||
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
|
||||
"""
|
||||
df_list = input[1]
|
||||
light_aff = input[0]
|
||||
vectorizer = CountVectorizer()
|
||||
|
||||
lnamelist = list(dix_org.keys())
|
||||
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
|
||||
#pairs = []
|
||||
result = {}
|
||||
pairs = []
|
||||
|
||||
|
||||
def get_keywords(filtered_list):
|
||||
# Extract the "keywords" values from the dictionaries in filtered_list
|
||||
keywords_list = [entry["keywords"] for entry in filtered_list]
|
||||
|
||||
return keywords_list
|
||||
keywords = get_keywords(df_list)
|
||||
|
||||
|
||||
for k,s in enumerate(keywords):
|
||||
similar_k = []
|
||||
pairs_k = []
|
||||
|
||||
if s in lnamelist:
|
||||
similarity = 1
|
||||
similar_k.append(similarity)
|
||||
|
||||
pairs_k.append((s,s,similarity,dix_org[s]))
|
||||
pairs.append((s,s,similarity,dix_org[s]))
|
||||
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [s]
|
||||
else:
|
||||
dix[k].append(s)
|
||||
else:
|
||||
|
||||
for x in lnamelist:
|
||||
if is_contained(s, x):
|
||||
|
||||
x_vector = vectorizer.fit_transform([x]).toarray()
|
||||
s_vector = vectorizer.transform([s]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(x_vector, s_vector)[0][0]
|
||||
if similarity > min(simU, simG):
|
||||
if (is_contained('univ', s) and is_contained('univ', x)) and similarity > simU:
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((s,x,similarity,dix_org[x]))
|
||||
pairs.append((s,x,similarity,dix_org[x]))
|
||||
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
elif (not is_contained('univ', s) and not is_contained('univ', x)) and similarity > simG:
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((s,x,similarity,dix_org[x]))
|
||||
pairs.append((s,x,similarity,dix_org[x]))
|
||||
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
|
||||
elif is_contained(x, s):
|
||||
if (is_contained('univ', s) and is_contained('univ', x)):
|
||||
|
||||
s_vector = vectorizer.fit_transform([s]).toarray()
|
||||
x_vector = vectorizer.transform([x]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(s_vector, x_vector)[0][0]
|
||||
if similarity > simU: #max(0.82,sim):
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((s,x,similarity,dix_org[x]))
|
||||
pairs.append((s,x,similarity,dix_org[x]))
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
elif not is_contained('univ', s) and not is_contained('univ', x):
|
||||
|
||||
s_vector = vectorizer.fit_transform([s]).toarray()
|
||||
x_vector = vectorizer.transform([x]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(s_vector, x_vector)[0][0]
|
||||
if similarity > simG: #max(0.82,sim):
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((s,x,similarity,dix_org[x]))
|
||||
pairs.append((s,x,similarity,dix_org[x]))
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
|
||||
result[k] = pairs_k
|
||||
|
||||
multi = index_multiple_matchings(list(set(pairs)))
|
||||
# need_check = list(set([i for i in range(len(multi)) if list(multi.values())[i]>1]))
|
||||
# print('here', multi)
|
||||
# need_check_keys = [keywords[i] for i in range(len(keywords)) if multi[keywords[i]]>1]
|
||||
need_check_keys = []
|
||||
for i in range(len(keywords)):
|
||||
try:
|
||||
if multi[keywords[i]]>1:
|
||||
need_check_keys.append(keywords[i])
|
||||
except:
|
||||
pass
|
||||
|
||||
best = best_sim_score(light_aff, len(keywords), pairs, multi, simU, simG)
|
||||
matched_org = [x[0] for x in best]
|
||||
# best_o = []
|
||||
# best_s = []
|
||||
# best_result = []
|
||||
# for x in best:
|
||||
# best_o.append([x[i][0] for i in range(len(x))])
|
||||
# best_s.append([round(x[i][1],2) for i in range(len(x))])
|
||||
# num_mathced = [len(best_s[i]) for i in range(len(need_check))]
|
||||
ids = [dix_org[x[0]] for x in best]
|
||||
for i,x in enumerate(matched_org):
|
||||
# id_list = []
|
||||
if dix_mult[x] != 'unique':
|
||||
if x in list(dix_city_ror.keys()):
|
||||
match_found0 = False
|
||||
match_found = False
|
||||
|
||||
for city in dix_city_ror[x]:
|
||||
if city[0] in light_aff:
|
||||
if city[0] not in x:
|
||||
ids[i] = city[1]
|
||||
|
||||
match_found0 = True
|
||||
match_found = True
|
||||
break
|
||||
if not match_found:
|
||||
for city in dix_city_ror[x]:
|
||||
if city[0] in light_aff and city[0] not in x:
|
||||
ids[i] = city[1]
|
||||
match_found0 = True
|
||||
print('ok')
|
||||
break
|
||||
|
||||
if not match_found:
|
||||
match_found2 = False
|
||||
match_found3 = False
|
||||
|
||||
for country in dix_country_ror[x]:
|
||||
if country[0] == 'united states' and (country[0] in light_aff or 'usa' in light_aff):
|
||||
ids[i] = country[1]
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
|
||||
if country[0] == 'united kingdom' and (country[0] in light_aff or 'uk' in light_aff):
|
||||
ids[i] = country[1]
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
|
||||
elif country[0] in light_aff:
|
||||
|
||||
if country[0] not in x:
|
||||
ids[i] = country[1]
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
|
||||
if not match_found3:
|
||||
for country in dix_country_ror[x]:
|
||||
if country[0] in light_aff and country[0] in x:
|
||||
ids[i] = country[1]
|
||||
match_found2 = True
|
||||
break
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
results = [[x[0],x[1], ids[i]] for i,x in enumerate(best)]
|
||||
|
||||
return results #[[result[to_check[i]] for i in ready] + [to_check[2]], best[0]]
|
|
@ -1,584 +0,0 @@
|
|||
galway
|
||||
maynooth
|
||||
duluth
|
||||
port arthur
|
||||
new orleans
|
||||
paterson
|
||||
santa barbara
|
||||
thornton
|
||||
westminster
|
||||
north las vegas
|
||||
stockton
|
||||
marysville
|
||||
fitchburg
|
||||
tallinn
|
||||
fargo
|
||||
seaside
|
||||
manaus
|
||||
porto
|
||||
quebec city
|
||||
hialeah
|
||||
normal
|
||||
kansas city
|
||||
delhi
|
||||
fort worth
|
||||
palermo
|
||||
olathe
|
||||
madison
|
||||
santa maria
|
||||
youngstown
|
||||
allentown
|
||||
santa clara
|
||||
charlotte
|
||||
agra
|
||||
palmdale
|
||||
kraków
|
||||
bendigo
|
||||
high point
|
||||
washington
|
||||
dallas
|
||||
grand prairie
|
||||
plano
|
||||
leipzig
|
||||
bratislava
|
||||
seville
|
||||
puebla
|
||||
lucknow
|
||||
toowoomba
|
||||
santa rosa
|
||||
sioux falls
|
||||
flint
|
||||
kissimmee
|
||||
lacey
|
||||
brownsville
|
||||
palm springs
|
||||
tyler
|
||||
minsk
|
||||
san diego
|
||||
los angeles
|
||||
edmonton
|
||||
college station
|
||||
toulouse
|
||||
garland
|
||||
florence
|
||||
saskatoon
|
||||
albury-wodonga
|
||||
newburgh
|
||||
danbury
|
||||
deltona
|
||||
south bend
|
||||
nagpur
|
||||
pomona
|
||||
memphis
|
||||
london
|
||||
lincoln
|
||||
chandler
|
||||
adelaide
|
||||
salt lake city
|
||||
edinburgh
|
||||
suzhou
|
||||
grayslake
|
||||
new york city
|
||||
kanpur
|
||||
brussels
|
||||
okayama
|
||||
tuscaloosa
|
||||
clarksville
|
||||
jackson
|
||||
boise city
|
||||
canton
|
||||
louisville
|
||||
varanasi
|
||||
columbus
|
||||
lorain
|
||||
vadodara
|
||||
orem
|
||||
chennai
|
||||
townsville
|
||||
eindhoventoronto
|
||||
wuhan
|
||||
norman
|
||||
winter haven
|
||||
eugene
|
||||
riga
|
||||
hamamatsu
|
||||
fresno
|
||||
lake charles
|
||||
budapest
|
||||
mobile
|
||||
lowell
|
||||
vienna
|
||||
tallahassee
|
||||
nanjing
|
||||
new haven
|
||||
sacramento
|
||||
leeds
|
||||
harlingen
|
||||
springdale
|
||||
perth
|
||||
sendai
|
||||
utica
|
||||
orange
|
||||
baltimore
|
||||
rochester
|
||||
rancho cucamonga
|
||||
bellevue
|
||||
fort wayne
|
||||
modesto
|
||||
pristina
|
||||
nuremberg
|
||||
stuttgart
|
||||
indore
|
||||
murfreesboro
|
||||
nottingham
|
||||
scranton
|
||||
lancaster
|
||||
abilene
|
||||
monterey
|
||||
sioux city
|
||||
bari
|
||||
chula vista
|
||||
ahmedabad
|
||||
north port
|
||||
helsinki
|
||||
leominster
|
||||
ocala
|
||||
sarajevo
|
||||
hangzhou
|
||||
roanoke
|
||||
new york
|
||||
bethlehem
|
||||
dublin
|
||||
sunshine coast
|
||||
pune
|
||||
billings
|
||||
changchunsydney
|
||||
garden grove
|
||||
port orange
|
||||
pittsburgh
|
||||
new bedford
|
||||
hiroshima
|
||||
san francisco
|
||||
sheffield
|
||||
chongqing
|
||||
layton
|
||||
pueblo
|
||||
chengdu
|
||||
cincinnati
|
||||
erie
|
||||
lansing
|
||||
ljubljana
|
||||
st louis
|
||||
rio de janeiro
|
||||
philadelphia
|
||||
tacoma
|
||||
bel air
|
||||
chesapeake
|
||||
davenport
|
||||
las vegas
|
||||
nagasaki
|
||||
kitchener
|
||||
boulder
|
||||
roseville
|
||||
evansville
|
||||
victoria
|
||||
burbank
|
||||
sofia
|
||||
santa clarita
|
||||
san buenaventura
|
||||
savannah
|
||||
apple valley
|
||||
brighton
|
||||
coral springs
|
||||
huntsville
|
||||
fort lauderdale
|
||||
warsaw
|
||||
antioch
|
||||
medford
|
||||
visalia
|
||||
frankfurt
|
||||
joliet
|
||||
curitiba
|
||||
mcallen
|
||||
seattle
|
||||
alexandria
|
||||
bryan
|
||||
moreno valley
|
||||
berlin
|
||||
olympia
|
||||
caracas
|
||||
tianjin
|
||||
cleveland
|
||||
des moines
|
||||
prague
|
||||
fukuoka
|
||||
burlington
|
||||
bhopal
|
||||
nara
|
||||
hampton
|
||||
jefferson
|
||||
chicago
|
||||
temecula
|
||||
paris
|
||||
gilbert
|
||||
bradenton
|
||||
champaign
|
||||
munich
|
||||
amsterdam
|
||||
raleigh
|
||||
atlanta
|
||||
lakeland
|
||||
denver
|
||||
round lake beach
|
||||
richmond
|
||||
buffalo
|
||||
phoenix
|
||||
antwerp
|
||||
greenbay
|
||||
milwaukee
|
||||
south lyon
|
||||
concord
|
||||
vero beach
|
||||
newcastle
|
||||
podgorica
|
||||
monterrey
|
||||
shantou
|
||||
costa mesa
|
||||
copenhagen
|
||||
vilnius
|
||||
dalian
|
||||
bristol
|
||||
salinas
|
||||
belgrade
|
||||
waterloo
|
||||
henderson
|
||||
hayward
|
||||
hickory
|
||||
el monte
|
||||
lima
|
||||
redding
|
||||
mexico city
|
||||
cary
|
||||
kennewick
|
||||
guayaquil
|
||||
tirana
|
||||
kawasaki
|
||||
greensboro
|
||||
west covina
|
||||
amarillo
|
||||
saitama
|
||||
new london
|
||||
recife
|
||||
manchester
|
||||
rockford
|
||||
kelowna
|
||||
hagerstown
|
||||
bordeaux
|
||||
york
|
||||
kaneohe
|
||||
tucson
|
||||
gainesville
|
||||
kalamazoo
|
||||
bogotá
|
||||
reading
|
||||
virginia beach
|
||||
guadalajara
|
||||
albany
|
||||
durham
|
||||
green bay
|
||||
oceanside
|
||||
montreal
|
||||
turin
|
||||
malaga
|
||||
oshawa
|
||||
mesa
|
||||
pensacola
|
||||
boise
|
||||
bonita springs
|
||||
fort walton beach
|
||||
port saint lucie
|
||||
reykjavik
|
||||
north charleston
|
||||
newark
|
||||
reno
|
||||
knoxville
|
||||
bakersfield
|
||||
oslo
|
||||
omaha
|
||||
milan
|
||||
cambridge
|
||||
norwich
|
||||
shanghai
|
||||
naples
|
||||
victorville
|
||||
zagreb
|
||||
norwalk
|
||||
huntington beach
|
||||
clarke county
|
||||
lubbock
|
||||
yakima
|
||||
warren
|
||||
bucharest
|
||||
simi valley
|
||||
greenville
|
||||
racine
|
||||
salvador
|
||||
elk grove
|
||||
orlando
|
||||
windsor
|
||||
santa cruz
|
||||
saginaw
|
||||
ballarat
|
||||
muskegon
|
||||
shreveport
|
||||
clearwater
|
||||
merced
|
||||
boston
|
||||
basel
|
||||
elizabeth
|
||||
panama city
|
||||
okinawa
|
||||
sarasota
|
||||
zurich
|
||||
glendale
|
||||
wilmington
|
||||
pompano beach
|
||||
guangzhou
|
||||
fairfield
|
||||
hyderabad
|
||||
santiago
|
||||
nashville
|
||||
mchenry
|
||||
ann arbor
|
||||
carrollton
|
||||
hollywood
|
||||
laredo
|
||||
rome
|
||||
san bernardino
|
||||
bergen
|
||||
springfield
|
||||
winnipeg
|
||||
corona
|
||||
surat
|
||||
long beach
|
||||
nagoya
|
||||
toledo
|
||||
geelong
|
||||
kenosha
|
||||
sterling heights
|
||||
lisbon
|
||||
myrtle beach
|
||||
nashua
|
||||
riverside
|
||||
tampa
|
||||
bangalore
|
||||
richland
|
||||
rotterdam
|
||||
lyon
|
||||
scottsdale
|
||||
berkeley
|
||||
bologna
|
||||
cedar rapids
|
||||
syracuse
|
||||
tulsa
|
||||
ludhiana
|
||||
hemet
|
||||
portland
|
||||
mission viejo
|
||||
salem
|
||||
overland park
|
||||
detroit
|
||||
jinan
|
||||
osaka
|
||||
grand rapids
|
||||
jersey city
|
||||
kailua
|
||||
venice
|
||||
darwin
|
||||
miramar
|
||||
gulfport-biloxi
|
||||
huntington
|
||||
portsmouth
|
||||
worcester
|
||||
sunnyvale
|
||||
escondido
|
||||
college park
|
||||
thousand oaks
|
||||
harbin
|
||||
belfast
|
||||
yonkers
|
||||
alicante
|
||||
barnstable
|
||||
kitakyushu
|
||||
sapporo
|
||||
ogden
|
||||
aurora
|
||||
palm bay
|
||||
düsseldorf
|
||||
hobart
|
||||
irvine
|
||||
st johns
|
||||
hamburg
|
||||
provo
|
||||
melbourne
|
||||
madrid
|
||||
zhengzhou
|
||||
asheville
|
||||
patna
|
||||
inglewood
|
||||
houston
|
||||
newport news
|
||||
west valley city
|
||||
oklahoma city
|
||||
brisbane
|
||||
valencia
|
||||
pasadena
|
||||
aberdeen
|
||||
st petersburg
|
||||
lakewood
|
||||
irving
|
||||
naperville
|
||||
miami
|
||||
topeka
|
||||
downey
|
||||
genoa
|
||||
lewisville
|
||||
birmingham
|
||||
xian
|
||||
saint paul
|
||||
bremerton
|
||||
corpus christi
|
||||
daytona beach
|
||||
st paul
|
||||
oxnard
|
||||
murrieta
|
||||
lafayette
|
||||
montgomery
|
||||
baton rouge
|
||||
skopje
|
||||
cathedral city
|
||||
spartanburg
|
||||
canberra
|
||||
arvada
|
||||
hesperia
|
||||
port st lucie
|
||||
saint louis
|
||||
bridgeport
|
||||
tempe
|
||||
quito
|
||||
chattanooga
|
||||
bremen
|
||||
gold coast
|
||||
cairns
|
||||
beaumont
|
||||
elkhart
|
||||
peoria
|
||||
calgary
|
||||
honolulu
|
||||
havre de grace
|
||||
hamilton
|
||||
fullerton
|
||||
daly city
|
||||
dresden
|
||||
belem
|
||||
ottawa
|
||||
regina
|
||||
chiba
|
||||
fort collins
|
||||
indianapolis
|
||||
mumbai
|
||||
killeen
|
||||
sao paulo
|
||||
jaipur
|
||||
fremont
|
||||
zaragoza
|
||||
charleston
|
||||
waco
|
||||
kobe
|
||||
odessa
|
||||
monroe
|
||||
vallejo
|
||||
marseille
|
||||
qingdao
|
||||
frederick
|
||||
marina
|
||||
sebastian
|
||||
oakland
|
||||
pembroke pines
|
||||
san antonio
|
||||
kyoto
|
||||
colorado springs
|
||||
el paso
|
||||
shenyang
|
||||
punta gorda
|
||||
fort smith
|
||||
richmond county
|
||||
waterbury
|
||||
shenzhen
|
||||
albuquerque
|
||||
jacksonville
|
||||
minneapolis
|
||||
fortaleza
|
||||
denton
|
||||
gastonia
|
||||
fayetteville
|
||||
bloomington
|
||||
houma
|
||||
santa ana
|
||||
kolkata
|
||||
las cruces
|
||||
barcelona
|
||||
arlington
|
||||
niigata
|
||||
norfolk
|
||||
fontana
|
||||
providence
|
||||
santo domingo
|
||||
vancouver
|
||||
appleton
|
||||
san jose
|
||||
hartford
|
||||
winston
|
||||
barrie
|
||||
glasgow
|
||||
davidson county
|
||||
yokohama
|
||||
independence
|
||||
athens
|
||||
harrisburg
|
||||
macon
|
||||
torrance
|
||||
launceston
|
||||
cape coral
|
||||
austin
|
||||
little rock
|
||||
cologne
|
||||
mesquite
|
||||
catania
|
||||
stockholm
|
||||
nice
|
||||
stamford
|
||||
buenos aires
|
||||
columbia
|
||||
anchorage
|
||||
dayton
|
||||
wollongong
|
||||
halifax
|
||||
verona
|
||||
anaheim
|
||||
kiev
|
||||
augusta
|
||||
tokyo
|
||||
akron
|
||||
lexington
|
||||
wichita
|
||||
saint petersburg
|
||||
beijing
|
||||
johnson city
|
||||
spokane
|
||||
liverpool
|
||||
howell
|
||||
poughkeepsie
|
||||
ontario
|
||||
atlantic city
|
||||
trenton
|
|
@ -1,28 +0,0 @@
|
|||
universi
|
||||
research institu
|
||||
laboratory
|
||||
gmbh
|
||||
inc
|
||||
universi of
|
||||
research center
|
||||
foundation
|
||||
faculty
|
||||
national institu
|
||||
school medicine
|
||||
universi school
|
||||
graduate school
|
||||
graduate school engineering
|
||||
institu tropical medicine
|
||||
institu virology
|
||||
faculty medicine
|
||||
laboratory
|
||||
universi park
|
||||
institu science
|
||||
polytechnic universi
|
||||
universi 1
|
||||
ciudad universi
|
||||
universi campus
|
||||
universi hospitals
|
||||
colege
|
||||
universi road
|
||||
universitetska str
|
|
@ -1,16 +0,0 @@
|
|||
from
|
||||
the
|
||||
of
|
||||
at
|
||||
de
|
||||
for
|
||||
et
|
||||
für
|
||||
des
|
||||
in
|
||||
as
|
||||
a
|
||||
and
|
||||
fur
|
||||
for
|
||||
und
|
|
@ -1,8 +0,0 @@
|
|||
universitetskaya
|
||||
universitatsklinikum
|
||||
universitatskinderklinik
|
||||
universitatskliniken
|
||||
universitetshospital
|
||||
universitatsmedizin
|
||||
universitatsbibliothek
|
||||
universitatspital
|
|
@ -1,30 +1,5 @@
|
|||
<workflow-app name="AffroAffiliations" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
|
||||
<!-- <property>-->
|
||||
<!-- <name>crossrefInputPath</name>-->
|
||||
<!-- <description>the path where to find the inferred affiliation relations from Crossref</description>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>pubmedInputPath</name>-->
|
||||
<!-- <description>the path where to find the inferred affiliation relations from Pubmed</description>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>openapcInputPath</name>-->
|
||||
<!-- <description>the path where to find the inferred affiliation relations from OpenAPC</description>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>dataciteInputPath</name>-->
|
||||
<!-- <description>the path where to find the inferred affiliation relations from Datacite</description>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>webCrawlInputPath</name>-->
|
||||
<!-- <description>the path where to find the inferred affiliation relations from webCrawl</description>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>outputPath</name>-->
|
||||
<!-- <description>the path where to store the actionset</description>-->
|
||||
<!-- </property>-->
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
|
@ -93,7 +68,7 @@
|
|||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Affiliations inference (Affro)</name>
|
||||
<jar>affro_spark.py</jar>
|
||||
<jar>update_records.py</jar>
|
||||
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
|
@ -107,13 +82,13 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
|
||||
--conf spark.executorEnv.PYSPARK_PYTHON=python3
|
||||
--py-files ${wfAppPath}/affro_cluster.py,${wfAppPath}/create_input_cluster.py,${wfAppPath}/functions_cluster.py,${wfAppPath}/matching_cluster.py
|
||||
--files ${wfAppPath}/dictionaries/dix_acad.json,${wfAppPath}/dictionaries/dix_categ.json,${wfAppPath}/dictionaries/dix_city.json,${wfAppPath}/dictionaries/dix_country.json,${wfAppPath}/dictionaries/dix_mult.json,${wfAppPath}/txt_files/city_names.txt,${wfAppPath}/txt_files/remove_list.txt,${wfAppPath}/txt_files/stop_words.txt,${wfAppPath}/txt_files/university_terms.txt
|
||||
--py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/affro_test_example.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
|
||||
--files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
|
||||
</spark-opts>
|
||||
|
||||
<arg>${resultFolder}</arg>
|
||||
|
||||
<file>${wfAppPath}/affro_spark.py#affro_spark.py</file>
|
||||
<file>${wfAppPath}/affRo/update_records.py#update_records.py</file>
|
||||
</spark>
|
||||
|
||||
<ok to="End" />
|
||||
|
|
Loading…
Reference in New Issue