Add affRo algorithm as an external library

2024-09-09 16:13:26 +03:00 · 2024-09-09 16:13:26 +03:00 · 50401a872f
parent 37c04cbad7
commit 50401a872f
16 changed files with 37 additions and 1743 deletions
--- a/dhp-workflows/dhp-aggregation/pom.xml
+++ b/dhp-workflows/dhp-aggregation/pom.xml
@ -6,7 +6,18 @@
        <artifactId>dhp-workflows</artifactId>
        <version>1.2.5-SNAPSHOT</version>
    </parent>
+
    <artifactId>dhp-aggregation</artifactId>
+
+    <properties>
+        <affro.release.version>1.0.0</affro.release.version>
+    </properties>
+
+    <scm>
+        <url>https://code-repo.d4science.org/mkallipo/affRo</url>
+        <connection>scm:git:https://code-repo.d4science.org/mkallipo/affRo.git</connection>
+    </scm>
+
    <build>
        <plugins>
            <plugin>
@ -43,6 +54,28 @@
                    <scalaVersion>${scala.version}</scalaVersion>
                </configuration>
            </plugin>
+
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-scm-plugin</artifactId>
+                <version>1.8.1</version>
+                <configuration>
+                    <connectionType>connection</connectionType>
+                    <scmVersionType>tag</scmVersionType><!-- 'branch' can also be provided here -->
+                    <scmVersion>${affro.release.version}</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
+                    <checkoutDirectory>${project.build.directory}/${oozie.package.file.name}/${oozieAppDir}/affRo</checkoutDirectory>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>checkout-affro</id>
+                        <phase>prepare-package</phase>
+                        <goals>
+                            <goal>checkout</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+
        </plugins>

    </build>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/affro_cluster.py
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/affro_cluster.py
@ -1,31 +0,0 @@
-import sys 
-##import functions
-from functions_cluster import *
-from matching_cluster import *
-from create_input_cluster import *
-import json
-
-dix_org = load_json('dix_acad.json')
-dix_mult = load_json('dix_mult.json')
-dix_city = load_json('dix_city.json')
-dix_country = load_json('dix_country.json')
-
-print('READY')
-
-def affro(raw_aff_string):
-    result = Aff_Ids(create_df_algorithm(raw_aff_string), dix_org, dix_mult, dix_city, dix_country,  0.5, 0.5 )
-    return {'raw_affiliation_string':raw_aff_string, 'Matchings': [{'RORid':x[2], 'Confidence':x[1]} for x in result]}
-
-#raw_aff = 'university of california, los angeles, university of athens, university of california, san diego, university of athens, greece'
-
-
-# if __name__ == "__main__":
-#     if len(sys.argv) != 2:
-#         print("Usage: python affro_spark.py <string> <float1> <float2>")
-#         sys.exit(1)
-#
-#     string_arg = sys.argv[1]
-#    # float_arg1 = float(sys.argv[2])
-#    # float_arg2 = float(sys.argv[3])
-#
-#     print(affro(string_arg))
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/affro_spark.py
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/affro_spark.py
@ -1,35 +0,0 @@
-#!/usr/bin/python3
-
-from pyspark.sql import SQLContext
-from pyspark.sql.functions import udf
-from pyspark.sql.types import StringType
-from pyspark.sql import SparkSession
-import sys
-
-from affro_cluster import *
-
-# Initialize SparkSession
-spark = SparkSession.builder.appName("CustomFunctionExample").getOrCreate()
-
-output_folder = sys.argv[1]
-print("Writing to folder: ", output_folder)
-
-# Register the function as a UDF
-affro_udf = udf(affro, StringType())
-
-# Input list of strings
-input_data = ["university of athens", "university of vienna", "UCLA"]
-
-# # Convert the list to a Spark DataFrame
-df = spark.createDataFrame(input_data, "string").toDF("raw_affiliation_string")
-
-# # Apply your custom UDF to the DataFrame
-df_with_custom_value = df.withColumn("affro_value", affro_udf(df["raw_affiliation_string"]))
-
-
-# df_with_custom_value.show(truncate=False)
-df_with_custom_value.write.mode("overwrite").option("delimiter", "\t").csv(output_folder)
-
-
-# Stop the SparkSession
-spark.stop()
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/create_input_cluster.py
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/create_input_cluster.py
@ -1,66 +0,0 @@
-from functions_cluster import *
-
-def create_df_algorithm(raw_aff_string):
-
-
-    aff_no_symbols_d =  substrings_dict(clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string))))
-
-    dict_ = list(aff_no_symbols_d.values())
-
-    i = 0
-
-
-    while i < len(dict_) - 1:
-        if is_contained('progr', dict_[i]) and is_contained('dep', dict_[i+1]):
-            dict_.pop(i)
-
-            
-        elif (is_contained('assistant', dict_[i]) or is_contained('researcher', dict_[i]) or is_contained('phd', dict_[i]) or is_contained('student', dict_[i]) or is_contained('section', dict_[i]) or is_contained('prof', dict_[i]) or is_contained('director', dict_[i])) and (not is_contained('school', dict_[i+1]) or is_contained('univ', dict_[i+1]) or is_contained('inst', dict_[i+1]) or is_contained('lab', dict_[i+1]) or is_contained('fac', dict_[i+1])):
-            dict_.pop(i)
-
-        elif (is_contained('engineer', dict_[i]) or is_contained('progr', dict_[i]) or is_contained('unit', dict_[i]) or is_contained('lab', dict_[i]) or is_contained('dep', dict_[i]) or  is_contained('school', dict_[i])  or is_contained('inst', dict_[i]) #or is_contained('hosp', dict_[i]) 
-            or is_contained('fac', dict_[i])) and is_contained('univ', dict_[i+1]):
-            if not is_contained('univ', dict_[i]):
-                dict_.pop(i)
-
-        elif is_contained('lab', dict_[i]) and (is_contained('colege', dict_[i+1]) or is_contained('inst', dict_[i+1]) or is_contained('dep', dict_[i+1]) or is_contained('school', dict_[i+1])):
-            if not is_contained('univ', dict_[i]):
-                dict_.pop(i)
-
-        elif is_contained('dep', dict_[i]) and (is_contained('tech', dict_[i+1]) or is_contained('colege', dict_[i+1]) or is_contained('inst', dict_[i+1]) or  is_contained('hosp', dict_[i+1]) or  is_contained('school', dict_[i+1]) or  is_contained('fac', dict_[i+1])):
-            if not is_contained('univ', dict_[i]):
-                dict_.pop(i)
-
-        elif is_contained('inst',dict_[i]) and (is_contained('school', dict_[i+1]) or is_contained('dep', dict_[i+1]) or is_contained('acad', dict_[i+1]) or is_contained('hosp', dict_[i+1]) or is_contained('clin', dict_[i+1]) or is_contained('klin', dict_[i+1])  or is_contained('fak', dict_[i+1]) or is_contained('fac', dict_[i+1]) or is_contained('cent', dict_[i+1]) or is_contained('div', dict_[i+1])):
-            if not is_contained('univ', dict_[i]):
-                dict_.pop(i)
-
-        elif is_contained('school',dict_[i]) and is_contained('colege', dict_[i+1]):
-            if not is_contained('univ', dict_[i]):
-                dict_.pop(i)
-        else:
-            i += 1
-
-    light_aff = (', '.join((dict_)))
-
-    for x in dict_:
-        if x in city_names+remove_list:
-            dict_.remove(x)
-    
-
-    dict_ = [shorten_keywords_spark([x])[0] for x in dict_] 
-
-    keywords= []
-    def valueToCategory(value):
-        flag = 0
-
-        for k in categ_dicts:
-            if k in value: 
-                flag = 1
-        return flag
-    
-    aff_list = [{"index": i, "keywords": dict_[i], "category": valueToCategory(dict_[i])} for i in range(len(dict_))]
-
-    filtered_list = [entry for entry in aff_list if entry.get("category") == 1]
-
-    return   [light_aff, filtered_list]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/dictionaries/dix_acad.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/dictionaries/dix_acad.json
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/dictionaries/dix_categ.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/dictionaries/dix_categ.json
@ -1 +0,0 @@
-{"research": "Univ/Inst", "uniwersytet": "Univ/Inst", "investigacions": "Univ/Inst", "institu": "Univ/Inst", "istitut": "Univ/Inst", "univ": "Univ/Inst", "col": "Univ/Inst", "center": "Univ/Inst", "polytechnic": "Univ/Inst", "politecnico": "Univ/Inst", "centre": "Univ/Inst", "cnrs": "Univ/Inst", "faculty": "Univ/Inst", "school": "Univ/Inst", "academ": "Univ/Inst", "akadem": "Univ/Inst", "hochschule": "Univ/Inst", "ecole": "Univ/Inst", "tech": "Univ/Inst", "observ": "Univ/Inst", "escuela": "Univ/Inst", "escola": "Univ/Inst", "discovery programe": "Univ/Inst", "ku leuven": "Univ/Inst", "ucla": "Univ/Inst", "eth zurich": "Univ/Inst", "athena": "Univ/Inst", "openaire": "Univ/Inst", "erasmus": "Univ/Inst", "ist austria": "Univ/Inst", "lmu munich": "Univ/Inst", "cancer trials ireland": "Univ/Inst", "food safety authority": "Univ/Inst", "ucd": "Univ/Inst", "tcd": "Univ/Inst", "apc microbiome": "Univ/Inst", "nasa": "Univ/Inst", "ucl": "Univ/Inst", "zentrum": "Univ/Inst", "ncsr demokritos": "Univ/Inst", "panepistemio": "Univ/Inst", "forth": "Univ/Inst", "nui galway": "Univ/Inst", "nui maynooth": "Univ/Inst", "tu wien": "Univ/Inst", "tu dublin": "Univ/Inst", "lab": "Laboratory", "science": "Laboratory", "cientific": "Laboratory", "hospital": "Hospital", "clinic": "Hospital", "hopital": "Hospital", "klinik": "Hospital", "oncol": "Hospital", "medical": "Hospital", "health": "Hospital", "medicin": "Hospital", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "museum": "Museum", "library": "Museum", "foundation": "Foundation", "asociation": "Foundation", "organization": "Foundation", "society": "Foundation", "group": "Foundation", "royal": "Foundation", "ofice": "Foundation", "trust": "Foundation", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "microsoft": "Specific", "teagasc": "Specific", "ibm research": "Specific", "alergan": "Specific", "analog devices": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific"}
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/dictionaries/dix_city.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/dictionaries/dix_city.json
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/dictionaries/dix_country.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/dictionaries/dix_country.json
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/dictionaries/dix_mult.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/dictionaries/dix_mult.json
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/functions_cluster.py
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/functions_cluster.py
@ -1,615 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import re
-import unicodedata
-import html
-from unidecode import unidecode
-import json   
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-#import pandas as pd 
-
-def load_txt(file_path):
-    with open(file_path, 'r',  encoding='utf-8') as file:
-        list_ = [line.strip() for line in file]
-        return list_
-    
-def load_pickled_dict(file_path): 
-    with open(file_path, 'rb') as file: 
-        pickled_dict = pickle.load(file) 
-        return pickled_dict
-    
-
-def load_json(file_path): 
-    with open(file_path, 'r') as json_file:
-        json_dict = json.load(json_file)
-        return json_dict
-        
-categ_string = 'Laboratory|Univ/Inst|Hospital|Foundation|Specific|Museum'
-
-def replace_double_consonants(text):
-    # This regex pattern matches any double consonant
-    pattern = r'([bcdfghjklmnpqrstvwxyz])\1'
-    # The replacement is the first captured group (the single consonant)
-    result = re.sub(pattern, r'\1', text, flags=re.IGNORECASE)
-    return result
-
-remove_list = [replace_double_consonants(x) for x in load_txt('remove_list.txt')]
-stop_words = load_txt('stop_words.txt')
-university_terms = [replace_double_consonants(x) for x in load_txt('university_terms.txt')]
-city_names = [replace_double_consonants(x) for x in load_txt('city_names.txt')]
-
-categ_dicts = load_json('dix_categ.json')
-
-
-def is_contained(s, w):
-    words = s.split()  # Split the string 's' into a list of words
-    for word in words:
-        if word not in w:  # If a word from 's' is not found in 'w'
-            return False  # Return False immediately
-    return True  # If all words from 's' are found in 'w', return True
-
-def starts_with_any(string, prefixes):
-    for prefix in prefixes:
-        if string.startswith(prefix):
-            return [True, prefix]
-    return False
-
-def remove_leading_numbers(s):
-    return re.sub(r'^\d+', '', s)
-
-def remove_outer_parentheses(string):
-    """Remove outer parentheses from the string if they enclose the entire string."""
-    if string.startswith('(') and string.endswith(')'):
-        return string[1:-1].strip()
-    return string
-
-def index_multiple_matchings(pairs):
-    result_dict = {}
-    
-    r_list = [pair[3] for pair in pairs]
-    modified_list = [item for sublist in r_list for item in sublist]
-    r = len(list(set(modified_list)))
-        
-    for t in [pair[0] for pair in pairs]:
-        key = t
-        if key in result_dict and r>1:
-            result_dict[key] += 1
-            
-        else:
-            result_dict[key] = 1
-   
-    return result_dict
-
-def avg_string(df, col):
-    avg = [] 
-    for i in range(len(df)):
-        avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i]))
-    return sum(avg)/len(avg)
-
-def remove_stop_words(text):
-    words = text.split()
-    filtered_words = [word for word in words if word not in stop_words]
-    return ' '.join(filtered_words)
-
-
-def remove_parentheses(text):
-   return re.sub(r'\([^()]*\)', '', text)
-
-
-def replace_umlauts(text):
-    normalized_text = unicodedata.normalize('NFKD', text)
-    replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
-    return replaced_text
-
-def protect_phrases(input_string, phrases):
-    # Replace phrases with placeholders
-    placeholder_map = {}
-    for i, phrase in enumerate(phrases):
-        placeholder = "__PLACEHOLDER_" + str(i) + "__"
-        placeholder_map[placeholder] = phrase
-        input_string = input_string.replace(phrase, placeholder)
-    return input_string, placeholder_map
-
-def restore_phrases(split_strings, placeholder_map):
-    # Restore placeholders with original phrases
-    restored_strings = []
-    for s in split_strings:
-        for placeholder, phrase in placeholder_map.items():
-            s = s.replace(placeholder, phrase)
-        restored_strings.append(s)
-    return restored_strings
-
-def replace_comma_spaces(text):
-    return text.replace('  ', ' ').replace(' , ', ', ')
-
-def split_string_with_protection(input_string, protected_phrases):
-    # Step 1: Protect specific phrases
-    input_string, placeholder_map = protect_phrases(input_string, protected_phrases)
-    
-    # Step 2: Split the string on specified delimiters
-    split_strings = [s.strip() for s in re.split(r'[,;/]| – ', input_string) if s.strip()]
-    
-    # Step 3: Restore protected phrases
-    split_strings = restore_phrases(split_strings, placeholder_map)
-    
-    return split_strings
-
-protected_phrases1 =  [
-    phrase.format(x=x)
-    for x in city_names
-    for phrase in [
-        'university california, {x}',
-    #    'university california , {x}',
-
-        'university colege hospital, {x}',
-    #    'university colege hospital , {x}',
-        
-        'national univ ireland, {x}',
-    #    'national univ ireland , {x}',
-
-        'national university ireland, {x}',
-    #    'national university ireland , {x}',
-
-        'university colege, {x}',
-    #    'university colege , {x}',
-        
-        'university hospital, {x}', 
-    #    'university hospital , {x}', 
-
-        'imperial colege, {x}',
-    #    'imperial colege , {x}'
-        
-        'city university, {x}', 
-    #    'city university , {x}'
-
-        
-    ]
-]
-
-
-
-replacements = {'uni versity':'university',
-                'univ ':'university ',
-                'univercity':'university', 
-                'universtiy':'university', 
-                'univeristy':'university',
-                'universirty':'university', 
-                'universiti':'university', 
-                'universitiy':'university',
-                'universty' :'university',
-                'univ col': 'university colege',
-                'belfield, dublin': 'dublin',
-                'balsbridge, dublin': 'dublin', #ballsbridge
-                'earlsfort terrace, dublin': 'dublin',
-                'bon secours hospital, cork' : 'bon secours hospital cork',
-                'bon secours hospital, dublin' : 'bon secours hospital dublin',
-                'bon secours hospital, galway' : 'bon secours hospital galway',
-                'bon secours hospital, tralee' : 'bon secours hospital tralee',
-                'bon secours health system' : 'bon secours hospital dublin',
-                'bon secours hospital, glasnevin' : 'bon secours hospital dublin',
-                'imperial colege science, technology medicine' : 'imperial colege science technology medicine',
-                'ucl queen square institute neurology' : 'ucl, london',
-                'ucl institute neurology' : 'ucl, london',
-                'royal holoway, university london' : 'royal holoway universi london', #holloway
-                'city, university london' : 'city universi london',
-                'city university, london' : 'city universi london',
-                'aeginition':'eginition',
-                'national technical university, athens' : 'national technical university athens' 
-            # 'harvard medical school' : 'harvard university'
-
-
-    
-}
-
-
-def substrings_dict(string):
-    # Split the input string and clean each substring
-   # split_strings =  split_string_with_protection(string.replace('univ coll', 'university college').replace('belfield, dublin', 'dublin').replace('ballsbridge, dublin', 'dublin').replace('earlsfort Terrace, dublin', 'dublin'), protected_phrases1)
-    
-    for old, new in replacements.items():
-        string = string.replace(old, new)
-    split_strings = split_string_with_protection(string, protected_phrases1)
-    
-    # Define a set of university-related terms for later use
-
-
-    dict_string = {}
-    index = 0    
-    for value in split_strings:
-        
-        # Check if the substring contains any university-related terms
-        if not any(term in value.lower() for term in university_terms):
-            # Apply regex substitutions for common patterns
-   
-            modified_value = re.sub(r'universi\w*', 'universi', value, flags=re.IGNORECASE)
-            modified_value = re.sub(r'institu\w*', 'institu', modified_value, flags=re.IGNORECASE)
-            modified_value = re.sub(r'centre*', 'center', modified_value, flags=re.IGNORECASE)
-            modified_value = re.sub(r'\bsaint\b', 'st', modified_value, flags=re.IGNORECASE) 
-            modified_value = re.sub(r'\btrinity col\b', 'trinity colege', modified_value, flags=re.IGNORECASE)
-            modified_value = re.sub(r'\btechnische\b', 'technological', modified_value, flags=re.IGNORECASE)
-
-            
-
-            # Add the modified substring to the dictionary
-                     
-            dict_string[index] = modified_value.lower().strip()
-            index += 1
-       # elif 'universitetskaya' in value.lower():
-       #     index += 1
-
-
-            # Add the original substring to the dictionary
-        else:
-            dict_string[index] = value.lower().strip()
-            index += 1
-            
-    return dict_string
-
-
-
-def clean_string(input_string):
-    # Temporarily replace " - " with a unique placeholder
-    placeholder = "placeholder"
-  #  input_string = input_string.replace(" - ", placeholder)
-    input_string = input_string.replace(" – ", placeholder)
-
-    # Unescape HTML entities and convert to lowercase
-    input_string = replace_comma_spaces(remove_stop_words(replace_double_consonants(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.lower())))))).strip())
-    
-    # Normalize unicode characters (optional, e.g., replace umlauts)
-    input_string = unidecode(input_string)
-    
-    # Replace `/` and `–` with space (do not replace hyphen `-`)
-    result = re.sub(r'[/\-]', ' ', input_string)
-    
-    # Replace "saint" with "st"
-    result = re.sub(r'\bsaint\b', 'st', result)
-    result = re.sub(r'\baghia\b', 'agia', result)
-
-    
-    # Remove characters that are not from the Latin alphabet, or allowed punctuation
-    result = replace_comma_spaces(re.sub(r'[^a-zA-Z\s,;/]', '', result).strip())
-    
-    # Restore the " - " sequence from the placeholder
-    result = result.replace(placeholder, " – ")
-    
-    # Replace consecutive whitespace with a single space
-    result = re.sub(r'\s+', ' ', result)
-    #result = result.replace('ss', 's')
-    
-    return result.strip()  # Strip leading/trailing spaces
-
-
-def clean_string_facts(input_string):
-    # Replace specified characters with space
-    input_string = remove_stop_words(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.lower())))))
-    result = re.sub(r'[/\-,]', ' ', input_string)
-    result = re.sub(r'\bsaint\b', 'st', result) 
-
-    # Remove characters that are not from the Latin alphabet or numbers
-    result = re.sub(r'[^a-zA-Z0-9\s;/-]', '', result)
-    
-    # Replace consecutive whitespace with a single space
-    result = re.sub(r'\s+', ' ', result)
-    
-    return result
-    
-    
-def str_radius_u(string):
-    string = string.lower()
-    radius = 3
-    
-    str_list = string.split()
-    indices = []
-    result = []
-
-    for i, x in enumerate(str_list):
-        if is_contained('univers',x):
-            indices.append(i)
-        # elif is_contained('coll',x):
-        #     indices.append(i)
-            
-    for r0 in indices:
-        lmin =max(0,r0-radius)
-        lmax =min(r0+radius, len(str_list))
-        s = str_list[lmin:lmax+1]
-        
-        result.append(' '.join(s))
-    
-    return result 
-
-
-def str_radius_coll(string):
-    string = string.lower()
-    radius = 1
-    
-    str_list = string.split()
-    indices = []
-    result = []
-
-    for i, x in enumerate(str_list):
-        if is_contained('col',x):
-            indices.append(i)
-  
-    for r0 in indices:
-        lmin =max(0,r0-radius)
-        lmax =min(r0+radius, len(str_list))
-        s = str_list[lmin:lmax]
-        
-        result.append(' '.join(s))
-    
-    return result 
-
-
-def str_radius_h(string):
-    string = string.lower()
-    radius = 3
-    
-    str_list = string.split()
-    indices = []
-    result = []
-
-    for i, x in enumerate(str_list):
-        if is_contained('hospital',x):
-            indices.append(i)
-            
-    for r0 in indices:
-        lmin =max(0,r0-radius-1)
-        lmax =min(r0+radius, len(str_list))
-        s = str_list[lmin:lmax]
-        
-        result.append(' '.join(s))
-    
-    return result 
-
-
-def str_radius_c(string):
-    string = string.lower()
-    radius = 2
-    
-    str_list = string.split()
-    indices = []
-    result = []
-
-    for i, x in enumerate(str_list):
-        if is_contained('clinic',x) or is_contained('klinik',x):
-            indices.append(i)
-            
-    for r0 in indices:
-        lmin =max(0,r0-radius-1)
-        lmax =min(r0+radius, len(str_list))
-        s = str_list[lmin:lmax]
-        
-        result.append(' '.join(s))
-    
-    return result 
-
-def str_radius_r(string):
-    string = string.lower()
-    radius = 2
-    
-    str_list = string.split()
-    indices = []
-    result = []
-
-    for i, x in enumerate(str_list):
-        if is_contained('research',x):
-            indices.append(i)
-            
-    for r0 in indices:
-        lmin =max(0,r0-radius-1)
-        lmax =min(r0+radius, len(str_list))
-        s = str_list[lmin:lmax]
-        
-        result.append(' '.join(s))
-    
-    return result 
-
-def str_radius_spec(string):
-    spec = False
-    for x in string.split():
-        try:
-            if categ_dicts[x] == 'Specific':
-                spec = True
-                return x
-        except:
-            pass
-    if spec == False:
-        return string        
-        
-
-def avg_string(df, col):
-    avg = [] 
-    for i in range(len(df)):
-        avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i]))
-    return sum(avg)/len(avg)
-
-
-
-        
-                                
-def shorten_keywords(affiliations_simple):
-    affiliations_simple_n = []
-
-    for aff in affiliations_simple:
-        inner = []
-        for str in aff:
-            if 'universi' in str:
-                inner.extend(str_radius_u(str))
-            elif 'col' in str and 'trinity' in str:
-                inner.extend(str_radius_coll(str))
-            elif 'hospital' in str or 'hopita' in str:
-                inner.extend(str_radius_h(str))
-            elif 'clinic' in str or 'klinik' in str:
-                inner.extend(str_radius_c(str))
-            elif 'research council' in str:
-                inner.extend(str_radius_r(str))
-            else:
-                inner.append(str_radius_spec(str))
-
-        affiliations_simple_n.append(inner)
-
-    return affiliations_simple_n
-
-def shorten_keywords_spark(affiliations_simple):
-    affiliations_simple_n = []
-
-    for aff in affiliations_simple:
-      
-        if 'universi' in aff:
-            affiliations_simple_n.extend(str_radius_u(aff))
-        elif 'col' in aff and 'trinity' in aff:
-            affiliations_simple_n.extend(str_radius_coll(aff))
-        elif 'hospital' in aff or 'hopita' in aff:
-            affiliations_simple_n.extend(str_radius_h(aff))
-        elif 'clinic' in aff or 'klinik' in aff:
-            affiliations_simple_n.extend(str_radius_c(aff))
-        elif 'research council' in aff:
-            affiliations_simple_n.extend(str_radius_r(aff))
-        else:
-            affiliations_simple_n.append(str_radius_spec(aff))
-
-
-    return affiliations_simple_n
-
-
-def refine(list_, affil):
-    affil = affil.lower()
-    
-    ids = []
-    
-    for matched_org_list in list_:      
-     
-        id_list = []
-        
-        for matched_org in matched_org_list:
-            
-            if dix_mult[matched_org] == 'unique':
-                id_list.append(dix_acad[matched_org])
-            else:
-                city_found = False
-                for city in dix_city[matched_org]:
-                    if city[0] in affil:
-                        id_list.append(city[1])
-                        city_found = True
-                        break
-        
-                if not city_found:
-                    country_found = False
-                        
-                    for country in dix_country[matched_org]:
-                        if country[0] in  list(country_mapping.keys()):
-                            print(country[0])
-                            if country[0] in affil or country_mapping[country[0]][0] in affil or country_mapping[country[0]][0] in affil:
-                                id_list.append(country[1])
-                                country_found = True
-                                break
-                    
-                            
-
-                        elif country[0] in affil:
-                            print('country found',country[0])
-                        
-                            id_list.append(country[1])
-                            country_found = True
-                            break
-
-                    
-                    
-                    if not country_found:
-                        id_list.append(dix_acad[matched_org])
-           
-                
-        
-        ids.append(id_list)
-        return ids
-    
-def compute_cos(x,s):
-    vectorizer = CountVectorizer()
-
-    s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
-    x_vector = vectorizer.transform([x]).toarray()
-
-    # Compute similarity between the vectors
-    return cosine_similarity(x_vector, s_vector)[0][0]
-
-
-# def find_ror(string, simU, simG):
-#     df = pd.DataFrame()
- 
-#     df['Unique affiliations'] = [[string.lower()]]
-#     academia = create_df_algorithm(df)
-    
- 
-#     result = Aff_Ids(len(academia), academia,dix_acad, dix_mult, dix_city, dix_country, simU,simG)
-#     if len(result)>0:
-         
-#         dict_aff_open = {x: y for x, y in zip(result['Original affiliations'], result['Matched organizations'])}
-#         dict_aff_id = {x: y for x, y in zip(result['Original affiliations'], result['unique ROR'])}
-    
-#         dict_aff_score = {}
-#         for i in range(len(result)):
-#             if type(result['Similarity score'].iloc[i]) == list:
-#                 dict_aff_score[result['Original affiliations'].iloc[i]] = result['Similarity score'].iloc[i]
-#             else:
-#                 dict_aff_score[result['Original affiliations'].iloc[i]] = [result['Similarity score'].iloc[i]]
-                
-
-#         pids = []
-#         for i in range(len(df)):
-#             pidsi = []
-#             for aff in df['Unique affiliations'].iloc[i]:
-#                 if aff in list(dict_aff_id.keys()):
-#                     pidsi = pidsi + dict_aff_id[aff]
-#             # elif 'unmatched organization(s)' not in pidsi:
-#             #     pidsi = pidsi + ['unmatched organization(s)']
-#             pids.append(pidsi)
-                    
-                    
-#         names = []
-#         for i in range(len(df)):
-#             namesi = []
-#             for aff in df['Unique affiliations'].iloc[i]:
-#                 if aff in list(dict_aff_open.keys()):
-#                     try:
-#                         namesi = namesi + dict_aff_open[aff]
-#                     except TypeError:
-#                         namesi = namesi + [dict_aff_open[aff]]
-                    
-#             names.append(namesi)
-            
-#         scores = []
-#         for i in range(len(df)):
-#             scoresi = []
-#             for aff in df['Unique affiliations'].iloc[i]:
-#                 if aff in list(dict_aff_score.keys()):
-#                     scoresi = scoresi +  dict_aff_score[aff]
-                    
-#             scores.append(scoresi)
-            
-            
-#         df['Matched organizations'] = names
-#         df['ROR'] = pids
-#         df['Scores'] = scores
-
-
-       
-#         def update_Z(row):
-#             if len(row['ROR']) == 0 or len(row['Scores']) == 0:
-#                 return []
-            
-#             new_Z = []
-#             for ror, score in zip(row['ROR'], row['Scores']):
-#                 entry = {'ROR_ID': ror, 'Confidence': score}
-#                 new_Z.append(entry)
-#             return new_Z
-
-#         matching = df.apply(update_Z, axis=1)
-
-#         df['Matchings'] = matching
-
-        
-#         return df['Matchings'].iloc[0]
-#     else: 
-#         return 'no result'
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/matching_cluster.py
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/matching_cluster.py
@ -1,326 +0,0 @@
-from collections import defaultdict
-from collections import Counter
-
-import Levenshtein
-
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-
-from functions_cluster import *
-from create_input_cluster import *
-
-def best_sim_score(light_raw, l2, l3, l4, simU, simG):
-    """
-    Finds the best match between a 'key word' and several legal names from the OpenAIRE database.
-    ---> corrects special cases in the main map that follows
-
-    Args:
-        l1: List of level2 affiliations.
-        l2: number of candidates.
-        l3: List of pairs.
-        l4: mult
-
-    Returns:
-        List: Resulting list containing OpenAIRE names and their similarity scores.
-    """
-    
-    vectorizer = CountVectorizer()
-    numUniv = light_raw.lower().count('univ') 
-    result = []
-    best = [] 
-    s = light_raw
-    for j in range(len(l3)):
-        x = l3[j][1] 
-        
-        if [x, l3[j][2]] in result:
-            continue
-        
-        if l4[l3[j][0]] == 1:
-            
-            if  is_contained('univ', x.lower()) and  l3[j][2]> simU:
-                result.append([x, l3[j][2]])
-            elif  l3[j][2] >simG:
-                result.append([x, l3[j][2]])
-
-            
-            
-        elif l3[j][2] >=0.98:# and (is_contained("univ", x.lower()) or is_contained("college", x.lower()) or  is_contained("center", x.lower()) or  is_contained("schule", x.lower())): # If the similarity score of a pair (s,x) was 1, we store it to results list
-            result.append([l3[j][1], 1])
-            
-        else:
-            try:
-                if not is_contained("univ", x.lower()):
-                    continue  # Skip if x does not contain "university" or "univ"
-                
-                #  if (is_contained('hosp', x.lower()) and not is_contained('hosp', s)) or (not is_contained('hosp', x.lower()) and is_contained('hosp', s)) or (is_contained('hopital', x.lower()) and not is_contained('hopital', s)) or (not is_contained('hopital', x.lower()) and is_contained('hopital', s)):
-                #      continue
-                s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
-                x_vector = vectorizer.transform([x]).toarray()
-    
-                # Compute similarity between the vectors
-                similarity = cosine_similarity(x_vector, s_vector)[0][0]
-                if similarity> 0.1:
-                    similarity_l = 1 - Levenshtein.distance(x, l3[j][0]) / max(len(x), len(l3[j][0]))
-
-
-                    best.append([x, similarity,similarity_l])#(similarity+similarity2)/2])
-            except:
-                KeyError
-                    
-    if best:
-        # max_numbers = defaultdict(float)
-
-
-# Assuming best is a list of three-element lists
-# Each element is (string, number1, number2)
-        max_numbers = defaultdict(float)
-        for item in best:
-            string, number1, number2 = item  # Unpack the three elements
-            max_numbers[string] = max(max_numbers[string], number1)
-
-        reduced_best = [[string, number1, number2] for string, number1, number2 in best if number1 == max_numbers[string]]
-
-# Sort by number1 decreasingly and then by number2 in descending order
-        reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True)
-
-        result = result + reduced_best
-                
-    univ_list = []
-    other_list = []
-    
-    for r in result:
-        if is_contained('univ',r[0]):
-            univ_list.append(r)
-        else:
-            other_list.append(r)
-    
-    limit =  min(numUniv, l2)
-
-    if len(univ_list)> limit:
-        result = univ_list[:limit] + other_list
-        
-    result_dict = {}
-    pairs_dict = {}
-    
-    
-    for l in l3:
-        pairs_dict[l[1]] = l[2]
-        
-        
-    for p in result:
-        result_dict[p[0]]= pairs_dict[p[0]]
-        
-    
-        
-        
-    result_dict_list = [[y[0],result_dict[y[0]]] for y in result]  
-        
-                
-    return result_dict_list
-
-
-
-    
-    
-def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG):
-    
-    """
-    Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
-
-    Args:
-        m (int): The number of DOIs to check.
-        DF (DataFrame): The input DataFrame containing affiliation data.
-        dix_org (dict): A dictionary of names of organizations and their ROR_ids.
-        simU (float): Similarity threshold for universities.
-        simG (float): Similarity threshold for non-universities.
-
-    Returns:
-        DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
-    """
-    df_list = input[1]
-    light_aff = input[0]
-    vectorizer = CountVectorizer()
-
-    lnamelist = list(dix_org.keys())
-    dix = {}    # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
-    #pairs = [] 
-    result = {}
-    pairs = []
-    
- 
-    def get_keywords(filtered_list):
-        # Extract the "keywords" values from the dictionaries in filtered_list
-        keywords_list = [entry["keywords"] for entry in filtered_list]
-        
-        return keywords_list
-    keywords = get_keywords(df_list)
-
-
-    for k,s in enumerate(keywords):
-        similar_k = []
-        pairs_k = []
-
-        if s in lnamelist:
-            similarity = 1
-            similar_k.append(similarity)
-            
-            pairs_k.append((s,s,similarity,dix_org[s]))
-            pairs.append((s,s,similarity,dix_org[s]))
-
-
-            if k not in dix:
-                dix[k] = [s]
-            else:
-                dix[k].append(s)
-        else:
-
-            for x in lnamelist:
-                if  is_contained(s, x):
-
-                    x_vector = vectorizer.fit_transform([x]).toarray()
-                    s_vector = vectorizer.transform([s]).toarray()
-
-                    # Compute similarity between the vectors
-                    similarity = cosine_similarity(x_vector, s_vector)[0][0]
-                    if similarity > min(simU, simG):
-                        if (is_contained('univ', s) and is_contained('univ', x)) and similarity > simU:
-                            similar_k.append(similarity)
-                            pairs_k.append((s,x,similarity,dix_org[x]))
-                            pairs.append((s,x,similarity,dix_org[x]))
-
-
-                            if k not in dix:
-                                dix[k] = [x]
-                            else:
-                                dix[k].append(x)
-                        elif (not is_contained('univ', s) and not is_contained('univ', x)) and similarity > simG:
-                            similar_k.append(similarity)
-                            pairs_k.append((s,x,similarity,dix_org[x]))
-                            pairs.append((s,x,similarity,dix_org[x]))
-
-
-                            if k not in dix:
-                                dix[k] = [x]
-                            else:
-                                dix[k].append(x)
-                                
-                elif is_contained(x, s):
-                    if (is_contained('univ', s) and is_contained('univ', x)):
-
-                        s_vector = vectorizer.fit_transform([s]).toarray()
-                        x_vector = vectorizer.transform([x]).toarray()
-
-                        # Compute similarity between the vectors
-                        similarity = cosine_similarity(s_vector, x_vector)[0][0]
-                        if similarity > simU: #max(0.82,sim):
-                            similar_k.append(similarity)
-                            pairs_k.append((s,x,similarity,dix_org[x]))
-                            pairs.append((s,x,similarity,dix_org[x]))
-
-                            if k not in dix:
-                                dix[k] = [x]
-                            else:
-                                dix[k].append(x)
-                    elif not is_contained('univ', s) and not is_contained('univ', x):
-
-                        s_vector = vectorizer.fit_transform([s]).toarray()
-                        x_vector = vectorizer.transform([x]).toarray()
-
-                        # Compute similarity between the vectors
-                        similarity = cosine_similarity(s_vector, x_vector)[0][0]
-                        if similarity > simG: #max(0.82,sim):
-                            similar_k.append(similarity)
-                            pairs_k.append((s,x,similarity,dix_org[x]))
-                            pairs.append((s,x,similarity,dix_org[x]))
-
-                            if k not in dix:
-                                dix[k] = [x]
-                            else:
-                                dix[k].append(x)  
-
-        result[k] = pairs_k
-        
-    multi = index_multiple_matchings(list(set(pairs)))
-   # need_check = list(set([i for i in range(len(multi)) if list(multi.values())[i]>1]))
-   # print('here', multi)
-   # need_check_keys = [keywords[i] for i in range(len(keywords)) if multi[keywords[i]]>1]
-    need_check_keys = []
-    for i in range(len(keywords)):
-        try: 
-            if  multi[keywords[i]]>1:
-                need_check_keys.append(keywords[i])
-        except:
-            pass
-        
-    best =  best_sim_score(light_aff, len(keywords), pairs, multi, simU, simG) 
-    matched_org = [x[0] for x in best]
-  #      best_o = []
- #       best_s = []
-  #      best_result = []
-   #     for x in best:
-    #        best_o.append([x[i][0]  for i in range(len(x))])
-     #       best_s.append([round(x[i][1],2)  for i in range(len(x))])
-      #  num_mathced = [len(best_s[i]) for i in range(len(need_check))]
-    ids = [dix_org[x[0]] for x in best]
-    for i,x in enumerate(matched_org):
-       # id_list = []
-        if dix_mult[x] != 'unique':
-            if x in list(dix_city_ror.keys()):
-                match_found0 = False
-                match_found = False
-
-                for city in dix_city_ror[x]:
-                    if city[0] in light_aff:
-                        if city[0] not in x: 
-                            ids[i] = city[1]
-                            
-                            match_found0 = True
-                            match_found = True
-                            break
-                if not match_found:
-                    for city in dix_city_ror[x]:
-                        if city[0] in   light_aff and city[0] not in x:
-                            ids[i] = city[1]
-                            match_found0 = True
-                            print('ok')
-                            break  
-                    
-                if not match_found:
-                    match_found2 = False
-                    match_found3 = False
-
-                    for country in dix_country_ror[x]:
-                        if country[0] == 'united states' and (country[0] in light_aff or 'usa'  in light_aff):
-                            ids[i] = country[1]
-                            match_found2 = True
-                            match_found3 = True
-                            break
-                        
-                        if country[0] == 'united kingdom' and (country[0] in light_aff or 'uk'  in light_aff):
-                            ids[i] = country[1]
-                            match_found2 = True
-                            match_found3 = True
-                            break
-
-                        elif country[0] in light_aff:
-
-                            if country[0] not in x:
-                                ids[i] = country[1]
-                                match_found2 = True
-                                match_found3 = True
-                                break
-
-                    if not match_found3:
-                        for country in dix_country_ror[x]:
-                            if country[0] in light_aff and country[0] in x:
-                                ids[i] = country[1]
-                                match_found2 = True
-                                break  
-                        
-                
-                
-            
-
-    results = [[x[0],x[1], ids[i]] for i,x in enumerate(best)]
-
-    return  results #[[result[to_check[i]] for i in ready] + [to_check[2]], best[0]]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/txt_files/city_names.txt
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/txt_files/city_names.txt
@ -1,584 +0,0 @@
-galway
-maynooth
-duluth
-port arthur
-new orleans
-paterson
-santa barbara
-thornton
-westminster
-north las vegas
-stockton
-marysville
-fitchburg
-tallinn
-fargo
-seaside
-manaus
-porto
-quebec city
-hialeah
-normal
-kansas city
-delhi
-fort worth
-palermo
-olathe
-madison
-santa maria
-youngstown
-allentown
-santa clara
-charlotte
-agra
-palmdale
-kraków
-bendigo
-high point
-washington
-dallas
-grand prairie
-plano
-leipzig
-bratislava
-seville
-puebla
-lucknow
-toowoomba
-santa rosa
-sioux falls
-flint
-kissimmee
-lacey
-brownsville
-palm springs
-tyler
-minsk
-san diego
-los angeles
-edmonton
-college station
-toulouse
-garland
-florence
-saskatoon
-albury-wodonga
-newburgh
-danbury
-deltona
-south bend
-nagpur
-pomona
-memphis
-london
-lincoln
-chandler
-adelaide
-salt lake city
-edinburgh
-suzhou
-grayslake
-new york city
-kanpur
-brussels
-okayama
-tuscaloosa
-clarksville
-jackson
-boise city
-canton
-louisville
-varanasi
-columbus
-lorain
-vadodara
-orem
-chennai
-townsville
-eindhoventoronto
-wuhan
-norman
-winter haven
-eugene
-riga
-hamamatsu
-fresno
-lake charles
-budapest
-mobile
-lowell
-vienna
-tallahassee
-nanjing
-new haven
-sacramento
-leeds
-harlingen
-springdale
-perth
-sendai
-utica
-orange
-baltimore
-rochester
-rancho cucamonga
-bellevue
-fort wayne
-modesto
-pristina
-nuremberg
-stuttgart
-indore
-murfreesboro
-nottingham
-scranton
-lancaster
-abilene
-monterey
-sioux city
-bari
-chula vista
-ahmedabad
-north port
-helsinki
-leominster
-ocala
-sarajevo
-hangzhou
-roanoke
-new york
-bethlehem
-dublin
-sunshine coast
-pune
-billings
-changchunsydney
-garden grove
-port orange
-pittsburgh
-new bedford
-hiroshima
-san francisco
-sheffield
-chongqing
-layton
-pueblo
-chengdu
-cincinnati
-erie
-lansing
-ljubljana
-st louis
-rio de janeiro
-philadelphia
-tacoma
-bel air
-chesapeake
-davenport
-las vegas
-nagasaki
-kitchener
-boulder
-roseville
-evansville
-victoria
-burbank
-sofia
-santa clarita
-san buenaventura
-savannah
-apple valley
-brighton
-coral springs
-huntsville
-fort lauderdale
-warsaw
-antioch
-medford
-visalia
-frankfurt
-joliet
-curitiba
-mcallen
-seattle
-alexandria
-bryan
-moreno valley
-berlin
-olympia
-caracas
-tianjin
-cleveland
-des moines
-prague
-fukuoka
-burlington
-bhopal
-nara
-hampton
-jefferson
-chicago
-temecula
-paris
-gilbert
-bradenton
-champaign
-munich
-amsterdam
-raleigh
-atlanta
-lakeland
-denver
-round lake beach
-richmond
-buffalo
-phoenix
-antwerp
-greenbay
-milwaukee
-south lyon
-concord
-vero beach
-newcastle
-podgorica
-monterrey
-shantou
-costa mesa
-copenhagen
-vilnius
-dalian
-bristol
-salinas
-belgrade
-waterloo
-henderson
-hayward
-hickory
-el monte
-lima
-redding
-mexico city
-cary
-kennewick
-guayaquil
-tirana
-kawasaki
-greensboro
-west covina
-amarillo
-saitama
-new london
-recife
-manchester
-rockford
-kelowna
-hagerstown
-bordeaux
-york
-kaneohe
-tucson
-gainesville
-kalamazoo
-bogotá
-reading
-virginia beach
-guadalajara
-albany
-durham
-green bay
-oceanside
-montreal
-turin
-malaga
-oshawa
-mesa
-pensacola
-boise
-bonita springs
-fort walton beach
-port saint lucie
-reykjavik
-north charleston
-newark
-reno
-knoxville
-bakersfield
-oslo
-omaha
-milan
-cambridge
-norwich
-shanghai
-naples
-victorville
-zagreb
-norwalk
-huntington beach
-clarke county
-lubbock
-yakima
-warren
-bucharest
-simi valley
-greenville
-racine
-salvador
-elk grove
-orlando
-windsor
-santa cruz
-saginaw
-ballarat
-muskegon
-shreveport
-clearwater
-merced
-boston
-basel
-elizabeth
-panama city
-okinawa
-sarasota
-zurich
-glendale
-wilmington
-pompano beach
-guangzhou
-fairfield
-hyderabad
-santiago
-nashville
-mchenry
-ann arbor
-carrollton
-hollywood
-laredo
-rome
-san bernardino
-bergen
-springfield
-winnipeg
-corona
-surat
-long beach
-nagoya
-toledo
-geelong
-kenosha
-sterling heights
-lisbon
-myrtle beach
-nashua
-riverside
-tampa
-bangalore
-richland
-rotterdam
-lyon
-scottsdale
-berkeley
-bologna
-cedar rapids
-syracuse
-tulsa
-ludhiana
-hemet
-portland
-mission viejo
-salem
-overland park
-detroit
-jinan
-osaka
-grand rapids
-jersey city
-kailua
-venice
-darwin
-miramar
-gulfport-biloxi
-huntington
-portsmouth
-worcester
-sunnyvale
-escondido
-college park
-thousand oaks
-harbin
-belfast
-yonkers
-alicante
-barnstable
-kitakyushu
-sapporo
-ogden
-aurora
-palm bay
-düsseldorf
-hobart
-irvine
-st johns
-hamburg
-provo
-melbourne
-madrid
-zhengzhou
-asheville
-patna
-inglewood
-houston
-newport news
-west valley city
-oklahoma city
-brisbane
-valencia
-pasadena
-aberdeen
-st petersburg
-lakewood
-irving
-naperville
-miami
-topeka
-downey
-genoa
-lewisville
-birmingham
-xian
-saint paul
-bremerton
-corpus christi
-daytona beach
-st paul
-oxnard
-murrieta
-lafayette
-montgomery
-baton rouge
-skopje
-cathedral city
-spartanburg
-canberra
-arvada
-hesperia
-port st lucie
-saint louis
-bridgeport
-tempe
-quito
-chattanooga
-bremen
-gold coast
-cairns
-beaumont
-elkhart
-peoria
-calgary
-honolulu
-havre de grace
-hamilton
-fullerton
-daly city
-dresden
-belem
-ottawa
-regina
-chiba
-fort collins
-indianapolis
-mumbai
-killeen
-sao paulo
-jaipur
-fremont
-zaragoza
-charleston
-waco
-kobe
-odessa
-monroe
-vallejo
-marseille
-qingdao
-frederick
-marina
-sebastian
-oakland
-pembroke pines
-san antonio
-kyoto
-colorado springs
-el paso
-shenyang
-punta gorda
-fort smith
-richmond county
-waterbury
-shenzhen
-albuquerque
-jacksonville
-minneapolis
-fortaleza
-denton
-gastonia
-fayetteville
-bloomington
-houma
-santa ana
-kolkata
-las cruces
-barcelona
-arlington
-niigata
-norfolk
-fontana
-providence
-santo domingo
-vancouver
-appleton
-san jose
-hartford
-winston
-barrie
-glasgow
-davidson county
-yokohama
-independence
-athens
-harrisburg
-macon
-torrance
-launceston
-cape coral
-austin
-little rock
-cologne
-mesquite
-catania
-stockholm
-nice
-stamford
-buenos aires
-columbia
-anchorage
-dayton
-wollongong
-halifax
-verona
-anaheim
-kiev
-augusta
-tokyo
-akron
-lexington
-wichita
-saint petersburg
-beijing
-johnson city
-spokane
-liverpool
-howell
-poughkeepsie
-ontario
-atlantic city
-trenton
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/txt_files/remove_list.txt
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/txt_files/remove_list.txt
@ -1,28 +0,0 @@
-universi
-research institu
-laboratory
-gmbh
-inc
-universi of
-research center
-foundation
-faculty
-national institu
-school medicine
-universi school
-graduate school
-graduate school engineering
-institu tropical medicine
-institu virology
-faculty medicine
-laboratory
-universi park
-institu science
-polytechnic universi
-universi 1
-ciudad universi
-universi campus
-universi hospitals
-colege
-universi road
-universitetska str
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/txt_files/stop_words.txt
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/txt_files/stop_words.txt
@ -1,16 +0,0 @@
-from
-the
-of
-at
-de
-for
-et
-für
-des
-in
-as
-a
-and
-fur
-for
-und
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/txt_files/university_terms.txt
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/txt_files/university_terms.txt
@ -1,8 +0,0 @@
-universitetskaya
-universitatsklinikum
-universitatskinderklinik
-universitatskliniken
-universitetshospital
-universitatsmedizin
-universitatsbibliothek
-universitatspital
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/affiliations/oozie_app/workflow.xml
@ -1,30 +1,5 @@
 <workflow-app name="AffroAffiliations" xmlns="uri:oozie:workflow:0.5">
    <parameters>
-
-<!--        <property>-->
-<!--            <name>crossrefInputPath</name>-->
-<!--            <description>the path where to find the inferred affiliation relations from Crossref</description>-->
-<!--        </property>-->
-<!--        <property>-->
-<!--            <name>pubmedInputPath</name>-->
-<!--            <description>the path where to find the inferred affiliation relations from Pubmed</description>-->
-<!--        </property>-->
-<!--        <property>-->
-<!--            <name>openapcInputPath</name>-->
-<!--            <description>the path where to find the inferred affiliation relations from OpenAPC</description>-->
-<!--        </property>-->
-<!--        <property>-->
-<!--            <name>dataciteInputPath</name>-->
-<!--            <description>the path where to find the inferred affiliation relations from Datacite</description>-->
-<!--        </property>-->
-<!--        <property>-->
-<!--            <name>webCrawlInputPath</name>-->
-<!--            <description>the path where to find the inferred affiliation relations from webCrawl</description>-->
-<!--        </property>-->
-<!--        <property>-->
-<!--            <name>outputPath</name>-->
-<!--            <description>the path where to store the actionset</description>-->
-<!--        </property>-->
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -93,7 +68,7 @@
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Affiliations inference (Affro)</name>
-            <jar>affro_spark.py</jar>
+            <jar>update_records.py</jar>

            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
@ -107,13 +82,13 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
                --conf spark.executorEnv.PYSPARK_PYTHON=python3
-                --py-files ${wfAppPath}/affro_cluster.py,${wfAppPath}/create_input_cluster.py,${wfAppPath}/functions_cluster.py,${wfAppPath}/matching_cluster.py
-                --files ${wfAppPath}/dictionaries/dix_acad.json,${wfAppPath}/dictionaries/dix_categ.json,${wfAppPath}/dictionaries/dix_city.json,${wfAppPath}/dictionaries/dix_country.json,${wfAppPath}/dictionaries/dix_mult.json,${wfAppPath}/txt_files/city_names.txt,${wfAppPath}/txt_files/remove_list.txt,${wfAppPath}/txt_files/stop_words.txt,${wfAppPath}/txt_files/university_terms.txt
+                --py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/affro_test_example.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
+                --files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
            </spark-opts>

            <arg>${resultFolder}</arg>

-            <file>${wfAppPath}/affro_spark.py#affro_spark.py</file>
+            <file>${wfAppPath}/affRo/update_records.py#update_records.py</file>
        </spark>

        <ok to="End" />
				`@ -1 +0,0 @@`
				{"research": "Univ/Inst", "uniwersytet": "Univ/Inst", "investigacions": "Univ/Inst", "institu": "Univ/Inst", "istitut": "Univ/Inst", "univ": "Univ/Inst", "col": "Univ/Inst", "center": "Univ/Inst", "polytechnic": "Univ/Inst", "politecnico": "Univ/Inst", "centre": "Univ/Inst", "cnrs": "Univ/Inst", "faculty": "Univ/Inst", "school": "Univ/Inst", "academ": "Univ/Inst", "akadem": "Univ/Inst", "hochschule": "Univ/Inst", "ecole": "Univ/Inst", "tech": "Univ/Inst", "observ": "Univ/Inst", "escuela": "Univ/Inst", "escola": "Univ/Inst", "discovery programe": "Univ/Inst", "ku leuven": "Univ/Inst", "ucla": "Univ/Inst", "eth zurich": "Univ/Inst", "athena": "Univ/Inst", "openaire": "Univ/Inst", "erasmus": "Univ/Inst", "ist austria": "Univ/Inst", "lmu munich": "Univ/Inst", "cancer trials ireland": "Univ/Inst", "food safety authority": "Univ/Inst", "ucd": "Univ/Inst", "tcd": "Univ/Inst", "apc microbiome": "Univ/Inst", "nasa": "Univ/Inst", "ucl": "Univ/Inst", "zentrum": "Univ/Inst", "ncsr demokritos": "Univ/Inst", "panepistemio": "Univ/Inst", "forth": "Univ/Inst", "nui galway": "Univ/Inst", "nui maynooth": "Univ/Inst", "tu wien": "Univ/Inst", "tu dublin": "Univ/Inst", "lab": "Laboratory", "science": "Laboratory", "cientific": "Laboratory", "hospital": "Hospital", "clinic": "Hospital", "hopital": "Hospital", "klinik": "Hospital", "oncol": "Hospital", "medical": "Hospital", "health": "Hospital", "medicin": "Hospital", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "museum": "Museum", "library": "Museum", "foundation": "Foundation", "asociation": "Foundation", "organization": "Foundation", "society": "Foundation", "group": "Foundation", "royal": "Foundation", "ofice": "Foundation", "trust": "Foundation", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "microsoft": "Specific", "teagasc": "Specific", "ibm research": "Specific", "alergan": "Specific", "analog devices": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific"}