updates german terms, /

2024-09-17 12:06:29 +02:00 · 2024-09-17 12:06:29 +02:00 · a7b703b67d
parent b38be012a0
commit a7b703b67d
12 changed files with 22 additions and 17 deletions
--- a/pycache/affro_cluster.cpython-311.pyc
+++ b/pycache/affro_cluster.cpython-311.pyc
--- a/pycache/create_input_cluster.cpython-311.pyc
+++ b/pycache/create_input_cluster.cpython-311.pyc
--- a/pycache/functions_cluster.cpython-311.pyc
+++ b/pycache/functions_cluster.cpython-311.pyc
--- a/pycache/matching_cluster.cpython-311.pyc
+++ b/pycache/matching_cluster.cpython-311.pyc
--- a/dictionaries/.DS_Store
+++ b/dictionaries/.DS_Store
--- a/dictionaries/dix_acad.json
+++ b/dictionaries/dix_acad.json
--- a/dictionaries/dix_categ.json
+++ b/dictionaries/dix_categ.json
@ -1 +1 @@
-{"research": "Univ/Inst", "uniwersytet": "Univ/Inst", "investigacions": "Univ/Inst", "institu": "Univ/Inst", "istitut": "Univ/Inst", "univ": "Univ/Inst", "col": "Univ/Inst", "center": "Univ/Inst", "polytechnic": "Univ/Inst", "politecnico": "Univ/Inst", "centre": "Univ/Inst", "cnrs": "Univ/Inst", "faculty": "Univ/Inst", "school": "Univ/Inst", "academ": "Univ/Inst", "akadem": "Univ/Inst", "hochschule": "Univ/Inst", "ecole": "Univ/Inst", "tech": "Univ/Inst", "observ": "Univ/Inst", "escuela": "Univ/Inst", "escola": "Univ/Inst", "discovery programe": "Univ/Inst", "ku leuven": "Univ/Inst", "ucla": "Univ/Inst", "eth zurich": "Univ/Inst", "athena": "Univ/Inst", "openaire": "Univ/Inst", "erasmus": "Univ/Inst", "ist austria": "Univ/Inst", "lmu munich": "Univ/Inst", "cancer trials ireland": "Univ/Inst", "food safety authority": "Univ/Inst", "ucd": "Univ/Inst", "tcd": "Univ/Inst", "apc microbiome": "Univ/Inst", "nasa": "Univ/Inst", "ucl": "Univ/Inst", "zentrum": "Univ/Inst", "ncsr demokritos": "Univ/Inst", "panepistemio": "Univ/Inst", "forth": "Univ/Inst", "nui galway": "Univ/Inst", "nui maynooth": "Univ/Inst", "tu wien": "Univ/Inst", "tu dublin": "Univ/Inst", "lab": "Laboratory", "science": "Laboratory", "cientific": "Laboratory", "hospital": "Hospital", "clinic": "Hospital", "hopital": "Hospital", "klinik": "Hospital", "oncol": "Hospital", "medical": "Hospital", "health": "Hospital", "medicin": "Hospital", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "museum": "Museum", "library": "Museum", "foundation": "Foundation", "asociation": "Foundation", "organization": "Foundation", "society": "Foundation", "group": "Foundation", "royal": "Foundation", "ofice": "Foundation", "trust": "Foundation", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "microsoft": "Specific", "teagasc": "Specific", "ibm research": "Specific", "alergan": "Specific", "analog devices": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific"}
+{"research": "Univ/Inst", "uniwersytet": "Univ/Inst", "investigacions": "Univ/Inst", "institu": "Univ/Inst", "istitut": "Univ/Inst", "univ": "Univ/Inst", "col": "Univ/Inst", "center": "Univ/Inst", "polytechnic": "Univ/Inst", "politecnico": "Univ/Inst", "polutekhneio": "Univ/Inst", "centre": "Univ/Inst", "kentro": "Univ/Inst", "cnrs": "Univ/Inst", "faculty": "Univ/Inst", "school": "Univ/Inst", "academ": "Univ/Inst", "akadem": "Univ/Inst", "hochschule": "Univ/Inst", "ecole": "Univ/Inst", "tech": "Univ/Inst", "observ": "Univ/Inst", "escuela": "Univ/Inst", "escola": "Univ/Inst", "discovery programe": "Univ/Inst", "ku leuven": "Univ/Inst", "ucla": "Univ/Inst", "eth zurich": "Univ/Inst", "athena": "Univ/Inst", "openaire": "Univ/Inst", "erasmus": "Univ/Inst", "ist austria": "Univ/Inst", "lmu munich": "Univ/Inst", "cancer trials ireland": "Univ/Inst", "food safety authority": "Univ/Inst", "ucd": "Univ/Inst", "tcd": "Univ/Inst", "apc microbiome": "Univ/Inst", "nasa": "Univ/Inst", "ucl": "Univ/Inst", "zentrum": "Univ/Inst", "ncsr demokritos": "Univ/Inst", "panepist": "Univ/Inst", "forth": "Univ/Inst", "nui galway": "Univ/Inst", "nui maynooth": "Univ/Inst", "tu wien": "Univ/Inst", "tu dublin": "Univ/Inst", "lab": "Laboratory", "science": "Laboratory", "cientific": "Laboratory", "hospital": "Hospital", "clinic": "Hospital", "hopital": "Hospital", "klinik": "Hospital", "oncol": "Hospital", "medical": "Hospital", "health": "Hospital", "medicin": "Hospital", "nosokomei": "Hospital", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "museum": "Museum", "library": "Museum", "foundation": "Foundation", "asociation": "Foundation", "organization": "Foundation", "society": "Foundation", "group": "Foundation", "royal": "Foundation", "ofice": "Foundation", "trust": "Foundation", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "microsoft": "Specific", "teagasc": "Specific", "ibm research": "Specific", "alergan": "Specific", "analog devices": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific"}
--- a/dictionaries/dix_city.json
+++ b/dictionaries/dix_city.json
--- a/dictionaries/dix_country.json
+++ b/dictionaries/dix_country.json
--- a/dictionaries/dix_mult.json
+++ b/dictionaries/dix_mult.json
--- a/functions_cluster.py
+++ b/functions_cluster.py
@ -12,11 +12,6 @@ def load_txt(file_path):
        list_ = [line.strip() for line in file]
        return list_
    
-def load_pickled_dict(file_path): 
-    with open(file_path, 'rb') as file: 
-        pickled_dict = pickle.load(file) 
-        return pickled_dict
-    

 def load_json(file_path): 
    with open(file_path, 'r') as json_file:
@ -186,7 +181,14 @@ protected_phrases1 =  [



-replacements = {'nat.':'national',
+replacements = {'universitatsklinikum' : 'universi hospital',
+                'universitetshospital' : 'universi hospital',
+                'universitatskinderklinik' : 'universi childrens hospital',
+                'universitatskliniken': 'universi hospital',
+                'Universitätsklinik': 'universi hospital',
+                'universitatsmedizin': 'universi medicine',
+                'universitatsbibliothek' : 'universi library',
+                'nat.':'national',
                'uni versity':'university',
                'inst ':'institute ',
                'adv ':'advanced ',
@ -196,17 +198,18 @@ replacements = {'nat.':'national',
                'adv.':'advanced',
                'univ.':'university',
                'stud.': 'studies',
-                'uni versity':'university',
-                'univ ':'university ',
                'univercity':'university', 
-                'universtiy':'university',
                'univerisity':'university', 
+                'universtiy':'university', 
                'univeristy':'university',
                'universirty':'university', 
                'universiti':'university', 
                'universitiy':'university',
                'universty' :'university',
                'univ col': 'university colege',
+                'univ. col.': 'university colege',
+                'univ. coll.': 'university colege',
+                'col.':'colege',
                'belfield, dublin': 'dublin',
                'balsbridge, dublin': 'dublin', #ballsbridge
                'earlsfort terrace, dublin': 'dublin',
@ -237,6 +240,7 @@ def substrings_dict(string):
    
    for old, new in replacements.items():
        string = string.replace(old, new)
+        string = string.replace('hospitalum','hospital').replace('hospitalen','hospital')
    split_strings = split_string_with_protection(string, protected_phrases1)
    
    # Define a set of university-related terms for later use
@ -245,7 +249,7 @@ def substrings_dict(string):
    dict_string = {}
    index = 0    
    for value in split_strings:
-        vaule = value.replace('.', ' ')        
+        value = value.replace('.', ' ')        
        # Check if the substring contains any university-related terms
        if not any(term in value.lower() for term in university_terms):
            # Apply regex substitutions for common patterns
@ -288,8 +292,8 @@ def clean_string(input_string):
    # Normalize unicode characters (optional, e.g., replace umlauts)
    input_string = unidecode(input_string)
    
-    # Replace `/` and `–` with space (do not replace hyphen `-`)
-    result = re.sub(r'[/\-]', ' ', input_string)
+    # Replace `–` with space (do not replace hyphen `-`)
+    result = re.sub(r'[\-]', ' ', input_string)
    
    # Replace "saint" with "st"
    result = re.sub(r'\bSaint\b', 'St', result)
--- a/txt_files/remove_list.txt
+++ b/txt_files/remove_list.txt
@ -1,3 +1,4 @@
+colege street
 universi
 research institu
 laboratory