updates

2024-09-19 21:37:28 +02:00 · 2024-09-19 21:37:28 +02:00 · bace694d21
parent a7b703b67d
commit bace694d21
9 changed files with 13 additions and 18 deletions
--- a/pycache/functions_cluster.cpython-311.pyc
+++ b/pycache/functions_cluster.cpython-311.pyc
--- a/pycache/matching_cluster.cpython-311.pyc
+++ b/pycache/matching_cluster.cpython-311.pyc
--- a/dictionaries/dix_acad.json
+++ b/dictionaries/dix_acad.json
--- a/dictionaries/dix_city.json
+++ b/dictionaries/dix_city.json
--- a/dictionaries/dix_country.json
+++ b/dictionaries/dix_country.json
--- a/dictionaries/dix_mult.json
+++ b/dictionaries/dix_mult.json
--- a/functions_cluster.py
+++ b/functions_cluster.py
@ -109,10 +109,10 @@ def remove_parentheses(text):
   return re.sub(r'\([^()]*\)', '', text)


-def replace_umlauts(text):
-    normalized_text = unicodedata.normalize('NFKD', text)
-    replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
-    return replaced_text
+# def replace_umlauts(text):
+#     normalized_text = unicodedata.normalize('NFKD', text)
+#     replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
+#     return replaced_text

 def protect_phrases(input_string, phrases):
    # Replace phrases with placeholders
@ -287,10 +287,8 @@ def clean_string(input_string):
    input_string = input_string.replace(" – ", placeholder)

    # Unescape HTML entities and convert to lowercase
-    input_string = replace_comma_spaces(replace_double_consonants(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.replace("'", "")))))).strip())
+    input_string = replace_comma_spaces(replace_double_consonants(unidecode(remove_parentheses(html.unescape(input_string.replace("'", ""))))).strip())
    
-    # Normalize unicode characters (optional, e.g., replace umlauts)
-    input_string = unidecode(input_string)
    
    # Replace `–` with space (do not replace hyphen `-`)
    result = re.sub(r'[\-]', ' ', input_string)
@ -318,7 +316,7 @@ def clean_string(input_string):

 def clean_string_facts(input_string):
    # Replace specified characters with space
-    input_string = remove_stop_words(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.lower())))))
+    input_string = remove_stop_words(unidecode(remove_parentheses(html.unescape(input_string.lower()))))
    result = re.sub(r'[/\-,]', ' ', input_string)
    result = re.sub(r'\bsaint\b', 'st', result) 

--- a/matching_cluster.py
+++ b/matching_cluster.py
@ -137,7 +137,6 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
    light_aff = input[0]
    vectorizer = CountVectorizer()

-    lnamelist = list(dix_org.keys())
    dix = {}    # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
    #pairs = [] 
    result = {}
@ -156,7 +155,7 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
        similar_k = []
        pairs_k = []

-        if s in lnamelist:
+        if s in dix_org:
            similarity = 1
            similar_k.append(similarity)
            
@ -170,7 +169,7 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
                dix[k].append(s)
        else:

-            for x in lnamelist:
+            for x in dix_org:
                if  is_contained(s, x):

                    x_vector = vectorizer.fit_transform([x]).toarray()
@ -261,7 +260,7 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
    for i,x in enumerate(matched_org):
       # id_list = []
        if dix_mult[x] != 'unique':
-            if x in list(dix_city_ror.keys()):
+            if x in dix_city_ror:
                match_found = False

                for city in dix_city_ror[x]:
--- a/txt_files/stop_words.txt
+++ b/txt_files/stop_words.txt
@ -8,8 +8,6 @@ et
 für
 des
 in
-as
-a
 and
 fur
 for