This commit is contained in:
mkallipo 2024-09-19 21:37:28 +02:00
parent a7b703b67d
commit bace694d21
9 changed files with 13 additions and 18 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -109,10 +109,10 @@ def remove_parentheses(text):
return re.sub(r'\([^()]*\)', '', text) return re.sub(r'\([^()]*\)', '', text)
def replace_umlauts(text): # def replace_umlauts(text):
normalized_text = unicodedata.normalize('NFKD', text) # normalized_text = unicodedata.normalize('NFKD', text)
replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c)) # replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
return replaced_text # return replaced_text
def protect_phrases(input_string, phrases): def protect_phrases(input_string, phrases):
# Replace phrases with placeholders # Replace phrases with placeholders
@ -287,10 +287,8 @@ def clean_string(input_string):
input_string = input_string.replace(" ", placeholder) input_string = input_string.replace(" ", placeholder)
# Unescape HTML entities and convert to lowercase # Unescape HTML entities and convert to lowercase
input_string = replace_comma_spaces(replace_double_consonants(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.replace("'", "")))))).strip()) input_string = replace_comma_spaces(replace_double_consonants(unidecode(remove_parentheses(html.unescape(input_string.replace("'", ""))))).strip())
# Normalize unicode characters (optional, e.g., replace umlauts)
input_string = unidecode(input_string)
# Replace `` with space (do not replace hyphen `-`) # Replace `` with space (do not replace hyphen `-`)
result = re.sub(r'[\-]', ' ', input_string) result = re.sub(r'[\-]', ' ', input_string)
@ -318,7 +316,7 @@ def clean_string(input_string):
def clean_string_facts(input_string): def clean_string_facts(input_string):
# Replace specified characters with space # Replace specified characters with space
input_string = remove_stop_words(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.lower()))))) input_string = remove_stop_words(unidecode(remove_parentheses(html.unescape(input_string.lower()))))
result = re.sub(r'[/\-,]', ' ', input_string) result = re.sub(r'[/\-,]', ' ', input_string)
result = re.sub(r'\bsaint\b', 'st', result) result = re.sub(r'\bsaint\b', 'st', result)

View File

@ -137,7 +137,6 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
light_aff = input[0] light_aff = input[0]
vectorizer = CountVectorizer() vectorizer = CountVectorizer()
lnamelist = list(dix_org.keys())
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]} dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
#pairs = [] #pairs = []
result = {} result = {}
@ -156,7 +155,7 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
similar_k = [] similar_k = []
pairs_k = [] pairs_k = []
if s in lnamelist: if s in dix_org:
similarity = 1 similarity = 1
similar_k.append(similarity) similar_k.append(similarity)
@ -170,7 +169,7 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
dix[k].append(s) dix[k].append(s)
else: else:
for x in lnamelist: for x in dix_org:
if is_contained(s, x): if is_contained(s, x):
x_vector = vectorizer.fit_transform([x]).toarray() x_vector = vectorizer.fit_transform([x]).toarray()
@ -261,7 +260,7 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
for i,x in enumerate(matched_org): for i,x in enumerate(matched_org):
# id_list = [] # id_list = []
if dix_mult[x] != 'unique': if dix_mult[x] != 'unique':
if x in list(dix_city_ror.keys()): if x in dix_city_ror:
match_found = False match_found = False
for city in dix_city_ror[x]: for city in dix_city_ror[x]:

View File

@ -8,8 +8,6 @@ et
für für
des des
in in
as
a
and and
fur fur
for for