This commit is contained in:
mkallipo 2024-09-19 21:37:28 +02:00
parent a7b703b67d
commit bace694d21
9 changed files with 13 additions and 18 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -109,10 +109,10 @@ def remove_parentheses(text):
return re.sub(r'\([^()]*\)', '', text)
def replace_umlauts(text):
normalized_text = unicodedata.normalize('NFKD', text)
replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
return replaced_text
# def replace_umlauts(text):
# normalized_text = unicodedata.normalize('NFKD', text)
# replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
# return replaced_text
def protect_phrases(input_string, phrases):
# Replace phrases with placeholders
@ -287,10 +287,8 @@ def clean_string(input_string):
input_string = input_string.replace(" ", placeholder)
# Unescape HTML entities and convert to lowercase
input_string = replace_comma_spaces(replace_double_consonants(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.replace("'", "")))))).strip())
input_string = replace_comma_spaces(replace_double_consonants(unidecode(remove_parentheses(html.unescape(input_string.replace("'", ""))))).strip())
# Normalize unicode characters (optional, e.g., replace umlauts)
input_string = unidecode(input_string)
# Replace `` with space (do not replace hyphen `-`)
result = re.sub(r'[\-]', ' ', input_string)
@ -318,7 +316,7 @@ def clean_string(input_string):
def clean_string_facts(input_string):
# Replace specified characters with space
input_string = remove_stop_words(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.lower())))))
input_string = remove_stop_words(unidecode(remove_parentheses(html.unescape(input_string.lower()))))
result = re.sub(r'[/\-,]', ' ', input_string)
result = re.sub(r'\bsaint\b', 'st', result)

View File

@ -137,7 +137,6 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
light_aff = input[0]
vectorizer = CountVectorizer()
lnamelist = list(dix_org.keys())
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
#pairs = []
result = {}
@ -156,7 +155,7 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
similar_k = []
pairs_k = []
if s in lnamelist:
if s in dix_org:
similarity = 1
similar_k.append(similarity)
@ -170,7 +169,7 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
dix[k].append(s)
else:
for x in lnamelist:
for x in dix_org:
if is_contained(s, x):
x_vector = vectorizer.fit_transform([x]).toarray()
@ -261,7 +260,7 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
for i,x in enumerate(matched_org):
# id_list = []
if dix_mult[x] != 'unique':
if x in list(dix_city_ror.keys()):
if x in dix_city_ror:
match_found = False
for city in dix_city_ror[x]:

View File

@ -8,8 +8,6 @@ et
für
des
in
as
a
and
fur
for