updates
This commit is contained in:
parent
a7b703b67d
commit
bace694d21
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -109,10 +109,10 @@ def remove_parentheses(text):
|
||||||
return re.sub(r'\([^()]*\)', '', text)
|
return re.sub(r'\([^()]*\)', '', text)
|
||||||
|
|
||||||
|
|
||||||
def replace_umlauts(text):
|
# def replace_umlauts(text):
|
||||||
normalized_text = unicodedata.normalize('NFKD', text)
|
# normalized_text = unicodedata.normalize('NFKD', text)
|
||||||
replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
|
# replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
|
||||||
return replaced_text
|
# return replaced_text
|
||||||
|
|
||||||
def protect_phrases(input_string, phrases):
|
def protect_phrases(input_string, phrases):
|
||||||
# Replace phrases with placeholders
|
# Replace phrases with placeholders
|
||||||
|
@ -287,10 +287,8 @@ def clean_string(input_string):
|
||||||
input_string = input_string.replace(" – ", placeholder)
|
input_string = input_string.replace(" – ", placeholder)
|
||||||
|
|
||||||
# Unescape HTML entities and convert to lowercase
|
# Unescape HTML entities and convert to lowercase
|
||||||
input_string = replace_comma_spaces(replace_double_consonants(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.replace("'", "")))))).strip())
|
input_string = replace_comma_spaces(replace_double_consonants(unidecode(remove_parentheses(html.unescape(input_string.replace("'", ""))))).strip())
|
||||||
|
|
||||||
# Normalize unicode characters (optional, e.g., replace umlauts)
|
|
||||||
input_string = unidecode(input_string)
|
|
||||||
|
|
||||||
# Replace `–` with space (do not replace hyphen `-`)
|
# Replace `–` with space (do not replace hyphen `-`)
|
||||||
result = re.sub(r'[\-]', ' ', input_string)
|
result = re.sub(r'[\-]', ' ', input_string)
|
||||||
|
@ -318,7 +316,7 @@ def clean_string(input_string):
|
||||||
|
|
||||||
def clean_string_facts(input_string):
|
def clean_string_facts(input_string):
|
||||||
# Replace specified characters with space
|
# Replace specified characters with space
|
||||||
input_string = remove_stop_words(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.lower())))))
|
input_string = remove_stop_words(unidecode(remove_parentheses(html.unescape(input_string.lower()))))
|
||||||
result = re.sub(r'[/\-,]', ' ', input_string)
|
result = re.sub(r'[/\-,]', ' ', input_string)
|
||||||
result = re.sub(r'\bsaint\b', 'st', result)
|
result = re.sub(r'\bsaint\b', 'st', result)
|
||||||
|
|
||||||
|
|
|
@ -137,7 +137,6 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
|
||||||
light_aff = input[0]
|
light_aff = input[0]
|
||||||
vectorizer = CountVectorizer()
|
vectorizer = CountVectorizer()
|
||||||
|
|
||||||
lnamelist = list(dix_org.keys())
|
|
||||||
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
|
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
|
||||||
#pairs = []
|
#pairs = []
|
||||||
result = {}
|
result = {}
|
||||||
|
@ -156,7 +155,7 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
|
||||||
similar_k = []
|
similar_k = []
|
||||||
pairs_k = []
|
pairs_k = []
|
||||||
|
|
||||||
if s in lnamelist:
|
if s in dix_org:
|
||||||
similarity = 1
|
similarity = 1
|
||||||
similar_k.append(similarity)
|
similar_k.append(similarity)
|
||||||
|
|
||||||
|
@ -170,7 +169,7 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
|
||||||
dix[k].append(s)
|
dix[k].append(s)
|
||||||
else:
|
else:
|
||||||
|
|
||||||
for x in lnamelist:
|
for x in dix_org:
|
||||||
if is_contained(s, x):
|
if is_contained(s, x):
|
||||||
|
|
||||||
x_vector = vectorizer.fit_transform([x]).toarray()
|
x_vector = vectorizer.fit_transform([x]).toarray()
|
||||||
|
@ -261,7 +260,7 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
|
||||||
for i,x in enumerate(matched_org):
|
for i,x in enumerate(matched_org):
|
||||||
# id_list = []
|
# id_list = []
|
||||||
if dix_mult[x] != 'unique':
|
if dix_mult[x] != 'unique':
|
||||||
if x in list(dix_city_ror.keys()):
|
if x in dix_city_ror:
|
||||||
match_found = False
|
match_found = False
|
||||||
|
|
||||||
for city in dix_city_ror[x]:
|
for city in dix_city_ror[x]:
|
||||||
|
|
|
@ -8,8 +8,6 @@ et
|
||||||
für
|
für
|
||||||
des
|
des
|
||||||
in
|
in
|
||||||
as
|
|
||||||
a
|
|
||||||
and
|
and
|
||||||
fur
|
fur
|
||||||
for
|
for
|
||||||
|
|
Loading…
Reference in New Issue