updates abbr

This commit is contained in:
mkallipo 2024-09-16 12:20:37 +02:00
parent fbf55b3d5d
commit b38be012a0
1 changed files with 16 additions and 5 deletions

View File

@ -186,10 +186,21 @@ protected_phrases1 = [
replacements = {'uni versity':'university',
replacements = {'nat.':'national',
'uni versity':'university',
'inst ':'institute ',
'adv ':'advanced ',
'univ ':'university ',
'stud ': 'studies ',
'inst.':'institute',
'adv.':'advanced',
'univ.':'university',
'stud.': 'studies',
'uni versity':'university',
'univ ':'university ',
'univercity':'university',
'universtiy':'university',
'univerisity':'university',
'univeristy':'university',
'universirty':'university',
'universiti':'university',
@ -234,7 +245,7 @@ def substrings_dict(string):
dict_string = {}
index = 0
for value in split_strings:
vaule = value.replace('.', ' ')
# Check if the substring contains any university-related terms
if not any(term in value.lower() for term in university_terms):
# Apply regex substitutions for common patterns
@ -286,7 +297,7 @@ def clean_string(input_string):
# Remove characters that are not from the Latin alphabet, or allowed punctuation
result = replace_comma_spaces(re.sub(r'[^a-zA-Z\s,;/]', '', result).strip())
result = replace_comma_spaces(re.sub(r'[^a-zA-Z\s,;/.]', '', result).strip())
# Restore the " - " sequence from the placeholder
result = result.replace(placeholder, " ")
@ -308,7 +319,7 @@ def clean_string_facts(input_string):
result = re.sub(r'\bsaint\b', 'st', result)
# Remove characters that are not from the Latin alphabet or numbers
result = re.sub(r'[^a-zA-Z0-9\s;/-]', '', result)
result = re.sub(r'[^a-zA-Z0-9\s;/-.]', '', result)
# Replace consecutive whitespace with a single space
result = re.sub(r'\s+', ' ', result)