updates german terms, /

This commit is contained in:
mkallipo 2024-09-17 12:06:29 +02:00
parent b38be012a0
commit a7b703b67d
12 changed files with 22 additions and 17 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
dictionaries/.DS_Store vendored Normal file

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -1 +1 @@
{"research": "Univ/Inst", "uniwersytet": "Univ/Inst", "investigacions": "Univ/Inst", "institu": "Univ/Inst", "istitut": "Univ/Inst", "univ": "Univ/Inst", "col": "Univ/Inst", "center": "Univ/Inst", "polytechnic": "Univ/Inst", "politecnico": "Univ/Inst", "centre": "Univ/Inst", "cnrs": "Univ/Inst", "faculty": "Univ/Inst", "school": "Univ/Inst", "academ": "Univ/Inst", "akadem": "Univ/Inst", "hochschule": "Univ/Inst", "ecole": "Univ/Inst", "tech": "Univ/Inst", "observ": "Univ/Inst", "escuela": "Univ/Inst", "escola": "Univ/Inst", "discovery programe": "Univ/Inst", "ku leuven": "Univ/Inst", "ucla": "Univ/Inst", "eth zurich": "Univ/Inst", "athena": "Univ/Inst", "openaire": "Univ/Inst", "erasmus": "Univ/Inst", "ist austria": "Univ/Inst", "lmu munich": "Univ/Inst", "cancer trials ireland": "Univ/Inst", "food safety authority": "Univ/Inst", "ucd": "Univ/Inst", "tcd": "Univ/Inst", "apc microbiome": "Univ/Inst", "nasa": "Univ/Inst", "ucl": "Univ/Inst", "zentrum": "Univ/Inst", "ncsr demokritos": "Univ/Inst", "panepistemio": "Univ/Inst", "forth": "Univ/Inst", "nui galway": "Univ/Inst", "nui maynooth": "Univ/Inst", "tu wien": "Univ/Inst", "tu dublin": "Univ/Inst", "lab": "Laboratory", "science": "Laboratory", "cientific": "Laboratory", "hospital": "Hospital", "clinic": "Hospital", "hopital": "Hospital", "klinik": "Hospital", "oncol": "Hospital", "medical": "Hospital", "health": "Hospital", "medicin": "Hospital", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "museum": "Museum", "library": "Museum", "foundation": "Foundation", "asociation": "Foundation", "organization": "Foundation", "society": "Foundation", "group": "Foundation", "royal": "Foundation", "ofice": "Foundation", "trust": "Foundation", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "microsoft": "Specific", "teagasc": "Specific", "ibm research": "Specific", "alergan": "Specific", "analog devices": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific"}
{"research": "Univ/Inst", "uniwersytet": "Univ/Inst", "investigacions": "Univ/Inst", "institu": "Univ/Inst", "istitut": "Univ/Inst", "univ": "Univ/Inst", "col": "Univ/Inst", "center": "Univ/Inst", "polytechnic": "Univ/Inst", "politecnico": "Univ/Inst", "polutekhneio": "Univ/Inst", "centre": "Univ/Inst", "kentro": "Univ/Inst", "cnrs": "Univ/Inst", "faculty": "Univ/Inst", "school": "Univ/Inst", "academ": "Univ/Inst", "akadem": "Univ/Inst", "hochschule": "Univ/Inst", "ecole": "Univ/Inst", "tech": "Univ/Inst", "observ": "Univ/Inst", "escuela": "Univ/Inst", "escola": "Univ/Inst", "discovery programe": "Univ/Inst", "ku leuven": "Univ/Inst", "ucla": "Univ/Inst", "eth zurich": "Univ/Inst", "athena": "Univ/Inst", "openaire": "Univ/Inst", "erasmus": "Univ/Inst", "ist austria": "Univ/Inst", "lmu munich": "Univ/Inst", "cancer trials ireland": "Univ/Inst", "food safety authority": "Univ/Inst", "ucd": "Univ/Inst", "tcd": "Univ/Inst", "apc microbiome": "Univ/Inst", "nasa": "Univ/Inst", "ucl": "Univ/Inst", "zentrum": "Univ/Inst", "ncsr demokritos": "Univ/Inst", "panepist": "Univ/Inst", "forth": "Univ/Inst", "nui galway": "Univ/Inst", "nui maynooth": "Univ/Inst", "tu wien": "Univ/Inst", "tu dublin": "Univ/Inst", "lab": "Laboratory", "science": "Laboratory", "cientific": "Laboratory", "hospital": "Hospital", "clinic": "Hospital", "hopital": "Hospital", "klinik": "Hospital", "oncol": "Hospital", "medical": "Hospital", "health": "Hospital", "medicin": "Hospital", "nosokomei": "Hospital", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "museum": "Museum", "library": "Museum", "foundation": "Foundation", "asociation": "Foundation", "organization": "Foundation", "society": "Foundation", "group": "Foundation", "royal": "Foundation", "ofice": "Foundation", "trust": "Foundation", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "microsoft": "Specific", "teagasc": "Specific", "ibm research": "Specific", "alergan": "Specific", "analog devices": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific"}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -12,11 +12,6 @@ def load_txt(file_path):
list_ = [line.strip() for line in file]
return list_
def load_pickled_dict(file_path):
with open(file_path, 'rb') as file:
pickled_dict = pickle.load(file)
return pickled_dict
def load_json(file_path):
with open(file_path, 'r') as json_file:
@ -186,7 +181,14 @@ protected_phrases1 = [
replacements = {'nat.':'national',
replacements = {'universitatsklinikum' : 'universi hospital',
'universitetshospital' : 'universi hospital',
'universitatskinderklinik' : 'universi childrens hospital',
'universitatskliniken': 'universi hospital',
'Universitätsklinik': 'universi hospital',
'universitatsmedizin': 'universi medicine',
'universitatsbibliothek' : 'universi library',
'nat.':'national',
'uni versity':'university',
'inst ':'institute ',
'adv ':'advanced ',
@ -196,17 +198,18 @@ replacements = {'nat.':'national',
'adv.':'advanced',
'univ.':'university',
'stud.': 'studies',
'uni versity':'university',
'univ ':'university ',
'univercity':'university',
'universtiy':'university',
'univerisity':'university',
'universtiy':'university',
'univeristy':'university',
'universirty':'university',
'universiti':'university',
'universitiy':'university',
'universty' :'university',
'univ col': 'university colege',
'univ. col.': 'university colege',
'univ. coll.': 'university colege',
'col.':'colege',
'belfield, dublin': 'dublin',
'balsbridge, dublin': 'dublin', #ballsbridge
'earlsfort terrace, dublin': 'dublin',
@ -237,6 +240,7 @@ def substrings_dict(string):
for old, new in replacements.items():
string = string.replace(old, new)
string = string.replace('hospitalum','hospital').replace('hospitalen','hospital')
split_strings = split_string_with_protection(string, protected_phrases1)
# Define a set of university-related terms for later use
@ -245,7 +249,7 @@ def substrings_dict(string):
dict_string = {}
index = 0
for value in split_strings:
vaule = value.replace('.', ' ')
value = value.replace('.', ' ')
# Check if the substring contains any university-related terms
if not any(term in value.lower() for term in university_terms):
# Apply regex substitutions for common patterns
@ -288,8 +292,8 @@ def clean_string(input_string):
# Normalize unicode characters (optional, e.g., replace umlauts)
input_string = unidecode(input_string)
# Replace `/` and `` with space (do not replace hyphen `-`)
result = re.sub(r'[/\-]', ' ', input_string)
# Replace `` with space (do not replace hyphen `-`)
result = re.sub(r'[\-]', ' ', input_string)
# Replace "saint" with "st"
result = re.sub(r'\bSaint\b', 'St', result)

View File

@ -1,3 +1,4 @@
colege street
universi
research institu
laboratory