updates german terms, /
This commit is contained in:
parent
b38be012a0
commit
a7b703b67d
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -1 +1 @@
|
|||
{"research": "Univ/Inst", "uniwersytet": "Univ/Inst", "investigacions": "Univ/Inst", "institu": "Univ/Inst", "istitut": "Univ/Inst", "univ": "Univ/Inst", "col": "Univ/Inst", "center": "Univ/Inst", "polytechnic": "Univ/Inst", "politecnico": "Univ/Inst", "centre": "Univ/Inst", "cnrs": "Univ/Inst", "faculty": "Univ/Inst", "school": "Univ/Inst", "academ": "Univ/Inst", "akadem": "Univ/Inst", "hochschule": "Univ/Inst", "ecole": "Univ/Inst", "tech": "Univ/Inst", "observ": "Univ/Inst", "escuela": "Univ/Inst", "escola": "Univ/Inst", "discovery programe": "Univ/Inst", "ku leuven": "Univ/Inst", "ucla": "Univ/Inst", "eth zurich": "Univ/Inst", "athena": "Univ/Inst", "openaire": "Univ/Inst", "erasmus": "Univ/Inst", "ist austria": "Univ/Inst", "lmu munich": "Univ/Inst", "cancer trials ireland": "Univ/Inst", "food safety authority": "Univ/Inst", "ucd": "Univ/Inst", "tcd": "Univ/Inst", "apc microbiome": "Univ/Inst", "nasa": "Univ/Inst", "ucl": "Univ/Inst", "zentrum": "Univ/Inst", "ncsr demokritos": "Univ/Inst", "panepistemio": "Univ/Inst", "forth": "Univ/Inst", "nui galway": "Univ/Inst", "nui maynooth": "Univ/Inst", "tu wien": "Univ/Inst", "tu dublin": "Univ/Inst", "lab": "Laboratory", "science": "Laboratory", "cientific": "Laboratory", "hospital": "Hospital", "clinic": "Hospital", "hopital": "Hospital", "klinik": "Hospital", "oncol": "Hospital", "medical": "Hospital", "health": "Hospital", "medicin": "Hospital", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "museum": "Museum", "library": "Museum", "foundation": "Foundation", "asociation": "Foundation", "organization": "Foundation", "society": "Foundation", "group": "Foundation", "royal": "Foundation", "ofice": "Foundation", "trust": "Foundation", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "microsoft": "Specific", "teagasc": "Specific", "ibm research": "Specific", "alergan": "Specific", "analog devices": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific"}
|
||||
{"research": "Univ/Inst", "uniwersytet": "Univ/Inst", "investigacions": "Univ/Inst", "institu": "Univ/Inst", "istitut": "Univ/Inst", "univ": "Univ/Inst", "col": "Univ/Inst", "center": "Univ/Inst", "polytechnic": "Univ/Inst", "politecnico": "Univ/Inst", "polutekhneio": "Univ/Inst", "centre": "Univ/Inst", "kentro": "Univ/Inst", "cnrs": "Univ/Inst", "faculty": "Univ/Inst", "school": "Univ/Inst", "academ": "Univ/Inst", "akadem": "Univ/Inst", "hochschule": "Univ/Inst", "ecole": "Univ/Inst", "tech": "Univ/Inst", "observ": "Univ/Inst", "escuela": "Univ/Inst", "escola": "Univ/Inst", "discovery programe": "Univ/Inst", "ku leuven": "Univ/Inst", "ucla": "Univ/Inst", "eth zurich": "Univ/Inst", "athena": "Univ/Inst", "openaire": "Univ/Inst", "erasmus": "Univ/Inst", "ist austria": "Univ/Inst", "lmu munich": "Univ/Inst", "cancer trials ireland": "Univ/Inst", "food safety authority": "Univ/Inst", "ucd": "Univ/Inst", "tcd": "Univ/Inst", "apc microbiome": "Univ/Inst", "nasa": "Univ/Inst", "ucl": "Univ/Inst", "zentrum": "Univ/Inst", "ncsr demokritos": "Univ/Inst", "panepist": "Univ/Inst", "forth": "Univ/Inst", "nui galway": "Univ/Inst", "nui maynooth": "Univ/Inst", "tu wien": "Univ/Inst", "tu dublin": "Univ/Inst", "lab": "Laboratory", "science": "Laboratory", "cientific": "Laboratory", "hospital": "Hospital", "clinic": "Hospital", "hopital": "Hospital", "klinik": "Hospital", "oncol": "Hospital", "medical": "Hospital", "health": "Hospital", "medicin": "Hospital", "nosokomei": "Hospital", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "museum": "Museum", "library": "Museum", "foundation": "Foundation", "asociation": "Foundation", "organization": "Foundation", "society": "Foundation", "group": "Foundation", "royal": "Foundation", "ofice": "Foundation", "trust": "Foundation", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "microsoft": "Specific", "teagasc": "Specific", "ibm research": "Specific", "alergan": "Specific", "analog devices": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific"}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -12,11 +12,6 @@ def load_txt(file_path):
|
|||
list_ = [line.strip() for line in file]
|
||||
return list_
|
||||
|
||||
def load_pickled_dict(file_path):
|
||||
with open(file_path, 'rb') as file:
|
||||
pickled_dict = pickle.load(file)
|
||||
return pickled_dict
|
||||
|
||||
|
||||
def load_json(file_path):
|
||||
with open(file_path, 'r') as json_file:
|
||||
|
@ -186,7 +181,14 @@ protected_phrases1 = [
|
|||
|
||||
|
||||
|
||||
replacements = {'nat.':'national',
|
||||
replacements = {'universitatsklinikum' : 'universi hospital',
|
||||
'universitetshospital' : 'universi hospital',
|
||||
'universitatskinderklinik' : 'universi childrens hospital',
|
||||
'universitatskliniken': 'universi hospital',
|
||||
'Universitätsklinik': 'universi hospital',
|
||||
'universitatsmedizin': 'universi medicine',
|
||||
'universitatsbibliothek' : 'universi library',
|
||||
'nat.':'national',
|
||||
'uni versity':'university',
|
||||
'inst ':'institute ',
|
||||
'adv ':'advanced ',
|
||||
|
@ -196,17 +198,18 @@ replacements = {'nat.':'national',
|
|||
'adv.':'advanced',
|
||||
'univ.':'university',
|
||||
'stud.': 'studies',
|
||||
'uni versity':'university',
|
||||
'univ ':'university ',
|
||||
'univercity':'university',
|
||||
'universtiy':'university',
|
||||
'univerisity':'university',
|
||||
'universtiy':'university',
|
||||
'univeristy':'university',
|
||||
'universirty':'university',
|
||||
'universiti':'university',
|
||||
'universitiy':'university',
|
||||
'universty' :'university',
|
||||
'univ col': 'university colege',
|
||||
'univ. col.': 'university colege',
|
||||
'univ. coll.': 'university colege',
|
||||
'col.':'colege',
|
||||
'belfield, dublin': 'dublin',
|
||||
'balsbridge, dublin': 'dublin', #ballsbridge
|
||||
'earlsfort terrace, dublin': 'dublin',
|
||||
|
@ -237,6 +240,7 @@ def substrings_dict(string):
|
|||
|
||||
for old, new in replacements.items():
|
||||
string = string.replace(old, new)
|
||||
string = string.replace('hospitalum','hospital').replace('hospitalen','hospital')
|
||||
split_strings = split_string_with_protection(string, protected_phrases1)
|
||||
|
||||
# Define a set of university-related terms for later use
|
||||
|
@ -245,7 +249,7 @@ def substrings_dict(string):
|
|||
dict_string = {}
|
||||
index = 0
|
||||
for value in split_strings:
|
||||
vaule = value.replace('.', ' ')
|
||||
value = value.replace('.', ' ')
|
||||
# Check if the substring contains any university-related terms
|
||||
if not any(term in value.lower() for term in university_terms):
|
||||
# Apply regex substitutions for common patterns
|
||||
|
@ -288,8 +292,8 @@ def clean_string(input_string):
|
|||
# Normalize unicode characters (optional, e.g., replace umlauts)
|
||||
input_string = unidecode(input_string)
|
||||
|
||||
# Replace `/` and `–` with space (do not replace hyphen `-`)
|
||||
result = re.sub(r'[/\-]', ' ', input_string)
|
||||
# Replace `–` with space (do not replace hyphen `-`)
|
||||
result = re.sub(r'[\-]', ' ', input_string)
|
||||
|
||||
# Replace "saint" with "st"
|
||||
result = re.sub(r'\bSaint\b', 'St', result)
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
colege street
|
||||
universi
|
||||
research institu
|
||||
laboratory
|
||||
|
|
Loading…
Reference in New Issue