updates german terms, /
This commit is contained in:
parent
b38be012a0
commit
a7b703b67d
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -1 +1 @@
|
||||||
{"research": "Univ/Inst", "uniwersytet": "Univ/Inst", "investigacions": "Univ/Inst", "institu": "Univ/Inst", "istitut": "Univ/Inst", "univ": "Univ/Inst", "col": "Univ/Inst", "center": "Univ/Inst", "polytechnic": "Univ/Inst", "politecnico": "Univ/Inst", "centre": "Univ/Inst", "cnrs": "Univ/Inst", "faculty": "Univ/Inst", "school": "Univ/Inst", "academ": "Univ/Inst", "akadem": "Univ/Inst", "hochschule": "Univ/Inst", "ecole": "Univ/Inst", "tech": "Univ/Inst", "observ": "Univ/Inst", "escuela": "Univ/Inst", "escola": "Univ/Inst", "discovery programe": "Univ/Inst", "ku leuven": "Univ/Inst", "ucla": "Univ/Inst", "eth zurich": "Univ/Inst", "athena": "Univ/Inst", "openaire": "Univ/Inst", "erasmus": "Univ/Inst", "ist austria": "Univ/Inst", "lmu munich": "Univ/Inst", "cancer trials ireland": "Univ/Inst", "food safety authority": "Univ/Inst", "ucd": "Univ/Inst", "tcd": "Univ/Inst", "apc microbiome": "Univ/Inst", "nasa": "Univ/Inst", "ucl": "Univ/Inst", "zentrum": "Univ/Inst", "ncsr demokritos": "Univ/Inst", "panepistemio": "Univ/Inst", "forth": "Univ/Inst", "nui galway": "Univ/Inst", "nui maynooth": "Univ/Inst", "tu wien": "Univ/Inst", "tu dublin": "Univ/Inst", "lab": "Laboratory", "science": "Laboratory", "cientific": "Laboratory", "hospital": "Hospital", "clinic": "Hospital", "hopital": "Hospital", "klinik": "Hospital", "oncol": "Hospital", "medical": "Hospital", "health": "Hospital", "medicin": "Hospital", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "museum": "Museum", "library": "Museum", "foundation": "Foundation", "asociation": "Foundation", "organization": "Foundation", "society": "Foundation", "group": "Foundation", "royal": "Foundation", "ofice": "Foundation", "trust": "Foundation", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "microsoft": "Specific", "teagasc": "Specific", "ibm research": "Specific", "alergan": "Specific", "analog devices": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific"}
|
{"research": "Univ/Inst", "uniwersytet": "Univ/Inst", "investigacions": "Univ/Inst", "institu": "Univ/Inst", "istitut": "Univ/Inst", "univ": "Univ/Inst", "col": "Univ/Inst", "center": "Univ/Inst", "polytechnic": "Univ/Inst", "politecnico": "Univ/Inst", "polutekhneio": "Univ/Inst", "centre": "Univ/Inst", "kentro": "Univ/Inst", "cnrs": "Univ/Inst", "faculty": "Univ/Inst", "school": "Univ/Inst", "academ": "Univ/Inst", "akadem": "Univ/Inst", "hochschule": "Univ/Inst", "ecole": "Univ/Inst", "tech": "Univ/Inst", "observ": "Univ/Inst", "escuela": "Univ/Inst", "escola": "Univ/Inst", "discovery programe": "Univ/Inst", "ku leuven": "Univ/Inst", "ucla": "Univ/Inst", "eth zurich": "Univ/Inst", "athena": "Univ/Inst", "openaire": "Univ/Inst", "erasmus": "Univ/Inst", "ist austria": "Univ/Inst", "lmu munich": "Univ/Inst", "cancer trials ireland": "Univ/Inst", "food safety authority": "Univ/Inst", "ucd": "Univ/Inst", "tcd": "Univ/Inst", "apc microbiome": "Univ/Inst", "nasa": "Univ/Inst", "ucl": "Univ/Inst", "zentrum": "Univ/Inst", "ncsr demokritos": "Univ/Inst", "panepist": "Univ/Inst", "forth": "Univ/Inst", "nui galway": "Univ/Inst", "nui maynooth": "Univ/Inst", "tu wien": "Univ/Inst", "tu dublin": "Univ/Inst", "lab": "Laboratory", "science": "Laboratory", "cientific": "Laboratory", "hospital": "Hospital", "clinic": "Hospital", "hopital": "Hospital", "klinik": "Hospital", "oncol": "Hospital", "medical": "Hospital", "health": "Hospital", "medicin": "Hospital", "nosokomei": "Hospital", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "museum": "Museum", "library": "Museum", "foundation": "Foundation", "asociation": "Foundation", "organization": "Foundation", "society": "Foundation", "group": "Foundation", "royal": "Foundation", "ofice": "Foundation", "trust": "Foundation", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "microsoft": "Specific", "teagasc": "Specific", "ibm research": "Specific", "alergan": "Specific", "analog devices": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific"}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -12,11 +12,6 @@ def load_txt(file_path):
|
||||||
list_ = [line.strip() for line in file]
|
list_ = [line.strip() for line in file]
|
||||||
return list_
|
return list_
|
||||||
|
|
||||||
def load_pickled_dict(file_path):
|
|
||||||
with open(file_path, 'rb') as file:
|
|
||||||
pickled_dict = pickle.load(file)
|
|
||||||
return pickled_dict
|
|
||||||
|
|
||||||
|
|
||||||
def load_json(file_path):
|
def load_json(file_path):
|
||||||
with open(file_path, 'r') as json_file:
|
with open(file_path, 'r') as json_file:
|
||||||
|
@ -186,7 +181,14 @@ protected_phrases1 = [
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
replacements = {'nat.':'national',
|
replacements = {'universitatsklinikum' : 'universi hospital',
|
||||||
|
'universitetshospital' : 'universi hospital',
|
||||||
|
'universitatskinderklinik' : 'universi childrens hospital',
|
||||||
|
'universitatskliniken': 'universi hospital',
|
||||||
|
'Universitätsklinik': 'universi hospital',
|
||||||
|
'universitatsmedizin': 'universi medicine',
|
||||||
|
'universitatsbibliothek' : 'universi library',
|
||||||
|
'nat.':'national',
|
||||||
'uni versity':'university',
|
'uni versity':'university',
|
||||||
'inst ':'institute ',
|
'inst ':'institute ',
|
||||||
'adv ':'advanced ',
|
'adv ':'advanced ',
|
||||||
|
@ -196,17 +198,18 @@ replacements = {'nat.':'national',
|
||||||
'adv.':'advanced',
|
'adv.':'advanced',
|
||||||
'univ.':'university',
|
'univ.':'university',
|
||||||
'stud.': 'studies',
|
'stud.': 'studies',
|
||||||
'uni versity':'university',
|
|
||||||
'univ ':'university ',
|
|
||||||
'univercity':'university',
|
'univercity':'university',
|
||||||
'universtiy':'university',
|
|
||||||
'univerisity':'university',
|
'univerisity':'university',
|
||||||
|
'universtiy':'university',
|
||||||
'univeristy':'university',
|
'univeristy':'university',
|
||||||
'universirty':'university',
|
'universirty':'university',
|
||||||
'universiti':'university',
|
'universiti':'university',
|
||||||
'universitiy':'university',
|
'universitiy':'university',
|
||||||
'universty' :'university',
|
'universty' :'university',
|
||||||
'univ col': 'university colege',
|
'univ col': 'university colege',
|
||||||
|
'univ. col.': 'university colege',
|
||||||
|
'univ. coll.': 'university colege',
|
||||||
|
'col.':'colege',
|
||||||
'belfield, dublin': 'dublin',
|
'belfield, dublin': 'dublin',
|
||||||
'balsbridge, dublin': 'dublin', #ballsbridge
|
'balsbridge, dublin': 'dublin', #ballsbridge
|
||||||
'earlsfort terrace, dublin': 'dublin',
|
'earlsfort terrace, dublin': 'dublin',
|
||||||
|
@ -237,6 +240,7 @@ def substrings_dict(string):
|
||||||
|
|
||||||
for old, new in replacements.items():
|
for old, new in replacements.items():
|
||||||
string = string.replace(old, new)
|
string = string.replace(old, new)
|
||||||
|
string = string.replace('hospitalum','hospital').replace('hospitalen','hospital')
|
||||||
split_strings = split_string_with_protection(string, protected_phrases1)
|
split_strings = split_string_with_protection(string, protected_phrases1)
|
||||||
|
|
||||||
# Define a set of university-related terms for later use
|
# Define a set of university-related terms for later use
|
||||||
|
@ -245,7 +249,7 @@ def substrings_dict(string):
|
||||||
dict_string = {}
|
dict_string = {}
|
||||||
index = 0
|
index = 0
|
||||||
for value in split_strings:
|
for value in split_strings:
|
||||||
vaule = value.replace('.', ' ')
|
value = value.replace('.', ' ')
|
||||||
# Check if the substring contains any university-related terms
|
# Check if the substring contains any university-related terms
|
||||||
if not any(term in value.lower() for term in university_terms):
|
if not any(term in value.lower() for term in university_terms):
|
||||||
# Apply regex substitutions for common patterns
|
# Apply regex substitutions for common patterns
|
||||||
|
@ -288,8 +292,8 @@ def clean_string(input_string):
|
||||||
# Normalize unicode characters (optional, e.g., replace umlauts)
|
# Normalize unicode characters (optional, e.g., replace umlauts)
|
||||||
input_string = unidecode(input_string)
|
input_string = unidecode(input_string)
|
||||||
|
|
||||||
# Replace `/` and `–` with space (do not replace hyphen `-`)
|
# Replace `–` with space (do not replace hyphen `-`)
|
||||||
result = re.sub(r'[/\-]', ' ', input_string)
|
result = re.sub(r'[\-]', ' ', input_string)
|
||||||
|
|
||||||
# Replace "saint" with "st"
|
# Replace "saint" with "st"
|
||||||
result = re.sub(r'\bSaint\b', 'St', result)
|
result = re.sub(r'\bSaint\b', 'St', result)
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
colege street
|
||||||
universi
|
universi
|
||||||
research institu
|
research institu
|
||||||
laboratory
|
laboratory
|
||||||
|
|
Loading…
Reference in New Issue