affRo/create_input_cluster.py

77 lines
4.1 KiB
Python
Raw Normal View History

2024-09-05 12:23:32 +02:00
from functions_cluster import *
def create_df_algorithm(raw_aff_string):
aff_no_symbols_d = substrings_dict(clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string))))
substring_list = list(aff_no_symbols_d.values())
i = 0
while i < len(substring_list) - 1:
if is_contained('progr', substring_list[i]) and is_contained('dep', substring_list[i+1]):
substring_list.pop(i)
elif (is_contained('assistant', substring_list[i]) or is_contained('researcher', substring_list[i]) or is_contained('phd', substring_list[i]) or is_contained('student', substring_list[i]) or is_contained('section', substring_list[i]) or is_contained('prof', substring_list[i]) or is_contained('director', substring_list[i])) and (not is_contained('school', substring_list[i+1]) or is_contained('univ', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('lab', substring_list[i+1]) or is_contained('fac', substring_list[i+1])):
substring_list.pop(i)
elif (is_contained('engineer', substring_list[i]) or is_contained('progr', substring_list[i]) or is_contained('unit', substring_list[i]) or is_contained('lab', substring_list[i]) or is_contained('dep', substring_list[i]) or is_contained('school', substring_list[i]) or is_contained('inst', substring_list[i]) #or is_contained('hosp', substring_list[i])
or is_contained('fac', substring_list[i])) and is_contained('univ', substring_list[i+1]):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
elif is_contained('lab', substring_list[i]) and (is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('school', substring_list[i+1])):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
elif is_contained('dep', substring_list[i]) and (is_contained('tech', substring_list[i+1]) or is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('school', substring_list[i+1]) or is_contained('fac', substring_list[i+1])):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
elif is_contained('inst',substring_list[i]) and (is_contained('school', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('acad', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('clin', substring_list[i+1]) or is_contained('klin', substring_list[i+1]) or is_contained('fak', substring_list[i+1]) or is_contained('fac', substring_list[i+1]) or is_contained('cent', substring_list[i+1]) or is_contained('div', substring_list[i+1])):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
elif is_contained('school',substring_list[i]) and is_contained('colege', substring_list[i+1]):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
else:
i += 1
light_aff = (', '.join((substring_list)))
for x in substring_list:
if x in city_names+remove_list:
substring_list.remove(x)
substring_list = [shorten_keywords_spark([x])[0] for x in substring_list]
def valueToCategory(value):
flag = 0
for k in categ_dicts:
if k in value:
flag = 1
return flag
aff_list = [{"index": i, "keywords": substring_list[i], "category": valueToCategory(substring_list[i])} for i in range(len(substring_list))]
filtered_list = [entry for entry in aff_list if entry.get("category") == 1]
return [light_aff, filtered_list]