from functions_cluster import * def create_df_algorithm(raw_aff_string): aff_no_symbols_d = substrings_dict(clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string)))) substring_list = list(aff_no_symbols_d.values()) i = 0 while i < len(substring_list) - 1: if is_contained('progr', substring_list[i]) and is_contained('dep', substring_list[i+1]): substring_list.pop(i) elif (is_contained('assistant', substring_list[i]) or is_contained('researcher', substring_list[i]) or is_contained('phd', substring_list[i]) or is_contained('student', substring_list[i]) or is_contained('section', substring_list[i]) or is_contained('prof', substring_list[i]) or is_contained('director', substring_list[i])) and (not is_contained('school', substring_list[i+1]) or is_contained('univ', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('lab', substring_list[i+1]) or is_contained('fac', substring_list[i+1])): substring_list.pop(i) elif (is_contained('engineer', substring_list[i]) or is_contained('progr', substring_list[i]) or is_contained('unit', substring_list[i]) or is_contained('lab', substring_list[i]) or is_contained('dep', substring_list[i]) or is_contained('school', substring_list[i]) or is_contained('inst', substring_list[i]) #or is_contained('hosp', substring_list[i]) or is_contained('fac', substring_list[i])) and is_contained('univ', substring_list[i+1]): if not is_contained('univ', substring_list[i]): substring_list.pop(i) else: i = i+1 continue elif is_contained('lab', substring_list[i]) and (is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('school', substring_list[i+1])): if not is_contained('univ', substring_list[i]): substring_list.pop(i) else: i = i+1 continue elif is_contained('dep', substring_list[i]) and (is_contained('tech', substring_list[i+1]) or is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('school', substring_list[i+1]) or is_contained('fac', substring_list[i+1])): if not is_contained('univ', substring_list[i]): substring_list.pop(i) else: i = i+1 continue elif is_contained('inst',substring_list[i]) and (is_contained('school', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('acad', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('clin', substring_list[i+1]) or is_contained('klin', substring_list[i+1]) or is_contained('fak', substring_list[i+1]) or is_contained('fac', substring_list[i+1]) or is_contained('cent', substring_list[i+1]) or is_contained('div', substring_list[i+1])): if not is_contained('univ', substring_list[i]): substring_list.pop(i) else: i = i+1 continue elif is_contained('school',substring_list[i]) and is_contained('colege', substring_list[i+1]): if not is_contained('univ', substring_list[i]): substring_list.pop(i) else: i = i+1 continue else: i += 1 light_aff = (', '.join((substring_list))) for x in substring_list: if x in city_names+remove_list: substring_list.remove(x) substring_list = [shorten_keywords_spark([x])[0] for x in substring_list] def valueToCategory(value): flag = 0 for k in categ_dicts: if k in value: flag = 1 return flag aff_list = [{"index": i, "keywords": substring_list[i], "category": valueToCategory(substring_list[i])} for i in range(len(substring_list))] filtered_list = [entry for entry in aff_list if entry.get("category") == 1] return [light_aff, filtered_list]