import json fin = open('./data/in/ds_dedup_2022-02-16_13.03.17.csv') fin.readline() dic = {} for line in fin: line = line.strip().split(';') last = len(line) -1 if line[0] not in dic: dic[line[0]] = set() id = line[last ] for i in range(len(line[last]), 12): id += "_" dic[line[0]].add(id.lower() + "::" + line[2]) fin.close() fin = open('./data/out/registryGroups.txt') intersectionGroups = {} onlyRegistries = [] onlyDedup = [] completeOverlap = [] cp = set() for line in fin: rg = set(json.loads(line)) found = False for dg in dic: intersection = rg.intersection(dic[dg]) intl = len(intersection) if intl > 0: found = True if intl == len(dic[dg]) and intl == len(rg): completeOverlap.append(intersection) cp.add(dg) else: if not dg in intersectionGroups: intersectionGroups[dg] = [] intersectionGroups[dg].append([dic[dg], rg, intersection]) if not found: onlyRegistries.append(rg) fin.close() freport = open('./data/out/allDuplicateSets.txt','w') for dg in dic: if not (dg in intersectionGroups or dg in cp): onlyDedup.append(dic[dg]) fout = open('./data/out/onlyRegistry.txt','w') for e in onlyRegistries: r = fout.write("%s\n"%json.dumps(list(e))) r = freport.write("%s\n"%json.dumps(list(e))) fout.close() fout = open('./data/out/completeOverlap.txt','w') for e in completeOverlap: r = fout.write("%s\n"%json.dumps(list(e))) r = freport.write("%s\n"%json.dumps(list(e))) fout.close() fout = open('./data/out/intersection.txt','w') for did in intersectionGroups: ret = [] for g in intersectionGroups[did]: lista = [] for l in g: lista.append(list(l)) ret.append(lista ) r = fout.write("%s\n"%json.dumps(list(ret))) fout.close() fout = open('./data/out/onlyDedup.txt','w') for e in onlyDedup: r = fout.write("%s\n"%json.dumps(list(e))) r = freport.write("%s\n"%json.dumps(list(e))) fout.close() dedupExtention = 0 registryExtention = 0 mutualExtention = 0 groupMerging = 0 newGroup = [] newGroupCount = 0 recount = 0 extentionKeys = {'mutual':[], 'dedup':[],'registry':[],'merging':[]} for dg in intersectionGroups: re = False ng = set() if len(intersectionGroups[dg]) > 1: groupMerging += 1 extentionKeys['merging'].append(dg) for g in intersectionGroups[dg]: ng = ng.union(g[0]) ng = ng.union(g[1]) else: intersection = intersectionGroups[dg][0] dedup = intersection[0] registry = intersection[1] inte = intersection[2] if(len(inte)) < len(dedup) and len(inte) < len(registry): mutualExtention += 1 extentionKeys['mutual'].append(dg) if(len(inte)) < len(dedup) and len(inte) == len(registry): dedupExtention += 1 extentionKeys['dedup'].append(dg) if(len(inte)) == len(dedup) and len(inte) < len(registry): registryExtention += 1 re = True extentionKeys['registry'].append(dg) ng = dedup.union(registry) duplicate = False for tmp in newGroup: l = len(tmp.intersection(ng)) if l == len(tmp) and l == len(ng): duplicate = True break if not duplicate: newGroup.append(ng) if not re : newGroupCount += 1 else: recount += 1 fout = open('./data/out/newGroups.txt','w') for e in newGroup: r = fout.write("%s\n"%json.dumps(list(e))) r = freport.write("%s\n"%json.dumps(list(e))) fout.close() freport.close() fout = open('./data/out/report.txt','w') fout.write("Merged Groups = %s \n"%str(groupMerging)) fout.write("Mutual Extention = %s \n"%str(mutualExtention)) fout.write("Extention to registry group via dedup = %s \n"%str(dedupExtention)) fout.write("Extention to dedup group via registry = %s \n"%str(registryExtention)) fout.write("Number of new groups without duplicates = %s \n"%str(newGroupCount)) fout.write("Number of registry extentions without duplicates = %s \n"%str(recount)) across4 = 0 temp = [] for i in newGroup: tmp = [] for e in i: tmp .append(e[0:4]) if 'fair' in tmp and 'roar' in tmp and 're3d' in tmp and 'open' in tmp: across4 += 1 temp.append(list(i)) fout.write("Number Groups with at least one entry for each registry = %s \n"%str(across4)) fout.write("\n\n\n********** ENTRIES FROM ALL REGISTRY ************** \n\n\n") for e in temp: fout.write("%s\n"%json.dumps(e)) fout.write("\n\n\n********** MUTUAL EXTENTION GROUPS ************** \n\n\n") for e in extentionKeys['mutual']: ret = [] for g in intersectionGroups[e]: lista = [] for l in g: lista.append(list(l)) ret.append(lista ) r = fout.write("%s\n"%json.dumps(ret)) fout.write("\n\n\n********** MERGING EXTENTION GROUPS ************** \n\n\n") for e in extentionKeys['merging']: ret = [] for g in intersectionGroups[e]: lista = [] for l in g: lista.append(list(l)) ret.append(lista ) r = fout.write("%s\n"%json.dumps(ret)) fout.write("\n\n\n********** DEDUP EXTENTION GROUPS ************** \n\n\n") for e in extentionKeys['dedup']: ret = [] for g in intersectionGroups[e]: lista = [] for l in g: lista.append(list(l)) ret.append(lista ) r = fout.write("%s\n"%json.dumps(ret)) fout.write("\n\n\n********** REGISTRY EXTENTION GROUPS ************** \n\n\n") for e in extentionKeys['registry']: ret = [] for g in intersectionGroups[e]: lista = [] for l in g: lista.append(list(l)) ret.append(lista ) r = fout.write("%s\n"%json.dumps(ret)) fout.close()