Registries/code/crossreferencededup.py

203 lines
5.8 KiB
Python

import json
fin = open('./data/in/ds_dedup_2022-02-16_13.03.17.csv')
fin.readline()
dic = {}
for line in fin:
line = line.strip().split(';')
last = len(line) -1
if line[0] not in dic:
dic[line[0]] = set()
id = line[last ]
for i in range(len(line[last]), 12):
id += "_"
dic[line[0]].add(id.lower() + "::" + line[2])
fin.close()
fin = open('./data/out/registryGroups.txt')
intersectionGroups = {}
onlyRegistries = []
onlyDedup = []
completeOverlap = []
cp = set()
for line in fin:
rg = set(json.loads(line))
found = False
for dg in dic:
intersection = rg.intersection(dic[dg])
intl = len(intersection)
if intl > 0:
found = True
if intl == len(dic[dg]) and intl == len(rg):
completeOverlap.append(intersection)
cp.add(dg)
else:
if not dg in intersectionGroups:
intersectionGroups[dg] = []
intersectionGroups[dg].append([dic[dg], rg, intersection])
if not found:
onlyRegistries.append(rg)
fin.close()
freport = open('./data/out/allDuplicateSets.txt','w')
for dg in dic:
if not (dg in intersectionGroups or dg in cp):
onlyDedup.append(dic[dg])
fout = open('./data/out/onlyRegistry.txt','w')
for e in onlyRegistries:
r = fout.write("%s\n"%json.dumps(list(e)))
r = freport.write("%s\n"%json.dumps(list(e)))
fout.close()
fout = open('./data/out/completeOverlap.txt','w')
for e in completeOverlap:
r = fout.write("%s\n"%json.dumps(list(e)))
r = freport.write("%s\n"%json.dumps(list(e)))
fout.close()
fout = open('./data/out/intersection.txt','w')
for did in intersectionGroups:
ret = []
for g in intersectionGroups[did]:
lista = []
for l in g:
lista.append(list(l))
ret.append(lista )
r = fout.write("%s\n"%json.dumps(list(ret)))
fout.close()
fout = open('./data/out/onlyDedup.txt','w')
for e in onlyDedup:
r = fout.write("%s\n"%json.dumps(list(e)))
r = freport.write("%s\n"%json.dumps(list(e)))
fout.close()
dedupExtention = 0
registryExtention = 0
mutualExtention = 0
groupMerging = 0
newGroup = []
newGroupCount = 0
recount = 0
extentionKeys = {'mutual':[], 'dedup':[],'registry':[],'merging':[]}
for dg in intersectionGroups:
re = False
ng = set()
if len(intersectionGroups[dg]) > 1:
groupMerging += 1
extentionKeys['merging'].append(dg)
for g in intersectionGroups[dg]:
ng = ng.union(g[0])
ng = ng.union(g[1])
else:
intersection = intersectionGroups[dg][0]
dedup = intersection[0]
registry = intersection[1]
inte = intersection[2]
if(len(inte)) < len(dedup) and len(inte) < len(registry):
mutualExtention += 1
extentionKeys['mutual'].append(dg)
if(len(inte)) < len(dedup) and len(inte) == len(registry):
dedupExtention += 1
extentionKeys['dedup'].append(dg)
if(len(inte)) == len(dedup) and len(inte) < len(registry):
registryExtention += 1
re = True
extentionKeys['registry'].append(dg)
ng = dedup.union(registry)
duplicate = False
for tmp in newGroup:
l = len(tmp.intersection(ng))
if l == len(tmp) and l == len(ng):
duplicate = True
break
if not duplicate:
newGroup.append(ng)
if not re :
newGroupCount += 1
else:
recount += 1
fout = open('./data/out/newGroups.txt','w')
for e in newGroup:
r = fout.write("%s\n"%json.dumps(list(e)))
r = freport.write("%s\n"%json.dumps(list(e)))
fout.close()
freport.close()
fout = open('./data/out/report.txt','w')
fout.write("Merged Groups = %s \n"%str(groupMerging))
fout.write("Mutual Extention = %s \n"%str(mutualExtention))
fout.write("Extention to registry group via dedup = %s \n"%str(dedupExtention))
fout.write("Extention to dedup group via registry = %s \n"%str(registryExtention))
fout.write("Number of new groups without duplicates = %s \n"%str(newGroupCount))
fout.write("Number of registry extentions without duplicates = %s \n"%str(recount))
across4 = 0
temp = []
for i in newGroup:
tmp = []
for e in i:
tmp .append(e[0:4])
if 'fair' in tmp and 'roar' in tmp and 're3d' in tmp and 'open' in tmp:
across4 += 1
temp.append(list(i))
fout.write("Number Groups with at least one entry for each registry = %s \n"%str(across4))
fout.write("\n\n\n********** ENTRIES FROM ALL REGISTRY ************** \n\n\n")
for e in temp:
fout.write("%s\n"%json.dumps(e))
fout.write("\n\n\n********** MUTUAL EXTENTION GROUPS ************** \n\n\n")
for e in extentionKeys['mutual']:
ret = []
for g in intersectionGroups[e]:
lista = []
for l in g:
lista.append(list(l))
ret.append(lista )
r = fout.write("%s\n"%json.dumps(ret))
fout.write("\n\n\n********** MERGING EXTENTION GROUPS ************** \n\n\n")
for e in extentionKeys['merging']:
ret = []
for g in intersectionGroups[e]:
lista = []
for l in g:
lista.append(list(l))
ret.append(lista )
r = fout.write("%s\n"%json.dumps(ret))
fout.write("\n\n\n********** DEDUP EXTENTION GROUPS ************** \n\n\n")
for e in extentionKeys['dedup']:
ret = []
for g in intersectionGroups[e]:
lista = []
for l in g:
lista.append(list(l))
ret.append(lista )
r = fout.write("%s\n"%json.dumps(ret))
fout.write("\n\n\n********** REGISTRY EXTENTION GROUPS ************** \n\n\n")
for e in extentionKeys['registry']:
ret = []
for g in intersectionGroups[e]:
lista = []
for l in g:
lista.append(list(l))
ret.append(lista )
r = fout.write("%s\n"%json.dumps(ret))
fout.close()