203 lines
5.8 KiB
Python
203 lines
5.8 KiB
Python
import json
|
|
|
|
fin = open('./data/in/ds_dedup_2022-02-16_13.03.17.csv')
|
|
fin.readline()
|
|
dic = {}
|
|
for line in fin:
|
|
line = line.strip().split(';')
|
|
last = len(line) -1
|
|
if line[0] not in dic:
|
|
dic[line[0]] = set()
|
|
id = line[last ]
|
|
for i in range(len(line[last]), 12):
|
|
id += "_"
|
|
dic[line[0]].add(id.lower() + "::" + line[2])
|
|
|
|
fin.close()
|
|
|
|
fin = open('./data/out/registryGroups.txt')
|
|
intersectionGroups = {}
|
|
onlyRegistries = []
|
|
onlyDedup = []
|
|
completeOverlap = []
|
|
cp = set()
|
|
|
|
|
|
for line in fin:
|
|
rg = set(json.loads(line))
|
|
found = False
|
|
for dg in dic:
|
|
intersection = rg.intersection(dic[dg])
|
|
intl = len(intersection)
|
|
if intl > 0:
|
|
found = True
|
|
if intl == len(dic[dg]) and intl == len(rg):
|
|
completeOverlap.append(intersection)
|
|
cp.add(dg)
|
|
else:
|
|
if not dg in intersectionGroups:
|
|
intersectionGroups[dg] = []
|
|
intersectionGroups[dg].append([dic[dg], rg, intersection])
|
|
if not found:
|
|
onlyRegistries.append(rg)
|
|
|
|
fin.close()
|
|
|
|
freport = open('./data/out/allDuplicateSets.txt','w')
|
|
for dg in dic:
|
|
if not (dg in intersectionGroups or dg in cp):
|
|
onlyDedup.append(dic[dg])
|
|
|
|
fout = open('./data/out/onlyRegistry.txt','w')
|
|
for e in onlyRegistries:
|
|
r = fout.write("%s\n"%json.dumps(list(e)))
|
|
r = freport.write("%s\n"%json.dumps(list(e)))
|
|
|
|
fout.close()
|
|
|
|
fout = open('./data/out/completeOverlap.txt','w')
|
|
for e in completeOverlap:
|
|
r = fout.write("%s\n"%json.dumps(list(e)))
|
|
r = freport.write("%s\n"%json.dumps(list(e)))
|
|
|
|
fout.close()
|
|
|
|
fout = open('./data/out/intersection.txt','w')
|
|
for did in intersectionGroups:
|
|
ret = []
|
|
for g in intersectionGroups[did]:
|
|
lista = []
|
|
for l in g:
|
|
lista.append(list(l))
|
|
ret.append(lista )
|
|
r = fout.write("%s\n"%json.dumps(list(ret)))
|
|
|
|
fout.close()
|
|
|
|
fout = open('./data/out/onlyDedup.txt','w')
|
|
for e in onlyDedup:
|
|
r = fout.write("%s\n"%json.dumps(list(e)))
|
|
r = freport.write("%s\n"%json.dumps(list(e)))
|
|
|
|
fout.close()
|
|
|
|
dedupExtention = 0
|
|
registryExtention = 0
|
|
mutualExtention = 0
|
|
groupMerging = 0
|
|
newGroup = []
|
|
newGroupCount = 0
|
|
recount = 0
|
|
extentionKeys = {'mutual':[], 'dedup':[],'registry':[],'merging':[]}
|
|
|
|
for dg in intersectionGroups:
|
|
re = False
|
|
ng = set()
|
|
if len(intersectionGroups[dg]) > 1:
|
|
groupMerging += 1
|
|
extentionKeys['merging'].append(dg)
|
|
for g in intersectionGroups[dg]:
|
|
ng = ng.union(g[0])
|
|
ng = ng.union(g[1])
|
|
else:
|
|
intersection = intersectionGroups[dg][0]
|
|
dedup = intersection[0]
|
|
registry = intersection[1]
|
|
inte = intersection[2]
|
|
if(len(inte)) < len(dedup) and len(inte) < len(registry):
|
|
mutualExtention += 1
|
|
extentionKeys['mutual'].append(dg)
|
|
if(len(inte)) < len(dedup) and len(inte) == len(registry):
|
|
dedupExtention += 1
|
|
extentionKeys['dedup'].append(dg)
|
|
if(len(inte)) == len(dedup) and len(inte) < len(registry):
|
|
registryExtention += 1
|
|
re = True
|
|
extentionKeys['registry'].append(dg)
|
|
ng = dedup.union(registry)
|
|
duplicate = False
|
|
for tmp in newGroup:
|
|
l = len(tmp.intersection(ng))
|
|
if l == len(tmp) and l == len(ng):
|
|
duplicate = True
|
|
break
|
|
if not duplicate:
|
|
newGroup.append(ng)
|
|
if not re :
|
|
newGroupCount += 1
|
|
else:
|
|
recount += 1
|
|
|
|
|
|
fout = open('./data/out/newGroups.txt','w')
|
|
for e in newGroup:
|
|
r = fout.write("%s\n"%json.dumps(list(e)))
|
|
r = freport.write("%s\n"%json.dumps(list(e)))
|
|
|
|
fout.close()
|
|
freport.close()
|
|
fout = open('./data/out/report.txt','w')
|
|
fout.write("Merged Groups = %s \n"%str(groupMerging))
|
|
fout.write("Mutual Extention = %s \n"%str(mutualExtention))
|
|
fout.write("Extention to registry group via dedup = %s \n"%str(dedupExtention))
|
|
fout.write("Extention to dedup group via registry = %s \n"%str(registryExtention))
|
|
fout.write("Number of new groups without duplicates = %s \n"%str(newGroupCount))
|
|
fout.write("Number of registry extentions without duplicates = %s \n"%str(recount))
|
|
across4 = 0
|
|
temp = []
|
|
for i in newGroup:
|
|
tmp = []
|
|
for e in i:
|
|
tmp .append(e[0:4])
|
|
if 'fair' in tmp and 'roar' in tmp and 're3d' in tmp and 'open' in tmp:
|
|
across4 += 1
|
|
temp.append(list(i))
|
|
fout.write("Number Groups with at least one entry for each registry = %s \n"%str(across4))
|
|
|
|
fout.write("\n\n\n********** ENTRIES FROM ALL REGISTRY ************** \n\n\n")
|
|
for e in temp:
|
|
fout.write("%s\n"%json.dumps(e))
|
|
|
|
fout.write("\n\n\n********** MUTUAL EXTENTION GROUPS ************** \n\n\n")
|
|
for e in extentionKeys['mutual']:
|
|
ret = []
|
|
for g in intersectionGroups[e]:
|
|
lista = []
|
|
for l in g:
|
|
lista.append(list(l))
|
|
ret.append(lista )
|
|
r = fout.write("%s\n"%json.dumps(ret))
|
|
|
|
fout.write("\n\n\n********** MERGING EXTENTION GROUPS ************** \n\n\n")
|
|
for e in extentionKeys['merging']:
|
|
ret = []
|
|
for g in intersectionGroups[e]:
|
|
lista = []
|
|
for l in g:
|
|
lista.append(list(l))
|
|
ret.append(lista )
|
|
r = fout.write("%s\n"%json.dumps(ret))
|
|
|
|
fout.write("\n\n\n********** DEDUP EXTENTION GROUPS ************** \n\n\n")
|
|
for e in extentionKeys['dedup']:
|
|
ret = []
|
|
for g in intersectionGroups[e]:
|
|
lista = []
|
|
for l in g:
|
|
lista.append(list(l))
|
|
ret.append(lista )
|
|
r = fout.write("%s\n"%json.dumps(ret))
|
|
|
|
fout.write("\n\n\n********** REGISTRY EXTENTION GROUPS ************** \n\n\n")
|
|
for e in extentionKeys['registry']:
|
|
ret = []
|
|
for g in intersectionGroups[e]:
|
|
lista = []
|
|
for l in g:
|
|
lista.append(list(l))
|
|
ret.append(lista )
|
|
r = fout.write("%s\n"%json.dumps(ret))
|
|
|
|
|
|
fout.close()
|