636 lines
18 KiB
Python
Executable File
636 lines
18 KiB
Python
Executable File
__docformat__ = 'reStructuredText en'
|
|
|
|
import setpath
|
|
import json
|
|
from hashlib import md5
|
|
from binascii import b2a_base64
|
|
import math
|
|
import collections
|
|
import functions
|
|
|
|
class graphpowerhash:
|
|
"""
|
|
.. function:: graphpowerhash(steps, [undirected_edge], node1, node2, [node1_details, edge_details, node2_details]) -> jpack of graph node hashes
|
|
|
|
Graph power hashing is based on a `power iteration algorithm <http://en.wikipedia.org/wiki/Power_iteration>`_
|
|
that calculates hashes on every processing step. The produced output, contains for every node in the input graph
|
|
a hash that "describes" its "surroundings".
|
|
|
|
Parameters:
|
|
|
|
:steps:
|
|
The *steps* option controls the number of steps that the power hashing will be executed for. Another
|
|
way to conceptualize the *steps* parameter is to think of it as the radius of the graph around
|
|
a particular node that the node's hash covers.
|
|
|
|
Steps parameter's possible value are:
|
|
|
|
- null (default). When steps=null, then steps is automatically set to number_of_nodes/2
|
|
- Positive integer value.
|
|
- -1 . Steps is set to number_of_nodes
|
|
- Negative integers, steps is set to number_of_nodes / absolute_value(steps)
|
|
|
|
:undirected_edge':
|
|
|
|
This option can only have the *null* value.
|
|
|
|
- Parameter absent. The graph is assumed to be directed.
|
|
- Parameter present and having a *null* value. The graph is assumed to be undirected
|
|
|
|
:node1,node2:
|
|
|
|
Node1 connects to Node2. If node1 doesn't connect to any node, then *node2*'s value should be null.
|
|
|
|
:node and edge details:
|
|
|
|
Optional details, that are processed with the graph's structure. In essence these
|
|
parameters define "tags" on the nodes and edges of the graph.
|
|
|
|
.. note::
|
|
The graph power hash algorithm is an experimental algorithm created by me, Lefteris Stamatogiannakis. I haven't
|
|
proved its correctness, so please use it with care. Due to its hash usage, there is a (very low probability)
|
|
that two different graphs could hash to the same power hash.
|
|
|
|
I would be very very thankfull to anyone knowledgable in graph theory, who could prove it to be wrong (or correct).
|
|
If the reader knows of a paper that describes another algorithm similar to this algorithm, i would be glad to be pointed towards it.
|
|
|
|
.. note::
|
|
The computational complexity of the powerhash algorithm is O(n * steps * average_node_degree). The optimal value for
|
|
the hash to fully cover the graph, is to set the steps parameter to *graph_diameter* / 2.
|
|
|
|
Right now for steps=null, we take the worse upper bound of n / 2, so the computational complexity becomes
|
|
O(n * ~(n/2) * average_node_degree).
|
|
|
|
Examples:
|
|
|
|
Directed graph:
|
|
|
|
>>> table1('''
|
|
... 1 2
|
|
... 2 3
|
|
... 3 4
|
|
... 4 5
|
|
... 5 3
|
|
... ''')
|
|
|
|
>>> sql("select graphpowerhash(null, a,b) from table1")
|
|
graphpowerhash(null, a,b)
|
|
------------------------------------------------------------------------------------------------------------------------------
|
|
["OaNj+OtIZPqcwjc3QVvKpg","Um7OU79ApcRNA2TKrdcBcA","ZyQT/AoKyjIkwWMNvceK2A","3vaHWSLU/H32HvHTVBkpUQ","+3uZjYUMSXwyZs7HFHKNVg"]
|
|
|
|
|
|
Above graph having its nodes renumbered (its powerhash is the same as above):
|
|
|
|
>>> table2('''
|
|
... 2 5
|
|
... 5 4
|
|
... 4 1
|
|
... 1 3
|
|
... 3 4
|
|
... ''')
|
|
|
|
>>> sql("select graphpowerhash(null, a,b) from table2")
|
|
graphpowerhash(null, a,b)
|
|
------------------------------------------------------------------------------------------------------------------------------
|
|
["OaNj+OtIZPqcwjc3QVvKpg","Um7OU79ApcRNA2TKrdcBcA","ZyQT/AoKyjIkwWMNvceK2A","3vaHWSLU/H32HvHTVBkpUQ","+3uZjYUMSXwyZs7HFHKNVg"]
|
|
|
|
|
|
Above graph with a small change (its hash differs from above graphs):
|
|
|
|
>>> table3('''
|
|
... 2 5
|
|
... 5 4
|
|
... 4 1
|
|
... 1 3
|
|
... 3 5
|
|
... ''')
|
|
|
|
>>> sql("select graphpowerhash(null, a,b) from table3")
|
|
graphpowerhash(null, a,b)
|
|
------------------------------------------------------------------------------------------------------------------------------
|
|
["APq1eISun1GpYjgUhiMrLA","NPPh9FLzC5cUedxldXV77Q","VVZ93zo6gePuMeRf6f00Zg","df/4yDABlitCTfOGut0NvA","lqo+lY4fcjqujlgsYr+3Yw"]
|
|
|
|
|
|
Actual testing of equality or inequality of above graphs:
|
|
|
|
>>> sql("select hashmd5( (select graphpowerhash(null, a,b) from table1) )=hashmd5( (select graphpowerhash(null, a,b) from table2) ) as grapheq")
|
|
grapheq
|
|
-------
|
|
1
|
|
|
|
>>> sql("select hashmd5( (select graphpowerhash(null, a,b) from table1) )=hashmd5( (select graphpowerhash(null, a,b) from table3) ) as grapheq")
|
|
grapheq
|
|
-------
|
|
0
|
|
|
|
|
|
Graph with only one node:
|
|
|
|
>>> sql("select graphpowerhash(null, a, null) from (select * from table1 limit 1)")
|
|
graphpowerhash(null, a, null)
|
|
-----------------------------
|
|
["TOiuilAk4RLkg01tIwyvcg"]
|
|
|
|
|
|
Undirected version of table1's graph:
|
|
|
|
>>> sql("select graphpowerhash(null, null, a,b) from table1")
|
|
graphpowerhash(null, null, a,b)
|
|
------------------------------------------------------------------------------------------------------------------------------
|
|
["JudlYSkYV7rFHjk94abY/A","W88IN4kgDSeVX9kaY36SJg","W88IN4kgDSeVX9kaY36SJg","6ez9ee0N2ogdvKJVQ8VKWA","7gz+LT/LtsyFc+GxMUlL8g"]
|
|
|
|
|
|
Same graph as above, but some of the edges have been reversed (the undirected powerhash matches the powerhash above):
|
|
|
|
>>> table4('''
|
|
... 2 1
|
|
... 2 3
|
|
... 3 4
|
|
... 4 5
|
|
... 3 5
|
|
... ''')
|
|
|
|
>>> sql("select graphpowerhash(null, null, a,b) from table4")
|
|
graphpowerhash(null, null, a,b)
|
|
------------------------------------------------------------------------------------------------------------------------------
|
|
["JudlYSkYV7rFHjk94abY/A","W88IN4kgDSeVX9kaY36SJg","W88IN4kgDSeVX9kaY36SJg","6ez9ee0N2ogdvKJVQ8VKWA","7gz+LT/LtsyFc+GxMUlL8g"]
|
|
|
|
|
|
Graph similarity, using the step parameter (value of step defines the radius of the similar subgraphs that can be found):
|
|
|
|
>>> sql("select jaccard( (select graphpowerhash(3, a, b) from table1), (select graphpowerhash(3, a, b) from table3) ) as jacsim")
|
|
jacsim
|
|
------
|
|
0.0
|
|
|
|
>>> sql("select jaccard( (select graphpowerhash(1, a, b) from table1), (select graphpowerhash(1, a, b) from table3) ) as jacsim")
|
|
jacsim
|
|
------
|
|
0.25
|
|
|
|
|
|
Powerhash of graph having details (using a chemical composition):
|
|
|
|
>>> table5('''
|
|
... 1 2 O = C
|
|
... 2 3 C = O
|
|
... ''')
|
|
|
|
First without details:
|
|
|
|
>>> sql("select graphpowerhash(null, null, a, b) from table5")
|
|
graphpowerhash(null, null, a, b)
|
|
----------------------------------------------------------------------------
|
|
["Rw3sDN24TI7YARBNOOmYSg","9m5wcZf9iUxDwgzQkzu6Ag","9m5wcZf9iUxDwgzQkzu6Ag"]
|
|
|
|
Second with all details:
|
|
|
|
>>> sql("select graphpowerhash(null, null, a, b, c, d, e) from table5")
|
|
graphpowerhash(null, null, a, b, c, d, e)
|
|
----------------------------------------------------------------------------
|
|
["CPebw+eZYzw5bWgx47/tkg","CPebw+eZYzw5bWgx47/tkg","WNn4aDDBKcoMMi+nrz5JEA"]
|
|
|
|
"""
|
|
|
|
registered=True
|
|
|
|
def __init__(self):
|
|
self.nodes={}
|
|
self.steps=None
|
|
|
|
def step(self, *args):
|
|
directed=True
|
|
argslen=len(args)
|
|
largs=args
|
|
|
|
if largs[0]!=None:
|
|
self.steps=largs[0]
|
|
|
|
if largs[1]==None:
|
|
directed=False
|
|
largs=list(largs)
|
|
del(largs[1])
|
|
argslen-=1
|
|
|
|
if directed:
|
|
if argslen>4:
|
|
edgedetailslr='1'+chr(30)+str(largs[4])
|
|
edgedetailsrl='0'+chr(30)+str(largs[4])
|
|
else:
|
|
edgedetailslr='1'
|
|
edgedetailsrl='0'
|
|
else:
|
|
if argslen>4:
|
|
edgedetailslr='1'+ chr(30)+str(largs[4])
|
|
edgedetailsrl=edgedetailslr
|
|
else:
|
|
edgedetailslr='1'
|
|
edgedetailsrl=edgedetailslr
|
|
|
|
if largs[1] not in self.nodes:
|
|
if argslen>3:
|
|
self.nodes[largs[1]]=[ [( largs[2],edgedetailslr )] , str(largs[3])]
|
|
else:
|
|
self.nodes[largs[1]]=[ [( largs[2],edgedetailslr )] , '']
|
|
else:
|
|
self.nodes[largs[1]][0].append( ( largs[2],edgedetailslr ) )
|
|
|
|
|
|
if largs[2]!=None:
|
|
if largs[2] not in self.nodes:
|
|
if argslen>5:
|
|
self.nodes[largs[2]]=[ [(largs[1],edgedetailsrl )], str(largs[5])]
|
|
else:
|
|
self.nodes[largs[2]]=[ [(largs[1],edgedetailsrl )] , '']
|
|
else:
|
|
self.nodes[largs[2]][0].append( ( largs[1],edgedetailsrl ) )
|
|
|
|
def final(self):
|
|
ncount=len(self.nodes)
|
|
|
|
for n,v in self.nodes.iteritems():
|
|
v[1]=str(len(v[0]))+chr(31)+v[1]
|
|
|
|
if ncount==1:
|
|
self.steps=1
|
|
|
|
if self.steps==None:
|
|
# Calculate approximate worse case diameter
|
|
degreeseq=set()
|
|
mindegree=ncount
|
|
maxdegree=0
|
|
invdegree=0.0
|
|
|
|
for n,v in self.nodes.iteritems():
|
|
ndegree=len(v[0])
|
|
mindegree=min(mindegree, ndegree)
|
|
maxdegree=max(maxdegree, ndegree)
|
|
degreeseq.add(ndegree)
|
|
invdegree+=1.0/ndegree
|
|
|
|
self.steps=int(min(
|
|
# Obvious upper bounds
|
|
ncount-max(2, maxdegree) + 2,
|
|
# P. Dankelmann "Diameter and inverse degree"
|
|
(3*invdegree+3)*math.log(ncount)/math.log(math.log(ncount)) if ncount>16 else ncount,
|
|
# Simon Mukwembi "A note on diameter and the degree sequence of a graph"
|
|
1+3*(ncount - len(degreeseq)+1)/float((mindegree+1)), ncount - len(degreeseq)+2))/2
|
|
|
|
if self.steps<0:
|
|
self.steps=ncount/abs(self.steps)
|
|
|
|
nhashes={}
|
|
|
|
for n,v in self.nodes.iteritems():
|
|
nhashes[n]=md5(str(v[1]+chr(30))).digest()
|
|
|
|
if ncount>1:
|
|
for s in xrange(self.steps):
|
|
nhashes1={}
|
|
nhashcount={}
|
|
for n, v in self.nodes.iteritems():
|
|
nhash=md5(v[1]+chr(30)+chr(30).join(sorted([nhashes[x]+chr(29)+y for x,y in v[0]]))).digest()
|
|
nhashes1[n]=nhash
|
|
if nhash in nhashcount:
|
|
nhashcount[nhash]+=1
|
|
else:
|
|
nhashcount[nhash]=1
|
|
nhashes=nhashes1
|
|
|
|
# TODO Find new upper bound of diameter via calculating Spanning Tree starting from distincthash
|
|
# if len(nhashcount)>0:
|
|
# distincthash=min([x for x,y in nhashcount.iteritems() if y==1])
|
|
|
|
|
|
return json.dumps([b2a_base64(x)[0:-3] for x in sorted(nhashes.values())], separators=(',',':'), ensure_ascii=False)
|
|
|
|
class graphtodot:
|
|
"""
|
|
.. function:: graphtodot(graphname, [undirected_edge], node1, node2, [node1_details, edge_details, node2_details]) -> graphviz dot graph
|
|
|
|
Returns the *Graphviz* DOT representation of the input graph.
|
|
|
|
Examples:
|
|
|
|
Directed graph:
|
|
|
|
>>> table1('''
|
|
... 1 2
|
|
... 2 3
|
|
... 3 4
|
|
... 4 5
|
|
... 5 3
|
|
... ''')
|
|
|
|
>>> sql("select graphtodot(null, a,b) from table1")
|
|
graphtodot(null, a,b)
|
|
------------------------------------------------------------------------
|
|
digraph {
|
|
"1" -> "2";
|
|
"2" -> "3";
|
|
"3" -> "4";
|
|
"4" -> "5";
|
|
"5" -> "3";
|
|
}
|
|
|
|
Undirected graph:
|
|
|
|
>>> table2('''
|
|
... 2 5
|
|
... 5 4
|
|
... 4 1
|
|
... 1 3
|
|
... 3 4
|
|
... ''')
|
|
|
|
>>> sql("select graphtodot(null, null, a,b) from table2")
|
|
graphtodot(null, null, a,b)
|
|
----------------------------------------------------------------------
|
|
graph {
|
|
"1" -- "3";
|
|
"2" -- "5";
|
|
"3" -- "4";
|
|
"4" -- "1";
|
|
"5" -- "4";
|
|
}
|
|
|
|
Graph with details:
|
|
|
|
>>> table5('''
|
|
... 1 2 O = C
|
|
... 2 3 C = O
|
|
... ''')
|
|
|
|
>>> sql("select graphtodot('chem_comp_1', null, a, b, c, d, e) from table5")
|
|
graphtodot('chem_comp_1', null, a, b, c, d, e)
|
|
------------------------------------------------------------------------------------------------------------------------
|
|
graph chem_comp_1 {
|
|
"1" [label="O"];
|
|
"1" -- "2" [label="="];
|
|
"2" [label="C"];
|
|
"2" -- "3" [label="="];
|
|
"3" [label="O"];
|
|
}
|
|
|
|
"""
|
|
|
|
registered=True
|
|
|
|
def __init__(self):
|
|
self.nodes={}
|
|
self.steps=None
|
|
self.graphname=None
|
|
self.directed=True
|
|
|
|
def step(self, *args):
|
|
directed=True
|
|
argslen=len(args)
|
|
largs=args
|
|
|
|
if largs[0]!=None:
|
|
self.graphname=largs[0]
|
|
|
|
if largs[1]==None:
|
|
self.directed=False
|
|
largs=list(largs)
|
|
del(largs[1])
|
|
argslen-=1
|
|
|
|
if argslen>4:
|
|
edgedetailslr=unicode(largs[4])
|
|
else:
|
|
edgedetailslr=None
|
|
|
|
if largs[1] not in self.nodes:
|
|
if argslen>3:
|
|
self.nodes[largs[1]]=[ [( largs[2],edgedetailslr )] , largs[3]]
|
|
else:
|
|
self.nodes[largs[1]]=[ [( largs[2],edgedetailslr )] , None]
|
|
else:
|
|
self.nodes[largs[1]][0].append( ( largs[2],edgedetailslr ) )
|
|
|
|
|
|
if largs[2]!=None:
|
|
if largs[2] not in self.nodes:
|
|
if argslen>5:
|
|
self.nodes[largs[2]]=[ [], largs[5]]
|
|
else:
|
|
self.nodes[largs[2]]=[ [] , None]
|
|
|
|
def final(self):
|
|
if self.graphname==None:
|
|
self.graphname=''
|
|
|
|
if type(self.graphname) in (int, float):
|
|
self.graphname=u'g'+unicode(self.graphname)
|
|
else:
|
|
self.graphname=unicode(self.graphname)
|
|
|
|
dot=u''
|
|
if self.directed:
|
|
dot=u'di'
|
|
|
|
if self.graphname==None:
|
|
self.graphname='""'
|
|
|
|
dot+=u'graph '+self.graphname+u' {\n'
|
|
|
|
digraph=False
|
|
|
|
for n,v in self.nodes.iteritems():
|
|
if v[1]!=None:
|
|
dot+=json.dumps(unicode(n))+' [label="'+unicode(v[1]).replace('"',"'")+'"];\n'
|
|
for e in v[0]:
|
|
dot+=json.dumps(unicode(n)) + ' '
|
|
if self.directed:
|
|
dot+='-> '
|
|
else:
|
|
dot+='-- '
|
|
dot += json.dumps(unicode(e[0]))
|
|
if e[1]!=None:
|
|
dot+=u' [label="'+unicode(e[1]).replace('"',"'")+'"]'
|
|
dot+=u';\n'
|
|
|
|
dot+='}'
|
|
|
|
return dot
|
|
|
|
class graphtotgf:
|
|
"""
|
|
.. function:: graphtotgf(node1, node2, [node1_details, edge_details, node2_details]) -> TGF graph
|
|
|
|
Returns the TGF representation of the input graph.
|
|
|
|
Examples:
|
|
|
|
>>> table1('''
|
|
... 1 2
|
|
... 2 3
|
|
... 3 4
|
|
... 4 5
|
|
... 5 3
|
|
... ''')
|
|
|
|
>>> sql("select graphtotgf(a,b) from table1") # doctest: +NORMALIZE_WHITESPACE
|
|
graphtotgf(a,b)
|
|
------------------------------------------
|
|
1
|
|
2
|
|
3
|
|
4
|
|
5
|
|
#
|
|
1 2
|
|
2 3
|
|
3 4
|
|
4 5
|
|
5 3
|
|
|
|
Graph with details:
|
|
|
|
>>> table5('''
|
|
... 1 2 O = C
|
|
... 2 3 C = O
|
|
... ''')
|
|
|
|
>>> sql("select graphtotgf(a, b, c, d, e) from table5")
|
|
graphtotgf(a, b, c, d, e)
|
|
--------------------------
|
|
1 O
|
|
2 C
|
|
3 O
|
|
#
|
|
1 2 =
|
|
2 3 =
|
|
|
|
"""
|
|
|
|
registered=True
|
|
|
|
def __init__(self):
|
|
self.nodes={}
|
|
self.steps=None
|
|
self.directed=True
|
|
|
|
def step(self, *args):
|
|
argslen=len(args)
|
|
largs=args
|
|
|
|
if argslen>3:
|
|
edgedetailslr=unicode(largs[3])
|
|
else:
|
|
edgedetailslr=None
|
|
|
|
if largs[0] not in self.nodes:
|
|
if argslen>2:
|
|
self.nodes[largs[0]]=[ [( largs[1],edgedetailslr )] , largs[2]]
|
|
else:
|
|
self.nodes[largs[0]]=[ [( largs[1],edgedetailslr )] , None]
|
|
else:
|
|
self.nodes[largs[0]][0].append( ( largs[1],edgedetailslr ) )
|
|
|
|
|
|
if largs[1]!=None:
|
|
if largs[1] not in self.nodes:
|
|
if argslen>4:
|
|
self.nodes[largs[1]]=[ [], largs[4]]
|
|
else:
|
|
self.nodes[largs[1]]=[ [] , None]
|
|
|
|
def final(self):
|
|
tgf=''
|
|
|
|
def clearname(n):
|
|
return unicode(n).replace(' ','_').replace('"',"'")
|
|
|
|
for n,v in self.nodes.iteritems():
|
|
tgf+=clearname(n) + ' ' + (clearname(v[1]) if v[1]!=None else '') +'\n'
|
|
|
|
tgf+='#\n'
|
|
|
|
for n,v in self.nodes.iteritems():
|
|
for e in v[0]:
|
|
tgf+=clearname(n)+ ' ' + clearname(e[0])+ ' '+ (clearname(e[1]) if e[1]!=None else '') + '\n'
|
|
|
|
return tgf
|
|
|
|
class graphcliques:
|
|
"""
|
|
.. function:: graphcliques(node1, node2) -> graph cliques
|
|
|
|
Finds and returns the cliques in the graph defined by the node1<->node2 links.
|
|
|
|
Examples:
|
|
|
|
>>> table1('''
|
|
... n1 n2
|
|
... n2 n3
|
|
... n1 n3
|
|
... n3 n4
|
|
... n4 n5
|
|
... n5 n3
|
|
... n1 n6
|
|
... ''')
|
|
|
|
>>> sql("select graphcliques(a,b) from table1") # doctest: +NORMALIZE_WHITESPACE
|
|
cliqueid | nodeid
|
|
-----------------
|
|
0 | n1
|
|
0 | n2
|
|
0 | n3
|
|
1 | n3
|
|
1 | n4
|
|
1 | n5
|
|
|
|
"""
|
|
|
|
registered=True
|
|
|
|
def __init__(self):
|
|
self.G = collections.defaultdict(set)
|
|
|
|
def step(self, *args):
|
|
if len(args)!=2:
|
|
raise functions.OperatorError('graphcliques', 'Two parameters should be provided')
|
|
|
|
self.G[args[0]].add(args[1])
|
|
self.G[args[1]].add(args[0])
|
|
|
|
def _bors_kerbosch(self, R, P, X):
|
|
if not P and not X: # if both are empty
|
|
if len(R) > 2:
|
|
yield sorted(R)
|
|
return
|
|
|
|
pivot = max(((len(self.G[v]), v) for v in P.union(X)))[1]
|
|
|
|
for v in P.difference(self.G[pivot]):
|
|
for c in self._bors_kerbosch(R.union((v,)), P.intersection(self.G[v]), X.intersection(self.G[v])):
|
|
yield c
|
|
P.remove(v)
|
|
X.add(v)
|
|
|
|
def final(self):
|
|
cid = 0
|
|
|
|
yield ('cliqueid', 'nodeid')
|
|
for c in self._bors_kerbosch(set([]), set(self.G.keys()), set([])):
|
|
for n in c:
|
|
yield cid, n
|
|
cid += 1
|
|
|
|
if not ('.' in __name__):
|
|
"""
|
|
This is needed to be able to test the function, put it at the end of every
|
|
new function you create
|
|
"""
|
|
import sys
|
|
import setpath
|
|
from functions import *
|
|
testfunction()
|
|
if __name__ == "__main__":
|
|
reload(sys)
|
|
sys.setdefaultencoding('utf-8')
|
|
import doctest
|
|
doctest.testmod()
|