interactive-mining/interactive-mining-3rdparty.../madis/src/functions/aggregate/mining.py

416 lines
13 KiB
Python
Executable File

import re
import itertools
import setpath
import functions
import lib.jopts as jopts
from operator import itemgetter
import random
__docformat__ = 'reStructuredText en'
re_params=re.compile('(\w*):(.*)')
def consumer(func):
"""A decorator, advances func to its first yield point when called.
"""
from functools import wraps
@wraps(func)
def wrapper(*args,**kw):
gen = func(*args, **kw)
gen.next()
return gen
return wrapper
class freqitemsets:
"""
.. function:: freqitemsets(datacol, [threshold, noautothres, stats, maxlen]) -> [itemset_id:int, itemset_length:int, itemset_frequency:int, item:text]
Calculates frequent itemsets on a given column (datacol). The algorithm is tuned for the
case when we have many different items (in the order of millions), many input itemsets, but
small itemset length (10-20).
Returned table schema:
:itemset_id: Automatic itemset id
:itemset_length: Length of itemset
:itemset_frequency: How many times an itemset has been found
:item: Itemset's item value
Parameters:
:datacol:
Column on which to calculate frequent itemsets
:threshold: Default is 2
How many times an freq. itemset must appear for it to appear in the results
:noautothres: 1/0 (Default is 0)
Do not calculate the threshold automatically
:stats: 1/0 (Default is 0)
Return frequent itemset statistics
:maxlen: NUMBER (Default is no limit at all)
Maximum itemset length to search
Examples:
>>> table1('''
... 'car wood bike' 'first group'
... 'car car wood' 'first group'
... 'car wood' 'first group'
... 'car wood ice' 'first group'
... 'ice' 'second group'
... 'car ice' 'second group'
... 'car cream toy' 'second group'
... 'icecream ice car toy' 'second group'
... ''')
>>> sql("select b,freqitemsets(a, 'threshold:2', 'noautothres:1', 'maxlen:2') from table1 group by b")
b | itemset_id | itemset_length | itemset_frequency | item
---------------------------------------------------------------------
first group | 1 | 1 | 4 | wood
first group | 2 | 1 | 4 | car
first group | 3 | 2 | 4 | car
first group | 3 | 2 | 4 | wood
second group | 1 | 1 | 3 | ice
second group | 2 | 1 | 3 | car
second group | 3 | 1 | 2 | toy
second group | 4 | 2 | 2 | car
second group | 4 | 2 | 2 | ice
second group | 5 | 2 | 2 | car
second group | 5 | 2 | 2 | toy
>>> sql("select b,freqitemsets(a, 'stats:1') from table1 group by b")
b | MaxTransactionLength | CombinationCount | PassedTransactions | ValidKeywords
-------------------------------------------------------------------------------------------
first group | 3 | 2 | 3 | 2
first group | 3 | 1 | 1 | 2
first group | 3 | 0 | 0 | 0
second group | 4 | 3 | 3 | 3
second group | 4 | 0 | 3 | 0
"""
registered=True
multiset=True
def __init__(self):
self.threshold=2
self.startingthreshold=2
self.autothres=1
self.compress=0
self.initstatic=False
self.input={}
self.maxlength=0
self.kwcode={}
self.codekw={}
self.maxkwcode=0
self.overthres={}
self.belowthres={}
self.passedkw={}
self.init=True
self.itemset_id=0
self.maxlen=None
self.stats=False
def initargs(self, args):
self.init=False
for i in xrange(1, len(args)):
v=re_params.match(args[i])
if v is not None and v.groups()[0]!='' and v.groups()[1]!='' and i>0:
v=v.groups()
if v[0]=='threshold':
try:
self.threshold=int(v[1])
self.startingthreshold=self.threshold
except KeyboardInterrupt:
raise
except:
raise functions.OperatorError("FreqItemsets",'No integer value given for threshold')
if v[0]=='noautothres':
self.autothres=0
if v[0]=='compress':
self.compress=1
if v[0]=='maxlen':
self.maxlen=int(v[1])
if v[0]=='stats':
self.stats=True
def demultiplex(self, data):
iterable=None
iterpos=-1
for i in xrange(len(data)):
if hasattr(data[i],'__iter__')==True:
iterable=data[i]
iterpos=i
break
if iterpos==-1:
yield list(data)
else:
pre=list(data[0:iterpos])
post=list(data[iterpos+1:])
for i in iterable:
if hasattr(i,'__iter__')==False:
yield pre+[i]+post
else:
yield pre+list(i)+post
def insertcombfreq(self, comb, freq):
if comb in self.overthres:
self.overthres[comb]+=freq
else:
if comb in self.belowthres:
self.belowthres[comb]+=freq
else:
self.belowthres[comb]=freq
if self.belowthres[comb]>=self.threshold:
self.overthres[comb]=self.belowthres[comb]
del(self.belowthres[comb])
for k in comb:
if self.compress==0:
self.passedkw[k]=True
elif not k in self.passedkw:
self.passedkw[k]=self.overthres[comb]
else:
self.passedkw[k]+=self.overthres[comb]
def insertitemset(self, itemset):
if itemset not in self.input:
self.input[itemset]=1
else:
self.input[itemset]+=1
def cleanitemsets(self, minlength):
newitemsets={}
for k,v in self.input.iteritems():
itemset=tuple(i for i in k if i in self.passedkw)
if self.compress==1:
esoteric_itemset=tuple(i for i in itemset if self.passedkw[i]==v)
if len(esoteric_itemset)>0:
if len(itemset)>=minlength:
self.overthres[itemset]=v
itemset=tuple(i for i in itemset if self.passedkw[i]!=v)
if len(itemset)>=minlength:
if itemset not in newitemsets:
newitemsets[itemset]=v
else:
newitemsets[itemset]+=v
self.input=newitemsets
def step(self, *args):
if self.init==True:
self.initargs(args)
if len(args[0])==0:
return
itms=sorted(set(args[0].split(' ')))
itms=[x for x in itms if x!='']
li=len(itms)
if li>0:
if li>self.maxlength:
self.maxlength=li
inputkws=[]
for kw in itms:
if len(kw)==0:
print itms, args[0], len(args[0]), li
if kw not in self.kwcode:
self.kwcode[kw]=self.maxkwcode
self.codekw[self.maxkwcode]=kw
inputkws.append(self.maxkwcode)
self.insertcombfreq( (self.maxkwcode,),1 )
self.maxkwcode+=1
else:
itm=self.kwcode[kw]
self.insertcombfreq( (itm,),1 )
inputkws.append(itm)
if len(inputkws)>1:
self.insertitemset(tuple(inputkws))
def final(self):
if not self.stats:
yield ('itemset_id', 'itemset_length', 'itemset_frequency', 'item')
else:
yield ('MaxTransactionLength', 'CombinationCount', 'PassedTransactions', 'ValidKeywords')
splist=[{},{}]
del(self.kwcode)
splist[1]=self.overthres
if self.stats:
yield [self.maxlength, len(splist[1]), len(self.input), len(self.passedkw)]
if not self.stats:
for its,v in sorted(splist[1].items(), key=itemgetter(1),reverse=True):
self.itemset_id+=1
for i in self.demultiplex( (self.itemset_id, len([self.codekw[i] for i in its]), v, [self.codekw[i] for i in its]) ):
yield i
if self.maxlen==None:
self.maxlen=self.maxlength
for l in xrange(2, min(self.maxlength+1, self.maxlen+1)):
splist.append({})
self.belowthres={}
self.overthres={}
prevl=l-1
# Autothresholding
if self.autothres==1:
if len(self.input)==0 or len(self.passedkw)==0:
break
else:
self.threshold=self.startingthreshold + int(len(self.passedkw)/len(self.input))
self.cleanitemsets(l)
self.passedkw={}
prevsplist = splist[prevl]
icombs = itertools.combinations
insertcomb = self.insertcombfreq
for k,v in self.input.iteritems():
for k in icombs(k,l):
insertit=True
for i1 in icombs(k, prevl):
if i1 not in prevsplist:
insertit=False
break
if insertit:
insertcomb( k,v )
splist[l-1]={}
splist[l]=self.overthres
if self.stats:
yield [self.maxlength, len(splist[l]), len(self.input), len(self.passedkw)]
if not self.stats:
for its,v in sorted(splist[l].items(), key=itemgetter(1),reverse=True):
self.itemset_id+=1
for i in self.demultiplex( (self.itemset_id, len([self.codekw[i] for i in its]), v, [self.codekw[i] for i in its]) ):
yield i
del(self.overthres)
del(self.belowthres)
del(self.passedkw)
del(self.input)
del(self.codekw)
del(splist)
class sampledistvals:
"""
.. function:: sampledistvals(sample_size, C1, C2, C3) -> [C1, C2, C3]
Sampledistvals returns sample_size distinct values for each of the input C1..Cn columns.
>>> table1('''
... test1 2 3
... test1 2 3
... test2 4 2
... test4 2 t
... ''')
>>> sql("select sampledistvals(3, a, b, c) from table1")
C1 | C2 | C3
---------------------------------------------
["test1","test2","test4"] | [2,4] | [2,3,"t"]
"""
registered=True
def __init__(self):
self.vals=None
self.lenargs = -1
self.init=True
def step(self, *args):
if self.init:
self.lenargs = len(args)
self.vals = a=[set() for i in xrange(self.lenargs-1)]
self.init = False
for i in xrange(1, self.lenargs):
if len(self.vals[i-1])<args[0] and args[i] not in self.vals[i-1]:
self.vals[i-1].add(args[i])
def final(self):
yield tuple(['C'+str(i) for i in xrange(1, self.lenargs)] )
yield [jopts.toj(list(i)) for i in self.vals]
class samplegroup:
"""
.. function:: samplegroup(sample_size, C1, C2, C3)
Returns a random sample_size set of rows.
>>> table1('''
... test1 2 3
... test1 2 3
... test2 4 2
... test4 2 t
... ''')
>>> sql("select samplegroup(2, a, b, c) from table1") # doctest: +ELLIPSIS
C1 | C2 | C3
---------------
...
>>> sql("select samplegroup(2) from (select 5 where 5=6)") # doctest: +ELLIPSIS
"""
registered=True
def __init__(self):
self.samplelist = []
self.index = 0
self.random = random.randint
def step(self, *args):
# Generate the reservoir
if self.index < args[0]:
self.samplelist.append(args)
else:
r = self.random(0, self.index)
if r < args[0]:
self.samplelist[r] = args
self.index += 1
def final(self):
if self.samplelist == []:
yield tuple(['C1'])
else:
yield tuple(['C'+str(i) for i in xrange(1, len(self.samplelist[0]))] )
for r in self.samplelist:
yield list(r[1:])
if not ('.' in __name__):
"""
This is needed to be able to test the function, put it at the end of every
new function you create
"""
import sys
import setpath
from functions import *
testfunction()
if __name__ == "__main__":
reload(sys)
sys.setdefaultencoding('utf-8')
import doctest
doctest.testmod()