import setpath import Queue import functions from lib import iso8601 from operator import itemgetter from lib import jopts from lib.unicodeops import unistr __docformat__ = 'reStructuredText en' def timedelta2millisec(tdelta): return tdelta.days*24*60*60*1000+tdelta.seconds*1000+tdelta.microseconds class groupsum: """ .. function:: groupsum(n,col1,col2,col3,....) It groups by the first n columns of the input, and sums/jsets the rest. :Returned schema: Columns are automatically named as col1, col2 ... Examples: >>> table1(''' ... aa t1 43 ... ac t2 34 ... aa t3 12 ... ab t4 21 ... ac t5 14 ... as t6 23 ... ''') >>> sql("select groupsum(1,a,b,c) from table1") c1 | c2 | c3 --------------------- ac | ["t2","t5"] | 48 aa | ["t1","t3"] | 55 ab | t4 | 21 as | t6 | 23 """ registered=True def __init__(self): self.notchecked = True self.groupsdict = {} self.grouplen = 0 self.numofargs = 0 def step(self, *args): if self.notchecked: if len(args)<2: raise functions.OperatorError("groupsum","Wrong number of arguments") self.grouplen = args[0] self.numofargs = len(args) self.notchecked = False groupkey = args[1:self.grouplen+1] try: group = self.groupsdict[groupkey] j = 0 for i in xrange(self.grouplen+1,self.numofargs): group[j].append(args[i]) j += 1 except KeyError: self.groupsdict[groupkey] = [[x] for x in args[self.grouplen+1:]] def final(self): yield tuple('c'+str(i) for i in xrange(1,self.numofargs)) for groupkey, sumcols in self.groupsdict.iteritems(): cols = list(groupkey) for col in sumcols: try: cols.append(sum(col)) except TypeError: cols.append(jopts.toj(sorted(set( jopts.fromj(*col) )))) yield cols class groupmax: """ .. function:: groupmax(n,col1,col2,col3,....) It groups by the first n columns of the input, and returns the maximum value of the rest. :Returned schema: Columns are automatically named as col1, col2 ... Examples: >>> table1(''' ... aa t1 43 ... ac t2 34 ... aa t3 12 ... ab t4 21 ... ac t5 14 ... as t6 23 ... ''') >>> sql("select groupmax(1,a,b,c) from table1") c1 | c2 | c3 ------------ ac | t5 | 34 aa | t3 | 43 ab | t4 | 21 as | t6 | 23 """ registered=True def __init__(self): self.notchecked = True self.groupsdict = {} self.grouplen = 0 self.numofargs = 0 def step(self, *args): if self.notchecked: if len(args)<2: raise functions.OperatorError("groupmax","Wrong number of arguments") self.grouplen = args[0] self.numofargs = len(args) self.notchecked = False groupkey = args[1:self.grouplen+1] try: group = self.groupsdict[groupkey] j = 0 for i in xrange(self.grouplen+1,self.numofargs): group[j].append(args[i]) j += 1 except KeyError: self.groupsdict[groupkey] = [[x] for x in args[self.grouplen+1:]] def final(self): yield tuple('c'+str(i) for i in xrange(1,self.numofargs)) for groupkey, sumcols in self.groupsdict.iteritems(): cols = list(groupkey) for col in sumcols: cols.append(max(col)) yield cols class condbreak: """ .. function:: condbreak(groupid, C1, C2 ,...., condition, orderby) -> [bgroupid,C1,C2....] Returns an expanded *groupid* and the *value1...valueN*, perfoming new groupings when condition is true. Rows grouped together are the ones that order by *orderby* column have no intermediate true values for *condition*. :Returned multiset schema: - *bgroupid* *groupid* appended with an integer value indicating the subgroup of the row. - *C1, C2 ..* The input values of the row. .. seealso:: * :ref:`tutmultiset` functions >>> table1(''' ... 1 user1 open ... 2 user1 read ... 3 user1 close ... 4 user1 open ... 5 user1 write ... 6 user1 close ... 7 user2 open ... 8 user2 write ... ''') >>> sql("select condbreak(b,c,c='open',a) from table1 group by b") bgroupid | C1 ---------------- user11 | open user11 | read user11 | close user12 | open user12 | write user12 | close user21 | open user21 | write >>> sql("select condbreak(b,c,c='open',a) from (select 4 as a, 6 as b, 9 as c where c!=9)") """ registered=True multiset=True def __init__(self): self.vals=[] def step(self, *args): if not args: raise functions.OperatorError("condbreak","No arguments") if len(args)<4: raise functions.OperatorError("condbreak","Wrong number of arguments") self.vals.append(list(args)) def final(self): self.vals.sort(key=lambda x:x[-1]) if self.vals==[]: size=0 else: size=len(self.vals[0])-2 if size<=0: yield ("bgroupid","C1") else: yield tuple(["bgroupid"]+["C"+str(i+1) for i in xrange(size-1)]) counter=0 for el in self.vals: if el[-2]==True: counter+=1 bid=unistr(el[0])+str(counter) yield [bid]+el[1:-2] class datediffbreak: """ .. function:: datediffbreak(groupid, C1, C2 ,...., date, maxdiff[,'order',orderbycol1,orderbycol2,...]) -> [bgroupid,C1,C2....] Returns an expanded *groupid* and the *value1...valueN*, perfoming new groupings when subsequent rows *date* values differ more than *maxdiff* milliseconds. Rows grouped together are the ones that order by *orderby* column or if ommited by the given order have less *date* distance than *maxdiff*. Input dates should be in :ref:`ISO 8601 format `. :Returned multiset schema: - *bgroupid* *groupid* appended with an integer value indicating the subgroup of the row. - *C1, C2 ..* The input values of the row. .. seealso:: * :ref:`tutmultiset` functions >>> table1(''' ... 1 session1 '2007-01-01 00:03:13' ... 2 session1 '2007-01-01 00:03:27' ... 3 session1 '2007-01-01 00:03:36' ... 4 session2 '2007-01-01 00:04:39' ... 5 session2 '2007-01-01 00:04:40' ... 6 session3 '2007-01-01 00:04:49' ... 7 session3 '2007-01-01 00:04:59' ... ''') >>> sql("select datediffbreak(b,a,c,10*1000,'order',c,a) from table1 group by b") bgroupid | C1 -------------- session10 | 1 session11 | 2 session11 | 3 session20 | 4 session20 | 5 session30 | 6 session30 | 7 .. doctest:: :hide: >>> sql("select datediffbreak(b,c,c='open',a) from (select 4 as a, 6 as b, 9 as c where c!=9)") bgroupid | C1 --------------- None | None >>> sql("select datediffbreak(b,a,c,10*1000,a,c) from table1 group by b") #doctest:+ELLIPSIS +NORMALIZE_WHITESPACE Traceback (most recent call last): ... OperatorError: Madis SQLError: Operator DATEDIFFBREAK: Wrong date format: 1 """ registered=True def __init__(self): self.vals=[] self.init=True self.position=None self.comparesize=0 self.fullsize=0 def initargs(self, args): self.init=False if not args: raise functions.OperatorError("datediffbreak","No arguments") if len(args)<4: raise functions.OperatorError("datediffbreak","Wrong number of arguments") self.maxdiff=args[-1] for i in xrange(len(args)): if args[i]=='order': self.position=i self.maxdiff=args[i-1] self.comparesize=len(args)-(i+1) if len(args)<5: raise functions.OperatorError("datediffbreak","Wrong number of arguments") break def step(self, *args): if self.init: self.initargs(args) if not self.position: self.vals.append(list(args[:-1])) else: self.vals.append(list(args[:self.position-1]+args[self.position+1:])) def final(self): if self.position: self.vals.sort(key=lambda x:tuple(x[-self.comparesize:])) if self.vals==[]: size=0 else: size=len(self.vals[0])-self.comparesize-1 if size<=0: yield ("bgroupid","C1") yield [None, None] return yield tuple(["bgroupid"]+["C"+str(i) for i in xrange(1,size)]) counter=0 dt=None dtpos=self.comparesize+1 for el in self.vals: try: dtnew=iso8601.parse_date(el[-dtpos]) except Exception: raise functions.OperatorError("datediffbreak","Wrong date format: %s" %(el[-dtpos])) if dt and timedelta2millisec(dtnew-dt)>self.maxdiff: counter+=1 dt=dtnew bid=unistr(el[0])+str(counter) yield [bid]+el[1:-dtpos] class datediffnewsesid: """ .. function:: datediffnewsesid(maxdiff, date, groupid, C1, C2 ,....) -> [bgroupid, C1, C2, ...] Returns only the C1, C2, ... that should be updated with bgroupid, so as for the input groups to be effectively broken on maxdiff times. Input dates should be in :ref:`ISO 8601 format `. :Returned multiset schema: - *bgroupid* *groupid* appended with an integer value indicating the subgroup of the row. - *C1, C2 ..* The input values of the row. >>> table1(''' ... 1 session1 '2007-01-01 00:03:13' ... 2 session1 '2007-01-01 00:03:27' ... 3 session1 '2007-01-01 00:03:36' ... 4 session2 '2007-01-01 00:04:39' ... 5 session2 '2007-01-01 00:05:40' ... 6 session3 '2007-01-01 00:04:49' ... 7 session3 '2007-01-01 00:06:59' ... 8 session3 '2007-01-01 00:06:59' ... 9 session4 '2007-01-01 00:04:59' ... ''') >>> sql("select datediffnewsesid(10, c, b, a) from table1 group by b") bgroupid | C1 -------------- session11 | 2 session11 | 3 session21 | 5 session31 | 7 session31 | 8 .. doctest:: :hide: >>> sql("select datediffnewsesid(10, c, b, a) from (select 4 as a, 6 as b, 9 as c where c!=9)") bgroupid | C1 --------------- None | None >>> table2(''' ... 1 session1 '2007-05-01T21:10:51Z' ... 9 session1 '2012-01-08T18:24:32Z' ... ''') >>> sql("select datediffnewsesid(1800, c, b, a) from table2 group by b") bgroupid | C1 -------------- session11 | 9 """ registered=True def __init__(self): self.vals=[] self.init=True self.maxdiff=0 def initargs(self, args): self.init=False if not args: raise functions.OperatorError("datediffnewsesid","No arguments") if len(args)<4: raise functions.OperatorError("datediffnewsesid","Wrong number of arguments") self.maxdiff=args[0] def step(self, *args): if self.init: self.initargs(args) self.vals.append(list(args[1:])) def final(self): lenofvals=len(self.vals) if lenofvals<=0: yield ("bgroupid", "C1") yield [None, None, None] return yield tuple(["bgroupid"]+["C"+str(i) for i in xrange(1,len(self.vals[0])-1)]) counter=0 if lenofvals!=1: for el in self.vals: try: el.insert(0,iso8601.parse_date(el[0])) except Exception: raise functions.OperatorError("datediffnewsesid","Wrong date format: %s" %(el[0])) self.vals.sort(key=itemgetter(0)) dt=self.vals[0][0] for el in self.vals[1:]: dtnew=el[0] diff=dtnew-dt dt=dtnew if (diff.days*86400+diff.seconds)>self.maxdiff: counter+=1 if counter!=0: bid=unistr(el[2])+str(counter) yield [bid]+el[3:] class datedifffilter: """ .. function:: datedifffilter(maxdiff, date, C1, C2 ....) -> [date,C1,C2....] Returns only a subset of the provided entries, performing a sort of entry clustering based on the entries date difference. Each cluster is represented by the latest entry. The first argument defines the time differnece threshold that is employed for entry clustering, and it is provided in seconds. The second argument is assumed to contain the date column. Entries are assumed to be provided in an ascending order by the date column. Input dates should be in :ref:`ISO 8601 format `. All subsequent columns remain unchanged. :Returned multiset schema: - *date, C1, C2 ..* The selected input values of the row. .. seealso:: * :ref:`tutmultiset` functions >>> table1(''' ... 2010-01-01T01:32:03Z value1 ... 2010-01-01T01:32:04Z value2 ... 2010-01-01T01:32:06Z value3 ... 2010-01-01T01:32:08Z value4 ... 2010-01-01T01:32:29Z value5 ... 2010-01-01T02:35:03Z value6 ... 2010-01-01T02:35:04Z value7 ... 2010-01-01T03:55:04Z value8 ... ''') >>> sql("select datedifffilter(20, a,b) from table1") date | C1 ----------------------------- 2010-01-01T01:32:08Z | value4 2010-01-01T01:32:29Z | value5 2010-01-01T02:35:04Z | value7 2010-01-01T03:55:04Z | value8 >>> table1(''' ... 2010-01-01T01:32:03Z value1 ... ''') >>> sql("select datedifffilter(20, a,b) from table1") date | C1 ----------------------------- 2010-01-01T01:32:03Z | value1 >>> table1(''' ... '2010-01-01 01:32:03' value1 ... '2010-01-01 01:32:04' value2 ... '2010-01-01 01:32:06' value3 ... '2010-01-01 01:32:08' value4 ... '2010-01-01 01:32:29' value5 ... '2010-01-01 02:35:03' value6 ... '2010-01-01 02:35:04' value7 ... '2010-01-01 03:55:04' value8 ... ''') >>> sql("select datedifffilter(30, a,b) from table1") date | C1 ---------------------------- 2010-01-01 01:32:29 | value5 2010-01-01 02:35:04 | value7 2010-01-01 03:55:04 | value8 """ registered=True def __init__(self): self.init=True self.vals=[] self.maxdiff=0 def initargs(self, args): self.init=False if not args: raise functions.OperatorError("datedifffilter","No arguments") if len(args)<2: raise functions.OperatorError("datedifffilter","Wrong number of arguments") self.maxdiff=args[0] def step(self, *args): if self.init==True: self.initargs(args) self.vals.append(list(args[1:])) def final(self): lenofvals=len(self.vals) if lenofvals==0: yield ("date","C1") yield [None,None] return yield tuple(["date"]+["C"+str(i) for i in xrange(1, len(self.vals[0]))]) dt=None dtpos=0 diff=0 if lenofvals==1: yield(self.vals[dtpos]) else: for el in self.vals: el.append(iso8601.parse_date(el[0])) self.vals.sort(key=itemgetter(-1)) for el in self.vals: if dtposself.maxdiff: yield(el[0:-1]) if dtpos==lenofvals-1: yield(self.vals[dtpos][0:-1]) class datediffgroup: """ .. function:: datediffgroup(maxdiff, date, C1, C2 ....) -> [groupid, date,C1,C2....] Performing a sort of entry clustering based on the entries date difference. The cluster id that is assigned to each entry is returned in the first column, and it is followed by the entry's original contents. The first argument defines the time differnece threshold that is employed for entry clustering, and it is provided in seconds. The second argument is assumed to contain the date column. Entries are assumed to be provided in an ascending order by the date column. Input dates should be in :ref:`ISO 8601 format `. All subsequent columns remain unchanged. :Returned multiset schema: - *date, C1, C2 ..* The selected input values of the row. .. seealso:: * :ref:`tutmultiset` functions >>> table1(''' ... 2010-01-01T01:32:03Z value1 ... 2010-01-01T01:32:04Z value2 ... 2010-01-01T01:32:06Z value3 ... 2010-01-01T01:32:08Z value4 ... 2010-01-01T01:32:29Z value5 ... 2010-01-01T02:35:03Z value6 ... 2010-01-01T02:35:04Z value7 ... 2010-01-01T03:55:04Z value8 ... ''') >>> sql("select datediffgroup(20,a,b) from table1") groupid | date | C1 --------------------------------------- 1 | 2010-01-01T01:32:03Z | value1 1 | 2010-01-01T01:32:04Z | value2 1 | 2010-01-01T01:32:06Z | value3 1 | 2010-01-01T01:32:08Z | value4 2 | 2010-01-01T01:32:29Z | value5 3 | 2010-01-01T02:35:03Z | value6 3 | 2010-01-01T02:35:04Z | value7 4 | 2010-01-01T03:55:04Z | value8 """ registered=True multiset=True def __init__(self): self.init=True self.vals=[] self.maxdiff=0 self.counter=0 self.tablesize=0 self.groupIdCounter=1 def initargs(self, args): self.init=False if not args: raise functions.OperatorError("datediffgroup","No arguments") if len(args)<2: raise functions.OperatorError("datediffgroup","Wrong number of arguments") self.tablesize=len(args)-1 self.maxdiff=args[0] def step(self, *args): if self.init==True: self.initargs(args) self.vals.append(list(args[1:])) self.counter+=1 def final(self): if self.tablesize<=0: yield ("groupid","date","C1") else: yield tuple(["groupid"]+["date"]+["C"+str(i+1) for i in xrange(self.tablesize-1)]) dt=None dtpos=0 diff=0 for el in self.vals: if dtposself.maxdiff: self.groupIdCounter+=1 dtpos+=1 if dtpos==self.counter-1: yield [str(self.groupIdCounter)]+self.vals[dtpos] if not ('.' in __name__): """ This is needed to be able to test the function, put it at the end of every new function you create """ import sys import setpath from functions import * testfunction() if __name__ == "__main__": reload(sys) sys.setdefaultencoding('utf-8') import doctest doctest.testmod()