interactive-mining/interactive-mining-3rdparty.../madis/src/functions/aggregate/statistics.py

1024 lines
26 KiB
Python

import setpath
import functions
import math
from lib import iso8601
import re
import datetime
from fractions import Fraction
import json
from fractions import Fraction
__docformat__ = 'reStructuredText en'
class modeop:
"""
.. function:: modeop(X) -> [ModeOpElements int/str, ModeOpValue int]
Returns the mode (i.e. the value that occurs the most frequently in a data set), along with the modevalue (i.e. the maximum frequency of occurrence)
When more than one modes are found in a data set (i.e. when more than one values appear with the maximum frequency), all values are returned.
For a sample from a continuous distribution, such as [0.935..., 1.211..., 2.430..., 3.668..., 3.874...], the concept of mode is unusable in its raw form,
since each value will occur precisely once. Following the usual practice, data is discretized by rounding to the closer int value.
For a textual sample, values are first converted to lowercase.
:Returned multiset schema:
Columns are automatically named as *ModeOpElements, ModeOpValue*
.. seealso::
* :ref:`tutmultiset` functions
Examples:
>>> table1('''
... 1
... 3
... 6
... 6
... 6
... 6
... 7
... 7
... 7
... 7
... 12
... 12
... 17
... ''')
>>> sql("select modeop(a) from table1")
ModeOpElements | ModeOpValue
----------------------------
6 | 4
7 | 4
>>> table2('''
... 1.1235
... 1
... 5.1
... 5.2
... 5.3
... 5.5
... 5.6
... 5.7
... ''')
>>> sql("select modeop(a) from table2")
ModeOpElements | ModeOpValue
----------------------------
5 | 3
6 | 3
>>> table3('''
... leuteris
... maria
... marialena
... Meili
... meili
... ''')
>>> sql("select modeop(a) from table3")
ModeOpElements | ModeOpValue
----------------------------
meili | 2
.. doctest::
:hide:
>>> sql("delete from table3")
>>> sql("select modeop(a) from table3")
ModeOpElements | ModeOpValue
----------------------------
None | None
"""
registered=True #Value to define db operator
multiset=True
def __init__(self):
self.init=True
self.sample = []
self.modevalue = 0
def initargs(self, args):
self.init=False
if not args:
raise functions.OperatorError("modeop","No arguments")
if len(args)>1:
raise functions.OperatorError("modeop","Wrong number of arguments")
def step(self, *args):
if self.init==True:
self.initargs(args)
if isinstance(args[0], basestring):
#For the case of textual dataset, values are converted to lowercase
self.element = (args[0]).lower()
else:
#For the case of arithmetic dataset, values are rounded and converted to int
self.element = int(round(args[0]))
self.sample.append(self.element)
def final(self):
output=[]
if (not self.sample):
output+=['None']
modevalue='None'
else:
self.sample.sort()
# Initialize a dictionary to store frequency data.
frequency = {}
# Build dictionary: key - data set values; item - data frequency.
for x in self.sample:
if (x in frequency.keys()):
frequency[x]+=1
else:
frequency[x]=1
# Find the modeval, i.e. the maximum frequency
modevalue = max(frequency.values())
# If the value of mode is 1, there is no mode for the given data set.
if (modevalue == 1):
output+=['None']
modevalue='None'
else:
# Step through the frequency dictionary, looking for keys equaling
# the current modevalue. If found, append the key to output list.
for x in frequency:
if (modevalue == frequency[x]):
output+=[x]
#CREATE MULTISET OUTPUT
#print all keys, along with the modevlaue
yield ("ModeOpElements", "ModeOpValue")
for el in output:
yield (el, modevalue)
class median:
"""
.. function:: median(X) -> [median float]
Returns the median, i.e.numeric value separating the higher half of a sample, a population, or a probability distribution, from the lower half.
It is computed by arranging all the observations from lowest value to highest value and picking the middle one.
If there is an even number of observations, then there is no single middle value, so the mean of the two middle values is obtained.
Incoming textual values are simply ignored.
Examples:
>>> table1('''
... 1
... 3
... 6
... 6
... 6
... 6
... 7
... 7
... 7
... 7
... 12
... 12
... 17
... ''')
>>> sql("select median(a) from table1")
median(a)
---------
7.0
>>> table2('''
... 1
... 2
... 2
... 3
... 3
... 9
... ''')
>>> sql("select median(a) from table2")
median(a)
---------
2.5
>>> table3('''
... 1
... 2
... maria
... lala
... null
... 'None'
... 3
... 9
... ''')
>>> sql("select median(a) from table3")
median(a)
---------
2.5
.. doctest::
:hide:
>>> sql("delete from table3")
>>> sql("select median(a) from table3")
median(a)
---------
None
"""
registered=True #Value to define db operator
def __init__(self):
self.init=True
self.sample = []
self.counter=0
def initargs(self, args):
self.init=False
if not args:
raise functions.OperatorError("median","No arguments")
if len(args)>1:
raise functions.OperatorError("median","Wrong number of arguments")
def step(self, *args):
if self.init==True:
self.initargs(args)
if not(isinstance(args[0], basestring)) and args[0]:
self.counter +=1
self.element = float((args[0]))
self.sample.append(self.element)
def final(self):
if (not self.sample):
return
self.sample.sort()
"""Determine the value which is in the exact middle of the data set."""
if (self.counter%2): # Number of elements in data set is even.
self.median = self.sample[self.counter/2]
else: # Number of elements in data set is odd.
midpt = self.counter/2
self.median = (self.sample[midpt-1] + self.sample[midpt])/2.0
return self.median
class variance:
"""
.. function:: variance(X,[type]) -> [variance float]
Determine the measure of the spread of the data set about the mean.
Sample variance is determined by default; population variance can be
determined by setting the (optional) second argument to values 'true' or 'population'.
When values 'false' or 'sample' are entered for type, the default sample variance computation is performed.
Examples:
>>> table1('''
... 1
... 2
... 3
... 4
... 5
... 6
... 'text is ignored'
... 'none'
... ''')
>>> sql("select variance(a) from table1")
variance(a)
-----------
3.5
>>> sql("select variance(a,'false') from table1")
variance(a,'false')
-------------------
3.5
>>> sql("select variance(a,'sample') from table1")
variance(a,'sample')
--------------------
3.5
>>> sql("select variance(a,'True') from table1")
variance(a,'True')
------------------
2.91666666667
>>> sql("select variance(a,'Population') from table1")
variance(a,'Population')
------------------------
2.91666666667
.. doctest::
:hide:
>>> sql("delete from table1")
>>> sql("select variance(a) from table1")
variance(a)
-----------
None
"""
registered=True #Value to define db operator
def __init__(self):
self.init=True
self.population=False
self.n=0
self.mean=Fraction(0.0)
self.M2=Fraction(0.0)
def initargs(self, args):
self.init=False
if not args:
raise functions.OperatorError("sdev","No arguments")
elif len(args)==2:
tmp = args[1].lower()
if tmp=='false' or tmp=='sample':
self.population=False
elif tmp=='true' or tmp=='population':
self.population=True
else:
raise functions.OperatorError("sdev", "Wrong value in second argument"+'\n'+
"Accepted Values:"+'\n'
"----False, false, FALSE, sample---- for Sample Standard Deviation"+'\n'+
"----True, true, TRUE, population---- for Population Standard Deviation"+'\n')
elif len(args)>2:
raise functions.OperatorError("sdev","Wrong number of arguments")
def step(self, *args):
if self.init==True:
self.initargs(args)
try:
x=Fraction(args[0])
except KeyboardInterrupt:
raise
except:
return
self.n+=1
delta=x-self.mean
self.mean += delta / self.n
if self.n > 1:
self.M2 += delta * (x - self.mean)
def final(self):
if self.n==0:
return None
try:
if (not self.population and self.n>1): # Divide sum of squares by N-1 (sample variance).
variance = self.M2/(self.n-1)
else: # Divide sum of squares by N (population variance).
variance = self.M2/self.n
except:
variance = 0.0
return float(variance)
class stdev:
"""
.. function:: stdev(X,[type]) -> [stdev float]
Computes standard deviation of a dataset X, i.e. the square root of its variance.
Sample standard deviation is determined by default; population standard deviation can be
determined by setting the (optional) second argument to values 'true' or 'population'.
When values 'false' or 'sample' are entered for type, the default sample standard deviation
computation is performed.
Examples:
>>> table1('''
... 3
... 7
... 7
... 19
... 'text is ignored'
... 'none'
... ''')
>>> sql("select stdev(a) from table1")
stdev(a)
-------------
6.92820323028
>>> sql("select stdev(a,'population') from table1")
stdev(a,'population')
---------------------
6.0
>>> sql("select stdev(a,'true') from table1")
stdev(a,'true')
---------------
6.0
.. doctest::
:hide:
>>> sql("delete from table1")
>>> sql("select stdev(a) from table1")
stdev(a)
--------
None
"""
registered=True #Value to define db operator
def __init__(self):
self.init=True
self.population=False
self.n=0
self.mean=Fraction(0.0)
self.M2=Fraction(0.0)
def initargs(self, args):
self.init=False
if not args:
raise functions.OperatorError("sdev","No arguments")
elif len(args)==2:
tmp = args[1].lower()
if tmp=='false' or tmp=='sample':
self.population=False
elif tmp=='true' or tmp=='population':
self.population=True
else:
raise functions.OperatorError("sdev", "Wrong value in second argument"+'\n'+
"Accepted Values:"+'\n'
"----False, false, FALSE, sample---- for Sample Standard Deviation"+'\n'+
"----True, true, TRUE, population---- for Population Standard Deviation"+'\n')
elif len(args)>2:
raise functions.OperatorError("sdev","Wrong number of arguments")
def step(self, *args):
if self.init==True:
self.initargs(args)
try:
x=Fraction(args[0])
except KeyboardInterrupt:
raise
except:
return
self.n+=1
delta=x-self.mean
self.mean += delta / self.n
if self.n > 1:
self.M2 += delta * (x - self.mean)
def final(self):
if self.n==0:
return None
try:
if (not self.population and self.n>1): # Divide sum of squares by N-1 (sample variance).
variance = self.M2/(self.n-1)
else: # Divide sum of squares by N (population variance).
variance = self.M2/self.n
except:
variance = 0.0
return math.sqrt(variance)
class rangef:
"""
.. function:: rangef(X) -> [rangef float]
Computes the numerical range for a dataset X, substracting the minimum value from the maximum value.
Textal and NULL data entries are simply ignored.
Examples:
>>> table1('''
... 1
... 3
... 6
... 6
... 7
... 12
... 12
... 17
... 'text is ignored'
... 'None'
... ''')
>>> sql("select rangef(a) from table1")
rangef(a)
---------
16.0
.. doctest::
:hide:
>>> sql("delete from table1")
>>> sql("select rangef(a) from table1")
rangef(a)
---------
None
"""
registered=True #Value to define db operator
def __init__(self):
self.init=True
self.sample=[]
def initargs(self, args):
self.init=False
if len(args)<>1:
raise functions.OperatorError("rangef","Wrong number of arguments")
def step(self, *args):
if not(isinstance(args[0], basestring)) and args[0]:
self.sample.append(float(args[0]))
def final(self):
if (not self.sample):
return
self.range=max(self.sample) - min(self.sample)
return self.range
class amean:
"""
.. function:: amean(X) -> [amean float]
Computes the arithmetic mean, i.e. the average, thus providing an alternative choise
to traditional *avg* offered by sqlite.
Examples:
>>> table1('''
... 1
... 2
... 2
... 3
... 'text is ignored, as well as null values'
... 'none'
... ''')
>>> sql("select amean(a) from table1")
amean(a)
--------
2.0
.. doctest::
:hide:
>>> sql("delete from table1")
>>> sql("select amean(a) from table1")
amean(a)
--------
None
"""
registered=True #Value to define db function
def __init__(self):
self.init=True
self.counter=0
self.sum=0.0
self.sample=[]
def initargs(self, args):
self.init=False
if not args:
raise functions.OperatorError("amean","No arguments")
elif len(args)>1:
raise functions.OperatorError("amean","Wrong number of arguments")
def step(self, *args):
if self.init==True:
self.initargs(args)
if not(isinstance(args[0], basestring)) and args[0]:
self.sample.append(float(args[0]))
self.sum += float(args[0])
self.counter+=1
def final(self):
if (not self.sample):
return
return self.sum/self.counter
class wamean:
"""
.. function:: wamean(W,X) -> [wamean float]
Computes the weighted arithmetic mean, i.e. the weighted average.
First column contains the weights and second column contains the actual data values.
.. math::
wamean_{\mathrm} = \sum_{i=1}^{N} w_i x_i / \sum_{i=1}^{N} w_i
Examples:
>>> table1('''
... 2 1
... 2 2
... 1 2
... 'text is ignored, as well as null values' 3
... 'none' 2
... 1 'text is ignored, as well as null values'
... 2 'none'
... 2 3
... ''')
>>> sql("select wamean(a,b) from table1")
wamean(a,b)
-----------
2.0
.. doctest::
:hide:
>>> sql("delete from table1")
>>> sql("select wamean(a) from table1")
wamean(a)
---------
None
"""
registered=True #Value to define db operator
def __init__(self):
self.init=True
self.counter=0
self.sum=0.0
def initargs(self, args):
self.init=False
if (len(args)<>2):
raise functions.OperatorError("wamean","Wrong number of arguments")
def step(self, *args):
if self.init==True:
self.initargs(args)
if not(isinstance(args[0], basestring)) and args[0] and not(isinstance(args[1], basestring)) and args[1]:
self.sum += args[0]*args[1]
self.counter+=args[0]
def final(self):
if (self.counter==0):
return
return self.sum/self.counter
class gmean:
"""
.. function:: gmean(X,[m]) -> [gmean float]
Computes the genaralized mean (also known as the power mean or Holder mean),
which is an abstraction of the *Pythagorean means* including *arithmetic*, *geometric*, and *harmonic* means.
It is defined for a set of *n* positive real numbers as follows:
.. math::
gmean_{\mathrm} = \Big ( {1 \over N} \sum_{i=1}^{N} x_i ^p \Big ) ^{1/p}
The (optional) second argument stands for the *p* paramteter, thus determining the exact mean type:
- p=2 : *Quadratic mean* (computed for both negative and positive values)
- p=1 : *Artihmetic mean*
- p=0 : *Geometric mean* (only for positive real numbers)
- p=-1: *Harmonian mean* (only for positive real numbers)
By default, i.e. in absence of second argument, p is set to 0, computing
the geometric mean.
Examples:
>>> table1('''
... 6
... 50
... 9
... 1200
... 'text is ignored, as well as None values'
... 'None'
... ''')
>>> sql("select gmean(a) from table1")
gmean(a)
-------------
42.4264068712
>>> table2('''
... 34
... 27
... 45
... 55
... 22
... 34
... ''')
>>> sql("select gmean(a,1) from table2")
gmean(a,1)
-------------
36.1666666667
>>> sql("select gmean(a,0) from table2")
gmean(a,0)
-------------
34.5451100372
>>> sql("select gmean(a) from table2")
gmean(a)
-------------
34.5451100372
>>> sql("select gmean(a,-1) from table2")
gmean(a,-1)
-------------
33.0179836512
>>> sql("select gmean(a,2) from table2")
gmean(a,2)
-------------
37.8043207407
"""
registered=True #Value to define db operator
def __init__(self):
self.init=True
self.counter=0
self.sum=0.0
self.p=0.0
self.result=0.0
def initargs(self, args):
self.init=False
if not args:
raise functions.OperatorError("gmean","No arguments")
elif len(args)>2:
raise functions.OperatorError("gmean","Wrong number of arguments")
elif len(args)==2:
self.p=args[1]
if self.p>2 or self.p<-1:
raise functions.OperatorError("\n gmean","Second argument takes values from -1 to 2\n"+
"p=2 :quadratic mean (for both negative and positive values)\n"+
"p=1 :artihmetic mean\n"+
"p=0 :geometric mean (for positive real numbers)\n"+
"p=-1:harmonian mean (for positive real numbers)\n")
def step(self, *args):
if self.init==True:
self.initargs(args)
if not(isinstance(args[0], basestring)) and args[0]:
if self.p<1 and args[0]<1:
raise functions.OperatorError("gmean","The specified type of mean applies only to positive numbers")
# The easiest way to think of the geometric mean is that
#it is the average of the logarithmic values, converted back to a base 10 number.
if self.p==0:
self.sum += math.log10(args[0])
else:
self.sum += args[0]**self.p
self.counter +=1
def final(self):
if (self.counter==0):
return
if self.p==0:
result = 10**(self.sum/self.counter)
return result
else:
return (self.sum/self.counter)**(1.0/self.p)
re_now=re.compile('now:(?P<now>.*)')
class frecency:
"""
.. function:: frecency(actiondate[,points[,now:date]])
Returns a float weighted sum assigning to each action *points* or less, depending on the *actiondate* distance to the current date (or *now:date*).
In detail the action points decrease 30% at distance 10-30 days, 50% at 1-3 months, 70% at 3-6 months and 90% at greater distance. Date parameters should be in ISO8601 format.
.. _iso8601:
**ISO 8601 format** :
Year:
YYYY (eg 1997)
Year and month:
YYYY-MM (eg 1997-07)
Complete date:
YYYY-MM-DD (eg 1997-07-16)
Complete date plus hours and minutes:
YYYY-MM-DD hh:mmTZD (eg 1997-07-16 19:20+01:00)
Complete date plus hours, minutes and seconds:
YYYY-MM-DD hh:mm:ssTZD (eg 1997-07-16 19:20:30+01:00)
Complete date plus hours and minutes:
YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00)
Complete date plus hours, minutes and seconds:
YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00)
Examples:
>>> table1('''
... 2009-06-01 1
... 2009-08-28 2
... 2009-09-17 3
... ''')
>>> sql("select frecency(a,'now:2009-09-26 04:38:30') from table1")
frecency(a,'now:2009-09-26 04:38:30')
-------------------------------------
200.0
"""
registered=True #Value to define db operator
def __init__(self):
self.frecency=0
self.initstatic=False
self.points=None
self.now=None
def __decrease(self,offsettimedelta):
if offsettimedelta<=datetime.timedelta(days=10):
return 1.0
if offsettimedelta<=datetime.timedelta(days=30):
return 0.7
if offsettimedelta<=datetime.timedelta(days=(30*3)):
return 0.5
if offsettimedelta<=datetime.timedelta(days=(30*6)):
return 0.3
return 0.1
def step(self, *args):
if not args:
raise functions.OperatorError("frecency","No arguments")
# last 2 arguments are static , so they are parse only the first time
if not self.initstatic:
self.initstatic=True
self.points=100.0
self.now=datetime.datetime.now()
if len(args)>=2:
for arg in args[1:]:
isnowarg=re_now.match(arg)
if isnowarg:
nowdate=isnowarg.groupdict()['now']
self.now=iso8601.parse_date(nowdate)
else:
self.points=int(arg)
input=args[0]
dt=iso8601.parse_date(input)
self.frecency+=self.__decrease(self.now-dt)*self.points
def final(self):
return self.frecency
class pearson:
"""
.. function:: pearson(X,Y) -> float
Computes the pearson coefficient of X and Y datasets
Examples:
>>> sql("select pearson(c1,1/c1) from range(1,91)")
pearson(c1,1/c1)
----------------
-0.181568259801
>>> sql("select pearson(c1,17*c1+5) from range(1,91)")
pearson(c1,17*c1+5)
-------------------
1.0
>>> sql("select pearson(c1,pyfun('math.pow',2,c1)) from range(1,41)")
pearson(c1,pyfun('math.pow',2,c1))
----------------------------------
0.456349821382
>>> sql("select pearson(a,b) from (select 1 as a, 2 as b)")
pearson(a,b)
------------
0
"""
registered=True #Value to define db operator
sum_x=0
sum_y=0
def __init__(self):
self.sX=Fraction(0)
self.sX2=Fraction(0)
self.sY=Fraction(0)
self.sY2=Fraction(0)
self.sXY=Fraction(0)
self.n=0
def step(self,*args):
try:
x, y = [Fraction(i) for i in args[:2]]
except KeyboardInterrupt:
raise
except:
return
self.n+=1
self.sX+=x
self.sY+=y
self.sX2+=x*x
self.sY2+=y*y
self.sXY+=x*y
def final(self):
if self.n==0:
return None
d = (math.sqrt(self.n*self.sX2-self.sX*self.sX)*math.sqrt(self.n*self.sY2-self.sY*self.sY))
if d == 0:
return 0
return float((self.n*self.sXY-self.sX*self.sY)/d)
class fsum:
"""
.. function:: fsum(X) -> json
Computes the sum using fractional computation. It return the result in json format
Examples:
>>> table1('''
... 1
... 2
... 2
... 10
... ''')
>>> sql("select fsum(a) from table1")
fsum(a)
-------
[15, 1]
>>> table1('''
... 0.99999999
... 3.99999999
... 0.78978989
... 1.99999999
... ''')
>>> sql("select fsum(a) from table1")
fsum(a)
-------------------------------------
[70164189421580937, 9007199254740992]
"""
registered = True
def __init__(self):
self.init = True
self.x = Fraction(0.0)
def step(self, *args):
if self.init:
self.init = False
if not args:
raise functions.OperatorError("fsum","No arguments")
try:
if type(args[0]) in (int, float, long):
x = Fraction(args[0])
else:
try:
json_object = json.loads(args[0])
x = Fraction(json_object[0], json_object[1])
except ValueError, e:
return
except KeyboardInterrupt:
raise
except:
return
self.x += x
def final(self):
return json.dumps([self.x.numerator, self.x.denominator])
if not ('.' in __name__):
"""
This is needed to be able to test the function, put it at the end of every
new function you create
"""
import sys
import setpath
from functions import *
testfunction()
if __name__ == "__main__":
reload(sys)
sys.setdefaultencoding('utf-8')
import doctest
doctest.testmod()