implemented Cohen's Kappa Statistics

git-svn-id: https://svn.d4science.research-infrastructures.eu/gcube/trunk/data-analysis/EcologicalEngine@76846 82a268e6-3cf1-43bd-a215-b396298e98cf
This commit is contained in:
Gianpaolo Coro 2013-06-07 10:12:39 +00:00
parent 1634249c07
commit 029a5dba24
2 changed files with 119 additions and 26 deletions

View File

@ -18,7 +18,11 @@ public class MathFunctions {
System.out.print(a[i]+" ");
}
*/
System.out.println(" "+roundDecimal(300.23454,2));
// System.out.println(" "+roundDecimal(300.23454,2));
// System.out.println(cohensKappaForDichotomy(20, 5, 10, 15));
// System.out.println(cohensKappaForDichotomy(45, 15, 25, 15));
System.out.println(cohensKappaForDichotomy(25,35,5,35));
}
//rounds to the xth decimal position
@ -250,4 +254,47 @@ public class MathFunctions {
return linearpoints;
}
public static double cohensKappaForDichotomy(long NumOf_A1_B1, long NumOf_A1_B0, long NumOf_A0_B1, long NumOf_A0_B0){
long T = NumOf_A1_B1+NumOf_A1_B0+NumOf_A0_B1+NumOf_A0_B0;
double Pra = (double)(NumOf_A1_B1+NumOf_A0_B0)/(double) T ;
double Pre1 = (double) (NumOf_A1_B1+NumOf_A1_B0) * (double) (NumOf_A1_B1+NumOf_A0_B1)/(double) (T*T);
double Pre2 = (double) (NumOf_A0_B0+NumOf_A0_B1) * (double) (NumOf_A0_B0+NumOf_A1_B0)/(double) (T*T);
double Pre = Pre1+Pre2;
double Kappa = (Pra-Pre)/(1d-Pre);
return roundDecimal(Kappa,3);
}
public static String kappaClassificationLandisKoch(double kappa){
if (kappa<0)
return "Poor";
else if ((kappa>=0)&&(kappa<=0.20))
return "Slight";
else if ((kappa>=0.21)&&(kappa<=0.40))
return "Fair";
else if ((kappa>=0.41)&&(kappa<=0.60))
return "Moderate";
else if ((kappa>=0.61)&&(kappa<=0.80))
return "Substantial";
else if (kappa>=0.81)
return "Almost Perfect";
else
return "Not Applicable";
}
public static String kappaClassificationFleiss(double kappa){
if (kappa<0)
return "Poor";
else if ((kappa>=0)&&(kappa<=0.40))
return "Marginal";
else if ((kappa>0.4)&&(kappa<=0.75))
return "Good";
else if (kappa>0.75)
return "Excellent";
else
return "Not Applicable";
}
}

View File

@ -1,7 +1,6 @@
package org.gcube.dataanalysis.ecoengine.evaluation;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
@ -16,17 +15,25 @@ import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.TableTemplates;
import org.gcube.dataanalysis.ecoengine.interfaces.DataAnalysis;
import org.gcube.dataanalysis.ecoengine.utils.DatabaseFactory;
import org.gcube.dataanalysis.ecoengine.utils.Operations;
public class DiscrepancyAnalysis extends DataAnalysis {
// static String discrepancyQuery = "select distinct a.%1$s as csquareone,b.%2$s as csquaretwo,a.%3$s as firstprob,b.%4$s as secondprob from %5$s as a inner join %6$s as b on a.%1$s=b.%2$s and (a.%3$s<>b.%4$s)";
// static String discrepancyQuery = "select distinct a.%1$s as csquareone,b.%2$s as csquaretwo,a.%3$s as firstprob,b.%4$s as secondprob from (select * from %5$s order by %1$s limit %7$s) as a inner join (select * from %6$s order by %2$s limit %7$s) as b on a.%1$s=b.%2$s and (a.%3$s<>b.%4$s)";
//version 3
/*
static String discrepancyQuery = "select * from (select distinct a.%1$s as csquareone,b.%2$s as csquaretwo,a.%3$s as firstprob,b.%4$s as secondprob from " +
"(select %1$s , avg(%3$s) as %3$s from (select distinct * from %5$s order by %1$s limit %7$s) as aa group by %1$s) as a " +
"left join " +
"(select %2$s , avg(%4$s) as %4$s from (select distinct * from %6$s order by %2$s limit %7$s) as aa group by %2$s) as b " +
"on a.%1$s=b.%2$s) as sel where firstprob<>secondprob";
*/
static String discrepancyQuery = "select * from (select distinct a.%1$s as csquareone,b.%2$s as csquaretwo,a.%3$s as firstprob,b.%4$s as secondprob from " +
"(select %1$s , avg(%3$s) as %3$s from (select distinct * from %5$s order by %1$s limit %7$s) as aa group by %1$s) as a " +
"left join " +
"(select %2$s , avg(%4$s) as %4$s from (select distinct * from %6$s order by %2$s limit %7$s) as aa group by %2$s) as b " +
"on a.%1$s=b.%2$s) as sel";
static String getNumberOfElementsQuery = "select count(*) from %1$s";
private static int minElements = 100;
@ -38,10 +45,19 @@ public class DiscrepancyAnalysis extends DataAnalysis {
List<Float> errors;
double mean;
double variance;
int numberoferrors;
int numberofvectors;
double kthreshold;
long numberoferrors;
long numberofvectors;
long numberofcomparisons;
float maxerror;
String maxdiscrepancyPoint;
long numHigher = 0;
long numLower = 0;
long agreementA1B1=0;
long agreementA0B0=0;
long agreementA1B0=0;
long agreementA0B1=0;
private LinkedHashMap<String, String> output;
@Override
@ -60,6 +76,8 @@ public class DiscrepancyAnalysis extends DataAnalysis {
PrimitiveType p6 = new PrimitiveType(Float.class.getName(), null, PrimitiveTypes.NUMBER, "ComparisonThreshold","the comparison threshold","0.1");
PrimitiveType p7 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "MaxSamples","the comparison threshold","10000");
PrimitiveType p8 = new PrimitiveType(Float.class.getName(), null, PrimitiveTypes.NUMBER, "KThreshold", "Threshold for K-Statistic: over this threshold values will be considered 1 for agreement calculation. Default is 0.5","0.5");
parameters.add(p1);
parameters.add(p2);
parameters.add(p3);
@ -68,6 +86,7 @@ public class DiscrepancyAnalysis extends DataAnalysis {
parameters.add(p13);
parameters.add(p6);
parameters.add(p7);
parameters.add(p8);
DatabaseType.addDefaultDBPars(parameters);
return parameters;
@ -84,6 +103,15 @@ public class DiscrepancyAnalysis extends DataAnalysis {
String FirstTable = config.getParam("FirstTable");
String SecondTable = config.getParam("SecondTable");
String maxSamples = config.getParam("MaxSamples");
String kthresholdString = config.getParam("KThreshold");
kthreshold = 0.5;
try{
kthreshold = Double.parseDouble(kthresholdString);
}catch(Exception e){}
AnalysisLogger.getLogger().trace("Using Cohen's Kappa Threshold: "+kthreshold);
int maxCompElements = maxElements;
if (maxSamples!=null && maxSamples.length()>0){
int maxx = Integer.parseInt(maxSamples);
@ -104,20 +132,18 @@ public class DiscrepancyAnalysis extends DataAnalysis {
output.put("NUMBER_OF_ERRORS", "0");
output.put("NUMBER_OF_COMPARISONS", "" + numberofvectors);
output.put("ACCURACY", "100.0");
output.put("MAXIMUM_ERROR", "-");
output.put("MAXIMUM_ERROR", "0");
output.put("MAXIMUM_ERROR_POINT", "-");
output.put("COHENS_KAPPA", "1");
output.put("COHENS_KAPPA_CLASSIFICATION_LANDIS_KOCH", MathFunctions.kappaClassificationLandisKoch(1));
output.put("COHENS_KAPPA_CLASSIFICATION_FLEISS", MathFunctions.kappaClassificationFleiss(1));
output.put("TREND", "STATIONARY");
return output;
}
// String query = String.format(discrepancyQuery, FirstTableCsquareColumn, SecondTableCsquareColumn, FirstTableProbabilityColumn, SecondTableProbabilityColumn, FirstTable, SecondTable);
// List<Object> nelementsQ = DatabaseFactory.executeSQLQuery(DatabaseUtils.countElementsStatement(FirstTable),connection);
// int nelements = Integer.parseInt(""+nelementsQ.get(0));
// int nelements = Math.min(Operations.calcNumOfRepresentativeElements(nPoints, minElements),maxCompElements);
int nelements = nPoints;
AnalysisLogger.getLogger().trace("Number Of Elements to take: "+nelements);
String query = String.format(discrepancyQuery, FirstTableCsquareColumn, SecondTableCsquareColumn, FirstTableProbabilityColumn, SecondTableProbabilityColumn, FirstTable, SecondTable,""+nelements);
AnalysisLogger.getLogger().trace("Number Of Elements to take: "+numberofvectors);
String query = String.format(discrepancyQuery, FirstTableCsquareColumn, SecondTableCsquareColumn, FirstTableProbabilityColumn, SecondTableProbabilityColumn, FirstTable, SecondTable,""+numberofvectors);
AnalysisLogger.getLogger().debug("Discrepancy Calculation - Query to perform :" + query);
List<Object> takePoints = DatabaseFactory.executeSQLQuery(query, connection);
@ -129,21 +155,29 @@ public class DiscrepancyAnalysis extends DataAnalysis {
threshold = Float.parseFloat(config.getParam("ComparisonThreshold"));
analyzeCompareList(takePoints);
calcDiscrepancy();
float accuracy = 100;
if (processedRecords>0)
accuracy = (1 - (float) numberoferrors / (float) numberofcomparisons) * 100;
if (maxdiscrepancyPoint==null)
maxdiscrepancyPoint="-";
double kappa = MathFunctions.cohensKappaForDichotomy(agreementA1B1, agreementA1B0, agreementA0B1, agreementA0B0);
AnalysisLogger.getLogger().debug("Discrepancy Calculation - Calculated Cohen's Kappa:" + kappa);
output = new LinkedHashMap<String, String>();
output.put("MEAN", "" + MathFunctions.roundDecimal(mean,2));
output.put("VARIANCE", "" + MathFunctions.roundDecimal(variance,2));
output.put("NUMBER_OF_ERRORS", "" + numberoferrors);
output.put("NUMBER_OF_COMPARISONS", "" + nelements);
float accuracy = 100;
if (processedRecords>0)
accuracy = (1 - (float) numberoferrors / (float) nelements) * 100;
output.put("NUMBER_OF_COMPARISONS", "" + numberofcomparisons);
output.put("ACCURACY", "" + MathFunctions.roundDecimal(accuracy,2));
output.put("MAXIMUM_ERROR", "" + MathFunctions.roundDecimal(maxerror,2));
output.put("MAXIMUM_ERROR_POINT", "" + maxdiscrepancyPoint);
output.put("MAXIMUM_ERROR_POINT", maxdiscrepancyPoint);
output.put("COHENS_KAPPA", "" + kappa);
output.put("COHENS_KAPPA_CLASSIFICATION_LANDIS_KOCH", MathFunctions.kappaClassificationLandisKoch(kappa));
output.put("COHENS_KAPPA_CLASSIFICATION_FLEISS", MathFunctions.kappaClassificationFleiss(kappa));
if (numLower>numHigher)
output.put("TREND", "CONTRACTION");
else if (numLower<numHigher)
@ -173,14 +207,16 @@ public class DiscrepancyAnalysis extends DataAnalysis {
}
long numHigher = 0;
long numLower = 0;
public void analyzeCompareList(List<Object> points) {
errors = new ArrayList<Float>();
if (points != null) {
maxerror = 0;
for (Object vector : points) {
//number of comparison equals to the aggregation
numberofcomparisons++;
Object[] elements = (Object[]) vector;
String csquare = (String) elements[0];
float probabilityPoint1 = 0;
@ -203,6 +239,16 @@ public class DiscrepancyAnalysis extends DataAnalysis {
else if (probabilityPoint2<probabilityPoint1)
numLower++;
}
//calculations for Cohen's Kappa
if ((probabilityPoint1>=kthreshold) && (probabilityPoint2>=kthreshold))
agreementA1B1++;
else if ((probabilityPoint1<kthreshold) && (probabilityPoint2<kthreshold))
agreementA0B0++;
if ((probabilityPoint1>=kthreshold) && (probabilityPoint2<kthreshold))
agreementA1B0++;
if ((probabilityPoint1<kthreshold) && (probabilityPoint2>=kthreshold))
agreementA0B1++;
}
}