From 029a5dba242e027acd9fa4fd29e55d407231ce78 Mon Sep 17 00:00:00 2001 From: Gianpaolo Coro Date: Fri, 7 Jun 2013 10:12:39 +0000 Subject: [PATCH] implemented Cohen's Kappa Statistics git-svn-id: https://svn.d4science.research-infrastructures.eu/gcube/trunk/data-analysis/EcologicalEngine@76846 82a268e6-3cf1-43bd-a215-b396298e98cf --- .../graphtools/utils/MathFunctions.java | 49 +++++++++- .../evaluation/DiscrepancyAnalysis.java | 96 ++++++++++++++----- 2 files changed, 119 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/gcube/contentmanagement/graphtools/utils/MathFunctions.java b/src/main/java/org/gcube/contentmanagement/graphtools/utils/MathFunctions.java index 27b23c6..22fcd0f 100644 --- a/src/main/java/org/gcube/contentmanagement/graphtools/utils/MathFunctions.java +++ b/src/main/java/org/gcube/contentmanagement/graphtools/utils/MathFunctions.java @@ -18,7 +18,11 @@ public class MathFunctions { System.out.print(a[i]+" "); } */ - System.out.println(" "+roundDecimal(300.23454,2)); +// System.out.println(" "+roundDecimal(300.23454,2)); + +// System.out.println(cohensKappaForDichotomy(20, 5, 10, 15)); +// System.out.println(cohensKappaForDichotomy(45, 15, 25, 15)); + System.out.println(cohensKappaForDichotomy(25,35,5,35)); } //rounds to the xth decimal position @@ -250,4 +254,47 @@ public class MathFunctions { return linearpoints; } + + + public static double cohensKappaForDichotomy(long NumOf_A1_B1, long NumOf_A1_B0, long NumOf_A0_B1, long NumOf_A0_B0){ + long T = NumOf_A1_B1+NumOf_A1_B0+NumOf_A0_B1+NumOf_A0_B0; + + double Pra = (double)(NumOf_A1_B1+NumOf_A0_B0)/(double) T ; + double Pre1 = (double) (NumOf_A1_B1+NumOf_A1_B0) * (double) (NumOf_A1_B1+NumOf_A0_B1)/(double) (T*T); + double Pre2 = (double) (NumOf_A0_B0+NumOf_A0_B1) * (double) (NumOf_A0_B0+NumOf_A1_B0)/(double) (T*T); + double Pre = Pre1+Pre2; + double Kappa = (Pra-Pre)/(1d-Pre); + return roundDecimal(Kappa,3); + } + + public static String kappaClassificationLandisKoch(double kappa){ + if (kappa<0) + return "Poor"; + else if ((kappa>=0)&&(kappa<=0.20)) + return "Slight"; + else if ((kappa>=0.21)&&(kappa<=0.40)) + return "Fair"; + else if ((kappa>=0.41)&&(kappa<=0.60)) + return "Moderate"; + else if ((kappa>=0.61)&&(kappa<=0.80)) + return "Substantial"; + else if (kappa>=0.81) + return "Almost Perfect"; + else + return "Not Applicable"; + } + + public static String kappaClassificationFleiss(double kappa){ + if (kappa<0) + return "Poor"; + else if ((kappa>=0)&&(kappa<=0.40)) + return "Marginal"; + else if ((kappa>0.4)&&(kappa<=0.75)) + return "Good"; + else if (kappa>0.75) + return "Excellent"; + else + return "Not Applicable"; + } + } diff --git a/src/main/java/org/gcube/dataanalysis/ecoengine/evaluation/DiscrepancyAnalysis.java b/src/main/java/org/gcube/dataanalysis/ecoengine/evaluation/DiscrepancyAnalysis.java index d2201dc..988d5b7 100644 --- a/src/main/java/org/gcube/dataanalysis/ecoengine/evaluation/DiscrepancyAnalysis.java +++ b/src/main/java/org/gcube/dataanalysis/ecoengine/evaluation/DiscrepancyAnalysis.java @@ -1,7 +1,6 @@ package org.gcube.dataanalysis.ecoengine.evaluation; import java.util.ArrayList; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; @@ -16,17 +15,25 @@ import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes; import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.TableTemplates; import org.gcube.dataanalysis.ecoengine.interfaces.DataAnalysis; import org.gcube.dataanalysis.ecoengine.utils.DatabaseFactory; -import org.gcube.dataanalysis.ecoengine.utils.Operations; public class DiscrepancyAnalysis extends DataAnalysis { // static String discrepancyQuery = "select distinct a.%1$s as csquareone,b.%2$s as csquaretwo,a.%3$s as firstprob,b.%4$s as secondprob from %5$s as a inner join %6$s as b on a.%1$s=b.%2$s and (a.%3$s<>b.%4$s)"; // static String discrepancyQuery = "select distinct a.%1$s as csquareone,b.%2$s as csquaretwo,a.%3$s as firstprob,b.%4$s as secondprob from (select * from %5$s order by %1$s limit %7$s) as a inner join (select * from %6$s order by %2$s limit %7$s) as b on a.%1$s=b.%2$s and (a.%3$s<>b.%4$s)"; + //version 3 + /* static String discrepancyQuery = "select * from (select distinct a.%1$s as csquareone,b.%2$s as csquaretwo,a.%3$s as firstprob,b.%4$s as secondprob from " + "(select %1$s , avg(%3$s) as %3$s from (select distinct * from %5$s order by %1$s limit %7$s) as aa group by %1$s) as a " + "left join " + "(select %2$s , avg(%4$s) as %4$s from (select distinct * from %6$s order by %2$s limit %7$s) as aa group by %2$s) as b " + "on a.%1$s=b.%2$s) as sel where firstprob<>secondprob"; + */ + + static String discrepancyQuery = "select * from (select distinct a.%1$s as csquareone,b.%2$s as csquaretwo,a.%3$s as firstprob,b.%4$s as secondprob from " + + "(select %1$s , avg(%3$s) as %3$s from (select distinct * from %5$s order by %1$s limit %7$s) as aa group by %1$s) as a " + + "left join " + + "(select %2$s , avg(%4$s) as %4$s from (select distinct * from %6$s order by %2$s limit %7$s) as aa group by %2$s) as b " + + "on a.%1$s=b.%2$s) as sel"; static String getNumberOfElementsQuery = "select count(*) from %1$s"; private static int minElements = 100; @@ -38,10 +45,19 @@ public class DiscrepancyAnalysis extends DataAnalysis { List errors; double mean; double variance; - int numberoferrors; - int numberofvectors; + double kthreshold; + long numberoferrors; + long numberofvectors; + long numberofcomparisons; float maxerror; String maxdiscrepancyPoint; + long numHigher = 0; + long numLower = 0; + long agreementA1B1=0; + long agreementA0B0=0; + long agreementA1B0=0; + long agreementA0B1=0; + private LinkedHashMap output; @Override @@ -60,6 +76,8 @@ public class DiscrepancyAnalysis extends DataAnalysis { PrimitiveType p6 = new PrimitiveType(Float.class.getName(), null, PrimitiveTypes.NUMBER, "ComparisonThreshold","the comparison threshold","0.1"); PrimitiveType p7 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "MaxSamples","the comparison threshold","10000"); + PrimitiveType p8 = new PrimitiveType(Float.class.getName(), null, PrimitiveTypes.NUMBER, "KThreshold", "Threshold for K-Statistic: over this threshold values will be considered 1 for agreement calculation. Default is 0.5","0.5"); + parameters.add(p1); parameters.add(p2); parameters.add(p3); @@ -68,6 +86,7 @@ public class DiscrepancyAnalysis extends DataAnalysis { parameters.add(p13); parameters.add(p6); parameters.add(p7); + parameters.add(p8); DatabaseType.addDefaultDBPars(parameters); return parameters; @@ -84,6 +103,15 @@ public class DiscrepancyAnalysis extends DataAnalysis { String FirstTable = config.getParam("FirstTable"); String SecondTable = config.getParam("SecondTable"); String maxSamples = config.getParam("MaxSamples"); + + String kthresholdString = config.getParam("KThreshold"); + kthreshold = 0.5; + try{ + kthreshold = Double.parseDouble(kthresholdString); + }catch(Exception e){} + + AnalysisLogger.getLogger().trace("Using Cohen's Kappa Threshold: "+kthreshold); + int maxCompElements = maxElements; if (maxSamples!=null && maxSamples.length()>0){ int maxx = Integer.parseInt(maxSamples); @@ -104,20 +132,18 @@ public class DiscrepancyAnalysis extends DataAnalysis { output.put("NUMBER_OF_ERRORS", "0"); output.put("NUMBER_OF_COMPARISONS", "" + numberofvectors); output.put("ACCURACY", "100.0"); - output.put("MAXIMUM_ERROR", "-"); + output.put("MAXIMUM_ERROR", "0"); output.put("MAXIMUM_ERROR_POINT", "-"); + output.put("COHENS_KAPPA", "1"); + output.put("COHENS_KAPPA_CLASSIFICATION_LANDIS_KOCH", MathFunctions.kappaClassificationLandisKoch(1)); + output.put("COHENS_KAPPA_CLASSIFICATION_FLEISS", MathFunctions.kappaClassificationFleiss(1)); + output.put("TREND", "STATIONARY"); + return output; } -// String query = String.format(discrepancyQuery, FirstTableCsquareColumn, SecondTableCsquareColumn, FirstTableProbabilityColumn, SecondTableProbabilityColumn, FirstTable, SecondTable); -// List nelementsQ = DatabaseFactory.executeSQLQuery(DatabaseUtils.countElementsStatement(FirstTable),connection); -// int nelements = Integer.parseInt(""+nelementsQ.get(0)); - -// int nelements = Math.min(Operations.calcNumOfRepresentativeElements(nPoints, minElements),maxCompElements); - int nelements = nPoints; - - AnalysisLogger.getLogger().trace("Number Of Elements to take: "+nelements); - String query = String.format(discrepancyQuery, FirstTableCsquareColumn, SecondTableCsquareColumn, FirstTableProbabilityColumn, SecondTableProbabilityColumn, FirstTable, SecondTable,""+nelements); + AnalysisLogger.getLogger().trace("Number Of Elements to take: "+numberofvectors); + String query = String.format(discrepancyQuery, FirstTableCsquareColumn, SecondTableCsquareColumn, FirstTableProbabilityColumn, SecondTableProbabilityColumn, FirstTable, SecondTable,""+numberofvectors); AnalysisLogger.getLogger().debug("Discrepancy Calculation - Query to perform :" + query); List takePoints = DatabaseFactory.executeSQLQuery(query, connection); @@ -129,21 +155,29 @@ public class DiscrepancyAnalysis extends DataAnalysis { threshold = Float.parseFloat(config.getParam("ComparisonThreshold")); analyzeCompareList(takePoints); calcDiscrepancy(); - + + float accuracy = 100; + if (processedRecords>0) + accuracy = (1 - (float) numberoferrors / (float) numberofcomparisons) * 100; + + if (maxdiscrepancyPoint==null) + maxdiscrepancyPoint="-"; + + double kappa = MathFunctions.cohensKappaForDichotomy(agreementA1B1, agreementA1B0, agreementA0B1, agreementA0B0); + AnalysisLogger.getLogger().debug("Discrepancy Calculation - Calculated Cohen's Kappa:" + kappa); + output = new LinkedHashMap(); output.put("MEAN", "" + MathFunctions.roundDecimal(mean,2)); output.put("VARIANCE", "" + MathFunctions.roundDecimal(variance,2)); output.put("NUMBER_OF_ERRORS", "" + numberoferrors); - output.put("NUMBER_OF_COMPARISONS", "" + nelements); - - float accuracy = 100; - if (processedRecords>0) - accuracy = (1 - (float) numberoferrors / (float) nelements) * 100; - - + output.put("NUMBER_OF_COMPARISONS", "" + numberofcomparisons); output.put("ACCURACY", "" + MathFunctions.roundDecimal(accuracy,2)); output.put("MAXIMUM_ERROR", "" + MathFunctions.roundDecimal(maxerror,2)); - output.put("MAXIMUM_ERROR_POINT", "" + maxdiscrepancyPoint); + output.put("MAXIMUM_ERROR_POINT", maxdiscrepancyPoint); + output.put("COHENS_KAPPA", "" + kappa); + output.put("COHENS_KAPPA_CLASSIFICATION_LANDIS_KOCH", MathFunctions.kappaClassificationLandisKoch(kappa)); + output.put("COHENS_KAPPA_CLASSIFICATION_FLEISS", MathFunctions.kappaClassificationFleiss(kappa)); + if (numLower>numHigher) output.put("TREND", "CONTRACTION"); else if (numLower points) { errors = new ArrayList(); if (points != null) { maxerror = 0; for (Object vector : points) { + //number of comparison equals to the aggregation + numberofcomparisons++; + Object[] elements = (Object[]) vector; String csquare = (String) elements[0]; float probabilityPoint1 = 0; @@ -203,6 +239,16 @@ public class DiscrepancyAnalysis extends DataAnalysis { else if (probabilityPoint2=kthreshold) && (probabilityPoint2>=kthreshold)) + agreementA1B1++; + else if ((probabilityPoint1=kthreshold) && (probabilityPoint2=kthreshold)) + agreementA0B1++; } }