ecological-engine/src/main/java/org/gcube/dataanalysis/ecoengine/evaluation/DiscrepancyAnalysis.java

268 lines
11 KiB
Java

package org.gcube.dataanalysis.ecoengine.evaluation;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import org.gcube.contentmanagement.graphtools.utils.MathFunctions;
import org.gcube.dataanalysis.ecoengine.datatypes.ColumnType;
import org.gcube.dataanalysis.ecoengine.datatypes.DatabaseType;
import org.gcube.dataanalysis.ecoengine.datatypes.InputTable;
import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveType;
import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.TableTemplates;
import org.gcube.dataanalysis.ecoengine.interfaces.DataAnalysis;
import org.gcube.dataanalysis.ecoengine.utils.DatabaseFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class DiscrepancyAnalysis extends DataAnalysis {
// static String discrepancyQuery = "select distinct a.%1$s as csquareone,b.%2$s as csquaretwo,a.%3$s as firstprob,b.%4$s as secondprob from %5$s as a inner join %6$s as b on a.%1$s=b.%2$s and (a.%3$s<>b.%4$s)";
// static String discrepancyQuery = "select distinct a.%1$s as csquareone,b.%2$s as csquaretwo,a.%3$s as firstprob,b.%4$s as secondprob from (select * from %5$s order by %1$s limit %7$s) as a inner join (select * from %6$s order by %2$s limit %7$s) as b on a.%1$s=b.%2$s and (a.%3$s<>b.%4$s)";
//version 3
/*
static String discrepancyQuery = "select * from (select distinct a.%1$s as csquareone,b.%2$s as csquaretwo,a.%3$s as firstprob,b.%4$s as secondprob from " +
"(select %1$s , avg(%3$s) as %3$s from (select distinct * from %5$s order by %1$s limit %7$s) as aa group by %1$s) as a " +
"left join " +
"(select %2$s , avg(%4$s) as %4$s from (select distinct * from %6$s order by %2$s limit %7$s) as aa group by %2$s) as b " +
"on a.%1$s=b.%2$s) as sel where firstprob<>secondprob";
*/
private static Logger logger = LoggerFactory.getLogger(DiscrepancyAnalysis.class);
static String discrepancyQuery = "select * from (select distinct a.%1$s as csquareone,b.%2$s as csquaretwo,a.%3$s as firstprob,b.%4$s as secondprob from " +
"(select %1$s , avg(%3$s) as %3$s from (select distinct * from %5$s order by %1$s limit %7$s) as aa group by %1$s) as a " +
"left join " +
"(select %2$s , avg(%4$s) as %4$s from (select distinct * from %6$s order by %2$s limit %7$s) as aa group by %2$s) as b " +
"on a.%1$s=b.%2$s) as sel";
static String getNumberOfElementsQuery = "select count(*) from %1$s";
private static int minElements = 100;
private static int maxElements = 30000;
float threshold = 0.1f;
String configPath = "./cfg/";
List<Float> errors;
double mean;
double variance;
double kthreshold;
long numberoferrors;
long numberofvectors;
long numberofcomparisons;
float maxerror;
String maxdiscrepancyPoint;
long numHigher = 0;
long numLower = 0;
long agreementA1B1=0;
long agreementA0B0=0;
long agreementA1B0=0;
long agreementA0B1=0;
private LinkedHashMap<String, String> output;
@Override
public List<StatisticalType> getInputParameters() {
List<StatisticalType> parameters = new ArrayList<StatisticalType>();
List<TableTemplates> templates = new ArrayList<TableTemplates>();
templates.add(TableTemplates.HSPEC);
templates.add(TableTemplates.TRAININGSET);
templates.add(TableTemplates.TESTSET);
InputTable p1 = new InputTable(templates,"FirstTable","First Table");
InputTable p2 = new InputTable(templates,"SecondTable","Second Table");
ColumnType p3 = new ColumnType("FirstTable", "FirstTableCsquareColumn", "the csquares column name in the first table", "csquarecode", false);
ColumnType p4 = new ColumnType("SecondTable", "SecondTableCsquareColumn", "the csquares column name in the second table", "csquarecode", false);
ColumnType p5 = new ColumnType("FirstTable", "FirstTableProbabilityColumn", "the probability column in the first table", "probability", false);
ColumnType p13 = new ColumnType("SecondTable", "SecondTableProbabilityColumn", "the probability column in the second table", "probability", false);
PrimitiveType p6 = new PrimitiveType(Float.class.getName(), null, PrimitiveTypes.NUMBER, "ComparisonThreshold","the comparison threshold","0.1");
PrimitiveType p7 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "MaxSamples","the comparison threshold","10000");
PrimitiveType p8 = new PrimitiveType(Float.class.getName(), null, PrimitiveTypes.NUMBER, "KThreshold", "Threshold for K-Statistic: over this threshold values will be considered 1 for agreement calculation. Default is 0.5","0.5");
parameters.add(p1);
parameters.add(p2);
parameters.add(p3);
parameters.add(p4);
parameters.add(p5);
parameters.add(p13);
parameters.add(p6);
parameters.add(p7);
parameters.add(p8);
DatabaseType.addDefaultDBPars(parameters);
return parameters;
}
@Override
public LinkedHashMap<String, String> analyze() throws Exception {
String FirstTableCsquareColumn = config.getParam("FirstTableCsquareColumn");
String SecondTableCsquareColumn = config.getParam("SecondTableCsquareColumn");
String FirstTableProbabilityColumn = config.getParam("FirstTableProbabilityColumn");
String SecondTableProbabilityColumn = config.getParam("SecondTableProbabilityColumn");
String FirstTable = config.getParam("FirstTable");
String SecondTable = config.getParam("SecondTable");
String maxSamples = config.getParam("MaxSamples");
String kthresholdString = config.getParam("KThreshold");
kthreshold = 0.5;
try{
kthreshold = Double.parseDouble(kthresholdString);
}catch(Exception e){}
logger.trace("Using Cohen's Kappa Threshold: "+kthreshold);
int maxCompElements = maxElements;
if (maxSamples!=null && maxSamples.length()>0){
int maxx = Integer.parseInt(maxSamples);
maxCompElements = maxx!=0?maxx:Integer.MAX_VALUE;
}
List<Object> takeNPoints = DatabaseFactory.executeSQLQuery(String.format(getNumberOfElementsQuery, FirstTable), connection);
List<Object> takeMPoints = DatabaseFactory.executeSQLQuery(String.format(getNumberOfElementsQuery, SecondTable), connection);
int nPoints = Integer.parseInt(""+takeNPoints.get(0));
int mPoints = Integer.parseInt(""+takeMPoints.get(0));
numberofvectors = Math.max(nPoints, mPoints);
if (FirstTable.equals(SecondTable)){
output = new LinkedHashMap<String, String>();
output.put("MEAN", "0.0");
output.put("VARIANCE", "0.0");
output.put("NUMBER_OF_ERRORS", "0");
output.put("NUMBER_OF_COMPARISONS", "" + numberofvectors);
output.put("ACCURACY", "100.0");
output.put("RELATIVE ERROR", "-");
output.put("MAXIMUM_ERROR", "0");
output.put("MAXIMUM_ERROR_POINT", "-");
output.put("COHENS_KAPPA", "1");
output.put("COHENS_KAPPA_CLASSIFICATION_LANDIS_KOCH", MathFunctions.kappaClassificationLandisKoch(1));
output.put("COHENS_KAPPA_CLASSIFICATION_FLEISS", MathFunctions.kappaClassificationFleiss(1));
output.put("TREND", "STATIONARY");
return output;
}
logger.trace("Number Of Elements to take: "+numberofvectors);
String query = String.format(discrepancyQuery, FirstTableCsquareColumn, SecondTableCsquareColumn, FirstTableProbabilityColumn, SecondTableProbabilityColumn, FirstTable, SecondTable,""+numberofvectors);
logger.debug("Discrepancy Calculation - Query to perform :" + query);
List<Object> takePoints = DatabaseFactory.executeSQLQuery(query, connection);
super.processedRecords = 0;
if (takePoints != null)
super.processedRecords = takePoints.size();
threshold = Float.parseFloat(config.getParam("ComparisonThreshold"));
analyzeCompareList(takePoints);
calcDiscrepancy();
float accuracy = 100;
if (processedRecords>0)
accuracy = (1 - (float) numberoferrors / (float) numberofcomparisons) * 100;
if (maxdiscrepancyPoint==null)
maxdiscrepancyPoint="-";
logger.debug("Discrepancy Calculation - Kappa values: " + "agreementA1B1 "+agreementA1B1 +" agreementA1B0 " + agreementA1B0 +" agreementA0B1 "+agreementA0B1+" agreementA0B0 "+agreementA0B0);
double kappa = MathFunctions.cohensKappaForDichotomy(agreementA1B1, agreementA1B0, agreementA0B1, agreementA0B0);
logger.debug("Discrepancy Calculation - Calculated Cohen's Kappa:" + kappa);
output = new LinkedHashMap<String, String>();
output.put("MEAN", "" + MathFunctions.roundDecimal(mean,2));
output.put("VARIANCE", "" + MathFunctions.roundDecimal(variance,2));
output.put("NUMBER_OF_ERRORS", "" + numberoferrors);
output.put("NUMBER_OF_COMPARISONS", "" + numberofcomparisons);
output.put("ACCURACY", "" + MathFunctions.roundDecimal(accuracy,2));
output.put("RELATIVE ERROR", "" + MathFunctions.roundDecimal(mean/maxerror,2));
output.put("MAXIMUM_ERROR", "" + MathFunctions.roundDecimal(maxerror,2));
output.put("MAXIMUM_ERROR_POINT", maxdiscrepancyPoint);
output.put("COHENS_KAPPA", "" + kappa);
output.put("COHENS_KAPPA_CLASSIFICATION_LANDIS_KOCH", MathFunctions.kappaClassificationLandisKoch(kappa));
output.put("COHENS_KAPPA_CLASSIFICATION_FLEISS", MathFunctions.kappaClassificationFleiss(kappa));
if (numLower>numHigher)
output.put("TREND", "CONTRACTION");
else if (numLower<numHigher)
output.put("TREND", "EXPANSION");
else
output.put("TREND", "STATIONARY");
return output;
}
void calcDiscrepancy() {
double[] err = new double[errors.size()];
int i = 0;
for (Float e : errors) {
err[i] = e;
i++;
}
mean = 0;
variance = 0;
if (err.length > 0) {
mean = MathFunctions.mean(err);
variance = com.rapidminer.tools.math.MathFunctions.variance(err, Double.NEGATIVE_INFINITY);
}
}
public void analyzeCompareList(List<Object> points) {
errors = new ArrayList<Float>();
if (points != null) {
maxerror = 0;
for (Object vector : points) {
//number of comparison equals to the aggregation
numberofcomparisons++;
Object[] elements = (Object[]) vector;
String csquare = (String) elements[0];
float probabilityPoint1 = 0;
if (elements[2] != null)
probabilityPoint1 = Float.parseFloat(""+elements[2]);
float probabilityPoint2 = 0;
if (elements[3] != null)
probabilityPoint2 = Float.parseFloat(""+elements[3]);
float discrepancy = Math.abs(probabilityPoint2 - probabilityPoint1);
if (discrepancy > threshold) {
errors.add(Math.abs(probabilityPoint2 - probabilityPoint1));
numberoferrors++;
if (discrepancy > maxerror) {
maxerror = discrepancy;
maxdiscrepancyPoint = csquare;
}
if (probabilityPoint2>probabilityPoint1)
numHigher++;
else if (probabilityPoint2<probabilityPoint1)
numLower++;
}
//calculations for Cohen's Kappa
if ((probabilityPoint1>=kthreshold) && (probabilityPoint2>=kthreshold))
agreementA1B1++;
else if ((probabilityPoint1<kthreshold) && (probabilityPoint2<kthreshold))
agreementA0B0++;
if ((probabilityPoint1>=kthreshold) && (probabilityPoint2<kthreshold))
agreementA1B0++;
if ((probabilityPoint1<kthreshold) && (probabilityPoint2>=kthreshold))
agreementA0B1++;
}
}
}
@Override
public String getDescription() {
return "An evaluator algorithm that compares two tables containing real valued vectors. It drives the comparison by relying on a geographical distance threshold and a threshold for K-Statistic.";
}
}