ecological-engine/src/main/java/org/gcube/dataanalysis/ecoengine/evaluation/HabitatRepresentativeness.java

287 lines
13 KiB
Java

package org.gcube.dataanalysis.ecoengine.evaluation;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions;
import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration;
import org.gcube.dataanalysis.ecoengine.datatypes.ColumnTypesList;
import org.gcube.dataanalysis.ecoengine.datatypes.DatabaseType;
import org.gcube.dataanalysis.ecoengine.datatypes.InputTable;
import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveType;
import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.TableTemplates;
import org.gcube.dataanalysis.ecoengine.interfaces.DataAnalysis;
import org.gcube.dataanalysis.ecoengine.models.cores.pca.PrincipalComponentAnalysis;
import org.gcube.dataanalysis.ecoengine.utils.DatabaseFactory;
import org.gcube.dataanalysis.ecoengine.utils.Operations;
import org.gcube.dataanalysis.ecoengine.utils.Transformations;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class HabitatRepresentativeness extends DataAnalysis {
private static Logger logger = LoggerFactory.getLogger(HabitatRepresentativeness.class);
static String getNumberOfElementsQuery = "select count(*) from %1$s #OPTIONAL#";
static String getRandomVectors = "select %1$s from %2$s #OPTIONAL# order by RANDOM() limit %3$s";
// static String getRandomVectors = "select %1$s from %2$s #OPTIONAL# limit %3$s";
String configPath = "./cfg/";
private LinkedHashMap<String, String> output;
private static int minimumNumberToTake = 10000;
private float status;
private int currentIterationStep;
private float innerstatus;
private int maxTests = 2;
public List<StatisticalType> getInputParameters() {
List<StatisticalType> parameters = new ArrayList<StatisticalType>();
List<TableTemplates> templates = new ArrayList<TableTemplates>();
templates.add(TableTemplates.HCAF);
templates.add(TableTemplates.TRAININGSET);
templates.add(TableTemplates.TESTSET);
List<TableTemplates> templatesOccurrences = new ArrayList<TableTemplates>();
templatesOccurrences.add(TableTemplates.OCCURRENCE_AQUAMAPS);
templatesOccurrences.add(TableTemplates.TRAININGSET);
templatesOccurrences.add(TableTemplates.TESTSET);
InputTable p1 = new InputTable(templates,"ProjectingAreaTable","A Table containing projecting area information");
PrimitiveType p2 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "OptionalCondition","optional filter for taking area rows","where oceanarea>0",true);
InputTable p3 = new InputTable(templates,"PositiveCasesTable","A Table containing positive cases");
InputTable p4 = new InputTable(templates,"NegativeCasesTable","A Table containing negative cases");
// PrimitiveType p5 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "FeaturesColumns","fetures columns names separated by comma","depthmean,depthmax,depthmin, sstanmean,sbtanmean,salinitymean,salinitybmean, primprodmean,iceconann,landdist,oceanarea");
ColumnTypesList p5 = new ColumnTypesList ("PositiveCasesTable","FeaturesColumns", "Features columns", false);
parameters.add(p1);
parameters.add(p2);
parameters.add(p3);
parameters.add(p4);
parameters.add(p5);
DatabaseType.addDefaultDBPars(parameters);
return parameters;
}
public List<String> getOutputParameters() {
List<String> outputs = new ArrayList<String>();
outputs.add("HRS_VECTOR");
outputs.add("HRS");
return outputs;
}
private int calculateNumberOfPoints(String table, String option) {
String numberOfPositiveCasesQuery = String.format(getNumberOfElementsQuery, table);
numberOfPositiveCasesQuery = numberOfPositiveCasesQuery.replace("#OPTIONAL#", (option != null) ? option : "");
List<Object> totalPoints = DatabaseFactory.executeSQLQuery(numberOfPositiveCasesQuery, connection);
int points = Integer.parseInt("" + totalPoints.get(0));
return points;
}
private double[][] getPoints(String table, String option, String features, int numberOfElemsToTake) {
String query = String.format(getRandomVectors, features, table, "" + numberOfElemsToTake);
query = query.replace("#OPTIONAL#", (option != null) ? option : "");
logger.trace("Compare - Query to perform for points:" + query);
List<Object> caughtpoints = DatabaseFactory.executeSQLQuery(query, connection);
int size = 0;
if (caughtpoints != null)
size = caughtpoints.size();
double[][] points = null;
if (size > 0) {
points = new double[size][((Object[]) caughtpoints.get(0)).length];
for (int i = 0; i < size; i++) {
if (caughtpoints.get(i) != null) {
Object[] arrayFeatures = (Object[]) caughtpoints.get(i);
for (int j = 0; j < arrayFeatures.length; j++) {
double delement = arrayFeatures[j] == null ? 0d : Double.parseDouble("" + arrayFeatures[j]);
points[i][j] = delement;
}
}
}
}
return points;
}
private void calcHRS(String projectingAreaTable, String projectingAreaFeaturesOptionalCondition, String FeaturesColumns, String positiveCasesTable, String negativeCasesTable,int numberOfElements) throws Exception{
innerstatus = 0f;
int numberOfElementsToTake = Operations.calcNumOfRepresentativeElements(numberOfElements, minimumNumberToTake);
logger.trace("HRS: TAKING "+numberOfElementsToTake+" POINTS ON "+numberOfElements+" FROM THE AREA UNDER ANALYSIS");
// 1 - take the right number of points
double[][] areaPoints = getPoints(projectingAreaTable, projectingAreaFeaturesOptionalCondition, FeaturesColumns, numberOfElementsToTake);
logger.trace("HRS: AREA POINTS MATRIX GENERATED");
innerstatus = 10f;
Operations operations = new Operations();
// 2 - standardize the matrix
areaPoints = operations.standardize(areaPoints);
logger.trace("HRS: MATRIX HAS BEEN STANDARDIZED");
innerstatus = 20f;
// 3 - calculate PCA
PrincipalComponentAnalysis pca = new PrincipalComponentAnalysis();
pca.calcPCA(areaPoints);
logger.trace("HRS: PCA HAS BEEN TRAINED");
innerstatus = 30f;
// 4 - get the pca components for all the vector
double[][] pcaComponents = pca.getComponentsMatrix(areaPoints);
logger.trace("HRS: PCA COMPONENT CALCULATED");
innerstatus = 40f;
// 5 - calculate the frequency distributions for all the pca: each row will be a frequency distribution for a pca component associated to uniform divisions of the range
calcFrequenciesDistributionsForComponents(pcaComponents);
logger.trace("HRS: FREQUENCIES FOR COMPONENTS CALCULATED");
innerstatus = 50f;
// 6 - take positive points and negative points - eventually merge them
double[][] positivePoints = null;
if ((positiveCasesTable!=null) && (positiveCasesTable.length()>0))
positivePoints = getPoints(positiveCasesTable, "", FeaturesColumns, numberOfElementsToTake);
double[][] negativePoints = null;
if ((negativeCasesTable!=null) && (negativeCasesTable.length()>0))
negativePoints = getPoints(negativeCasesTable, "", FeaturesColumns, numberOfElementsToTake);
double[][] habitatPoints = Transformations.mergeMatrixes(positivePoints, negativePoints);
logger.trace("HRS: HABITAT POINTS BUILT FROM POSITIVE AND NEGATIVE POINTS");
innerstatus = 60f;
// 7 - Standardize the points respect to previous means and variances
habitatPoints = operations.standardize(habitatPoints, operations.means, operations.variances);
logger.trace("HRS: HABITAT POINTS HAVE BEEN STANDARDIZED RESPECT TO PREVIOUS MEANS AND VARIANCES");
// 8 - calculate the pca components for habitat
double[][] habitatPcaComponents = pca.getComponentsMatrix(habitatPoints);
logger.trace("HRS: HABITAT POINTS HAVE BEEN TRANSFORMED BY PCA");
innerstatus = 70f;
// 9 - calculate frequencies distributions for each component, respect to previous intervals
int components = habitatPcaComponents[0].length;
// 10 - calculate absolute differences and sum -> obtain a hrs for each PCA component = for each feature
currentHRSVector = new double[components];
double[][] habitatPcaPointsMatrix = Transformations.traspose(habitatPcaComponents);
for (int i = 0; i < components; i++) {
double[] habitatPcaPoints = habitatPcaPointsMatrix[i];
// calculate frequency distributions respect to previous intervals
double[] habitatPcafrequencies = Operations.calcFrequencies(intervals.get(i), habitatPcaPoints);
habitatPcafrequencies = Operations.normalizeFrequencies(habitatPcafrequencies, habitatPcaPoints.length);
double[] absdifference = Operations.vectorialAbsoluteDifference(habitatPcafrequencies, frequencyDistrib.get(i));
currentHRSVector[i] = Operations.sumVector(absdifference);
}
logger.trace("HRS: HRS VECTOR HAS BEEN CALCULATED");
innerstatus = 90f;
// 11 - obtain hrsScore by weighted sum of hrs respect to inverse eigenvalues - too variable, substituted with the sum of the scores
// currentHRSScore = Operations.scalarProduct(currentHRSVector, pca.getInverseNormalizedEigenvalues());
currentHRSScore = Operations.sumVector(currentHRSVector);
logger.trace("HRS: HRS SCORE HAS BEEN CALCULATED");
innerstatus = 100f;
}
private double meanHRS ;
private double [] meanHRSVector;
private double currentHRSScore;
private double [] currentHRSVector;
public LinkedHashMap<String, String> analyze() throws Exception {
try {
status = 0;
String projectingAreaTable = config.getParam("ProjectingAreaTable");
String projectingAreaFeaturesOptionalCondition = config.getParam("OptionalCondition");
String FeaturesColumns = config.getParam("FeaturesColumns").replace(AlgorithmConfiguration.getListSeparator(), ",");
String positiveCasesTable = config.getParam("PositiveCasesTable");
String negativeCasesTable = config.getParam("NegativeCasesTable");
connection = AlgorithmConfiguration.getConnectionFromConfig(config);
meanHRS = 0;
int numberOfElements = calculateNumberOfPoints(projectingAreaTable, projectingAreaFeaturesOptionalCondition);
for (int i=0;i<maxTests;i++){
currentIterationStep = i;
logger.trace("ITERATION NUMBER "+(i+1));
calcHRS(projectingAreaTable, projectingAreaFeaturesOptionalCondition, FeaturesColumns, positiveCasesTable, negativeCasesTable, numberOfElements);
meanHRS = MathFunctions.incrementAvg(meanHRS, currentHRSScore, i);
if (meanHRSVector==null)
meanHRSVector = new double[currentHRSVector.length];
for (int j=0;j<currentHRSVector.length;j++){
meanHRSVector[j]=org.gcube.contentmanagement.graphtools.utils.MathFunctions.roundDecimal(MathFunctions.incrementAvg(meanHRSVector[j],currentHRSVector[j],i),2);
}
logger.trace("ITERATION FINISHED "+meanHRS);
status=Math.min(status+100f/maxTests,99f);
}
output = new LinkedHashMap<String, String>();
output.put("HRS_VECTOR", "" + Transformations.vector2String(meanHRSVector));
output.put("HRS", "" + org.gcube.contentmanagement.graphtools.utils.MathFunctions.roundDecimal(meanHRS,2));
return output;
} catch (Exception e) {
e.printStackTrace();
logger.error("ALERT: AN ERROR OCCURRED DURING HRS CALCULATION : " + e.getLocalizedMessage());
throw e;
} finally {
status=100;
logger.trace("COMPUTATION FINISHED ");
}
}
List<double[]> frequencyDistrib;
List<double[]> intervals; // uniform subdivision of the numeric ranges
// calculate a frequency distribution for each component
public void calcFrequenciesDistributionsForComponents(double[][] pcaComponents) {
frequencyDistrib = null;
if (pcaComponents.length > 0) {
int sizeDistrib = pcaComponents[0].length;
frequencyDistrib = new ArrayList<double[]>();
intervals = new ArrayList<double[]>();
double[][] pcaColumns = Transformations.traspose(pcaComponents);
for (int i = 0; i < sizeDistrib; i++) {
double[] pcaPoints = pcaColumns[i];
double[] interval = Operations.uniformDivide(Operations.getMax(pcaPoints), Operations.getMin(pcaPoints), pcaPoints);
double[] frequencies = Operations.calcFrequencies(interval, pcaPoints);
frequencies = Operations.normalizeFrequencies(frequencies, pcaPoints.length);
intervals.add(interval);
frequencyDistrib.add(frequencies);
}
}
}
public static void visualizeResults(HashMap<String, Object> results) {
for (String key : results.keySet()) {
System.out.println(key + ":" + results.get(key));
}
}
@Override
public StatisticalType getOutput() {
PrimitiveType p = new PrimitiveType(Map.class.getName(), PrimitiveType.stringMap2StatisticalMap(output), PrimitiveTypes.MAP, "AnalysisResult","Habitat Representativeness Score");
return p;
}
@Override
public float getStatus() {
return status==100f?status: Math.min((status+(float)(currentIterationStep+1)*innerstatus/(float)maxTests),99f);
}
@Override
public String getDescription() {
return "An evaluator algorithm that calculates the Habitat Representativeness Score, i.e. an indicator of the assessment of whether a specific survey coverage or another environmental features dataset, contains data that are representative of all available habitat variable combinations in an area.";
}
}