ecological-engine/src/main/java/org/gcube/dataanalysis/ecoengine/evaluation/HabitatRepresentativeness.java

package org.gcube.dataanalysis.ecoengine.evaluation;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions;
import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration;
import org.gcube.dataanalysis.ecoengine.datatypes.ColumnTypesList;
import org.gcube.dataanalysis.ecoengine.datatypes.DatabaseType;
import org.gcube.dataanalysis.ecoengine.datatypes.InputTable;
import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveType;
import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.TableTemplates;
import org.gcube.dataanalysis.ecoengine.interfaces.DataAnalysis;
import org.gcube.dataanalysis.ecoengine.models.cores.pca.PrincipalComponentAnalysis;
import org.gcube.dataanalysis.ecoengine.utils.DatabaseFactory;
import org.gcube.dataanalysis.ecoengine.utils.Operations;
import org.gcube.dataanalysis.ecoengine.utils.Transformations;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HabitatRepresentativeness extends DataAnalysis {

	private static Logger logger = LoggerFactory.getLogger(HabitatRepresentativeness.class);

	static String getNumberOfElementsQuery = "select count(*) from %1$s #OPTIONAL#";
	static String getRandomVectors = "select %1$s from %2$s #OPTIONAL# order by RANDOM() limit %3$s";
//	static String getRandomVectors = "select %1$s from %2$s #OPTIONAL# limit %3$s";

	String configPath = "./cfg/";

	private LinkedHashMap<String, String> output;
	private static int minimumNumberToTake = 10000;
	private float status;
	private int  currentIterationStep;
	private float innerstatus;
	private int maxTests = 2;

	public List<StatisticalType> getInputParameters() {
		List<StatisticalType> parameters = new ArrayList<StatisticalType>();
		List<TableTemplates> templates = new ArrayList<TableTemplates>();
		templates.add(TableTemplates.HCAF);
		templates.add(TableTemplates.TRAININGSET);
		templates.add(TableTemplates.TESTSET);

		List<TableTemplates> templatesOccurrences = new ArrayList<TableTemplates>();
		templatesOccurrences.add(TableTemplates.OCCURRENCE_AQUAMAPS);
		templatesOccurrences.add(TableTemplates.TRAININGSET);
		templatesOccurrences.add(TableTemplates.TESTSET);

		InputTable p1 = new InputTable(templates,"ProjectingAreaTable","A Table containing projecting area information");
		PrimitiveType p2 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "OptionalCondition","optional filter for taking area rows","where oceanarea>0",true);
		InputTable p3 = new InputTable(templates,"PositiveCasesTable","A Table containing positive cases");
		InputTable p4 = new InputTable(templates,"NegativeCasesTable","A Table containing negative cases");

//		PrimitiveType p5 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "FeaturesColumns","fetures columns names separated by comma","depthmean,depthmax,depthmin, sstanmean,sbtanmean,salinitymean,salinitybmean, primprodmean,iceconann,landdist,oceanarea");
		ColumnTypesList p5 = new ColumnTypesList ("PositiveCasesTable","FeaturesColumns", "Features columns", false);

		parameters.add(p1);
		parameters.add(p2);
		parameters.add(p3);
		parameters.add(p4);
		parameters.add(p5);

		DatabaseType.addDefaultDBPars(parameters);
		return parameters;
	}

	public List<String> getOutputParameters() {

		List<String> outputs = new ArrayList<String>();

		outputs.add("HRS_VECTOR");
		outputs.add("HRS");

		return outputs;
	}

	private int calculateNumberOfPoints(String table, String option) {

		String numberOfPositiveCasesQuery = String.format(getNumberOfElementsQuery, table);
		numberOfPositiveCasesQuery = numberOfPositiveCasesQuery.replace("#OPTIONAL#", (option != null) ? option : "");
		List<Object> totalPoints = DatabaseFactory.executeSQLQuery(numberOfPositiveCasesQuery, connection);
		int points = Integer.parseInt("" + totalPoints.get(0));
		return points;
	}

	private double[][] getPoints(String table, String option, String features, int numberOfElemsToTake) {

		String query = String.format(getRandomVectors, features, table, "" + numberOfElemsToTake);
		query = query.replace("#OPTIONAL#", (option != null) ? option : "");

		logger.trace("Compare - Query to perform for points:" + query);
		List<Object> caughtpoints = DatabaseFactory.executeSQLQuery(query, connection);
		int size = 0;
		if (caughtpoints != null)
			size = caughtpoints.size();
		double[][] points = null;
		if (size > 0) {

			points = new double[size][((Object[]) caughtpoints.get(0)).length];

			for (int i = 0; i < size; i++) {

				if (caughtpoints.get(i) != null) {
					Object[] arrayFeatures = (Object[]) caughtpoints.get(i);
					for (int j = 0; j < arrayFeatures.length; j++) {
						double delement = arrayFeatures[j] == null ? 0d : Double.parseDouble("" + arrayFeatures[j]);
						points[i][j] = delement;
					}
				}

			}
		}
		return points;
	}


	private void calcHRS(String projectingAreaTable, String projectingAreaFeaturesOptionalCondition, String  FeaturesColumns, String positiveCasesTable, String negativeCasesTable,int numberOfElements) throws Exception{
		innerstatus = 0f;
		int numberOfElementsToTake = Operations.calcNumOfRepresentativeElements(numberOfElements, minimumNumberToTake);
		logger.trace("HRS: TAKING "+numberOfElementsToTake+" POINTS ON "+numberOfElements+" FROM THE AREA UNDER ANALYSIS");
		// 1 - take the right number of points
		double[][] areaPoints = getPoints(projectingAreaTable, projectingAreaFeaturesOptionalCondition, FeaturesColumns, numberOfElementsToTake);
		logger.trace("HRS: AREA POINTS MATRIX GENERATED");
		innerstatus = 10f;
		Operations operations = new Operations();
		// 2 - standardize the matrix
		areaPoints = operations.standardize(areaPoints);
		logger.trace("HRS: MATRIX HAS BEEN STANDARDIZED");
		innerstatus = 20f;
		// 3 - calculate PCA
		PrincipalComponentAnalysis pca = new PrincipalComponentAnalysis();
		pca.calcPCA(areaPoints);
		logger.trace("HRS: PCA HAS BEEN TRAINED");
		innerstatus = 30f;
		// 4 - get the pca components for all the vector
		double[][] pcaComponents = pca.getComponentsMatrix(areaPoints);
		logger.trace("HRS: PCA COMPONENT CALCULATED");
		innerstatus = 40f;
		// 5 - calculate the frequency distributions for all the pca: each row will be a frequency distribution for a pca component associated to uniform divisions of the range
		calcFrequenciesDistributionsForComponents(pcaComponents);
		logger.trace("HRS: FREQUENCIES FOR COMPONENTS CALCULATED");
		innerstatus = 50f;
		// 6 - take positive points and negative points - eventually merge them
		double[][] positivePoints = null;
		if ((positiveCasesTable!=null) && (positiveCasesTable.length()>0))
				positivePoints = getPoints(positiveCasesTable, "", FeaturesColumns, numberOfElementsToTake);
		double[][] negativePoints = null;
		if ((negativeCasesTable!=null) && (negativeCasesTable.length()>0))
			negativePoints = getPoints(negativeCasesTable, "", FeaturesColumns, numberOfElementsToTake);
		double[][] habitatPoints = Transformations.mergeMatrixes(positivePoints, negativePoints);
		logger.trace("HRS: HABITAT POINTS BUILT FROM POSITIVE AND NEGATIVE POINTS");
		innerstatus = 60f;
		// 7 - Standardize the points respect to previous means and variances
		habitatPoints = operations.standardize(habitatPoints, operations.means, operations.variances);
		logger.trace("HRS: HABITAT POINTS HAVE BEEN STANDARDIZED RESPECT TO PREVIOUS MEANS AND VARIANCES");
		// 8 - calculate the pca components for habitat
		double[][] habitatPcaComponents = pca.getComponentsMatrix(habitatPoints);
		logger.trace("HRS: HABITAT POINTS HAVE BEEN TRANSFORMED BY PCA");
		innerstatus = 70f;
		// 9 - calculate frequencies distributions for each component, respect to previous intervals
		int components = habitatPcaComponents[0].length;
		// 10 - calculate absolute differences and sum -> obtain a hrs for each PCA component = for each feature
		currentHRSVector = new double[components];

		double[][] habitatPcaPointsMatrix = Transformations.traspose(habitatPcaComponents);
		for (int i = 0; i < components; i++) {
			double[] habitatPcaPoints = habitatPcaPointsMatrix[i];
			// calculate frequency distributions respect to previous intervals
			double[] habitatPcafrequencies = Operations.calcFrequencies(intervals.get(i), habitatPcaPoints);
			habitatPcafrequencies = Operations.normalizeFrequencies(habitatPcafrequencies, habitatPcaPoints.length);
			double[] absdifference = Operations.vectorialAbsoluteDifference(habitatPcafrequencies, frequencyDistrib.get(i));
			currentHRSVector[i] = Operations.sumVector(absdifference);
		}

		logger.trace("HRS: HRS VECTOR HAS BEEN CALCULATED");
		innerstatus = 90f;
		// 11 - obtain hrsScore by weighted sum of hrs respect to inverse eigenvalues - too variable, substituted with the sum of the scores
//		currentHRSScore = Operations.scalarProduct(currentHRSVector, pca.getInverseNormalizedEigenvalues());
		currentHRSScore = Operations.sumVector(currentHRSVector);

		logger.trace("HRS: HRS SCORE HAS BEEN CALCULATED");
		innerstatus = 100f;
	}
	private double meanHRS ;
	private double [] meanHRSVector;
	private double currentHRSScore;
	private double [] currentHRSVector;

	public LinkedHashMap<String, String> analyze() throws Exception {

		try {
			status = 0;
			String projectingAreaTable = config.getParam("ProjectingAreaTable");
			String projectingAreaFeaturesOptionalCondition = config.getParam("OptionalCondition");
			String FeaturesColumns = config.getParam("FeaturesColumns").replace(AlgorithmConfiguration.getListSeparator(), ",");
			String positiveCasesTable = config.getParam("PositiveCasesTable");
			String negativeCasesTable = config.getParam("NegativeCasesTable");
			connection = AlgorithmConfiguration.getConnectionFromConfig(config);
			meanHRS = 0;
			int numberOfElements = calculateNumberOfPoints(projectingAreaTable, projectingAreaFeaturesOptionalCondition);

			for (int i=0;i<maxTests;i++){
				currentIterationStep = i;
				logger.trace("ITERATION NUMBER "+(i+1));
				calcHRS(projectingAreaTable, projectingAreaFeaturesOptionalCondition, FeaturesColumns, positiveCasesTable, negativeCasesTable, numberOfElements);
				meanHRS = MathFunctions.incrementAvg(meanHRS, currentHRSScore, i);
				if (meanHRSVector==null)
					meanHRSVector = new double[currentHRSVector.length];

				for (int j=0;j<currentHRSVector.length;j++){
					meanHRSVector[j]=org.gcube.contentmanagement.graphtools.utils.MathFunctions.roundDecimal(MathFunctions.incrementAvg(meanHRSVector[j],currentHRSVector[j],i),2);
				}

				logger.trace("ITERATION FINISHED "+meanHRS);
				status=Math.min(status+100f/maxTests,99f);
			}

			output = new LinkedHashMap<String, String>();
			output.put("HRS_VECTOR", "" + Transformations.vector2String(meanHRSVector));
			output.put("HRS", "" + org.gcube.contentmanagement.graphtools.utils.MathFunctions.roundDecimal(meanHRS,2));

			return output;
		} catch (Exception e) {
			e.printStackTrace();
			logger.error("ALERT: AN ERROR OCCURRED DURING HRS CALCULATION : " + e.getLocalizedMessage());
			throw e;
		} finally {
			status=100;
			logger.trace("COMPUTATION FINISHED ");
		}
	}

	List<double[]> frequencyDistrib;
	List<double[]> intervals; // uniform subdivision of the numeric ranges

	// calculate a frequency distribution for each component
	public void calcFrequenciesDistributionsForComponents(double[][] pcaComponents) {
		frequencyDistrib = null;
		if (pcaComponents.length > 0) {
			int sizeDistrib = pcaComponents[0].length;
			frequencyDistrib = new ArrayList<double[]>();
			intervals = new ArrayList<double[]>();
			double[][] pcaColumns = Transformations.traspose(pcaComponents);
			for (int i = 0; i < sizeDistrib; i++) {
				double[] pcaPoints = pcaColumns[i];
				double[] interval = Operations.uniformDivide(Operations.getMax(pcaPoints), Operations.getMin(pcaPoints), pcaPoints);
				double[] frequencies = Operations.calcFrequencies(interval, pcaPoints);
				frequencies = Operations.normalizeFrequencies(frequencies, pcaPoints.length);
				intervals.add(interval);
				frequencyDistrib.add(frequencies);
			}
		}
	}

	public static void visualizeResults(HashMap<String, Object> results) {

		for (String key : results.keySet()) {
			System.out.println(key + ":" + results.get(key));
		}
	}

	@Override
	public StatisticalType getOutput() {
		PrimitiveType p = new PrimitiveType(Map.class.getName(), PrimitiveType.stringMap2StatisticalMap(output), PrimitiveTypes.MAP, "AnalysisResult","Habitat Representativeness Score");
		return p;
	}

	@Override
	public float getStatus() {
		return status==100f?status: Math.min((status+(float)(currentIterationStep+1)*innerstatus/(float)maxTests),99f);
	}

	@Override
	public String getDescription() {
		return "An evaluator algorithm that calculates the Habitat Representativeness Score, i.e. an indicator of the assessment of whether a specific survey coverage or another environmental features dataset, contains data that are representative of all available habitat variable combinations in an area.";
	}

}