301 lines
12 KiB
Java
301 lines
12 KiB
Java
package org.gcube.dataanalysis.ecoengine.clustering;
|
|
|
|
import java.lang.reflect.Field;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.ColumnTypesList;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.DatabaseType;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.InputTable;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveType;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.ServiceType;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.ServiceParameters;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.TableTemplates;
|
|
import org.gcube.dataanalysis.ecoengine.utils.DatabaseFactory;
|
|
import org.gcube.dataanalysis.ecoengine.utils.DatabaseUtils;
|
|
import org.gcube.dataanalysis.ecoengine.utils.DynamicEnum;
|
|
import org.gcube.dataanalysis.ecoengine.utils.ResourceFactory;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import com.rapidminer.example.Attribute;
|
|
import com.rapidminer.example.set.SimpleExampleSet;
|
|
import com.rapidminer.example.table.DataRow;
|
|
import com.rapidminer.example.table.MemoryExampleTable;
|
|
import com.rapidminer.operator.IOContainer;
|
|
import com.rapidminer.operator.IOObject;
|
|
import com.rapidminer.tools.OperatorService;
|
|
|
|
public class LOF extends DBScan {
|
|
|
|
private static Logger logger = LoggerFactory.getLogger(LOF.class);
|
|
|
|
String minimal_points_lower_bound = "1";
|
|
String minimal_points_upper_bound = "10";
|
|
String lof_threshold = "2";
|
|
String distance_function = "euclidian distance";
|
|
static String lofcolumn = "lof";
|
|
static String lofcolumntype = "real";
|
|
|
|
LOFenum enuFunctions = new LOFenum();
|
|
|
|
enum LOFenumType {
|
|
}
|
|
|
|
class LOFenum extends DynamicEnum {
|
|
public Field[] getFields() {
|
|
Field[] fields = LOFenumType.class.getDeclaredFields();
|
|
return fields;
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public void init() throws Exception {
|
|
status = 0;
|
|
if ((config!=null) && (initrapidminer))
|
|
config.initRapidMiner();
|
|
logger.debug("Initialized Rapid Miner ");
|
|
logger.debug("Initializing Database Connection");
|
|
dbHibConnection=DatabaseUtils.initDBSession(config);
|
|
//create the final table
|
|
try{
|
|
logger.debug("dropping table "+OccurrencePointsClusterTable);
|
|
String dropStatement = DatabaseUtils.dropTableStatement(OccurrencePointsClusterTable);
|
|
logger.debug("dropping table "+dropStatement);
|
|
DatabaseFactory.executeSQLUpdate(dropStatement, dbHibConnection);
|
|
}catch(Exception e){
|
|
logger.debug("Could not drop table "+OccurrencePointsClusterTable);
|
|
}
|
|
//create Table
|
|
logger.debug("Creating table "+OccurrencePointsClusterTable);
|
|
String [] features = FeaturesColumnNames.split(AlgorithmConfiguration.getListSeparator());
|
|
String columns = "";
|
|
|
|
for (int i=0;i<features.length;i++){
|
|
columns +=features[i]+" real";
|
|
if (i<features.length-1)
|
|
columns+=",";
|
|
}
|
|
|
|
String createStatement = "create table "+OccurrencePointsClusterTable+" ( "+columns+")";
|
|
// String createStatement = new DatabaseUtils(dbHibConnection).buildCreateStatement(OccurrencePointsTable,OccurrencePointsClusterTable);
|
|
logger.debug("Statement: "+createStatement);
|
|
DatabaseFactory.executeSQLUpdate(createStatement, dbHibConnection);
|
|
//add two columns one for cluster and another for outliers
|
|
logger.debug("Adding Columns");
|
|
DatabaseFactory.executeSQLUpdate(DatabaseUtils.addColumnStatement(OccurrencePointsClusterTable, lofcolumn, lofcolumntype), dbHibConnection);
|
|
DatabaseFactory.executeSQLUpdate(DatabaseUtils.addColumnStatement(OccurrencePointsClusterTable, outliersColumn, outliersColumnType), dbHibConnection);
|
|
logger.debug("Getting Samples");
|
|
//build samples
|
|
getSamples();
|
|
status = 10f;
|
|
}
|
|
|
|
@Override
|
|
public void setConfiguration(AlgorithmConfiguration config) {
|
|
if (config != null) {
|
|
minimal_points_lower_bound = config.getParam("minimal_points_lower_bound");
|
|
minimal_points_upper_bound = config.getParam("minimal_points_upper_bound");
|
|
distance_function = config.getParam("distance_function");
|
|
lof_threshold = config.getParam("lof_threshold");
|
|
|
|
OccurrencePointsTable = config.getParam("PointsTable").toLowerCase();
|
|
OccurrencePointsClusterLabel = config.getParam("PointsClusterLabel");
|
|
OccurrencePointsClusterTable = config.getParam("PointsClusterTable").toLowerCase();
|
|
FeaturesColumnNames = config.getParam("FeaturesColumnNames");
|
|
|
|
this.config = config;
|
|
}
|
|
|
|
}
|
|
|
|
@Override
|
|
public void compute() throws Exception {
|
|
try {
|
|
|
|
if ((config == null) || minimal_points_lower_bound == null || minimal_points_upper_bound == null || distance_function == null) {
|
|
throw new Exception("LOF: Error incomplete parameters");
|
|
}
|
|
status = 10f;
|
|
logger.debug("LOF: Settin up the cluster");
|
|
// take elements and produce example set
|
|
com.rapidminer.operator.preprocessing.outlier.LOFOutlierOperator clusterer = (com.rapidminer.operator.preprocessing.outlier.LOFOutlierOperator) OperatorService.createOperator("LOFOutlierDetection");
|
|
clusterer.setParameter("minimal_points_lower_bound", minimal_points_lower_bound);
|
|
clusterer.setParameter("minimal_points_upper_bound", minimal_points_upper_bound);
|
|
clusterer.setParameter("distance_function", distance_function);
|
|
|
|
IOContainer innerInput = new IOContainer(points);
|
|
|
|
logger.debug("LOF: Clustering...");
|
|
long ti = System.currentTimeMillis();
|
|
IOContainer output = clusterer.apply(innerInput);
|
|
logger.debug("LOF: ...ELAPSED CLUSTERING TIME: " + (System.currentTimeMillis() - ti));
|
|
logger.debug("LOF: ...Clustering Finished");
|
|
status = 70f;
|
|
|
|
IOObject[] outputvector = output.getIOObjects();
|
|
|
|
BuildClusterTable(outputvector);
|
|
} catch (Exception e) {
|
|
throw e;
|
|
} finally {
|
|
shutdown();
|
|
status = 100f;
|
|
}
|
|
}
|
|
|
|
@Override
|
|
protected void BuildClusterTable(IOObject[] outputvector) throws Exception {
|
|
|
|
StringBuffer bufferRows = new StringBuffer();
|
|
SimpleExampleSet output = (SimpleExampleSet) outputvector[0];
|
|
MemoryExampleTable met = (MemoryExampleTable) output.getExampleTable();
|
|
int numofcolumns = met.getAttributeCount();
|
|
int numofrows = met.size();
|
|
double lofthr = 2;
|
|
if (lof_threshold!=null)
|
|
try{lofthr = Double.parseDouble(lof_threshold);}catch(Exception e){};
|
|
logger.debug("LOF: using lof threshold :"+lofthr);
|
|
|
|
for (int i = 0; i < numofrows; i++) {
|
|
|
|
DataRow dr = met.getDataRow(i);
|
|
Attribute outlierAtt = met.getAttribute(numofcolumns - 1);
|
|
bufferRows.append("(");
|
|
|
|
for (int j=0;j<numofcolumns-2;j++){
|
|
Attribute att = met.getAttribute(j);
|
|
bufferRows.append(dr.get(att)+",");
|
|
}
|
|
double lofscore = dr.get(outlierAtt);
|
|
if (lofscore>Double.MAX_VALUE)
|
|
lofscore = Float.MAX_VALUE;
|
|
boolean outlier = (lofscore>=lofthr);
|
|
|
|
bufferRows.append(lofscore+","+outlier+")");
|
|
|
|
if (i<numofrows-1)
|
|
bufferRows.append(",");
|
|
}
|
|
|
|
logger.debug("LOF: Finished in retrieving and building output to write");
|
|
|
|
|
|
String columnsNames = FeaturesColumnNames + ","+lofcolumn+","+outliersColumn;
|
|
// System.out.println(DatabaseUtils.insertFromBuffer(OccurrencePointsClusterTable, columnsNames, bufferRows));
|
|
|
|
if (bufferRows.length() > 0) {
|
|
|
|
logger.debug("Writing into DB");
|
|
// logger.debug("Query to execute: "+DatabaseUtils.insertFromBuffer(OccurrencePointsClusterTable, columnsNames, bufferRows));
|
|
DatabaseFactory.executeSQLUpdate(DatabaseUtils.insertFromBuffer(OccurrencePointsClusterTable, columnsNames, bufferRows), dbHibConnection);
|
|
logger.debug("Finished with writing into DB");
|
|
} else
|
|
logger.debug("Nothing to write in the buffer");
|
|
|
|
status = 100;
|
|
logger.debug("Status: " + status);
|
|
|
|
}
|
|
|
|
@Override
|
|
public List<StatisticalType> getInputParameters() {
|
|
List<StatisticalType> parameters = new ArrayList<StatisticalType>();
|
|
List<TableTemplates> templateOccs = new ArrayList<TableTemplates>();
|
|
templateOccs.add(TableTemplates.GENERIC);
|
|
|
|
InputTable p1 = new InputTable(templateOccs, "PointsTable", "Table containing points or observations. Max 4000 points", "pointstable");
|
|
ColumnTypesList p2 = new ColumnTypesList("PointsTable", "FeaturesColumnNames", "column Names for the features", false);
|
|
|
|
PrimitiveType p0 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "PointsClusterLabel", "table name of the resulting distribution", "Cluster_");
|
|
ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, "PointsClusterTable", "table name of the distribution", "occcluster_");
|
|
|
|
PrimitiveType p4 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "minimal_points_lower_bound", "locality (usually called k): minimal number of nearest neighbors", "2");
|
|
PrimitiveType p5 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "minimal_points_upper_bound", "maximum number of nearest neighbors to take into account for outliers evaluation", "10");
|
|
|
|
if (LOFenumType.values().length<2) {
|
|
enuFunctions.addEnum(LOFenumType.class, "euclidian distance");
|
|
enuFunctions.addEnum(LOFenumType.class, "squared distance");
|
|
enuFunctions.addEnum(LOFenumType.class, "cosine distance");
|
|
enuFunctions.addEnum(LOFenumType.class, "inverted cosine distance");
|
|
enuFunctions.addEnum(LOFenumType.class, "angle");
|
|
}
|
|
|
|
PrimitiveType p6 = new PrimitiveType(Enum.class.getName(), LOFenumType.values(), PrimitiveTypes.ENUMERATED, "distance_function", "the distance function to use in the calculation", "euclidian distance");
|
|
PrimitiveType p7 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "lof_threshold", "the LOF score threshold over which the point is an outlier (usually 2)", "2");
|
|
|
|
parameters.add(p1);
|
|
parameters.add(p2);
|
|
parameters.add(p0);
|
|
parameters.add(p3);
|
|
parameters.add(p4);
|
|
parameters.add(p5);
|
|
parameters.add(p6);
|
|
parameters.add(p7);
|
|
|
|
DatabaseType.addDefaultDBPars(parameters);
|
|
return parameters;
|
|
}
|
|
|
|
@Override
|
|
public String getDescription() {
|
|
return "Local Outlier Factor (LOF). A clustering algorithm for real valued vectors that relies on Local Outlier Factor algorithm, i.e. an algorithm for finding anomalous data points by measuring the local deviation of a given data point with respect to its neighbours. A Maximum of 4000 points is allowed.";
|
|
}
|
|
|
|
ResourceFactory resourceManager;
|
|
|
|
public String getResourceLoad() {
|
|
if (resourceManager == null)
|
|
resourceManager = new ResourceFactory();
|
|
return resourceManager.getResourceLoad(1);
|
|
}
|
|
|
|
@Override
|
|
public String getResources() {
|
|
return ResourceFactory.getResources(100f);
|
|
}
|
|
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
long t0 = System.currentTimeMillis();
|
|
|
|
AlgorithmConfiguration config = new AlgorithmConfiguration();
|
|
config.setConfigPath("./cfg/");
|
|
config.setPersistencePath("./");
|
|
// config.setParam("PointsTable", "presence_basking_cluster");
|
|
// config.setParam("FeaturesColumnNames", "centerlat" + AlgorithmConfiguration.getListSeparator() + "centerlong");
|
|
config.setParam("PointsTable", "hcaf_d");
|
|
// config.setParam("FeaturesColumnNames", "depthmin" + AlgorithmConfiguration.getListSeparator() + "depthmax");
|
|
config.setParam("FeaturesColumnNames", "depthmin" + AlgorithmConfiguration.getListSeparator() + "depthmax"+ AlgorithmConfiguration.getListSeparator()+"depthmean"+
|
|
AlgorithmConfiguration.getListSeparator()+"sstanmean"+
|
|
AlgorithmConfiguration.getListSeparator()+"sstmnmax"+
|
|
AlgorithmConfiguration.getListSeparator()+"sstmnmin"+
|
|
AlgorithmConfiguration.getListSeparator()+"sbtanmean"+
|
|
AlgorithmConfiguration.getListSeparator()+"salinitymean"+
|
|
AlgorithmConfiguration.getListSeparator()+"salinitymax");
|
|
// config.setParam("FeaturesColumnNames", "depthmin");
|
|
config.setParam("PointsClusterTable", "occCluster_lof");
|
|
|
|
config.setParam("minimal_points_lower_bound", "1");
|
|
config.setParam("minimal_points_upper_bound", "100");
|
|
config.setParam("distance_function", "euclidean distance");
|
|
|
|
config.setParam("DatabaseUserName", "gcube");
|
|
config.setParam("DatabasePassword", "d4science2");
|
|
config.setParam("DatabaseURL", "jdbc:postgresql://146.48.87.169/testdb");
|
|
config.setParam("DatabaseDriver", "org.postgresql.Driver");
|
|
|
|
LOF cluster = new LOF();
|
|
cluster.setConfiguration(config);
|
|
cluster.init();
|
|
cluster.compute();
|
|
|
|
System.out.println("ELAPSED " + (System.currentTimeMillis() - t0));
|
|
|
|
}
|
|
|
|
}
|