utils for CSV and limitations to Clusterers

git-svn-id: https://svn.d4science.research-infrastructures.eu/gcube/trunk/data-analysis/EcologicalEngine@82321 82a268e6-3cf1-43bd-a215-b396298e98cf
This commit is contained in:
Gianpaolo Coro 2013-10-02 17:35:52 +00:00
parent 2438abbd29
commit f1dbb48522
7 changed files with 377 additions and 12 deletions

View File

@ -224,14 +224,34 @@ public class DBScan implements Clusterer{
t00=System.currentTimeMillis();
// System.out.println("->"+DatabaseUtils.getColumnsElementsStatement(OccurrencePointsTable, FeaturesColumnNames, ""));
FeaturesColumnNames=FeaturesColumnNames.replace(AlgorithmConfiguration.listSeparator, ",");
List<Object> samples = DatabaseFactory.executeSQLQuery(DatabaseUtils.getColumnsElementsStatement(OccurrencePointsTable, FeaturesColumnNames, ""), dbHibConnection);
String [] elements = FeaturesColumnNames.split(",");
// int limit = (int)Math.pow(5000,1d/(double)elements.length);
int N=4000;
double k = elements.length;
double t=82327;
double logG = Math.log(t)-N;
int limit = N;
// if (k>1)
// limit = (int)Math.round(( Math.log(t)-k*logG )/k );
// limit = (int)Math.round((double)N/k);
// limit = (int)(11d*Math.pow(N,2d/(k+1)));
// limit =(int) ((double)N/(1.3d));
AnalysisLogger.getLogger().debug("Clustering limit: "+limit);
List<Object> samples = DatabaseFactory.executeSQLQuery(DatabaseUtils.getColumnsElementsStatement(OccurrencePointsTable, FeaturesColumnNames, "limit "+limit), dbHibConnection);
int dimensions = elements.length;
int nSamples = samples.size();
samplesVector = new double[nSamples][dimensions];
int ir=0;
for (Object row:samples){
Object[] rowArr = (Object[]) row;
Object[] rowArr = new Object[1];
try{rowArr = (Object[]) row;}
catch(ClassCastException e){
rowArr[0] = ""+row;
}
int ic=0;
for (Object elem:rowArr){
Double feature = null;
@ -398,7 +418,7 @@ public class DBScan implements Clusterer{
List<StatisticalType> parameters = new ArrayList<StatisticalType>();
List<TableTemplates> templateOccs = new ArrayList<TableTemplates>();
templateOccs.add(TableTemplates.GENERIC);
InputTable p1 = new InputTable(templateOccs,"OccurrencePointsTable","Occurrence Points Table","occurrences");
InputTable p1 = new InputTable(templateOccs,"OccurrencePointsTable","Occurrence Points Table. Max 4000 points","occurrences");
ColumnTypesList p2 = new ColumnTypesList ("OccurrencePointsTable","FeaturesColumnNames", "column Names for the features", false);
PrimitiveType p0 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "OccurrencePointsClusterLabel","table name of the resulting distribution","OccCluster_");
ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, "OccurrencePointsClusterTable","table name of the distribution","occCluster_");
@ -419,7 +439,7 @@ public class DBScan implements Clusterer{
@Override
public String getDescription() {
return "Clustering with DBScan";
return "Clustering with DBScan. A maximum of 4000 points is allowed.";
}

View File

@ -118,7 +118,7 @@ public class KMeans extends DBScan{
List<StatisticalType> parameters = new ArrayList<StatisticalType>();
List<TableTemplates> templateOccs = new ArrayList<TableTemplates>();
templateOccs.add(TableTemplates.GENERIC);
InputTable p1 = new InputTable(templateOccs,"OccurrencePointsTable","Occurrence Points Table","occurrences");
InputTable p1 = new InputTable(templateOccs,"OccurrencePointsTable","Occurrence Points Table. Max 4000 points","occurrences");
ColumnTypesList p2 = new ColumnTypesList ("OccurrencePointsTable","FeaturesColumnNames", "column Names for the features", false);
PrimitiveType p0 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "OccurrencePointsClusterLabel","table name of the resulting distribution","OccCluster_");
ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, "OccurrencePointsClusterTable","table name of the distribution","occCluster_");
@ -142,7 +142,7 @@ public class KMeans extends DBScan{
@Override
public String getDescription() {
return "Clustering with KMeans";
return "Clustering with KMeans. A Maximum of 4000 points is allowed.";
}

View File

@ -0,0 +1,307 @@
package org.gcube.dataanalysis.ecoengine.clustering;
import java.lang.reflect.Field;
import java.security.acl.LastOwnerException;
import java.util.ArrayList;
import java.util.List;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration;
import org.gcube.dataanalysis.ecoengine.datatypes.ColumnTypesList;
import org.gcube.dataanalysis.ecoengine.datatypes.DatabaseType;
import org.gcube.dataanalysis.ecoengine.datatypes.InputTable;
import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveType;
import org.gcube.dataanalysis.ecoengine.datatypes.ServiceType;
import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.ServiceParameters;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.TableTemplates;
import org.gcube.dataanalysis.ecoengine.utils.DatabaseFactory;
import org.gcube.dataanalysis.ecoengine.utils.DatabaseUtils;
import org.gcube.dataanalysis.ecoengine.utils.DynamicEnum;
import org.gcube.dataanalysis.ecoengine.utils.ResourceFactory;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.set.SimpleExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRow;
import com.rapidminer.example.table.DoubleArrayDataRow;
import com.rapidminer.example.table.ExampleTable;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.clustering.Cluster;
import com.rapidminer.operator.clustering.ClusterModel;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.OperatorService;
public class LOF extends DBScan {
String minimal_points_lower_bound = "1";
String minimal_points_upper_bound = "10";
String lof_threshold = "2";
String distance_function = "euclidian distance";
static String lofcolumn = "lof";
static String lofcolumntype = "real";
LOFenum enuFunctions = new LOFenum();
enum LOFenumType {
}
class LOFenum extends DynamicEnum {
public Field[] getFields() {
Field[] fields = LOFenumType.class.getDeclaredFields();
return fields;
}
}
@Override
public void init() throws Exception {
status = 0;
if ((config!=null) && (initrapidminer))
config.initRapidMiner();
AnalysisLogger.getLogger().debug("Initialized Rapid Miner ");
AnalysisLogger.getLogger().debug("Initializing Database Connection");
dbHibConnection=DatabaseUtils.initDBSession(config);
//create the final table
try{
AnalysisLogger.getLogger().debug("dropping table "+OccurrencePointsClusterTable);
String dropStatement = DatabaseUtils.dropTableStatement(OccurrencePointsClusterTable);
AnalysisLogger.getLogger().debug("dropping table "+dropStatement);
DatabaseFactory.executeSQLUpdate(dropStatement, dbHibConnection);
}catch(Exception e){
AnalysisLogger.getLogger().debug("Could not drop table "+OccurrencePointsClusterTable);
}
//create Table
AnalysisLogger.getLogger().debug("Creating table "+OccurrencePointsClusterTable);
String [] features = FeaturesColumnNames.split(AlgorithmConfiguration.getListSeparator());
String columns = "";
for (int i=0;i<features.length;i++){
columns +=features[i]+" real";
if (i<features.length-1)
columns+=",";
}
String createStatement = "create table "+OccurrencePointsClusterTable+" ( "+columns+")";
// String createStatement = new DatabaseUtils(dbHibConnection).buildCreateStatement(OccurrencePointsTable,OccurrencePointsClusterTable);
AnalysisLogger.getLogger().debug("Statement: "+createStatement);
DatabaseFactory.executeSQLUpdate(createStatement, dbHibConnection);
//add two columns one for cluster and another for outliers
AnalysisLogger.getLogger().debug("Adding Columns");
DatabaseFactory.executeSQLUpdate(DatabaseUtils.addColumnStatement(OccurrencePointsClusterTable, lofcolumn, lofcolumntype), dbHibConnection);
DatabaseFactory.executeSQLUpdate(DatabaseUtils.addColumnStatement(OccurrencePointsClusterTable, outliersColumn, outliersColumnType), dbHibConnection);
AnalysisLogger.getLogger().debug("Getting Samples");
//build samples
getSamples();
status = 10f;
}
@Override
public void setConfiguration(AlgorithmConfiguration config) {
if (config != null) {
minimal_points_lower_bound = config.getParam("minimal_points_lower_bound");
minimal_points_upper_bound = config.getParam("minimal_points_upper_bound");
distance_function = config.getParam("distance_function");
lof_threshold = config.getParam("lof_threshold");
OccurrencePointsTable = config.getParam("PointsTable").toLowerCase();
OccurrencePointsClusterLabel = config.getParam("PointsClusterLabel");
OccurrencePointsClusterTable = config.getParam("PointsClusterTable").toLowerCase();
FeaturesColumnNames = config.getParam("FeaturesColumnNames");
this.config = config;
}
}
@Override
public void compute() throws Exception {
try {
if ((config == null) || minimal_points_lower_bound == null || minimal_points_upper_bound == null || distance_function == null) {
throw new Exception("LOF: Error incomplete parameters");
}
status = 10f;
AnalysisLogger.getLogger().debug("LOF: Settin up the cluster");
// take elements and produce example set
com.rapidminer.operator.preprocessing.outlier.LOFOutlierOperator clusterer = (com.rapidminer.operator.preprocessing.outlier.LOFOutlierOperator) OperatorService.createOperator("LOFOutlierDetection");
clusterer.setParameter("minimal_points_lower_bound", minimal_points_lower_bound);
clusterer.setParameter("minimal_points_upper_bound", minimal_points_upper_bound);
clusterer.setParameter("distance_function", distance_function);
IOContainer innerInput = new IOContainer(points);
AnalysisLogger.getLogger().debug("LOF: Clustering...");
long ti = System.currentTimeMillis();
IOContainer output = clusterer.apply(innerInput);
AnalysisLogger.getLogger().debug("LOF: ...ELAPSED CLUSTERING TIME: " + (System.currentTimeMillis() - ti));
AnalysisLogger.getLogger().debug("LOF: ...Clustering Finished");
status = 70f;
IOObject[] outputvector = output.getIOObjects();
BuildClusterTable(outputvector);
} catch (Exception e) {
throw e;
} finally {
shutdown();
status = 100f;
}
}
@Override
protected void BuildClusterTable(IOObject[] outputvector) throws Exception {
StringBuffer bufferRows = new StringBuffer();
SimpleExampleSet output = (SimpleExampleSet) outputvector[0];
MemoryExampleTable met = (MemoryExampleTable) output.getExampleTable();
int numofcolumns = met.getAttributeCount();
int numofrows = met.size();
double lofthr = 2;
if (lof_threshold!=null)
try{lofthr = Double.parseDouble(lof_threshold);}catch(Exception e){};
AnalysisLogger.getLogger().debug("LOF: using lof threshold :"+lofthr);
for (int i = 0; i < numofrows; i++) {
DataRow dr = met.getDataRow(i);
Attribute outlierAtt = met.getAttribute(numofcolumns - 1);
bufferRows.append("(");
for (int j=0;j<numofcolumns-2;j++){
Attribute att = met.getAttribute(j);
bufferRows.append(dr.get(att)+",");
}
double lofscore = dr.get(outlierAtt);
if (lofscore>Double.MAX_VALUE)
lofscore = Float.MAX_VALUE;
boolean outlier = (lofscore>=lofthr);
bufferRows.append(lofscore+","+outlier+")");
if (i<numofrows-1)
bufferRows.append(",");
}
AnalysisLogger.getLogger().debug("LOF: Finished in retrieving and building output to write");
String columnsNames = FeaturesColumnNames + ","+lofcolumn+","+outliersColumn;
// System.out.println(DatabaseUtils.insertFromBuffer(OccurrencePointsClusterTable, columnsNames, bufferRows));
if (bufferRows.length() > 0) {
AnalysisLogger.getLogger().debug("Writing into DB");
// AnalysisLogger.getLogger().debug("Query to execute: "+DatabaseUtils.insertFromBuffer(OccurrencePointsClusterTable, columnsNames, bufferRows));
DatabaseFactory.executeSQLUpdate(DatabaseUtils.insertFromBuffer(OccurrencePointsClusterTable, columnsNames, bufferRows), dbHibConnection);
AnalysisLogger.getLogger().debug("Finished with writing into DB");
} else
AnalysisLogger.getLogger().debug("Nothing to write in the buffer");
status = 100;
AnalysisLogger.getLogger().debug("Status: " + status);
}
@Override
public List<StatisticalType> getInputParameters() {
List<StatisticalType> parameters = new ArrayList<StatisticalType>();
List<TableTemplates> templateOccs = new ArrayList<TableTemplates>();
templateOccs.add(TableTemplates.GENERIC);
InputTable p1 = new InputTable(templateOccs, "PointsTable", "Table containing points or observations. Max 4000 points", "pointstable");
ColumnTypesList p2 = new ColumnTypesList("PointsTable", "FeaturesColumnNames", "column Names for the features", false);
PrimitiveType p0 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "PointsClusterLabel", "table name of the resulting distribution", "Cluster_");
ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, "PointsClusterTable", "table name of the distribution", "occcluster_");
PrimitiveType p4 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "minimal_points_lower_bound", "locality (usually called k): minimal number of nearest neighbors", "2");
PrimitiveType p5 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "minimal_points_upper_bound", "maximum number of nearest neighbors to take into account for outliers evaluation", "10");
if (LOFenumType.values().length<2) {
enuFunctions.addEnum(LOFenumType.class, "euclidian distance");
enuFunctions.addEnum(LOFenumType.class, "squared distance");
enuFunctions.addEnum(LOFenumType.class, "cosine distance");
enuFunctions.addEnum(LOFenumType.class, "inverted cosine distance");
enuFunctions.addEnum(LOFenumType.class, "angle");
}
PrimitiveType p6 = new PrimitiveType(Enum.class.getName(), LOFenumType.values(), PrimitiveTypes.ENUMERATED, "distance_function", "the distance function to use in the calculation", "euclidian distance");
PrimitiveType p7 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "lof_threshold", "the LOF score threshold over which the point is an outlier (usually 2)", "2");
parameters.add(p1);
parameters.add(p2);
parameters.add(p0);
parameters.add(p3);
parameters.add(p4);
parameters.add(p5);
parameters.add(p6);
parameters.add(p7);
DatabaseType.addDefaultDBPars(parameters);
return parameters;
}
@Override
public String getDescription() {
return "Local Outlier Factor (LOF). A Maximum of 4000 points is allowed.";
}
ResourceFactory resourceManager;
public String getResourceLoad() {
if (resourceManager == null)
resourceManager = new ResourceFactory();
return resourceManager.getResourceLoad(1);
}
@Override
public String getResources() {
return ResourceFactory.getResources(100f);
}
public static void main(String[] args) throws Exception {
long t0 = System.currentTimeMillis();
AlgorithmConfiguration config = new AlgorithmConfiguration();
config.setConfigPath("./cfg/");
config.setPersistencePath("./");
// config.setParam("PointsTable", "presence_basking_cluster");
// config.setParam("FeaturesColumnNames", "centerlat" + AlgorithmConfiguration.getListSeparator() + "centerlong");
config.setParam("PointsTable", "hcaf_d");
// config.setParam("FeaturesColumnNames", "depthmin" + AlgorithmConfiguration.getListSeparator() + "depthmax");
config.setParam("FeaturesColumnNames", "depthmin" + AlgorithmConfiguration.getListSeparator() + "depthmax"+ AlgorithmConfiguration.getListSeparator()+"depthmean"+
AlgorithmConfiguration.getListSeparator()+"sstanmean"+
AlgorithmConfiguration.getListSeparator()+"sstmnmax"+
AlgorithmConfiguration.getListSeparator()+"sstmnmin"+
AlgorithmConfiguration.getListSeparator()+"sbtanmean"+
AlgorithmConfiguration.getListSeparator()+"salinitymean"+
AlgorithmConfiguration.getListSeparator()+"salinitymax");
// config.setParam("FeaturesColumnNames", "depthmin");
config.setParam("PointsClusterTable", "occCluster_lof");
config.setParam("minimal_points_lower_bound", "1");
config.setParam("minimal_points_upper_bound", "100");
config.setParam("distance_function", "euclidean distance");
config.setParam("DatabaseUserName", "gcube");
config.setParam("DatabasePassword", "d4science2");
config.setParam("DatabaseURL", "jdbc:postgresql://146.48.87.169/testdb");
config.setParam("DatabaseDriver", "org.postgresql.Driver");
LOF cluster = new LOF();
cluster.setConfiguration(config);
cluster.init();
cluster.compute();
System.out.println("ELAPSED " + (System.currentTimeMillis() - t0));
}
}

View File

@ -119,7 +119,7 @@ public class XMeansWrapper extends DBScan {
List<TableTemplates> templateOccs = new ArrayList<TableTemplates>();
templateOccs.add(TableTemplates.GENERIC);
InputTable p1 = new InputTable(templateOccs, "OccurrencePointsTable", "Occurrence Points Table", "occurrences");
InputTable p1 = new InputTable(templateOccs, "OccurrencePointsTable", "Occurrence Points Table. Max 4000 points", "occurrences");
ColumnTypesList p2 = new ColumnTypesList ("OccurrencePointsTable","FeaturesColumnNames", "column Names for the features", false);
PrimitiveType p0 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "OccurrencePointsClusterLabel","table name of the resulting distribution","OccCluster_");
ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, "OccurrencePointsClusterTable", "table name of the distribution", "occCluster_");
@ -144,7 +144,7 @@ public class XMeansWrapper extends DBScan {
@Override
public String getDescription() {
return "Clustering with XMeans Algorithm: X-Means is K-Means extended by an Improve-Structure part In this part of the algorithm the centers are attempted to be split in its region. ";
return "Clustering with XMeans Algorithm: X-Means is K-Means extended by an Improve-Structure part In this part of the algorithm the centers are attempted to be split in its region. A Maximum of 4000 points is allowed.";
}
@Override

View File

@ -146,7 +146,7 @@ public class FeedForwardNNFile extends ModelAquamapsNN {
List<TableTemplates> templatesOccurrences = new ArrayList<TableTemplates>();
templatesOccurrences.add(TableTemplates.GENERIC);
InputTable p1 = new InputTable(templatesOccurrences, TrainingDataSet, "a table containing real values colums for training the ANN");
InputTable p1 = new InputTable(templatesOccurrences, TrainingDataSet, "a table containing real values colums for training the ANN (up to 100000 points)");
ColumnTypesList p2 = new ColumnTypesList(TrainingDataSet, TrainingDataSetColumns, "column names to use as features vectors", false);
ColumnType p3 = new ColumnType(TrainingDataSet, TrainingDataSetTargetColumn, "the column to use as target", "probability", false);
PrimitiveTypesList p4 = new PrimitiveTypesList(Integer.class.getName(), PrimitiveTypes.NUMBER, LayersNeurons, "a list of neurons number for each inner layer", true);

View File

@ -32,7 +32,7 @@ public class OccurrencePointsDuplicatesDeleter extends OccurrencePointsMerger {
templatesOccurrence.add(TableTemplates.OCCURRENCE_SPECIES);
// occurrence points tables
PrimitiveType p0 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, finalTableNameL, "the name of the produced table", "DeletedOcc_");
InputTable p1 = new InputTable(templatesOccurrence, tableNameF, "the table containing the occurrence points", "");
InputTable p1 = new InputTable(templatesOccurrence, tableNameF, "the table containing the occurrence points (up to 100 000 points)", "");
ColumnType p3 = new ColumnType(tableNameF, longitudeColumn, "column with longitude values", "decimallongitude", false);
ColumnType p4 = new ColumnType(tableNameF, latitudeColumn, "column with latitude values", "decimallatitude", false);
ColumnType p5 = new ColumnType(tableNameF, recordedByColumn, "column with RecordedBy values", "recordedby", false);
@ -62,7 +62,7 @@ public class OccurrencePointsDuplicatesDeleter extends OccurrencePointsMerger {
@Override
public String getDescription() {
return "An algorithm for deleting similar occurrences in a sets of occurrence points of species coming from the Species Discovery Facility of D4Science";
return "An algorithm for deleting similar occurrences in a sets of occurrence points of species coming from the Species Discovery Facility of D4Science. Works with up to 100 000 points";
}
@Override
@ -106,7 +106,7 @@ public class OccurrencePointsDuplicatesDeleter extends OccurrencePointsMerger {
public void takeFullRanges() {
// take the elements from sx table
AnalysisLogger.getLogger().info("Taking elements from left table: " + leftTableName);
leftRows = DatabaseFactory.executeSQLQuery(DatabaseUtils.getDinstictElements(tableName, columns.toString(), ""), dbconnection);
leftRows = DatabaseFactory.executeSQLQuery(DatabaseUtils.getDinstictElements(tableName, columns.toString(), "limit 100000"), dbconnection);
}
public void takeRange(int offsetLeft, int numLeft, int offsetRight, int numRight) {

View File

@ -4,7 +4,10 @@ import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.List;
import org.apache.regexp.RE;
import org.gcube.contentmanagement.graphtools.data.BigSamplesTable;
import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration;
@ -195,4 +198,39 @@ public class Transformations {
System.out.println("sha1: "+Sha1.SHA1(s2));
}
public static List<String> parseCVSString(String row, String delimiter) throws Exception{
List<String> elements = new ArrayList<String>();
String phrase=row;
int idxdelim = -1;
boolean quot = false;
phrase = phrase.trim();
while ( (idxdelim = phrase.indexOf(delimiter))>0) {
quot=phrase.startsWith("\"");
if (quot){
phrase = phrase.substring(1);
RE regexp = new RE("[^\\\\]\"");
boolean matching = regexp.match(phrase);
String quoted = "";
if (matching){
int i0 = regexp.getParenStart(0);
quoted= phrase.substring(0,i0+1).trim();
phrase = phrase.substring(i0+2).trim();
}
if (phrase.startsWith(delimiter))
phrase=phrase.substring(1);
elements.add(quoted);
}
else{
elements.add(phrase.substring(0,idxdelim));
phrase= phrase.substring(idxdelim+1).trim();
}
}
elements.add(phrase);
return elements;
}
}