This commit is contained in:
Gianpaolo Coro 2012-09-03 09:26:37 +00:00
parent b7b23ad5f9
commit 41ff336881
2 changed files with 210 additions and 28 deletions

View File

@ -36,16 +36,16 @@ import com.rapidminer.tools.OperatorService;
public class DBScan implements Clusterer{ public class DBScan implements Clusterer{
AlgorithmConfiguration config; protected AlgorithmConfiguration config;
String epsilon; protected String epsilon;
String minPoints; protected String minPoints;
ExampleSet points; protected ExampleSet points;
ArrayList<ArrayList<String>> rows; protected ArrayList<ArrayList<String>> rows;
String OccurrencePointsTable; protected String OccurrencePointsTable;
String OccurrencePointsClusterTable; protected String OccurrencePointsClusterTable;
String FeaturesColumnNames; protected String FeaturesColumnNames;
float status; protected float status;
private SessionFactory dbHibConnection; protected SessionFactory dbHibConnection;
public static String clusterColumn = "clusterid"; public static String clusterColumn = "clusterid";
public static String clusterColumnType = "character varying"; public static String clusterColumnType = "character varying";
@ -165,20 +165,20 @@ public class DBScan implements Clusterer{
if (config!=null) if (config!=null)
config.initRapidMiner(); config.initRapidMiner();
AnalysisLogger.getLogger().debug("DBScan: Initialized Rapid Miner "); AnalysisLogger.getLogger().debug("Initialized Rapid Miner ");
AnalysisLogger.getLogger().debug("DBScan: Initializing Database Connection"); AnalysisLogger.getLogger().debug("Initializing Database Connection");
dbHibConnection=DatabaseUtils.initDBSession(config); dbHibConnection=DatabaseUtils.initDBSession(config);
//create the final table //create the final table
try{ try{
AnalysisLogger.getLogger().debug("DBScan: dropping table "+OccurrencePointsClusterTable); AnalysisLogger.getLogger().debug("dropping table "+OccurrencePointsClusterTable);
String dropStatement = DatabaseUtils.dropTableStatement(OccurrencePointsClusterTable); String dropStatement = DatabaseUtils.dropTableStatement(OccurrencePointsClusterTable);
AnalysisLogger.getLogger().debug("DBScan: dropping table "+dropStatement); AnalysisLogger.getLogger().debug("dropping table "+dropStatement);
DatabaseFactory.executeSQLUpdate(dropStatement, dbHibConnection); DatabaseFactory.executeSQLUpdate(dropStatement, dbHibConnection);
}catch(Exception e){ }catch(Exception e){
AnalysisLogger.getLogger().debug("DBScan: Could not drop table "+OccurrencePointsClusterTable); AnalysisLogger.getLogger().debug("Could not drop table "+OccurrencePointsClusterTable);
} }
//create Table //create Table
AnalysisLogger.getLogger().debug("DBScan: Creating table "+OccurrencePointsClusterTable); AnalysisLogger.getLogger().debug("Creating table "+OccurrencePointsClusterTable);
String [] features = FeaturesColumnNames.split(AlgorithmConfiguration.getListSeparator()); String [] features = FeaturesColumnNames.split(AlgorithmConfiguration.getListSeparator());
String columns = ""; String columns = "";
@ -190,13 +190,13 @@ public class DBScan implements Clusterer{
String createStatement = "create table "+OccurrencePointsClusterTable+" ( "+columns+")"; String createStatement = "create table "+OccurrencePointsClusterTable+" ( "+columns+")";
// String createStatement = new DatabaseUtils(dbHibConnection).buildCreateStatement(OccurrencePointsTable,OccurrencePointsClusterTable); // String createStatement = new DatabaseUtils(dbHibConnection).buildCreateStatement(OccurrencePointsTable,OccurrencePointsClusterTable);
AnalysisLogger.getLogger().debug("DBScan: "+createStatement); AnalysisLogger.getLogger().debug("Statement: "+createStatement);
DatabaseFactory.executeSQLUpdate(createStatement, dbHibConnection); DatabaseFactory.executeSQLUpdate(createStatement, dbHibConnection);
//add two columns one for cluster and another for outliers //add two columns one for cluster and another for outliers
AnalysisLogger.getLogger().debug("DBScan: Adding Columns"); AnalysisLogger.getLogger().debug("Adding Columns");
DatabaseFactory.executeSQLUpdate(DatabaseUtils.addColumnStatement(OccurrencePointsClusterTable, clusterColumn, clusterColumnType), dbHibConnection); DatabaseFactory.executeSQLUpdate(DatabaseUtils.addColumnStatement(OccurrencePointsClusterTable, clusterColumn, clusterColumnType), dbHibConnection);
DatabaseFactory.executeSQLUpdate(DatabaseUtils.addColumnStatement(OccurrencePointsClusterTable, outliersColumn, outliersColumnType), dbHibConnection); DatabaseFactory.executeSQLUpdate(DatabaseUtils.addColumnStatement(OccurrencePointsClusterTable, outliersColumn, outliersColumnType), dbHibConnection);
AnalysisLogger.getLogger().debug("DBScan: Getting Samples"); AnalysisLogger.getLogger().debug("Getting Samples");
//build samples //build samples
getSamples(); getSamples();
status = 10f; status = 10f;
@ -218,7 +218,9 @@ public class DBScan implements Clusterer{
private void getSamples() throws Exception{ protected void getSamples() throws Exception{
System.out.println("->"+DatabaseUtils.getColumnsElementsStatement(OccurrencePointsTable, FeaturesColumnNames, ""));
FeaturesColumnNames=FeaturesColumnNames.replace(AlgorithmConfiguration.listSeparator, ",");
List<Object> samples = DatabaseFactory.executeSQLQuery(DatabaseUtils.getColumnsElementsStatement(OccurrencePointsTable, FeaturesColumnNames, ""), dbHibConnection); List<Object> samples = DatabaseFactory.executeSQLQuery(DatabaseUtils.getColumnsElementsStatement(OccurrencePointsTable, FeaturesColumnNames, ""), dbHibConnection);
String [] elements = FeaturesColumnNames.split(","); String [] elements = FeaturesColumnNames.split(",");
int dimensions = elements.length; int dimensions = elements.length;
@ -281,10 +283,21 @@ public class DBScan implements Clusterer{
IOObject[] outputvector = output.getIOObjects(); IOObject[] outputvector = output.getIOObjects();
BuildClusterTable(outputvector);
shutdown();
status = 100f;
}
protected void BuildClusterTable(IOObject[] outputvector) throws Exception{
ClusterModel innermodel = (ClusterModel) outputvector[0]; ClusterModel innermodel = (ClusterModel) outputvector[0];
ExampleSet es = (ExampleSet) outputvector[1]; ExampleSet es = (ExampleSet) outputvector[1];
String columnsNames =FeaturesColumnNames+","+clusterColumn+","+outliersColumn; String columnsNames =FeaturesColumnNames+","+clusterColumn+","+outliersColumn;
int minpoints = Integer.parseInt(minPoints); int minpoints = Integer.parseInt(minPoints);
AnalysisLogger.getLogger().debug("Analyzing Cluster ->"+" minpoints"+minpoints);
int nClusters = innermodel.getClusters().size(); int nClusters = innermodel.getClusters().size();
float statusstep = ((100f-status)/ (float)(nClusters+1)); float statusstep = ((100f-status)/ (float)(nClusters+1));
@ -296,7 +309,7 @@ public class DBScan implements Clusterer{
boolean outlier = false; boolean outlier = false;
//take cluster element indexes //take cluster element indexes
int npoints = c.getExampleIds().size(); int npoints = c.getExampleIds().size();
AnalysisLogger.getLogger().debug("DBScan: Analyzing Cluster ->"+id+" with "+npoints+" "+minpoints); AnalysisLogger.getLogger().debug("DBScan: Analyzing Cluster ->"+id+" with "+npoints);
if (npoints==minpoints) if (npoints==minpoints)
outlier=true; outlier=true;
@ -341,12 +354,8 @@ public class DBScan implements Clusterer{
float instatus = status + statusstep; float instatus = status + statusstep;
status = Math.min(95f, instatus); status = Math.min(95f, instatus);
AnalysisLogger.getLogger().debug("DBScan: Status: "+status); AnalysisLogger.getLogger().debug("DBScan: Status: "+status);
}
shutdown();
status = 100f;
} }
}
@ -384,8 +393,8 @@ public class DBScan implements Clusterer{
PrimitiveTypesList p2 = new PrimitiveTypesList(PrimitiveTypes.STRING, "FeaturesColumnNames","Column Names for the features",false); PrimitiveTypesList p2 = new PrimitiveTypesList(PrimitiveTypes.STRING, "FeaturesColumnNames","Column Names for the features",false);
ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, "OccurrencePointsClusterTable","Table name of the distribution","occCluster_"); ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, "OccurrencePointsClusterTable","Table name of the distribution","occCluster_");
PrimitiveType p4 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "epsilon","DBScan epsilon parameter","10"); PrimitiveType p4 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "epsilon","DBScan epsilon parameter","10");
PrimitiveType p5 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "minPoints","DBScan minimum points parameter (identifies outliers)","1"); PrimitiveType p5 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "minPoints","DBScan minimum points parameter (identifies outliers)","1");
DatabaseType p6 = new DatabaseType(DatabaseParameters.DATABASEUSERNAME, "DatabaseUserName", "db user name"); DatabaseType p6 = new DatabaseType(DatabaseParameters.DATABASEUSERNAME, "DatabaseUserName", "db user name");

View File

@ -0,0 +1,173 @@
package org.gcube.dataanalysis.ecoengine.clustering;
import java.util.ArrayList;
import java.util.List;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration;
import org.gcube.dataanalysis.ecoengine.datatypes.DatabaseType;
import org.gcube.dataanalysis.ecoengine.datatypes.InputTable;
import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveType;
import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveTypesList;
import org.gcube.dataanalysis.ecoengine.datatypes.ServiceType;
import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.DatabaseParameters;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.ServiceParameters;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.TableTemplates;
import org.gcube.dataanalysis.ecoengine.utils.ResourceFactory;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.IOObject;
import com.rapidminer.tools.OperatorService;
public class KMeans extends DBScan{
public static String clusterColumn = "clusterid";
public static String clusterColumnType = "character varying";
public static String outliersColumn = "outlier";
public static String outliersColumnType = "boolean";
private String kk;
private String maxRuns;
private String maxOptimizations;
public static void main(String[] args) throws Exception{
long t0 = System.currentTimeMillis();
AlgorithmConfiguration config = new AlgorithmConfiguration();
config.setConfigPath("./cfg/");
config.setPersistencePath("./");
config.setParam("OccurrencePointsTable","presence_basking_cluster");
config.setParam("FeaturesColumnNames","centerlat"+AlgorithmConfiguration.getListSeparator()+"centerlong");
config.setParam("OccurrencePointsClusterTable","occCluster_kmeans");
config.setParam("k","50");
config.setParam("max_runs","10");
config.setParam("max_optimization_steps","10");
config.setParam("min_points","2");
config.setParam("DatabaseUserName","gcube");
config.setParam("DatabasePassword","d4science2");
config.setParam("DatabaseURL","jdbc:postgresql://146.48.87.169/testdb");
config.setParam("DatabaseDriver","org.postgresql.Driver");
KMeans cluster = new KMeans();
cluster.setConfiguration(config);
cluster.init();
cluster.compute();
System.out.println("ELAPSED "+(System.currentTimeMillis()-t0));
}
@Override
public void setConfiguration(AlgorithmConfiguration config) {
if (config!=null){
kk=config.getParam("k");
maxRuns= config.getParam("max_runs");
maxOptimizations = config.getParam("max_optimization_steps");
OccurrencePointsTable = config.getParam("OccurrencePointsTable").toLowerCase();
OccurrencePointsClusterTable=config.getParam("OccurrencePointsClusterTable").toLowerCase();
FeaturesColumnNames=config.getParam("FeaturesColumnNames");
minPoints=config.getParam("min_points");
this.config=config;
}
}
@Override
public void compute() throws Exception {
if ((config==null)||kk==null||maxRuns==null||maxOptimizations==null){
throw new Exception("KMeans: Error incomplete parameters");
}
AnalysisLogger.getLogger().debug("KMeans: Settin up the cluster");
//take elements and produce example set
com.rapidminer.operator.clustering.clusterer.KMeans kmeans = (com.rapidminer.operator.clustering.clusterer.KMeans) OperatorService.createOperator("KMeans");
kmeans.setParameter("k", kk);
kmeans.setParameter("max_runs",maxRuns);
kmeans.setParameter("max_optimization_steps", maxOptimizations);
kmeans.setParameter("keep_example_set", "true");
kmeans.setParameter("add_cluster_attribute", "true");
IOContainer innerInput = new IOContainer(points);
AnalysisLogger.getLogger().debug("KMeans: Clustering...");
IOContainer output = kmeans.apply(innerInput);
AnalysisLogger.getLogger().debug("KMeans: ...Clustering Finished");
status = 70f;
IOObject[] outputvector = output.getIOObjects();
BuildClusterTable(outputvector);
shutdown();
status = 100f;
}
@Override
public List<StatisticalType> getInputParameters() {
List<StatisticalType> parameters = new ArrayList<StatisticalType>();
List<TableTemplates> templateOccs = new ArrayList<TableTemplates>();
templateOccs.add(TableTemplates.GENERIC);
InputTable p1 = new InputTable(templateOccs,"OccurrencePointsTable","Occurrence Points Table","occurrences");
PrimitiveTypesList p2 = new PrimitiveTypesList(PrimitiveTypes.STRING, "FeaturesColumnNames","Column Names for the features",false);
ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, "OccurrencePointsClusterTable","Table name of the distribution","occCluster_");
DatabaseType p6 = new DatabaseType(DatabaseParameters.DATABASEUSERNAME, "DatabaseUserName", "db user name");
DatabaseType p7 = new DatabaseType(DatabaseParameters.DATABASEPASSWORD, "DatabasePassword", "db password");
DatabaseType p8 = new DatabaseType(DatabaseParameters.DATABASEDRIVER, "DatabaseDriver", "db driver");
DatabaseType p9 = new DatabaseType(DatabaseParameters.DATABASEURL, "DatabaseURL", "db url");
DatabaseType p10 = new DatabaseType(DatabaseParameters.DATABASEDIALECT, "DatabaseDialect", "db dialect");
DatabaseType p11 = new DatabaseType(DatabaseParameters.DATABASETABLESPACE, "DatabaseTableSpace", "db dialect");
PrimitiveType p4 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "k","Expected Number of Clusters","3");
PrimitiveType p5 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "max_runs","Max runs of the clustering procedure","10");
PrimitiveType p12 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "max_optimization_steps","Max number of internal optimization steps","5");
PrimitiveType p13 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "min_points","Minimum number of points to define an outlier set","2");
parameters.add(p1);
parameters.add(p2);
parameters.add(p3);
parameters.add(p4);
parameters.add(p5);
parameters.add(p6);
parameters.add(p7);
parameters.add(p8);
parameters.add(p9);
parameters.add(p10);
parameters.add(p11);
parameters.add(p12);
return parameters;
}
@Override
public String getDescription() {
return "Clustering with KMeans";
}
ResourceFactory resourceManager;
public String getResourceLoad() {
if (resourceManager==null)
resourceManager = new ResourceFactory();
return resourceManager.getResourceLoad(1);
}
@Override
public String getResources() {
return ResourceFactory.getResources(100f);
}
}