This commit is contained in:
Gianpaolo Coro 2012-09-04 13:36:22 +00:00
parent fec0f3ac4d
commit 8941ec3652
8 changed files with 2813 additions and 19 deletions

View File

@ -1,2 +1,3 @@
DBSCAN=org.gcube.dataanalysis.ecoengine.clustering.DBScan
KMEANS=org.gcube.dataanalysis.ecoengine.clustering.KMeans
KMEANS=org.gcube.dataanalysis.ecoengine.clustering.KMeans
XMEANS=org.gcube.dataanalysis.ecoengine.clustering.XMeansWrapper

10
input.csv Normal file
View File

@ -0,0 +1,10 @@
5.1,3.5
4.9,3.0
4.7,3.2
4.6,3.1
5.0,3.6
5.4,3.9
4.6,3.4
5.0,3.4
4.4,2.9
4.9,3.1
1 5.1 3.5
2 4.9 3.0
3 4.7 3.2
4 4.6 3.1
5 5.0 3.6
6 5.4 3.9
7 4.6 3.4
8 5.0 3.4
9 4.4 2.9
10 4.9 3.1

21
pom.xml
View File

@ -151,6 +151,27 @@
<artifactId>xstream</artifactId>
<version>1.3.1</version>
</dependency>
<dependency>
<groupId>net.sf.squirrel-sql.thirdparty-non-maven</groupId>
<artifactId>java-cup</artifactId>
<version>0.11a</version>
</dependency>
<dependency>
<groupId>nz.ac.waikato.cms.weka</groupId>
<artifactId>weka-dev</artifactId>
<version>3.7.6</version>
</dependency>
<dependency>
<groupId>nz.ac.waikato.cms.weka</groupId>
<artifactId>XMeans</artifactId>
<version>1.0.3</version>
</dependency>
<dependency>
<groupId>org.pentaho.pentaho-commons</groupId>
<artifactId>pentaho-package-manager</artifactId>
<name>Pentaho Package Manager</name>
<version>1.0.0</version>
</dependency>
</dependencies>
<repositories>
<repository>

View File

@ -46,15 +46,18 @@ public class DBScan implements Clusterer{
protected String FeaturesColumnNames;
protected float status;
protected SessionFactory dbHibConnection;
protected double[][] samplesVector;
public static String clusterColumn = "clusterid";
public static String clusterColumnType = "character varying";
public static String outliersColumn = "outlier";
public static String outliersColumnType = "boolean";
protected boolean initrapidminer = true;
public static void mainCluster(String[] args) throws Exception{
String coordinates [] = {
"55.973798,-55.297853",
"57.279043,-57.055666",
@ -162,8 +165,8 @@ public class DBScan implements Clusterer{
@Override
public void init() throws Exception {
if (config!=null)
status = 0;
if ((config!=null) && (initrapidminer))
config.initRapidMiner();
AnalysisLogger.getLogger().debug("Initialized Rapid Miner ");
AnalysisLogger.getLogger().debug("Initializing Database Connection");
@ -212,20 +215,16 @@ public class DBScan implements Clusterer{
FeaturesColumnNames=config.getParam("FeaturesColumnNames");
this.config=config;
}
}
protected void getSamples() throws Exception{
System.out.println("->"+DatabaseUtils.getColumnsElementsStatement(OccurrencePointsTable, FeaturesColumnNames, ""));
// System.out.println("->"+DatabaseUtils.getColumnsElementsStatement(OccurrencePointsTable, FeaturesColumnNames, ""));
FeaturesColumnNames=FeaturesColumnNames.replace(AlgorithmConfiguration.listSeparator, ",");
List<Object> samples = DatabaseFactory.executeSQLQuery(DatabaseUtils.getColumnsElementsStatement(OccurrencePointsTable, FeaturesColumnNames, ""), dbHibConnection);
String [] elements = FeaturesColumnNames.split(",");
int dimensions = elements.length;
int nSamples = samples.size();
double[][] samplesVector = new double[nSamples][dimensions];
samplesVector = new double[nSamples][dimensions];
int ir=0;
for (Object row:samples){
Object[] rowArr = (Object[]) row;
@ -310,7 +309,7 @@ public class DBScan implements Clusterer{
//take cluster element indexes
int npoints = c.getExampleIds().size();
AnalysisLogger.getLogger().debug("DBScan: Analyzing Cluster ->"+id+" with "+npoints);
if (npoints==minpoints)
if (npoints<minpoints)
outlier=true;
int k=0;

View File

@ -23,10 +23,6 @@ import com.rapidminer.tools.OperatorService;
public class KMeans extends DBScan{
public static String clusterColumn = "clusterid";
public static String clusterColumnType = "character varying";
public static String outliersColumn = "outlier";
public static String outliersColumnType = "boolean";
private String kk;
private String maxRuns;
private String maxOptimizations;
@ -130,7 +126,7 @@ public class KMeans extends DBScan{
PrimitiveType p4 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "k","Expected Number of Clusters","3");
PrimitiveType p5 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "max_runs","Max runs of the clustering procedure","10");
PrimitiveType p12 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "max_optimization_steps","Max number of internal optimization steps","5");
PrimitiveType p13 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "min_points","Minimum number of points to define an outlier set","2");
PrimitiveType p13 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "min_points","Number of points which define an outlier set","2");
parameters.add(p1);
parameters.add(p2);
@ -144,6 +140,7 @@ public class KMeans extends DBScan{
parameters.add(p10);
parameters.add(p11);
parameters.add(p12);
parameters.add(p13);
return parameters;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,302 @@
package org.gcube.dataanalysis.ecoengine.clustering;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration;
import org.gcube.dataanalysis.ecoengine.datatypes.DatabaseType;
import org.gcube.dataanalysis.ecoengine.datatypes.InputTable;
import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveType;
import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveTypesList;
import org.gcube.dataanalysis.ecoengine.datatypes.ServiceType;
import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.DatabaseParameters;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.ServiceParameters;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.TableTemplates;
import org.gcube.dataanalysis.ecoengine.utils.DatabaseFactory;
import org.gcube.dataanalysis.ecoengine.utils.DatabaseUtils;
import org.gcube.dataanalysis.ecoengine.utils.Transformations;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.clustering.Cluster;
import com.rapidminer.operator.clustering.ClusterModel;
import com.rapidminer.tools.OperatorService;
import weka.clusterers.ClusterEvaluation;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ArffSaver;
import weka.core.converters.CSVLoader;
public class XMeansWrapper extends DBScan {
private String maxIterations;
private String minClusters;
private String maxClusters;
public XMeansWrapper(){
super();
initrapidminer=false;
}
public static void main1(String[] args) throws Exception {
args = new String[2];
args[0] = "input.csv";
args[1] = "c:/tmp/output.arff";
// load CSV
CSVLoader loader = new CSVLoader();
loader.setSource(new File(args[0]));
Instances data = loader.getDataSet();
// save ARFF
ArffSaver saver = new ArffSaver();
saver.setInstances(data);
saver.setFile(new File(args[1]));
// saver.setDestination(new File(args[1]));
saver.writeBatch();
}
public class CSV2Arff {
/**
* takes 2 arguments: - CSV input file - ARFF output file
*/
}
public static void main(String[] args) throws Exception {
XMeans xmeans = new XMeans();
// xmeans.setInputCenterFile(new File("./clusterinput.arf"));
// String[] options = {"-I 10","-M 1000","-J 1000","-L 2","-H 50","-B 1.0","-use-kdtree no","-N clusterinput.arf","-O clusterout.txt","-U 3"};
// String[] options = {"-I 10","-M 1000","-J 1000","-L 2","-H 50","-B 1.0","-use-kdtree no","-t clusterinput.arf","-O clusterout.txt","-U 3"};
// String optionsS = "-t c:/tmp/output.arff -O c:/tmp/clusterout.arff";
String optionsS = "-t c:/tmp/output.arff";
String[] options = optionsS.split(" ");
String elements = "ciao,tutti\n5.1,3.5\n4.9,3.0\n4.7,3.2\n4.6,3.1\n5.0,3.6\n5.4,3.9\n4.6,3.4\n5.0,3.4\n4.4,2.9\n4.9,3.1\n";
// xmeans.setInputCenterFile(new File("./clusterinput.arf"));
CSVLoader loader = new CSVLoader();
InputStream tis = new ByteArrayInputStream(elements.getBytes("UTF-8"));
loader.setSource(tis);
Instances id = loader.getDataSet();
System.out.println("ids: "+id.numInstances());
System.exit(0);
xmeans.buildClusterer(id);
// xmeans.main(options);
// ClusterEvaluation.evaluateClusterer(xmeans, options);
/*
* String[] opts = xmeans.getOptions(); for (int i=0;i<opts.length;i++){ System.out.println("options: "+opts[i]); }
*/
System.out.println(ClusterEvaluation.evaluateClusterer(xmeans, options));
// ClusterEvaluation.evaluateClusterer(xmeans, options);
System.out.println("*************");
Instances is = xmeans.getClusterCenters();
for (Instance i : is) {
DenseInstance di = (DenseInstance) i;
System.out.println("Attributes: " + i.numAttributes());
System.out.print("->" + di.toString(0));
System.out.println(" " + di.toString(1));
// System.out.println(i);
System.out.println("-------------------------------");
}
System.out.println(xmeans.m_Bic);
// System.out.println(xmeans.clusterInstance(instance));
int[] ii = xmeans.m_ClusterAssignments;
for (int ix : ii)
System.out.print(ix + " ");
// xmeans.main(options);
}
@Override
public List<StatisticalType> getInputParameters() {
List<StatisticalType> parameters = new ArrayList<StatisticalType>();
List<TableTemplates> templateOccs = new ArrayList<TableTemplates>();
templateOccs.add(TableTemplates.GENERIC);
InputTable p1 = new InputTable(templateOccs, "OccurrencePointsTable", "Occurrence Points Table", "occurrences");
PrimitiveTypesList p2 = new PrimitiveTypesList(PrimitiveTypes.STRING, "FeaturesColumnNames", "Column Names for the features", false);
ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, "OccurrencePointsClusterTable", "Table name of the distribution", "occCluster_");
PrimitiveType p4 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "maxIterations", "XMeans max number of overall iterations of the clustering learning", "10");
PrimitiveType p5 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "minClusters", "Minimum number of expected clusters", "1");
PrimitiveType p12 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "maxClusters", "Maximum number of clusters to produce", "50");
PrimitiveType p13 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "min_points", "Number of points which define an outlier set", "2");
DatabaseType p6 = new DatabaseType(DatabaseParameters.DATABASEUSERNAME, "DatabaseUserName", "db user name");
DatabaseType p7 = new DatabaseType(DatabaseParameters.DATABASEPASSWORD, "DatabasePassword", "db password");
DatabaseType p8 = new DatabaseType(DatabaseParameters.DATABASEDRIVER, "DatabaseDriver", "db driver");
DatabaseType p9 = new DatabaseType(DatabaseParameters.DATABASEURL, "DatabaseURL", "db url");
DatabaseType p10 = new DatabaseType(DatabaseParameters.DATABASEDIALECT, "DatabaseDialect", "db dialect");
DatabaseType p11 = new DatabaseType(DatabaseParameters.DATABASETABLESPACE, "DatabaseTableSpace", "db dialect");
parameters.add(p1);
parameters.add(p2);
parameters.add(p3);
parameters.add(p4);
parameters.add(p5);
parameters.add(p6);
parameters.add(p7);
parameters.add(p8);
parameters.add(p9);
parameters.add(p10);
parameters.add(p11);
parameters.add(p12);
parameters.add(p13);
return parameters;
}
@Override
public String getDescription() {
return "Clustering with XMeans Algorithm: X-Means is K-Means extended by an Improve-Structure part In this part of the algorithm the centers are attempted to be split in its region. ";
}
@Override
public void setConfiguration(AlgorithmConfiguration config) {
if (config != null) {
maxIterations = config.getParam("maxIterations");
minClusters = config.getParam("minClusters");
maxClusters = config.getParam("maxClusters");
minPoints = config.getParam("min_points");
OccurrencePointsTable = config.getParam("OccurrencePointsTable").toLowerCase();
OccurrencePointsClusterTable = config.getParam("OccurrencePointsClusterTable").toLowerCase();
FeaturesColumnNames = config.getParam("FeaturesColumnNames");
this.config = config;
}
}
@Override
public void compute() throws Exception {
if ((config == null) || maxIterations == null || minClusters == null || maxClusters == null) {
throw new Exception("XMeans: Error incomplete parameters");
}
if ((samplesVector != null) && (samplesVector.length > 0)) {
AnalysisLogger.getLogger().debug("XMeans: Setting up the cluster");
CSVLoader loader = new CSVLoader();
StringBuffer sb = new StringBuffer();
for (int i = -1; i < samplesVector.length; i++) {
for (int j = 0; j < samplesVector[0].length; j++) {
if (i==-1)
sb.append("F"+j);
else
sb.append(samplesVector[i][j]);
if (j < samplesVector[0].length - 1) {
sb.append(",");
} else
sb.append("\n");
}
}
InputStream tis = new ByteArrayInputStream(sb.toString().getBytes("UTF-8"));
loader.setSource(tis);
Instances id = loader.getDataSet();
XMeans xmeans = new XMeans();
xmeans.setMaxIterations(Integer.parseInt(maxIterations));
xmeans.setMinNumClusters(Integer.parseInt(minClusters));
xmeans.setMaxNumClusters(Integer.parseInt(maxClusters));
xmeans.buildClusterer(id);
status = 50f;
// do clustering
AnalysisLogger.getLogger().debug("XMeans: Clustering ...");
Instances is = xmeans.getClusterCenters();
int nClusters = is.numInstances();
// take results
AnalysisLogger.getLogger().debug("XMeans: Found "+nClusters+" Centroids");
for (Instance i : is) {
DenseInstance di = (DenseInstance) i;
int nCluster = di.numAttributes();
for (int k = 0; k < nCluster; k++) {
AnalysisLogger.getLogger().debug(di.toString(k));
}
AnalysisLogger.getLogger().debug("-------------------------------");
}
int[] clusteringAssignments = xmeans.m_ClusterAssignments;
int[] counters = new int[nClusters];
for (int cluster:clusteringAssignments){
counters[cluster]++;
}
AnalysisLogger.getLogger().debug("XMeans: Building Table");
BuildClusterTable(clusteringAssignments, counters);
} else
AnalysisLogger.getLogger().debug("XMeans: Warning - Empty Training Set");
shutdown();
status = 100f;
}
protected void BuildClusterTable(int[] clusteringAssignments, int[] counters) throws Exception {
String columnsNames = FeaturesColumnNames + "," + clusterColumn + "," + outliersColumn;
int minpoints = Integer.parseInt(minPoints);
AnalysisLogger.getLogger().debug("Analyzing Cluster ->" + " minpoints " + minpoints);
StringBuffer bufferRows = new StringBuffer();
int nrows = samplesVector.length;
int ncols = samplesVector[0].length;
AnalysisLogger.getLogger().debug("Analyzing Cluster ->" + "Building Rows to Insert");
for (int k = 0; k < nrows; k++) {
bufferRows.append("(");
int cindex = clusteringAssignments[k];
boolean isoutlier = (counters[cindex]<minpoints);
for (int j = 0; j < ncols; j++) {
bufferRows.append(samplesVector[k][j]);
bufferRows.append(",");
}
bufferRows.append(cindex + "," + isoutlier + ")");
if (k < nrows - 1) {
bufferRows.append(",");
}
}
//TO-DO: insert row at chunks
AnalysisLogger.getLogger().debug("Analyzing Cluster ->" + "Inserting rows");
if (bufferRows.length() > 0) {
AnalysisLogger.getLogger().debug("XMeans: Writing into DB");
AnalysisLogger.getLogger().debug(DatabaseUtils.insertFromBuffer(OccurrencePointsClusterTable, columnsNames, bufferRows));
DatabaseFactory.executeSQLUpdate(DatabaseUtils.insertFromBuffer(OccurrencePointsClusterTable, columnsNames, bufferRows), dbHibConnection);
AnalysisLogger.getLogger().debug("XMeans: Finished with writing into DB");
} else
AnalysisLogger.getLogger().debug("XMeans: Nothing to write in the buffer");
status = 95f;
AnalysisLogger.getLogger().debug("XMeans: Status: " + status);
}
}

View File

@ -3,6 +3,8 @@ package org.gcube.dataanalysis.ecoengine.test.regression;
import java.util.List;
import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration;
import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveType;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes;
import org.gcube.dataanalysis.ecoengine.interfaces.Clusterer;
import org.gcube.dataanalysis.ecoengine.processing.factories.ClusterersFactory;
@ -15,12 +17,22 @@ public class RegressionTestClusterers {
public static void main(String[] args) throws Exception {
System.out.println("TEST 1");
List<Clusterer> clus = ClusterersFactory.getClusterers(testConfigLocal());
List<Clusterer> clus;
clus = ClusterersFactory.getClusterers(testConfigLocal());
clus.get(0).init();
Regressor.process(clus.get(0));
clus = null;
clus = ClusterersFactory.getClusterers(testConfigLocal2());
clus.get(0).init();
Regressor.process(clus.get(0));
clus = null;
clus = ClusterersFactory.getClusterers(testConfigLocal3());
clus.get(0).init();
Regressor.process(clus.get(0));
clus = null;
}
@ -30,11 +42,45 @@ public static void main(String[] args) throws Exception {
config.setNumberOfResources(1);
config.setAgent("DBSCAN");
config.setParam("OccurrencePointsTable","presence_basking_cluster");
config.setParam("FeaturesColumnNames","centerlat,centerlong");
config.setParam("OccurrencePointsClusterTable","occCluster_2");
config.setParam("FeaturesColumnNames","centerlat"+AlgorithmConfiguration.getListSeparator()+"centerlong");
config.setParam("OccurrencePointsClusterTable","occcluster_dbscan");
config.setParam("epsilon","10");
config.setParam("minPoints","1");
return config;
}
private static AlgorithmConfiguration testConfigLocal2() {
AlgorithmConfiguration config = Regressor.getConfig();
config.setNumberOfResources(1);
config.setAgent("KMEANS");
config.setParam("OccurrencePointsTable","presence_basking_cluster");
config.setParam("FeaturesColumnNames","centerlat"+AlgorithmConfiguration.getListSeparator()+"centerlong");
config.setParam("OccurrencePointsClusterTable","occcluster_kmeans");
config.setParam("k","50");
config.setParam("max_runs","10");
config.setParam("max_optimization_steps","10");
config.setParam("min_points","2");
return config;
}
private static AlgorithmConfiguration testConfigLocal3() {
AlgorithmConfiguration config = Regressor.getConfig();
config.setNumberOfResources(1);
config.setAgent("XMEANS");
config.setParam("OccurrencePointsTable","presence_basking_cluster");
config.setParam("FeaturesColumnNames","centerlat"+AlgorithmConfiguration.getListSeparator()+"centerlong");
config.setParam("OccurrencePointsClusterTable","occcluster_xmeans");
config.setParam("maxIterations","1000");
config.setParam("minClusters","10");
config.setParam("maxClusters","50");
config.setParam("min_points","10");
return config;
}
}