utils for CSV and limitations to Clusterers

git-svn-id: https://svn.d4science.research-infrastructures.eu/gcube/trunk/data-analysis/EcologicalEngine@82321 82a268e6-3cf1-43bd-a215-b396298e98cf
2013-10-02 17:35:52 +00:00 · 2013-10-02 17:35:52 +00:00 · f1dbb48522
parent 2438abbd29
commit f1dbb48522
7 changed files with 377 additions and 12 deletions
--- a/src/main/java/org/gcube/dataanalysis/ecoengine/clustering/DBScan.java
+++ b/src/main/java/org/gcube/dataanalysis/ecoengine/clustering/DBScan.java
@ -224,14 +224,34 @@ public class DBScan implements Clusterer{
 		t00=System.currentTimeMillis();
 //		System.out.println("->"+DatabaseUtils.getColumnsElementsStatement(OccurrencePointsTable, FeaturesColumnNames, ""));
 		FeaturesColumnNames=FeaturesColumnNames.replace(AlgorithmConfiguration.listSeparator, ",");
-		List<Object> samples = DatabaseFactory.executeSQLQuery(DatabaseUtils.getColumnsElementsStatement(OccurrencePointsTable, FeaturesColumnNames, ""), dbHibConnection);
 		String [] elements = FeaturesColumnNames.split(",");
+//		int limit = (int)Math.pow(5000,1d/(double)elements.length);
+		int N=4000;
+		double k = elements.length;
+		double  t=82327;
+		double logG = Math.log(t)-N;
+		
+		int limit = N;
+//		if (k>1)
+//			limit = (int)Math.round(( Math.log(t)-k*logG )/k );
+//			limit = (int)Math.round((double)N/k);
+//		 limit = (int)(11d*Math.pow(N,2d/(k+1)));
+//			limit =(int) ((double)N/(1.3d));
+		
+		AnalysisLogger.getLogger().debug("Clustering limit: "+limit);
+		
+		List<Object> samples = DatabaseFactory.executeSQLQuery(DatabaseUtils.getColumnsElementsStatement(OccurrencePointsTable, FeaturesColumnNames, "limit "+limit), dbHibConnection);
+		
 		int dimensions = elements.length;
 		int nSamples = samples.size();
 		samplesVector = new double[nSamples][dimensions];
 		int ir=0;
 		for (Object row:samples){
-			Object[] rowArr = (Object[]) row;
+			Object[] rowArr = new Object[1];
+			try{rowArr = (Object[]) row;}
+			catch(ClassCastException e){
+				rowArr[0] = ""+row;
+			}
 			int ic=0;
 			for (Object elem:rowArr){
 				Double feature = null;
@ -398,7 +418,7 @@ public class DBScan implements Clusterer{
 		List<StatisticalType> parameters = new ArrayList<StatisticalType>();
 		List<TableTemplates> templateOccs = new ArrayList<TableTemplates>();
 		templateOccs.add(TableTemplates.GENERIC);
-		InputTable p1 = new InputTable(templateOccs,"OccurrencePointsTable","Occurrence Points Table","occurrences");
+		InputTable p1 = new InputTable(templateOccs,"OccurrencePointsTable","Occurrence Points Table. Max 4000 points","occurrences");
 		ColumnTypesList p2 = new ColumnTypesList ("OccurrencePointsTable","FeaturesColumnNames", "column Names for the features", false);
 		PrimitiveType p0 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "OccurrencePointsClusterLabel","table name of the resulting distribution","OccCluster_");
 		ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, "OccurrencePointsClusterTable","table name of the distribution","occCluster_");
@ -419,7 +439,7 @@ public class DBScan implements Clusterer{

 	@Override
 	public String getDescription() {
-		return "Clustering with DBScan";
+		return "Clustering with DBScan. A maximum of 4000 points is allowed.";
 	}


--- a/src/main/java/org/gcube/dataanalysis/ecoengine/clustering/KMeans.java
+++ b/src/main/java/org/gcube/dataanalysis/ecoengine/clustering/KMeans.java
@ -118,7 +118,7 @@ public class KMeans extends DBScan{
 		List<StatisticalType> parameters = new ArrayList<StatisticalType>();
 		List<TableTemplates> templateOccs = new ArrayList<TableTemplates>();
 		templateOccs.add(TableTemplates.GENERIC);
-		InputTable p1 = new InputTable(templateOccs,"OccurrencePointsTable","Occurrence Points Table","occurrences");
+		InputTable p1 = new InputTable(templateOccs,"OccurrencePointsTable","Occurrence Points Table. Max 4000 points","occurrences");
 		ColumnTypesList p2 = new ColumnTypesList ("OccurrencePointsTable","FeaturesColumnNames", "column Names for the features", false);
 		PrimitiveType p0 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "OccurrencePointsClusterLabel","table name of the resulting distribution","OccCluster_");
 		ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, "OccurrencePointsClusterTable","table name of the distribution","occCluster_");
@ -142,7 +142,7 @@ public class KMeans extends DBScan{

 	@Override
 	public String getDescription() {
-		return "Clustering with KMeans";
+		return "Clustering with KMeans. A Maximum of 4000 points is allowed.";
 	}


--- a/src/main/java/org/gcube/dataanalysis/ecoengine/clustering/LOF.java
+++ b/src/main/java/org/gcube/dataanalysis/ecoengine/clustering/LOF.java
@ -0,0 +1,307 @@
+package org.gcube.dataanalysis.ecoengine.clustering;
+
+import java.lang.reflect.Field;
+import java.security.acl.LastOwnerException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
+import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration;
+import org.gcube.dataanalysis.ecoengine.datatypes.ColumnTypesList;
+import org.gcube.dataanalysis.ecoengine.datatypes.DatabaseType;
+import org.gcube.dataanalysis.ecoengine.datatypes.InputTable;
+import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveType;
+import org.gcube.dataanalysis.ecoengine.datatypes.ServiceType;
+import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType;
+import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes;
+import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.ServiceParameters;
+import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.TableTemplates;
+import org.gcube.dataanalysis.ecoengine.utils.DatabaseFactory;
+import org.gcube.dataanalysis.ecoengine.utils.DatabaseUtils;
+import org.gcube.dataanalysis.ecoengine.utils.DynamicEnum;
+import org.gcube.dataanalysis.ecoengine.utils.ResourceFactory;
+
+import com.rapidminer.example.Attribute;
+import com.rapidminer.example.Attributes;
+import com.rapidminer.example.Example;
+import com.rapidminer.example.ExampleSet;
+import com.rapidminer.example.set.SimpleExampleSet;
+import com.rapidminer.example.table.AttributeFactory;
+import com.rapidminer.example.table.DataRow;
+import com.rapidminer.example.table.DoubleArrayDataRow;
+import com.rapidminer.example.table.ExampleTable;
+import com.rapidminer.example.table.MemoryExampleTable;
+import com.rapidminer.operator.IOContainer;
+import com.rapidminer.operator.IOObject;
+import com.rapidminer.operator.clustering.Cluster;
+import com.rapidminer.operator.clustering.ClusterModel;
+import com.rapidminer.tools.Ontology;
+import com.rapidminer.tools.OperatorService;
+
+public class LOF extends DBScan {
+
+	String minimal_points_lower_bound = "1";
+	String minimal_points_upper_bound = "10";
+	String lof_threshold = "2";
+	String distance_function = "euclidian distance";
+	static String lofcolumn = "lof";
+	static String lofcolumntype = "real";
+	
+	LOFenum enuFunctions = new LOFenum();
+
+	enum LOFenumType {
+	}
+
+	class LOFenum extends DynamicEnum {
+		public Field[] getFields() {
+			Field[] fields = LOFenumType.class.getDeclaredFields();
+			return fields;
+		}
+	}
+
+	@Override
+	public void init() throws Exception {
+		status = 0;
+		if ((config!=null) && (initrapidminer))
+			config.initRapidMiner();
+		AnalysisLogger.getLogger().debug("Initialized Rapid Miner ");
+		AnalysisLogger.getLogger().debug("Initializing Database Connection");
+		dbHibConnection=DatabaseUtils.initDBSession(config);
+		//create the final table
+		try{
+			AnalysisLogger.getLogger().debug("dropping table "+OccurrencePointsClusterTable);
+		String dropStatement = DatabaseUtils.dropTableStatement(OccurrencePointsClusterTable);
+		AnalysisLogger.getLogger().debug("dropping table "+dropStatement);
+		DatabaseFactory.executeSQLUpdate(dropStatement, dbHibConnection);
+		}catch(Exception e){
+			AnalysisLogger.getLogger().debug("Could not drop table "+OccurrencePointsClusterTable);
+		}
+		//create Table
+		AnalysisLogger.getLogger().debug("Creating table "+OccurrencePointsClusterTable);
+		String [] features =  FeaturesColumnNames.split(AlgorithmConfiguration.getListSeparator());
+		String columns = "";
+		
+		for (int i=0;i<features.length;i++){
+			columns +=features[i]+" real";
+			if (i<features.length-1)
+				columns+=",";
+		}
+		
+		String createStatement = "create table "+OccurrencePointsClusterTable+" ( "+columns+")";
+//		String createStatement = new DatabaseUtils(dbHibConnection).buildCreateStatement(OccurrencePointsTable,OccurrencePointsClusterTable);
+		AnalysisLogger.getLogger().debug("Statement: "+createStatement);
+		DatabaseFactory.executeSQLUpdate(createStatement, dbHibConnection);
+		//add two columns one for cluster and another for outliers
+		AnalysisLogger.getLogger().debug("Adding Columns");
+		DatabaseFactory.executeSQLUpdate(DatabaseUtils.addColumnStatement(OccurrencePointsClusterTable, lofcolumn, lofcolumntype), dbHibConnection);
+		DatabaseFactory.executeSQLUpdate(DatabaseUtils.addColumnStatement(OccurrencePointsClusterTable, outliersColumn, outliersColumnType), dbHibConnection);
+		AnalysisLogger.getLogger().debug("Getting Samples");
+		//build samples
+		getSamples();
+		status = 10f;
+	}
+
+	@Override
+	public void setConfiguration(AlgorithmConfiguration config) {
+		if (config != null) {
+			minimal_points_lower_bound = config.getParam("minimal_points_lower_bound");
+			minimal_points_upper_bound = config.getParam("minimal_points_upper_bound");
+			distance_function = config.getParam("distance_function");
+			lof_threshold  = config.getParam("lof_threshold");
+			
+			OccurrencePointsTable = config.getParam("PointsTable").toLowerCase();
+			OccurrencePointsClusterLabel = config.getParam("PointsClusterLabel");
+			OccurrencePointsClusterTable = config.getParam("PointsClusterTable").toLowerCase();
+			FeaturesColumnNames = config.getParam("FeaturesColumnNames");
+
+			this.config = config;
+		}
+
+	}
+
+	@Override
+	public void compute() throws Exception {
+		try {
+
+			if ((config == null) || minimal_points_lower_bound == null || minimal_points_upper_bound == null || distance_function == null) {
+				throw new Exception("LOF: Error incomplete parameters");
+			}
+			status = 10f;
+			AnalysisLogger.getLogger().debug("LOF: Settin up the cluster");
+			// take elements and produce example set
+			com.rapidminer.operator.preprocessing.outlier.LOFOutlierOperator clusterer = (com.rapidminer.operator.preprocessing.outlier.LOFOutlierOperator) OperatorService.createOperator("LOFOutlierDetection");
+			clusterer.setParameter("minimal_points_lower_bound", minimal_points_lower_bound);
+			clusterer.setParameter("minimal_points_upper_bound", minimal_points_upper_bound);
+			clusterer.setParameter("distance_function", distance_function);
+
+			IOContainer innerInput = new IOContainer(points);
+
+			AnalysisLogger.getLogger().debug("LOF: Clustering...");
+			long ti = System.currentTimeMillis();
+			IOContainer output = clusterer.apply(innerInput);
+			AnalysisLogger.getLogger().debug("LOF: ...ELAPSED CLUSTERING TIME: " + (System.currentTimeMillis() - ti));
+			AnalysisLogger.getLogger().debug("LOF: ...Clustering Finished");
+			status = 70f;
+
+			IOObject[] outputvector = output.getIOObjects();
+
+			BuildClusterTable(outputvector);
+		} catch (Exception e) {
+			throw e;
+		} finally {
+			shutdown();
+			status = 100f;
+		}
+	}
+
+	@Override
+	protected void BuildClusterTable(IOObject[] outputvector) throws Exception {
+
+		StringBuffer bufferRows = new StringBuffer();
+		SimpleExampleSet output = (SimpleExampleSet) outputvector[0];
+		MemoryExampleTable met = (MemoryExampleTable) output.getExampleTable();
+		int numofcolumns = met.getAttributeCount();
+		int numofrows = met.size();
+		double lofthr = 2;
+		if (lof_threshold!=null) 
+			try{lofthr = Double.parseDouble(lof_threshold);}catch(Exception e){};
+		AnalysisLogger.getLogger().debug("LOF: using lof threshold :"+lofthr);
+		
+		for (int i = 0; i < numofrows; i++) {
+
+			DataRow dr = met.getDataRow(i);
+			Attribute outlierAtt = met.getAttribute(numofcolumns - 1);
+			bufferRows.append("(");
+			
+			for (int j=0;j<numofcolumns-2;j++){
+				Attribute att = met.getAttribute(j);
+				bufferRows.append(dr.get(att)+",");
+			}
+			double lofscore = dr.get(outlierAtt);
+			if (lofscore>Double.MAX_VALUE)
+				lofscore = Float.MAX_VALUE;
+			boolean outlier = (lofscore>=lofthr);
+			
+			bufferRows.append(lofscore+","+outlier+")");
+			
+			if (i<numofrows-1)
+				bufferRows.append(",");
+		}
+
+		AnalysisLogger.getLogger().debug("LOF: Finished in retrieving and building output to write");
+		
+
+		String columnsNames = FeaturesColumnNames + ","+lofcolumn+","+outliersColumn;
+//		System.out.println(DatabaseUtils.insertFromBuffer(OccurrencePointsClusterTable, columnsNames, bufferRows));
+		
+		if (bufferRows.length() > 0) {
+
+			AnalysisLogger.getLogger().debug("Writing into DB");
+//			AnalysisLogger.getLogger().debug("Query to execute: "+DatabaseUtils.insertFromBuffer(OccurrencePointsClusterTable, columnsNames, bufferRows));
+			DatabaseFactory.executeSQLUpdate(DatabaseUtils.insertFromBuffer(OccurrencePointsClusterTable, columnsNames, bufferRows), dbHibConnection);
+			AnalysisLogger.getLogger().debug("Finished with writing into DB");
+		} else
+			AnalysisLogger.getLogger().debug("Nothing to write in the buffer");
+
+		status = 100;
+		AnalysisLogger.getLogger().debug("Status: " + status);
+
+	}
+
+	@Override
+	public List<StatisticalType> getInputParameters() {
+		List<StatisticalType> parameters = new ArrayList<StatisticalType>();
+		List<TableTemplates> templateOccs = new ArrayList<TableTemplates>();
+		templateOccs.add(TableTemplates.GENERIC);
+
+		InputTable p1 = new InputTable(templateOccs, "PointsTable", "Table containing points or observations. Max 4000 points", "pointstable");
+		ColumnTypesList p2 = new ColumnTypesList("PointsTable", "FeaturesColumnNames", "column Names for the features", false);
+
+		PrimitiveType p0 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "PointsClusterLabel", "table name of the resulting distribution", "Cluster_");
+		ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, "PointsClusterTable", "table name of the distribution", "occcluster_");
+
+		PrimitiveType p4 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "minimal_points_lower_bound", "locality (usually called k): minimal number of nearest neighbors", "2");
+		PrimitiveType p5 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "minimal_points_upper_bound", "maximum number of nearest neighbors to take into account for outliers evaluation", "10");
+		
+		if (LOFenumType.values().length<2) {
+			enuFunctions.addEnum(LOFenumType.class, "euclidian distance");
+			enuFunctions.addEnum(LOFenumType.class, "squared distance");
+			enuFunctions.addEnum(LOFenumType.class, "cosine distance");
+			enuFunctions.addEnum(LOFenumType.class, "inverted cosine distance");
+			enuFunctions.addEnum(LOFenumType.class, "angle");
+		}
+		
+		PrimitiveType p6 = new PrimitiveType(Enum.class.getName(), LOFenumType.values(), PrimitiveTypes.ENUMERATED, "distance_function", "the distance function to use in the calculation", "euclidian distance");
+		PrimitiveType p7 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, "lof_threshold", "the LOF score threshold over which the point is an outlier (usually 2)", "2");
+		
+		parameters.add(p1);
+		parameters.add(p2);
+		parameters.add(p0);
+		parameters.add(p3);
+		parameters.add(p4);
+		parameters.add(p5);
+		parameters.add(p6);
+		parameters.add(p7);
+
+		DatabaseType.addDefaultDBPars(parameters);
+		return parameters;
+	}
+
+	@Override
+	public String getDescription() {
+		return "Local Outlier Factor (LOF). A Maximum of 4000 points is allowed.";
+	}
+
+	ResourceFactory resourceManager;
+
+	public String getResourceLoad() {
+		if (resourceManager == null)
+			resourceManager = new ResourceFactory();
+		return resourceManager.getResourceLoad(1);
+	}
+
+	@Override
+	public String getResources() {
+		return ResourceFactory.getResources(100f);
+	}
+
+	
+	public static void main(String[] args) throws Exception {
+		long t0 = System.currentTimeMillis();
+
+		AlgorithmConfiguration config = new AlgorithmConfiguration();
+		config.setConfigPath("./cfg/");
+		config.setPersistencePath("./");
+//		config.setParam("PointsTable", "presence_basking_cluster");
+//		config.setParam("FeaturesColumnNames", "centerlat" + AlgorithmConfiguration.getListSeparator() + "centerlong");
+		config.setParam("PointsTable", "hcaf_d");
+//		config.setParam("FeaturesColumnNames", "depthmin" + AlgorithmConfiguration.getListSeparator() + "depthmax");
+		config.setParam("FeaturesColumnNames", "depthmin" + AlgorithmConfiguration.getListSeparator() + "depthmax"+ AlgorithmConfiguration.getListSeparator()+"depthmean"+ 
+									AlgorithmConfiguration.getListSeparator()+"sstanmean"+
+									AlgorithmConfiguration.getListSeparator()+"sstmnmax"+
+									AlgorithmConfiguration.getListSeparator()+"sstmnmin"+
+									AlgorithmConfiguration.getListSeparator()+"sbtanmean"+
+									AlgorithmConfiguration.getListSeparator()+"salinitymean"+
+									AlgorithmConfiguration.getListSeparator()+"salinitymax");
+//		config.setParam("FeaturesColumnNames", "depthmin");
+		config.setParam("PointsClusterTable", "occCluster_lof");
+
+		config.setParam("minimal_points_lower_bound", "1");
+		config.setParam("minimal_points_upper_bound", "100");
+		config.setParam("distance_function", "euclidean distance");
+
+		config.setParam("DatabaseUserName", "gcube");
+		config.setParam("DatabasePassword", "d4science2");
+		config.setParam("DatabaseURL", "jdbc:postgresql://146.48.87.169/testdb");
+		config.setParam("DatabaseDriver", "org.postgresql.Driver");
+
+		LOF cluster = new LOF();
+		cluster.setConfiguration(config);
+		cluster.init();
+		cluster.compute();
+
+		System.out.println("ELAPSED " + (System.currentTimeMillis() - t0));
+
+	}
+	
+}
--- a/src/main/java/org/gcube/dataanalysis/ecoengine/clustering/XMeansWrapper.java
+++ b/src/main/java/org/gcube/dataanalysis/ecoengine/clustering/XMeansWrapper.java
@ -119,7 +119,7 @@ public class XMeansWrapper extends DBScan {
 		List<TableTemplates> templateOccs = new ArrayList<TableTemplates>();
 		templateOccs.add(TableTemplates.GENERIC);

-		InputTable p1 = new InputTable(templateOccs, "OccurrencePointsTable", "Occurrence Points Table", "occurrences");
+		InputTable p1 = new InputTable(templateOccs, "OccurrencePointsTable", "Occurrence Points Table. Max 4000 points", "occurrences");
 		ColumnTypesList p2 = new ColumnTypesList ("OccurrencePointsTable","FeaturesColumnNames", "column Names for the features", false);
 		PrimitiveType p0 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, "OccurrencePointsClusterLabel","table name of the resulting distribution","OccCluster_");
 		ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, "OccurrencePointsClusterTable", "table name of the distribution", "occCluster_");
@ -144,7 +144,7 @@ public class XMeansWrapper extends DBScan {

 	@Override
 	public String getDescription() {
-		return "Clustering with XMeans Algorithm: X-Means is K-Means extended by an Improve-Structure part In this part of the algorithm the centers are attempted to be split in its region. ";
+		return "Clustering with XMeans Algorithm: X-Means is K-Means extended by an Improve-Structure part In this part of the algorithm the centers are attempted to be split in its region. A Maximum of 4000 points is allowed.";
 	}

 	@Override
--- a/src/main/java/org/gcube/dataanalysis/ecoengine/models/testing/FeedForwardNNFile.java
+++ b/src/main/java/org/gcube/dataanalysis/ecoengine/models/testing/FeedForwardNNFile.java
@ -146,7 +146,7 @@ public class FeedForwardNNFile extends ModelAquamapsNN {
 		List<TableTemplates> templatesOccurrences = new ArrayList<TableTemplates>();
 		templatesOccurrences.add(TableTemplates.GENERIC);

-		InputTable p1 = new InputTable(templatesOccurrences, TrainingDataSet, "a table containing real values colums for training the ANN");
+		InputTable p1 = new InputTable(templatesOccurrences, TrainingDataSet, "a table containing real values colums for training the ANN (up to 100000 points)");
 		ColumnTypesList p2 = new ColumnTypesList(TrainingDataSet, TrainingDataSetColumns, "column names to use as features vectors", false);
 		ColumnType p3 = new ColumnType(TrainingDataSet, TrainingDataSetTargetColumn, "the column to use as target", "probability", false);
 		PrimitiveTypesList p4 = new PrimitiveTypesList(Integer.class.getName(), PrimitiveTypes.NUMBER, LayersNeurons, "a list of neurons number for each inner layer", true);
--- a/src/main/java/org/gcube/dataanalysis/ecoengine/transducers/OccurrencePointsDuplicatesDeleter.java
+++ b/src/main/java/org/gcube/dataanalysis/ecoengine/transducers/OccurrencePointsDuplicatesDeleter.java
@ -32,7 +32,7 @@ public class OccurrencePointsDuplicatesDeleter extends OccurrencePointsMerger {
 		templatesOccurrence.add(TableTemplates.OCCURRENCE_SPECIES);
 		// occurrence points tables
 		PrimitiveType p0 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, finalTableNameL, "the name of the produced table", "DeletedOcc_");
-		InputTable p1 = new InputTable(templatesOccurrence, tableNameF, "the table containing the occurrence points", "");
+		InputTable p1 = new InputTable(templatesOccurrence, tableNameF, "the table containing the occurrence points (up to 100 000 points)", "");
 		ColumnType p3 = new ColumnType(tableNameF, longitudeColumn, "column with longitude values", "decimallongitude", false);
 		ColumnType p4 = new ColumnType(tableNameF, latitudeColumn, "column with latitude values", "decimallatitude", false);
 		ColumnType p5 = new ColumnType(tableNameF, recordedByColumn, "column with RecordedBy values", "recordedby", false);
@ -62,7 +62,7 @@ public class OccurrencePointsDuplicatesDeleter extends OccurrencePointsMerger {

 	@Override
 	public String getDescription() {
-		return "An algorithm for deleting similar occurrences in a sets of occurrence points of species coming from the Species Discovery Facility of D4Science";
+		return "An algorithm for deleting similar occurrences in a sets of occurrence points of species coming from the Species Discovery Facility of D4Science. Works with up to 100 000 points";
 	}

 	@Override
@ -106,7 +106,7 @@ public class OccurrencePointsDuplicatesDeleter extends OccurrencePointsMerger {
 	public void takeFullRanges() {
 		// take the elements from sx table
 		AnalysisLogger.getLogger().info("Taking elements from left table: " + leftTableName);
-		leftRows = DatabaseFactory.executeSQLQuery(DatabaseUtils.getDinstictElements(tableName, columns.toString(), ""), dbconnection);
+		leftRows = DatabaseFactory.executeSQLQuery(DatabaseUtils.getDinstictElements(tableName, columns.toString(), "limit 100000"), dbconnection);
 	}

 	public void takeRange(int offsetLeft, int numLeft, int offsetRight, int numRight) {
--- a/src/main/java/org/gcube/dataanalysis/ecoengine/utils/Transformations.java
+++ b/src/main/java/org/gcube/dataanalysis/ecoengine/utils/Transformations.java
@ -4,7 +4,10 @@ import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileWriter;
+import java.util.ArrayList;
+import java.util.List;

+import org.apache.regexp.RE;
 import org.gcube.contentmanagement.graphtools.data.BigSamplesTable;
 import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration;

@ -195,4 +198,39 @@ public class Transformations {
 		System.out.println("sha1: "+Sha1.SHA1(s2));
 	}
 	
+	public static List<String> parseCVSString(String row, String delimiter) throws Exception{
+		
+		List<String> elements = new ArrayList<String>();
+		String phrase=row;
+		int idxdelim = -1;
+		boolean quot = false;
+		phrase = phrase.trim();
+		while ( (idxdelim = phrase.indexOf(delimiter))>0) {
+			 quot=phrase.startsWith("\"");
+			if (quot){
+				phrase = phrase.substring(1);
+				RE regexp = new RE("[^\\\\]\"");
+				boolean matching = regexp.match(phrase);
+				String quoted = "";
+				if (matching){
+					int i0 = regexp.getParenStart(0);
+					quoted= phrase.substring(0,i0+1).trim();
+					phrase = phrase.substring(i0+2).trim();	
+				}
+				
+				if (phrase.startsWith(delimiter))
+					phrase=phrase.substring(1);
+					
+				elements.add(quoted);
+				
+			}
+			else{	
+				elements.add(phrase.substring(0,idxdelim));
+				phrase= phrase.substring(idxdelim+1).trim();
+			}
+		}
+		elements.add(phrase);
+		
+		return elements;
+	}
 }