272 lines
11 KiB
Java
272 lines
11 KiB
Java
package org.gcube.dataanalysis.ecoengine.transducers;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
import org.gcube.contentmanagement.lexicalmatcher.utils.DatabaseFactory;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.ColumnType;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.DatabaseType;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.InputTable;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveType;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.ServiceType;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.ServiceParameters;
|
|
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.TableTemplates;
|
|
import org.gcube.dataanalysis.ecoengine.interfaces.DataAnalysis;
|
|
import org.gcube.dataanalysis.ecoengine.utils.DatabaseUtils;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
public class OccurrencePointsDuplicatesDeleter extends OccurrencePointsMerger {
|
|
|
|
private static Logger logger = LoggerFactory.getLogger(DataAnalysis.class);
|
|
|
|
String tableName;
|
|
List<String> records = new ArrayList<String>();
|
|
|
|
public OccurrencePointsDuplicatesDeleter() {
|
|
|
|
}
|
|
|
|
@Override
|
|
public List<StatisticalType> getInputParameters() {
|
|
List<TableTemplates> templatesOccurrence = new ArrayList<TableTemplates>();
|
|
templatesOccurrence.add(TableTemplates.OCCURRENCE_SPECIES);
|
|
// occurrence points tables
|
|
PrimitiveType p0 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, finalTableNameL, "the name of the produced table", "DeletedOcc_");
|
|
InputTable p1 = new InputTable(templatesOccurrence, tableNameF, "the table containing the occurrence points (up to 100 000 points)", "");
|
|
ColumnType p3 = new ColumnType(tableNameF, longitudeColumn, "column with longitude values", "decimallongitude", false);
|
|
ColumnType p4 = new ColumnType(tableNameF, latitudeColumn, "column with latitude values", "decimallatitude", false);
|
|
ColumnType p5 = new ColumnType(tableNameF, recordedByColumn, "column with RecordedBy values", "recordedby", false);
|
|
ColumnType p6 = new ColumnType(tableNameF, scientificNameColumn, "column with Scientific Names", "scientificname", false);
|
|
ColumnType p7 = new ColumnType(tableNameF, eventDateColumn, "column with EventDate values", "eventdate", false);
|
|
ColumnType p8 = new ColumnType(tableNameF, lastModificationColumn, "column with Modified values", "modified", false);
|
|
ServiceType p9 = new ServiceType(ServiceParameters.RANDOMSTRING, finalTableNameF, "name of the resulting table", "processedOccurrences_");
|
|
PrimitiveType p10 = new PrimitiveType(Float.class.getName(), null, PrimitiveTypes.NUMBER, spatialTolerance, "the tolerance in degree for assessing that two points could be the same", "0.5");
|
|
PrimitiveType p11 = new PrimitiveType(Float.class.getName(), null, PrimitiveTypes.NUMBER, confidence, "the overall acceptance similarity threshold over which two points are the same - from 0 to 100", "80");
|
|
|
|
List<StatisticalType> inputs = new ArrayList<StatisticalType>();
|
|
inputs.add(p0);
|
|
inputs.add(p1);
|
|
inputs.add(p3);
|
|
inputs.add(p4);
|
|
inputs.add(p5);
|
|
inputs.add(p6);
|
|
inputs.add(p7);
|
|
inputs.add(p8);
|
|
inputs.add(p9);
|
|
inputs.add(p10);
|
|
inputs.add(p11);
|
|
|
|
DatabaseType.addDefaultDBPars(inputs);
|
|
return inputs;
|
|
}
|
|
|
|
@Override
|
|
public String getDescription() {
|
|
return "A transducer algorithm that produces a duplicate free table of species occurrence points where duplicates have been identified via user defined comparison thresholds. Works with up to 100 000 points";
|
|
}
|
|
|
|
@Override
|
|
public void init() throws Exception {
|
|
|
|
lonFld = config.getParam(longitudeColumn);
|
|
latFld = config.getParam(latitudeColumn);
|
|
recordedByFld = config.getParam(recordedByColumn);
|
|
scientificNameFld = config.getParam(scientificNameColumn);
|
|
eventDatFld = config.getParam(eventDateColumn);
|
|
modifDatFld = config.getParam(lastModificationColumn);
|
|
tableName = config.getParam(tableNameF);
|
|
rightTableName = tableName;
|
|
leftTableName = tableName;
|
|
finalTableName = config.getParam(finalTableNameF);
|
|
finalTableLabel = config.getParam(finalTableNameL);
|
|
spatialToleranceValue = Float.parseFloat(config.getParam(spatialTolerance));
|
|
confidenceValue = Float.parseFloat(config.getParam(confidence));
|
|
|
|
objectstoinsert = new ArrayList<OccurrencePointsMerger.OccurrenceRecord>();
|
|
objectstodelete = new ArrayList<OccurrencePointsMerger.OccurrenceRecord>();
|
|
records = new ArrayList<String>();
|
|
status = 0;
|
|
}
|
|
|
|
protected boolean isBetterThan(OccurrenceRecord leftOcc, OccurrenceRecord rightOcc) {
|
|
if (((leftOcc.modifdate != null) && (rightOcc.modifdate != null) && leftOcc.modifdate.before(rightOcc.modifdate)) || (leftOcc.modifdate == null) && (rightOcc.modifdate != null))
|
|
return false;
|
|
else if ((leftOcc.modifdate != null) && (rightOcc.modifdate != null) && leftOcc.modifdate.after(rightOcc.modifdate) || (leftOcc.modifdate != null) && (rightOcc.modifdate == null))
|
|
return true;
|
|
else
|
|
return false;
|
|
}
|
|
|
|
@Override
|
|
protected void prepareFinalTable() throws Exception {
|
|
DatabaseFactory.executeSQLUpdate(DatabaseUtils.createBlankTableFromAnotherStatement(tableName, finalTableName), dbconnection);
|
|
}
|
|
|
|
public void takeFullRanges() {
|
|
// take the elements from sx table
|
|
logger.info("Taking elements from left table: " + leftTableName);
|
|
leftRows = DatabaseFactory.executeSQLQuery(DatabaseUtils.getDinstictElements(tableName, columns.toString(),"")+" limit 100000", dbconnection);
|
|
}
|
|
|
|
public void takeRange(int offsetLeft, int numLeft, int offsetRight, int numRight) {
|
|
// take the elements from sx table
|
|
logger.info("Taking elements from left table: " + leftTableName);
|
|
leftRows = DatabaseFactory.executeSQLQuery(DatabaseUtils.getDinstictElements(leftTableName, columns.toString(), "offset " + offsetLeft + " limit " + numLeft), dbconnection);
|
|
}
|
|
|
|
public void computeRange() throws Exception {
|
|
try {
|
|
// for each element in dx
|
|
logger.trace("Processing");
|
|
status = 10;
|
|
int similaritiesCounter = 0;
|
|
int allrows = 0;
|
|
if (leftRows!=null)
|
|
allrows = leftRows.size();
|
|
int rowcounter = 0;
|
|
if (allrows > 0) {
|
|
for (Object row : leftRows) {
|
|
// transform into an occurrence object
|
|
OccurrenceRecord testOcc = row2OccurrenceRecord((Object[]) row);
|
|
// for each element in the white list
|
|
int k = 0;
|
|
int insertedSize = objectstoinsert.size();
|
|
boolean candidate = true;
|
|
|
|
while (k < insertedSize) {
|
|
OccurrenceRecord yetInserted = objectstoinsert.get(k);
|
|
float prob = extProb(yetInserted, testOcc);
|
|
// if the occurrence is better than the the yet inserted then delete the yet inserted and in the end insert the new occ
|
|
if (prob >= confidenceValue) {
|
|
similaritiesCounter++;
|
|
if (isBetterThan(testOcc, yetInserted)) {
|
|
logger.trace("Found a similarity with P=" + prob + " between (" + "\"" + testOcc.scientificName + "\"" + "," + testOcc.x + "\"" + "," + "\"" + testOcc.y + "\"" + "," + "\"" + testOcc.recordedby + "\"" + "," + "\"" + convert2conventionalFormat(testOcc.eventdate) + "\"" + ") VS " + "(" + "\"" + yetInserted.scientificName + "\"" + "," + "\"" + yetInserted.x + "\"" + "," + "\"" + yetInserted.y + "\"" + "," + "\"" + yetInserted.recordedby + "\"" + "," + "\"" + convert2conventionalFormat(yetInserted.eventdate) + "\"" + ")");
|
|
objectstoinsert.remove(k);
|
|
k--;
|
|
insertedSize--;
|
|
|
|
}
|
|
// if there is yet one better then discard the testOcc
|
|
else {
|
|
candidate = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
k++;
|
|
}
|
|
|
|
if (candidate)
|
|
objectstoinsert.add(testOcc);
|
|
|
|
status = Math.min(90, 10f + (80 * ((float) rowcounter) / ((float) allrows)));
|
|
rowcounter++;
|
|
}
|
|
|
|
logger.trace("Found " + similaritiesCounter + " similarities on " + allrows + " distinct elements");
|
|
status = 90;
|
|
// transform the complete list into a table
|
|
persist();
|
|
// close DB connection
|
|
}
|
|
} catch (Exception e) {
|
|
logger.error("error",e);
|
|
throw e;
|
|
} finally {
|
|
shutdown();
|
|
status = 100;
|
|
logger.trace("Occ Points Processing Finished and db closed");
|
|
}
|
|
|
|
}
|
|
|
|
public void computeOLD() throws Exception {
|
|
|
|
try {
|
|
// init DB connection
|
|
logger.trace("Initializing DB Connection");
|
|
dbconnection = DatabaseUtils.initDBSession(config);
|
|
logger.trace("Taking Table Description");
|
|
logger.trace("Creating final table: " + finalTableName);
|
|
// create new merged table
|
|
try {
|
|
DatabaseFactory.executeSQLUpdate(DatabaseUtils.dropTableStatement(finalTableName), dbconnection);
|
|
} catch (Exception e1) {
|
|
}
|
|
logger.trace("Preparing table: " + finalTableName);
|
|
prepareFinalTable();
|
|
logger.trace("Extracting columns from: " + finalTableName);
|
|
extractColumnNames();
|
|
logger.trace("Taken Table Description: " + columns);
|
|
// take distinct elements from table
|
|
logger.trace("Taking elements from table: " + tableName);
|
|
List<Object> rows = DatabaseFactory.executeSQLQuery(DatabaseUtils.getDinstictElements(tableName, columns.toString(), ""), dbconnection);
|
|
// for each element in dx
|
|
logger.trace("Processing");
|
|
status = 10;
|
|
int similaritiesCounter = 0;
|
|
int allrows = rows.size();
|
|
int rowcounter = 0;
|
|
;
|
|
for (Object row : rows) {
|
|
// transform into an occurrence object
|
|
OccurrenceRecord testOcc = row2OccurrenceRecord((Object[]) row);
|
|
// for each element in the white list
|
|
int k = 0;
|
|
int insertedSize = objectstoinsert.size();
|
|
boolean candidate = true;
|
|
|
|
while (k < insertedSize) {
|
|
OccurrenceRecord yetInserted = objectstoinsert.get(k);
|
|
float prob = extProb(yetInserted, testOcc);
|
|
// if the occurrence is better than the the yet inserted then delete the yet inserted and in the end insert the new occ
|
|
if (prob >= confidenceValue) {
|
|
similaritiesCounter++;
|
|
if (isBetterThan(testOcc, yetInserted)) {
|
|
logger.trace("Found a similarity with P=" + prob + " between (" + "\"" + testOcc.scientificName + "\"" + "," + testOcc.x + "\"" + "," + "\"" + testOcc.y + "\"" + "," + "\"" + testOcc.recordedby + "\"" + "," + "\"" + convert2conventionalFormat(testOcc.eventdate) + "\"" + ") VS " + "(" + "\"" + yetInserted.scientificName + "\"" + "," + "\"" + yetInserted.x + "\"" + "," + "\"" + yetInserted.y + "\"" + "," + "\"" + yetInserted.recordedby + "\"" + "," + "\"" + convert2conventionalFormat(yetInserted.eventdate) + "\"" + ")");
|
|
objectstoinsert.remove(k);
|
|
k--;
|
|
insertedSize--;
|
|
|
|
}
|
|
// if there is yet one better then discard the testOcc
|
|
else {
|
|
candidate = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
k++;
|
|
}
|
|
|
|
if (candidate)
|
|
objectstoinsert.add(testOcc);
|
|
|
|
status = Math.min(90, 10f + (80 * ((float) rowcounter) / ((float) allrows)));
|
|
rowcounter++;
|
|
}
|
|
|
|
logger.trace("Found " + similaritiesCounter + " similarities on " + allrows + " distinct elements");
|
|
status = 90;
|
|
// transform the complete list into a table
|
|
persist();
|
|
// close DB connection
|
|
} catch (Exception e) {
|
|
logger.trace("An error occurred " + e.getLocalizedMessage());
|
|
throw e;
|
|
} finally {
|
|
if (dbconnection != null)
|
|
dbconnection.close();
|
|
status = 100;
|
|
logger.trace("Occ Points Processing Finished and db closed");
|
|
}
|
|
}
|
|
|
|
public void postProcess() throws Exception {
|
|
|
|
}
|
|
}
|