This commit is contained in:
Gianpaolo Coro 2012-09-18 08:40:56 +00:00
parent 9fd65becdf
commit 9b2b34b3de
7 changed files with 168 additions and 50 deletions

View File

@ -32,7 +32,7 @@ public class DateGuesser {
}
// private static final String[] formats = { "MM\\dd\\yyyy", "MM\\dd\\yy", "MM/dd/yy", "MM/dd/yyyy", "dd/MM/yy", "dd/MM/yyyy", "dd/MM/yyyy HH:mm:ss", "dd/MM/yy HH:mm:ss", "dd/MM/yyyy HH:mm:ss","MM/yy","MM/yyyy", "yyyy.MM.dd G 'at' HH:mm:ss z", "EEE, MMM d, ''yy", "h:mm a", "hh 'o''clock' a, zzzz", "K:mm a, z", "MM-dd-yy","MM-dd-yyyy", "dd-MMM-yy", "yyyy.MM.dd.HH.mm.ss", "E, dd MMM yyyy HH:mm:ss Z", "yyyyy.MMMMM.dd GGG hh:mm aaa", "EEE, d MMM yyyy HH:mm:ss Z", "yyMMddHHmmssZ", "yyyy-MM-dd'T'HH:mm:ss.SSSZ", "yyyy-MM-dd HH:mm","yyyy-MM-dd","yyyy-MM-dd HH:mm:ss", "h:mm a", "yyyy"};
private static final String[] formats = { "MM\\dd\\yyyy", "MM\\dd\\yy", "MM/dd/yy", "MM/dd/yyyy", "MM/yy","MM/yyyy", "yyyy.MM.dd G 'at' HH:mm:ss z", "MM/dd/yyyy HH:mm:ss","MM/dd/yyyy HH:mm:ss aaa","dd/MM/yyyy HH:mm:ss","EEE, MMM d, ''yy", "h:mm a", "hh 'o''clock' a, zzzz", "K:mm a, z", "MM-dd-yy","MM-dd-yyyy", "dd-MMM-yy", "yyyy.MM.dd.HH.mm.ss", "E, dd MMM yyyy HH:mm:ss Z", "yyyyy.MMMMM.dd GGG hh:mm aaa", "EEE, d MMM yyyy HH:mm:ss Z", "yyMMddHHmmssZ", "yyyy-MM-dd'T'HH:mm:ss.SSSZ", "yyyy-MM-dd HH:mm","yyyy-MM-dd","yyyy-MM-dd HH:mm:ss", "h:mm a", "yyyy"};
private static final String[] formats = { "MM\\dd\\yyyy", "MM\\dd\\yy", "MM/dd/yy", "MM/dd/yyyy", "MM/yy","MM/yyyy", "yyyy.MM.dd G 'at' HH:mm:ss z", "MM/dd/yyyy HH:mm:ss","dd/MM/yyyy HH:mm:ss","EEE, MMM d, ''yy", "h:mm a", "hh 'o''clock' a, zzzz", "K:mm a, z", "MM-dd-yy","MM-dd-yyyy", "dd-MMM-yy", "yyyy.MM.dd.HH.mm.ss", "E, dd MMM yyyy HH:mm:ss Z", "yyyyy.MMMMM.dd GGG hh:mm aaa", "EEE, d MMM yyyy HH:mm:ss Z", "yyMMddHHmmssZ", "yyyy-MM-dd'T'HH:mm:ss.SSSZ", "yyyy-MM-dd HH:mm","yyyy-MM-dd","yyyy-MM-dd HH:mm:ss","MM/dd/yy KK:mm a","MM/dd/yy KK:mm:ss a","h:mm a", "yyyy"};
private static final String[] formatiITA = { "dd\\MM\\yyyy", "dd\\MM\\yy", "dd/MM/yy", "dd/MM/yyyy", "dd/MM/yy", "dd/MM/yyyy","dd/MM/yyyy HH:mm:ss", "dd/MM/yy HH:mm:ss", "dd/MM/yyyy HH:mm:ss", "MM/yy","MM/yyyy","dd.MM.yyyy G 'alle' HH:mm:ss z", "EEE, MMM d, ''yy", "h:mm a", "hh a, zzzz", "K:mm a, z", "dd-MMM-yy", "dd.MM.yyyy.HH.mm.ss", "E, dd MMM yyyy HH:mm:ss Z", "yyyyy.MMMMM.dd GGG hh:mm aaa", "EEE, d MMM yyyy HH:mm:ss Z", "yyMMddHHmmssZ", "yyyy-MM-dd'T'HH:mm:ss.SSSZ", "dd-MMM-yyyy HH:mm", "h:mm a", "yyyy"};

View File

@ -1,5 +1,6 @@
package org.gcube.dataanalysis.ecoengine.modeling;
import java.util.ArrayList;
import java.util.List;
import org.gcube.dataanalysis.ecoengine.configuration.ALG_PROPS;
@ -65,7 +66,8 @@ public class SimpleModeler implements Modeler{
@Override
public List<StatisticalType> getInputParameters() {
return innermodel.getInputParameters();
return new ArrayList<StatisticalType>();
// return innermodel.getInputParameters();
}
@Override

View File

@ -1,5 +1,6 @@
package org.gcube.dataanalysis.ecoengine.processing;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
@ -328,7 +329,8 @@ public class LocalSimpleSplitGenerator implements Generator {
@Override
public List<StatisticalType> getInputParameters() {
return distributionModel.getInputParameters();
// return distributionModel.getInputParameters();
return new ArrayList<StatisticalType>();
}

View File

@ -1,5 +1,6 @@
package org.gcube.dataanalysis.ecoengine.processing;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.Queue;
@ -497,7 +498,8 @@ public class LocalSplitGenerator implements Generator {
@Override
public List<StatisticalType> getInputParameters() {
return distributionModel.getInputParameters();
return new ArrayList<StatisticalType>();
// return distributionModel.getInputParameters();
}

View File

@ -1,12 +1,17 @@
package org.gcube.dataanalysis.ecoengine.transducers;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import org.gcube.contentmanagement.graphtools.utils.DateGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.gcube.contentmanagement.lexicalmatcher.utils.DatabaseFactory;
import org.gcube.contentmanagement.lexicalmatcher.utils.DistanceCalculator;
import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration;
import org.gcube.dataanalysis.ecoengine.configuration.INFRASTRUCTURE;
import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType;
@ -68,10 +73,19 @@ public class OccurrencePointsMerger implements Transducerer{
}
}
public static String convert2conventionalFormat(Calendar date){
if (date==null)
return "";
SimpleDateFormat formatter = new SimpleDateFormat("MM/dd/yy KK:mm:ss a");
String formattedDate = formatter.format(new Date(date.getTimeInMillis()));
return formattedDate;
}
boolean displaydateconvert = true;
public OccurrenceRecord row2OccurrenceRecord(Object[] row){
OccurrenceRecord record = new OccurrenceRecord();
int index = 0;
for (Object name:columnsNames){
String name$ = ""+name;
String value$ = ""+row[index];
@ -84,8 +98,35 @@ public class OccurrencePointsMerger implements Transducerer{
else if (name$.equalsIgnoreCase(recordedByFld)){
record.recordedby=value$;
}
else if (name$.equalsIgnoreCase(scientificNameFld)){
record.scientificName=value$;
}
else if (name$.equalsIgnoreCase(eventDatFld)){
record.eventdate=DateGuesser.convertDate(value$);
if ((value$==null) || (value$.length()==0)){
record.eventdate=null;
}
else{
/*
SimpleDateFormat formatter = new SimpleDateFormat("MM/dd/yy KK:mm a",Locale.UK);
try {
Date d = (Date) formatter.parse(value$);
Calendar cal = Calendar.getInstance();
cal.setTime(d);
System.out.println("From "+value$+"->"+(cal.get(Calendar.MONTH)+1)+" "+cal.get(Calendar.DAY_OF_MONTH)+" "+cal.get(Calendar.YEAR)+" "+cal.get(Calendar.HOUR)+" "+cal.get(Calendar.MINUTE));
// System.out.println("->"+cal.toString());
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
*/
record.eventdate=DateGuesser.convertDate(value$);
if (displaydateconvert)
{ AnalysisLogger.getLogger().trace("From "+value$+"->"+convert2conventionalFormat(record.eventdate)+" pattern "+DateGuesser.getPattern(value$));
displaydateconvert=false;
}
}
}
else if (name$.equalsIgnoreCase(modifDatFld)){
record.modifdate=DateGuesser.convertDate(value$);
@ -102,11 +143,12 @@ public class OccurrencePointsMerger implements Transducerer{
public String occurrenceRecord2String(OccurrenceRecord record){
StringBuffer buffer =new StringBuffer();
int index = 0;
int k=0;
int nNames = columnsNames.size();
for (Object name:columnsNames){
String name$ = ""+name;
String value$ = null;
String value$ = "''";
if (name$.equalsIgnoreCase(lonFld)){
value$="'"+record.x+"'";
}
@ -114,17 +156,31 @@ public class OccurrencePointsMerger implements Transducerer{
value$="'"+record.y+"'";
}
else if (name$.equalsIgnoreCase(recordedByFld)){
value$="'"+record.recordedby+"'";
if (record.recordedby!=null)
value$="'"+record.recordedby+"'";
}
else if (name$.equalsIgnoreCase(scientificNameFld)){
if (record.scientificName!=null)
value$="'"+record.scientificName+"'";
}
else if (name$.equalsIgnoreCase(eventDatFld)){
value$="'"+record.eventdate.toString()+"'";
if (record.eventdate!=null){
value$="'"+convert2conventionalFormat(record.eventdate)+"'";
// value$="'"+record.eventdate.getTimeInMillis()+"'";
}
}
else if (name$.equalsIgnoreCase(modifDatFld)){
value$="'"+record.modifdate.toString()+"'";
if (record.modifdate!=null){
value$="'"+convert2conventionalFormat(record.modifdate)+"'";
// value$="'"+record.modifdate.getTimeInMillis()+"'";
}
}
else{
if (record.otherValues!=null){
value$ = "'"+record.otherValues.get(k)+"'";
k++;
}
}
else
value$ = "'"+record.otherValues.get(index)+"'";
buffer.append(value$);
if (index<nNames-1){
buffer.append(",");
@ -137,29 +193,27 @@ public class OccurrencePointsMerger implements Transducerer{
}
public static void main(String[] args) {
public static void main(String[] args) throws Exception{
AlgorithmConfiguration config = Regressor.getConfig();
config.setNumberOfResources(1);
config.setParam(longitudeColumn,"presence_basking_cluster");
config.setParam(latitudeColumn,"centerlong"+AlgorithmConfiguration.getListSeparator()+"centerlat");
config.setParam("OccurrencePointsClusterTable","occcluster_xmeans");
config.setParam("maxIterations","1000");
config.setParam("minClusters","20");
config.setParam("maxClusters","30");
config.setParam("min_points","1");
/*
lonFld=config.getParam(longitudeColumn);
latFld=config.getParam(latitudeColumn);
recordedByFld=config.getParam(recordedByColumn);
scientificNameFld=config.getParam(scientificNameColumn);
eventDatFld=config.getParam(eventDateColumn);
modifDatFld=config.getParam(lastModificationColumn);
leftTableName=config.getParam(leftTableNameF);
rightTableName=config.getParam(rightTableNameF);
mergedTableName=config.getParam(mergedTableNameF);
spatialToleranceValue=Float.parseFloat(config.getParam(spatialTolerance));
confidenceValue=Float.parseFloat(config.getParam(confidence));
*/
config.setParam(longitudeColumn,"decimallongitude");
config.setParam(latitudeColumn,"decimallatitude");
config.setParam(recordedByColumn,"recordedby");
config.setParam(scientificNameColumn,"scientificname");
config.setParam(eventDateColumn,"eventdate");
config.setParam(lastModificationColumn,"modified");
config.setParam(rightTableNameF,"whitesharkoccurrences2");
config.setParam(leftTableNameF,"whitesharkoccurrences1");
// config.setParam(rightTableNameF,"whitesharkoccurrences2");
// config.setParam(rightTableNameF,"whitesharkoccurrences1");
config.setParam(mergedTableNameF,"whitesharkoccurrencesmerged");
config.setParam(spatialTolerance,"0.5");
config.setParam(confidence,"0.8");
OccurrencePointsMerger occm = new OccurrencePointsMerger();
occm.setConfiguration(config);
occm.init();
occm.compute();
}
@Override
@ -244,20 +298,51 @@ public class OccurrencePointsMerger implements Transducerer{
return null;
}
protected float extProb(OccurrenceRecord right,OccurrenceRecord left){
return (float)Math.random();
protected float probabilityStrings(String first, String second){
if ((first==null) ||(second==null))
return 1;
return (float) new DistanceCalculator().CD(false, first, second);
}
protected void manageProbability(float probability, OccurrenceRecord leftOcc, OccurrenceRecord rightOcc){
//if over the threshold then add to the complete list of elements
if (probability<confidenceValue)
objectstoinsert.add(rightOcc);
protected float probabilityDates(Calendar first, Calendar second){
if ((first==null) ||(second==null))
return 1;
if (first.compareTo(second)==0)
return 1;
else
return 0;
}
protected float extProb(OccurrenceRecord right,OccurrenceRecord left){
float probability = 0;
float distance = (float)Math.sqrt(Math.abs(left.x-right.x)+Math.abs(left.y-right.y));
if (distance>spatialToleranceValue)
probability=0;
else{
float pSpecies = probabilityStrings(right.scientificName, left.scientificName);
float pRecordedBy= probabilityStrings(right.recordedby, left.recordedby);
float pDates = probabilityDates(right.eventdate, left.eventdate);
probability = pSpecies*pRecordedBy*pDates;
}
return probability*100;
}
protected void manageHighProbability(float probability, OccurrenceRecord leftOcc, OccurrenceRecord rightOcc){
//if over the threshold then don't add
}
protected void manageLowProbability(float probability, OccurrenceRecord leftOcc, OccurrenceRecord rightOcc){
//if over the threshold then add to the element
objectstoinsert.add(rightOcc);
}
protected void persist(){
StringBuffer buffer = new StringBuffer();
int toins = objectstoinsert.size();
int counter = 0;
if (toins>0){
for (OccurrenceRecord record:objectstoinsert){
buffer.append("(");
buffer.append(occurrenceRecord2String(record));
@ -267,8 +352,11 @@ public class OccurrencePointsMerger implements Transducerer{
counter++;
}
String updateQ = DatabaseUtils.insertFromBuffer(mergedTableName,columns.toString(),buffer);
// System.out.println("Update:\n"+updateQ);
DatabaseFactory.executeSQLUpdate(updateQ, dbconnection);
}
}
@Override
@ -285,7 +373,7 @@ public class OccurrencePointsMerger implements Transducerer{
int nCols = columnsNames.size();
columns = new StringBuffer();
for (int i=0;i<nCols;i++){
columns.append(columnsNames);
columns.append("\""+columnsNames.get(i)+"\"");
if (i<nCols-1)
columns.append(",");
}
@ -304,31 +392,52 @@ public class OccurrencePointsMerger implements Transducerer{
//for each element in dx
List<OccurrenceRecord> leftRecords = new ArrayList<OccurrencePointsMerger.OccurrenceRecord>();
AnalysisLogger.getLogger().trace("Processing "+leftTableName+" vs "+rightTableName);
int iterations = 0;
int rightCounter = 0;
int similaritiesCounter = 0;
for (Object rRow:rightRows){
//transform into an occurrence object
OccurrenceRecord rightOcc = row2OccurrenceRecord((Object[])rRow);
//for each element in sx
int k=0;
int leftrecordsSize = 0;
boolean found = false;
float p = 0;
OccurrenceRecord bestleftOcc = null;
for (Object lRow:leftRows){
OccurrenceRecord leftOcc = null;
if (iterations==0){
//only for the first iteration on the left occurrences perform the transformation
if (leftrecordsSize<=k){
//transform into an occurrence object
leftOcc = row2OccurrenceRecord((Object[])lRow);
leftRecords.add(leftOcc);
leftrecordsSize++;
// System.out.println("ADDED "+k+"-th elements size: "+leftRecords.size());
}
else
leftOcc =leftRecords.get(k);
//evaluate P(dx,sx)
float p = extProb(leftOcc,rightOcc);
manageProbability(p, leftOcc, rightOcc);
p = extProb(leftOcc,rightOcc);
if (p>=confidenceValue){
AnalysisLogger.getLogger().trace("Found a similarity between ("+leftOcc.x+","+leftOcc.y+","+leftOcc.recordedby+ ") "+"("+rightOcc.x+","+rightOcc.y+","+rightOcc.recordedby+")");
bestleftOcc=leftOcc;
found=true;
similaritiesCounter++;
AnalysisLogger.getLogger().trace("Found a similarity with P="+p+" between ("+"\""+leftOcc.scientificName+"\""+","+leftOcc.x+"\""+","+"\""+leftOcc.y+"\""+","+"\""+leftOcc.recordedby+"\""+","+"\""+ convert2conventionalFormat(leftOcc.eventdate)+"\""+") VS "+
"("+"\""+rightOcc.scientificName+"\""+","+"\""+rightOcc.x+"\""+","+"\""+rightOcc.y+"\""+","+"\""+rightOcc.recordedby+"\""+","+"\""+ convert2conventionalFormat(rightOcc.eventdate)+"\""+")");
break;
}
k++;
}
iterations++;
rightCounter++;
if (found)
manageHighProbability(p, bestleftOcc, rightOcc);
else
manageLowProbability(p, bestleftOcc, rightOcc);
}
AnalysisLogger.getLogger().trace("Found "+similaritiesCounter+" similarities on "+rightCounter+" elements");
//transform the complete list into a table
persist();
//close DB connection
@ -338,6 +447,7 @@ public class OccurrencePointsMerger implements Transducerer{
finally{
if (dbconnection!=null)
dbconnection.close();
AnalysisLogger.getLogger().trace("Occ Points Processing Finished and db closed");
}
}
}

View File

@ -158,6 +158,7 @@ public class DatabaseUtils {
return "insert into "+table+" ("+columnsNames+") values "+values;
}
public static String copyFileToTableStatement (String file, String table){
return "COPY "+table+" FROM '"+file+"' DELIMITERS ';' WITH NULL AS 'null string'";
}

View File

@ -268,13 +268,13 @@ public class Operations {
return a * (x - shift) * (x - shift) + b * (x - shift) + c;
}
private static double[] inverseParabol(double a, double b, double c, double y) {
public static double[] inverseParabol(double a, double b, double c, double y) {
double[] ret = { (-1d * b + Math.sqrt(b * b + 4 * a * (Math.abs(y) - c))) / (2 * a), (-1d * b - Math.sqrt(b * b + 4 * a * (Math.abs(y) - c))) / (2 * a) };
return ret;
}
private static double logaritmicTransformation(double y) {
public static double logaritmicTransformation(double y) {
y = Math.abs(y);
if (y == 0)
return -Double.MAX_VALUE;
@ -356,6 +356,7 @@ public class Operations {
System.out.println("OK");
}
//distributes uniformly elements in parts
public static int[] takeChunks(int numberOfElements, int partitionFactor) {
int[] partitions = new int[1];
if (partitionFactor <= 0) {