diff --git a/src/main/java/org/gcube/contentmanagement/graphtools/utils/DateGuesser.java b/src/main/java/org/gcube/contentmanagement/graphtools/utils/DateGuesser.java index 7c520ed..206b893 100644 --- a/src/main/java/org/gcube/contentmanagement/graphtools/utils/DateGuesser.java +++ b/src/main/java/org/gcube/contentmanagement/graphtools/utils/DateGuesser.java @@ -32,7 +32,7 @@ public class DateGuesser { } // private static final String[] formats = { "MM\\dd\\yyyy", "MM\\dd\\yy", "MM/dd/yy", "MM/dd/yyyy", "dd/MM/yy", "dd/MM/yyyy", "dd/MM/yyyy HH:mm:ss", "dd/MM/yy HH:mm:ss", "dd/MM/yyyy HH:mm:ss","MM/yy","MM/yyyy", "yyyy.MM.dd G 'at' HH:mm:ss z", "EEE, MMM d, ''yy", "h:mm a", "hh 'o''clock' a, zzzz", "K:mm a, z", "MM-dd-yy","MM-dd-yyyy", "dd-MMM-yy", "yyyy.MM.dd.HH.mm.ss", "E, dd MMM yyyy HH:mm:ss Z", "yyyyy.MMMMM.dd GGG hh:mm aaa", "EEE, d MMM yyyy HH:mm:ss Z", "yyMMddHHmmssZ", "yyyy-MM-dd'T'HH:mm:ss.SSSZ", "yyyy-MM-dd HH:mm","yyyy-MM-dd","yyyy-MM-dd HH:mm:ss", "h:mm a", "yyyy"}; - private static final String[] formats = { "MM\\dd\\yyyy", "MM\\dd\\yy", "MM/dd/yy", "MM/dd/yyyy", "MM/yy","MM/yyyy", "yyyy.MM.dd G 'at' HH:mm:ss z", "MM/dd/yyyy HH:mm:ss","MM/dd/yyyy HH:mm:ss aaa","dd/MM/yyyy HH:mm:ss","EEE, MMM d, ''yy", "h:mm a", "hh 'o''clock' a, zzzz", "K:mm a, z", "MM-dd-yy","MM-dd-yyyy", "dd-MMM-yy", "yyyy.MM.dd.HH.mm.ss", "E, dd MMM yyyy HH:mm:ss Z", "yyyyy.MMMMM.dd GGG hh:mm aaa", "EEE, d MMM yyyy HH:mm:ss Z", "yyMMddHHmmssZ", "yyyy-MM-dd'T'HH:mm:ss.SSSZ", "yyyy-MM-dd HH:mm","yyyy-MM-dd","yyyy-MM-dd HH:mm:ss", "h:mm a", "yyyy"}; + private static final String[] formats = { "MM\\dd\\yyyy", "MM\\dd\\yy", "MM/dd/yy", "MM/dd/yyyy", "MM/yy","MM/yyyy", "yyyy.MM.dd G 'at' HH:mm:ss z", "MM/dd/yyyy HH:mm:ss","dd/MM/yyyy HH:mm:ss","EEE, MMM d, ''yy", "h:mm a", "hh 'o''clock' a, zzzz", "K:mm a, z", "MM-dd-yy","MM-dd-yyyy", "dd-MMM-yy", "yyyy.MM.dd.HH.mm.ss", "E, dd MMM yyyy HH:mm:ss Z", "yyyyy.MMMMM.dd GGG hh:mm aaa", "EEE, d MMM yyyy HH:mm:ss Z", "yyMMddHHmmssZ", "yyyy-MM-dd'T'HH:mm:ss.SSSZ", "yyyy-MM-dd HH:mm","yyyy-MM-dd","yyyy-MM-dd HH:mm:ss","MM/dd/yy KK:mm a","MM/dd/yy KK:mm:ss a","h:mm a", "yyyy"}; private static final String[] formatiITA = { "dd\\MM\\yyyy", "dd\\MM\\yy", "dd/MM/yy", "dd/MM/yyyy", "dd/MM/yy", "dd/MM/yyyy","dd/MM/yyyy HH:mm:ss", "dd/MM/yy HH:mm:ss", "dd/MM/yyyy HH:mm:ss", "MM/yy","MM/yyyy","dd.MM.yyyy G 'alle' HH:mm:ss z", "EEE, MMM d, ''yy", "h:mm a", "hh a, zzzz", "K:mm a, z", "dd-MMM-yy", "dd.MM.yyyy.HH.mm.ss", "E, dd MMM yyyy HH:mm:ss Z", "yyyyy.MMMMM.dd GGG hh:mm aaa", "EEE, d MMM yyyy HH:mm:ss Z", "yyMMddHHmmssZ", "yyyy-MM-dd'T'HH:mm:ss.SSSZ", "dd-MMM-yyyy HH:mm", "h:mm a", "yyyy"}; diff --git a/src/main/java/org/gcube/dataanalysis/ecoengine/modeling/SimpleModeler.java b/src/main/java/org/gcube/dataanalysis/ecoengine/modeling/SimpleModeler.java index 380e7c7..9d47f2d 100644 --- a/src/main/java/org/gcube/dataanalysis/ecoengine/modeling/SimpleModeler.java +++ b/src/main/java/org/gcube/dataanalysis/ecoengine/modeling/SimpleModeler.java @@ -1,5 +1,6 @@ package org.gcube.dataanalysis.ecoengine.modeling; +import java.util.ArrayList; import java.util.List; import org.gcube.dataanalysis.ecoengine.configuration.ALG_PROPS; @@ -65,7 +66,8 @@ public class SimpleModeler implements Modeler{ @Override public List getInputParameters() { - return innermodel.getInputParameters(); + return new ArrayList(); +// return innermodel.getInputParameters(); } @Override diff --git a/src/main/java/org/gcube/dataanalysis/ecoengine/processing/LocalSimpleSplitGenerator.java b/src/main/java/org/gcube/dataanalysis/ecoengine/processing/LocalSimpleSplitGenerator.java index 2d27d9f..5e94c55 100644 --- a/src/main/java/org/gcube/dataanalysis/ecoengine/processing/LocalSimpleSplitGenerator.java +++ b/src/main/java/org/gcube/dataanalysis/ecoengine/processing/LocalSimpleSplitGenerator.java @@ -1,5 +1,6 @@ package org.gcube.dataanalysis.ecoengine.processing; +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Properties; @@ -328,7 +329,8 @@ public class LocalSimpleSplitGenerator implements Generator { @Override public List getInputParameters() { - return distributionModel.getInputParameters(); +// return distributionModel.getInputParameters(); + return new ArrayList(); } diff --git a/src/main/java/org/gcube/dataanalysis/ecoengine/processing/LocalSplitGenerator.java b/src/main/java/org/gcube/dataanalysis/ecoengine/processing/LocalSplitGenerator.java index 5359295..a6c6bbe 100644 --- a/src/main/java/org/gcube/dataanalysis/ecoengine/processing/LocalSplitGenerator.java +++ b/src/main/java/org/gcube/dataanalysis/ecoengine/processing/LocalSplitGenerator.java @@ -1,5 +1,6 @@ package org.gcube.dataanalysis.ecoengine.processing; +import java.util.ArrayList; import java.util.List; import java.util.Properties; import java.util.Queue; @@ -497,7 +498,8 @@ public class LocalSplitGenerator implements Generator { @Override public List getInputParameters() { - return distributionModel.getInputParameters(); + return new ArrayList(); +// return distributionModel.getInputParameters(); } diff --git a/src/main/java/org/gcube/dataanalysis/ecoengine/transducers/OccurrencePointsMerger.java b/src/main/java/org/gcube/dataanalysis/ecoengine/transducers/OccurrencePointsMerger.java index 4fdab45..49da996 100644 --- a/src/main/java/org/gcube/dataanalysis/ecoengine/transducers/OccurrencePointsMerger.java +++ b/src/main/java/org/gcube/dataanalysis/ecoengine/transducers/OccurrencePointsMerger.java @@ -1,12 +1,17 @@ package org.gcube.dataanalysis.ecoengine.transducers; +import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; +import java.util.Date; import java.util.List; +import java.util.Locale; import org.gcube.contentmanagement.graphtools.utils.DateGuesser; import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; import org.gcube.contentmanagement.lexicalmatcher.utils.DatabaseFactory; +import org.gcube.contentmanagement.lexicalmatcher.utils.DistanceCalculator; import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration; import org.gcube.dataanalysis.ecoengine.configuration.INFRASTRUCTURE; import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType; @@ -68,10 +73,19 @@ public class OccurrencePointsMerger implements Transducerer{ } } - + public static String convert2conventionalFormat(Calendar date){ + if (date==null) + return ""; + SimpleDateFormat formatter = new SimpleDateFormat("MM/dd/yy KK:mm:ss a"); + String formattedDate = formatter.format(new Date(date.getTimeInMillis())); + return formattedDate; + +} + boolean displaydateconvert = true; public OccurrenceRecord row2OccurrenceRecord(Object[] row){ OccurrenceRecord record = new OccurrenceRecord(); int index = 0; + for (Object name:columnsNames){ String name$ = ""+name; String value$ = ""+row[index]; @@ -84,8 +98,35 @@ public class OccurrencePointsMerger implements Transducerer{ else if (name$.equalsIgnoreCase(recordedByFld)){ record.recordedby=value$; } + else if (name$.equalsIgnoreCase(scientificNameFld)){ + record.scientificName=value$; + } else if (name$.equalsIgnoreCase(eventDatFld)){ - record.eventdate=DateGuesser.convertDate(value$); + if ((value$==null) || (value$.length()==0)){ + record.eventdate=null; + } + else{ + /* + SimpleDateFormat formatter = new SimpleDateFormat("MM/dd/yy KK:mm a",Locale.UK); + try { + Date d = (Date) formatter.parse(value$); + Calendar cal = Calendar.getInstance(); + cal.setTime(d); + System.out.println("From "+value$+"->"+(cal.get(Calendar.MONTH)+1)+" "+cal.get(Calendar.DAY_OF_MONTH)+" "+cal.get(Calendar.YEAR)+" "+cal.get(Calendar.HOUR)+" "+cal.get(Calendar.MINUTE)); +// System.out.println("->"+cal.toString()); + } catch (ParseException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + */ + record.eventdate=DateGuesser.convertDate(value$); + if (displaydateconvert) + { AnalysisLogger.getLogger().trace("From "+value$+"->"+convert2conventionalFormat(record.eventdate)+" pattern "+DateGuesser.getPattern(value$)); + displaydateconvert=false; + } + + + } } else if (name$.equalsIgnoreCase(modifDatFld)){ record.modifdate=DateGuesser.convertDate(value$); @@ -102,11 +143,12 @@ public class OccurrencePointsMerger implements Transducerer{ public String occurrenceRecord2String(OccurrenceRecord record){ StringBuffer buffer =new StringBuffer(); int index = 0; + int k=0; int nNames = columnsNames.size(); for (Object name:columnsNames){ String name$ = ""+name; - String value$ = null; + String value$ = "''"; if (name$.equalsIgnoreCase(lonFld)){ value$="'"+record.x+"'"; } @@ -114,17 +156,31 @@ public class OccurrencePointsMerger implements Transducerer{ value$="'"+record.y+"'"; } else if (name$.equalsIgnoreCase(recordedByFld)){ - value$="'"+record.recordedby+"'"; + if (record.recordedby!=null) + value$="'"+record.recordedby+"'"; + } + else if (name$.equalsIgnoreCase(scientificNameFld)){ + if (record.scientificName!=null) + value$="'"+record.scientificName+"'"; } else if (name$.equalsIgnoreCase(eventDatFld)){ - value$="'"+record.eventdate.toString()+"'"; + if (record.eventdate!=null){ + value$="'"+convert2conventionalFormat(record.eventdate)+"'"; +// value$="'"+record.eventdate.getTimeInMillis()+"'"; + } } else if (name$.equalsIgnoreCase(modifDatFld)){ - value$="'"+record.modifdate.toString()+"'"; + if (record.modifdate!=null){ + value$="'"+convert2conventionalFormat(record.modifdate)+"'"; +// value$="'"+record.modifdate.getTimeInMillis()+"'"; + } + } + else{ + if (record.otherValues!=null){ + value$ = "'"+record.otherValues.get(k)+"'"; + k++; + } } - else - value$ = "'"+record.otherValues.get(index)+"'"; - buffer.append(value$); if (indexspatialToleranceValue) + probability=0; + else{ + float pSpecies = probabilityStrings(right.scientificName, left.scientificName); + float pRecordedBy= probabilityStrings(right.recordedby, left.recordedby); + float pDates = probabilityDates(right.eventdate, left.eventdate); + probability = pSpecies*pRecordedBy*pDates; + } + + return probability*100; + } + + protected void manageHighProbability(float probability, OccurrenceRecord leftOcc, OccurrenceRecord rightOcc){ + //if over the threshold then don't add + } + + protected void manageLowProbability(float probability, OccurrenceRecord leftOcc, OccurrenceRecord rightOcc){ + //if over the threshold then add to the element + objectstoinsert.add(rightOcc); + } + protected void persist(){ StringBuffer buffer = new StringBuffer(); int toins = objectstoinsert.size(); int counter = 0; + if (toins>0){ for (OccurrenceRecord record:objectstoinsert){ buffer.append("("); buffer.append(occurrenceRecord2String(record)); @@ -267,8 +352,11 @@ public class OccurrencePointsMerger implements Transducerer{ counter++; } + String updateQ = DatabaseUtils.insertFromBuffer(mergedTableName,columns.toString(),buffer); +// System.out.println("Update:\n"+updateQ); DatabaseFactory.executeSQLUpdate(updateQ, dbconnection); + } } @Override @@ -285,7 +373,7 @@ public class OccurrencePointsMerger implements Transducerer{ int nCols = columnsNames.size(); columns = new StringBuffer(); for (int i=0;i leftRecords = new ArrayList(); AnalysisLogger.getLogger().trace("Processing "+leftTableName+" vs "+rightTableName); - int iterations = 0; + int rightCounter = 0; + int similaritiesCounter = 0; for (Object rRow:rightRows){ + //transform into an occurrence object OccurrenceRecord rightOcc = row2OccurrenceRecord((Object[])rRow); //for each element in sx int k=0; + int leftrecordsSize = 0; + boolean found = false; + float p = 0; + OccurrenceRecord bestleftOcc = null; for (Object lRow:leftRows){ OccurrenceRecord leftOcc = null; - if (iterations==0){ + //only for the first iteration on the left occurrences perform the transformation + if (leftrecordsSize<=k){ + //transform into an occurrence object leftOcc = row2OccurrenceRecord((Object[])lRow); leftRecords.add(leftOcc); + leftrecordsSize++; +// System.out.println("ADDED "+k+"-th elements size: "+leftRecords.size()); } else leftOcc =leftRecords.get(k); //evaluate P(dx,sx) - float p = extProb(leftOcc,rightOcc); - manageProbability(p, leftOcc, rightOcc); + p = extProb(leftOcc,rightOcc); + if (p>=confidenceValue){ - AnalysisLogger.getLogger().trace("Found a similarity between ("+leftOcc.x+","+leftOcc.y+","+leftOcc.recordedby+ ") "+"("+rightOcc.x+","+rightOcc.y+","+rightOcc.recordedby+")"); + bestleftOcc=leftOcc; + found=true; + similaritiesCounter++; + AnalysisLogger.getLogger().trace("Found a similarity with P="+p+" between ("+"\""+leftOcc.scientificName+"\""+","+leftOcc.x+"\""+","+"\""+leftOcc.y+"\""+","+"\""+leftOcc.recordedby+"\""+","+"\""+ convert2conventionalFormat(leftOcc.eventdate)+"\""+") VS "+ + "("+"\""+rightOcc.scientificName+"\""+","+"\""+rightOcc.x+"\""+","+"\""+rightOcc.y+"\""+","+"\""+rightOcc.recordedby+"\""+","+"\""+ convert2conventionalFormat(rightOcc.eventdate)+"\""+")"); break; } k++; } - iterations++; + rightCounter++; + if (found) + manageHighProbability(p, bestleftOcc, rightOcc); + else + manageLowProbability(p, bestleftOcc, rightOcc); } + + AnalysisLogger.getLogger().trace("Found "+similaritiesCounter+" similarities on "+rightCounter+" elements"); + //transform the complete list into a table persist(); //close DB connection @@ -338,6 +447,7 @@ public class OccurrencePointsMerger implements Transducerer{ finally{ if (dbconnection!=null) dbconnection.close(); + AnalysisLogger.getLogger().trace("Occ Points Processing Finished and db closed"); } } } diff --git a/src/main/java/org/gcube/dataanalysis/ecoengine/utils/DatabaseUtils.java b/src/main/java/org/gcube/dataanalysis/ecoengine/utils/DatabaseUtils.java index d033d8d..cf53c73 100644 --- a/src/main/java/org/gcube/dataanalysis/ecoengine/utils/DatabaseUtils.java +++ b/src/main/java/org/gcube/dataanalysis/ecoengine/utils/DatabaseUtils.java @@ -158,6 +158,7 @@ public class DatabaseUtils { return "insert into "+table+" ("+columnsNames+") values "+values; } + public static String copyFileToTableStatement (String file, String table){ return "COPY "+table+" FROM '"+file+"' DELIMITERS ';' WITH NULL AS 'null string'"; } diff --git a/src/main/java/org/gcube/dataanalysis/ecoengine/utils/Operations.java b/src/main/java/org/gcube/dataanalysis/ecoengine/utils/Operations.java index 5d71bd7..46a5911 100644 --- a/src/main/java/org/gcube/dataanalysis/ecoengine/utils/Operations.java +++ b/src/main/java/org/gcube/dataanalysis/ecoengine/utils/Operations.java @@ -268,13 +268,13 @@ public class Operations { return a * (x - shift) * (x - shift) + b * (x - shift) + c; } - private static double[] inverseParabol(double a, double b, double c, double y) { + public static double[] inverseParabol(double a, double b, double c, double y) { double[] ret = { (-1d * b + Math.sqrt(b * b + 4 * a * (Math.abs(y) - c))) / (2 * a), (-1d * b - Math.sqrt(b * b + 4 * a * (Math.abs(y) - c))) / (2 * a) }; return ret; } - private static double logaritmicTransformation(double y) { + public static double logaritmicTransformation(double y) { y = Math.abs(y); if (y == 0) return -Double.MAX_VALUE; @@ -356,6 +356,7 @@ public class Operations { System.out.println("OK"); } + //distributes uniformly elements in parts public static int[] takeChunks(int numberOfElements, int partitionFactor) { int[] partitions = new int[1]; if (partitionFactor <= 0) {