git-svn-id: https://svn.d4science.research-infrastructures.eu/gcube/trunk/data-analysis/EcologicalEngine@71257 82a268e6-3cf1-43bd-a215-b396298e98cf
This commit is contained in:
parent
74c040904a
commit
9437d03c0a
|
@ -212,6 +212,11 @@ public class CategoryGuesser {
|
|||
col = null;
|
||||
}
|
||||
|
||||
public ArrayList<SingleResult> getLastResults(){
|
||||
return lastResults;
|
||||
}
|
||||
|
||||
ArrayList<SingleResult> lastResults;
|
||||
public void runGuesser(String seriesName, String columnName, LexicalEngineConfiguration externalConfig, String CategoryFilter, String ColumnFilter, String SingletonString) throws Exception {
|
||||
SessionFactory dbSession = null;
|
||||
String cfgFileCompletePath = cfgFile;
|
||||
|
@ -304,6 +309,7 @@ public class CategoryGuesser {
|
|||
AnalysisLogger.getLogger().warn("...End Calculation in " + t1 + "ms");
|
||||
|
||||
triesCounter = 0;
|
||||
lastResults=checkingResults;
|
||||
// close session if not more necessary
|
||||
if (oneshotMode)
|
||||
dbSession.close();
|
||||
|
@ -478,7 +484,8 @@ public class CategoryGuesser {
|
|||
}
|
||||
}
|
||||
|
||||
return deleteDuplicates(results);
|
||||
// return deleteDuplicates(results);
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -37,11 +37,12 @@ public class ExampleGuesser {
|
|||
conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect");
|
||||
|
||||
guesser.runGuesser(seriesName, column, conf);
|
||||
ArrayList<SingleResult> classification = guesser.getClassification();
|
||||
ArrayList<SingleResult> classification = guesser.getLastResults();
|
||||
|
||||
//only for debug
|
||||
guesser.showResults(classification);
|
||||
|
||||
guesser.shutdown();
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
package org.gcube.contentmanagement.lexicalmatcher.analysis.test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
|
||||
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.Category;
|
||||
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
|
||||
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
|
||||
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
|
||||
|
||||
public class ExampleLexicalMatcher {
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
try {
|
||||
|
||||
String singleton = "Faroe Island";
|
||||
String family = "COUNTRY_OLD";
|
||||
String column = "field6";
|
||||
|
||||
CategoryGuesser guesser = new CategoryGuesser();
|
||||
LexicalEngineConfiguration conf = new LexicalEngineConfiguration();
|
||||
|
||||
ArrayList<Category> categories = new ArrayList<Category>();
|
||||
//human name, index, table name, description
|
||||
categories.add(new Category("COUNTRY_OLD","39c98800-dd3c-11e0-b8d1-d1e2e7ba4f9d","rdf39c98800dd3c11e0b8d1d1e2e7ba4f9d","country"));
|
||||
categories.add(new Category("CONTINENT_OLD","1d5d51f0-dd42-11e0-b8d3-d1e2e7ba4f9d","rdf1d5d51f0dd4211e0b8d3d1e2e7ba4f9d","continent reference data"));
|
||||
categories.add(new Category("SPECIES_OLD","0a7fb500-dd3d-11e0-b8d1-d1e2e7ba4f9d","rdf0a7fb500dd3d11e0b8d1d1e2e7ba4f9d","species"));
|
||||
categories.add(new Category("CodeListCountry","4c8d93a0-edc2-11e0-93e4-f6a9821baa29","rdf4c8d93a0edc211e093e4f6a9821baa29","Country"));
|
||||
categories.add(new Category("CL_DIVISION","1140bdf0-dd2c-11e0-9220-ae17b3db32b7","rdf1140bdf0dd2c11e09220ae17b3db32b7","undefined"));
|
||||
categories.add(new Category("CL_ASFIS_TAX","f87360f0-d9f9-11e0-ba05-d9adb0db767c","rdff87360f0d9f911e0ba05d9adb0db767c","undefined"));
|
||||
conf.setCategories(categories);
|
||||
|
||||
//CHANGE THIS TO ENHANCE THE RECALL
|
||||
conf.setEntryAcceptanceThreshold(30);
|
||||
conf.setReferenceChunksToTake(-1);
|
||||
conf.setTimeSeriesChunksToTake(-1);
|
||||
conf.setUseSimpleDistance(false);
|
||||
|
||||
//database Parameters
|
||||
conf.setDatabaseUserName("gcube");
|
||||
conf.setDatabasePassword("d4science2");
|
||||
conf.setDatabaseURL("jdbc:postgresql://localhost/testdb");
|
||||
conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect");
|
||||
|
||||
guesser.runGuesser(singleton, conf, family,column );
|
||||
ArrayList<SingleResult> detailedResults = guesser.getDetailedMatches();
|
||||
AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton);
|
||||
|
||||
CategoryGuesser.showResults(detailedResults);
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -8,9 +8,9 @@ log4j.appender.stdout.layout.ConversionPattern=%d{dd/MM/yyyy HH:mm:ss} %p %t %c
|
|||
|
||||
|
||||
#### Second appender writes to a file
|
||||
log4j.logger.AnalysisLogger=info,AR
|
||||
log4j.logger.AnalysisLogger=trace,AR,stdout
|
||||
log4j.appender.AR=org.apache.log4j.RollingFileAppender
|
||||
log4j.appender.AR.Threshold=info
|
||||
log4j.appender.AR.Threshold=trace
|
||||
log4j.appender.AR.File=${GLOBUS_LOCATION}/logs/Analysis.log
|
||||
log4j.appender.AR.MaxFileSize=1024KB
|
||||
log4j.appender.AR.MaxBackupIndex=2
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
#Percentage threshold for discarding a category
|
||||
categoryDiscardThreshold=0
|
||||
#Percentage threshold for accepting similarity between a single Time series entry and a reference entry
|
||||
entryAcceptanceThreshold=80
|
||||
entryAcceptanceThreshold=50
|
||||
#Size of a comparison chunk
|
||||
chunkSize=25
|
||||
chunkSize=50
|
||||
#Number of chunks to take from Time series for performing comparison respect to reference data; if set to -1 all chunks will be analyzed
|
||||
timeSeriesChunksToTake=2
|
||||
timeSeriesChunksToTake=1
|
||||
#Number of chunks to take from Reference for performing comparison Time Series Elements; if set to -1 all chunks will be analyzed
|
||||
referenceChunksToTake =50
|
||||
referenceChunksToTake=5
|
||||
#Use random choice for chunks selection = true |false
|
||||
randomTake=true
|
||||
#Use Simple String Match as distance calculation
|
||||
|
@ -15,6 +15,6 @@ useSimpleDistance=false
|
|||
#Number Of Threads to use
|
||||
numberOfThreadsToUse=5
|
||||
#if two final scores differ for more than this percentage, prune the lower result
|
||||
categoryDiscardDifferencialThreshold = 2
|
||||
categoryDiscardDifferencialThreshold = 50
|
||||
#maximum difference between a result and the best result
|
||||
singleEntryRecognitionMaxDeviation = 40
|
Loading…
Reference in New Issue