This commit is contained in:
Gianpaolo Coro 2013-03-12 16:44:56 +00:00
parent 74c040904a
commit 9437d03c0a
5 changed files with 76 additions and 10 deletions

View File

@ -212,6 +212,11 @@ public class CategoryGuesser {
col = null; col = null;
} }
public ArrayList<SingleResult> getLastResults(){
return lastResults;
}
ArrayList<SingleResult> lastResults;
public void runGuesser(String seriesName, String columnName, LexicalEngineConfiguration externalConfig, String CategoryFilter, String ColumnFilter, String SingletonString) throws Exception { public void runGuesser(String seriesName, String columnName, LexicalEngineConfiguration externalConfig, String CategoryFilter, String ColumnFilter, String SingletonString) throws Exception {
SessionFactory dbSession = null; SessionFactory dbSession = null;
String cfgFileCompletePath = cfgFile; String cfgFileCompletePath = cfgFile;
@ -304,6 +309,7 @@ public class CategoryGuesser {
AnalysisLogger.getLogger().warn("...End Calculation in " + t1 + "ms"); AnalysisLogger.getLogger().warn("...End Calculation in " + t1 + "ms");
triesCounter = 0; triesCounter = 0;
lastResults=checkingResults;
// close session if not more necessary // close session if not more necessary
if (oneshotMode) if (oneshotMode)
dbSession.close(); dbSession.close();
@ -478,7 +484,8 @@ public class CategoryGuesser {
} }
} }
return deleteDuplicates(results); // return deleteDuplicates(results);
return results;
} }

View File

@ -37,11 +37,12 @@ public class ExampleGuesser {
conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect"); conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect");
guesser.runGuesser(seriesName, column, conf); guesser.runGuesser(seriesName, column, conf);
ArrayList<SingleResult> classification = guesser.getClassification(); ArrayList<SingleResult> classification = guesser.getLastResults();
//only for debug //only for debug
guesser.showResults(classification); guesser.showResults(classification);
guesser.shutdown();
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }

View File

@ -0,0 +1,58 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test;
import java.util.ArrayList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.Category;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class ExampleLexicalMatcher {
public static void main(String[] args) {
try {
String singleton = "Faroe Island";
String family = "COUNTRY_OLD";
String column = "field6";
CategoryGuesser guesser = new CategoryGuesser();
LexicalEngineConfiguration conf = new LexicalEngineConfiguration();
ArrayList<Category> categories = new ArrayList<Category>();
//human name, index, table name, description
categories.add(new Category("COUNTRY_OLD","39c98800-dd3c-11e0-b8d1-d1e2e7ba4f9d","rdf39c98800dd3c11e0b8d1d1e2e7ba4f9d","country"));
categories.add(new Category("CONTINENT_OLD","1d5d51f0-dd42-11e0-b8d3-d1e2e7ba4f9d","rdf1d5d51f0dd4211e0b8d3d1e2e7ba4f9d","continent reference data"));
categories.add(new Category("SPECIES_OLD","0a7fb500-dd3d-11e0-b8d1-d1e2e7ba4f9d","rdf0a7fb500dd3d11e0b8d1d1e2e7ba4f9d","species"));
categories.add(new Category("CodeListCountry","4c8d93a0-edc2-11e0-93e4-f6a9821baa29","rdf4c8d93a0edc211e093e4f6a9821baa29","Country"));
categories.add(new Category("CL_DIVISION","1140bdf0-dd2c-11e0-9220-ae17b3db32b7","rdf1140bdf0dd2c11e09220ae17b3db32b7","undefined"));
categories.add(new Category("CL_ASFIS_TAX","f87360f0-d9f9-11e0-ba05-d9adb0db767c","rdff87360f0d9f911e0ba05d9adb0db767c","undefined"));
conf.setCategories(categories);
//CHANGE THIS TO ENHANCE THE RECALL
conf.setEntryAcceptanceThreshold(30);
conf.setReferenceChunksToTake(-1);
conf.setTimeSeriesChunksToTake(-1);
conf.setUseSimpleDistance(false);
//database Parameters
conf.setDatabaseUserName("gcube");
conf.setDatabasePassword("d4science2");
conf.setDatabaseURL("jdbc:postgresql://localhost/testdb");
conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect");
guesser.runGuesser(singleton, conf, family,column );
ArrayList<SingleResult> detailedResults = guesser.getDetailedMatches();
AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton);
CategoryGuesser.showResults(detailedResults);
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -8,9 +8,9 @@ log4j.appender.stdout.layout.ConversionPattern=%d{dd/MM/yyyy HH:mm:ss} %p %t %c
#### Second appender writes to a file #### Second appender writes to a file
log4j.logger.AnalysisLogger=info,AR log4j.logger.AnalysisLogger=trace,AR,stdout
log4j.appender.AR=org.apache.log4j.RollingFileAppender log4j.appender.AR=org.apache.log4j.RollingFileAppender
log4j.appender.AR.Threshold=info log4j.appender.AR.Threshold=trace
log4j.appender.AR.File=${GLOBUS_LOCATION}/logs/Analysis.log log4j.appender.AR.File=${GLOBUS_LOCATION}/logs/Analysis.log
log4j.appender.AR.MaxFileSize=1024KB log4j.appender.AR.MaxFileSize=1024KB
log4j.appender.AR.MaxBackupIndex=2 log4j.appender.AR.MaxBackupIndex=2

View File

@ -1,13 +1,13 @@
#Percentage threshold for discarding a category #Percentage threshold for discarding a category
categoryDiscardThreshold=0 categoryDiscardThreshold=0
#Percentage threshold for accepting similarity between a single Time series entry and a reference entry #Percentage threshold for accepting similarity between a single Time series entry and a reference entry
entryAcceptanceThreshold=80 entryAcceptanceThreshold=50
#Size of a comparison chunk #Size of a comparison chunk
chunkSize=25 chunkSize=50
#Number of chunks to take from Time series for performing comparison respect to reference data; if set to -1 all chunks will be analyzed #Number of chunks to take from Time series for performing comparison respect to reference data; if set to -1 all chunks will be analyzed
timeSeriesChunksToTake=2 timeSeriesChunksToTake=1
#Number of chunks to take from Reference for performing comparison Time Series Elements; if set to -1 all chunks will be analyzed #Number of chunks to take from Reference for performing comparison Time Series Elements; if set to -1 all chunks will be analyzed
referenceChunksToTake =50 referenceChunksToTake=5
#Use random choice for chunks selection = true |false #Use random choice for chunks selection = true |false
randomTake=true randomTake=true
#Use Simple String Match as distance calculation #Use Simple String Match as distance calculation
@ -15,6 +15,6 @@ useSimpleDistance=false
#Number Of Threads to use #Number Of Threads to use
numberOfThreadsToUse=5 numberOfThreadsToUse=5
#if two final scores differ for more than this percentage, prune the lower result #if two final scores differ for more than this percentage, prune the lower result
categoryDiscardDifferencialThreshold = 2 categoryDiscardDifferencialThreshold = 50
#maximum difference between a result and the best result #maximum difference between a result and the best result
singleEntryRecognitionMaxDeviation = 40 singleEntryRecognitionMaxDeviation = 40