From 9437d03c0a55f6da3e7f11439b95f521bf761e16 Mon Sep 17 00:00:00 2001 From: Gianpaolo Coro Date: Tue, 12 Mar 2013 16:44:56 +0000 Subject: [PATCH] git-svn-id: https://svn.d4science.research-infrastructures.eu/gcube/trunk/data-analysis/EcologicalEngine@71257 82a268e6-3cf1-43bd-a215-b396298e98cf --- .../analysis/run/CategoryGuesser.java | 9 ++- .../analysis/test/ExampleGuesser.java | 5 +- .../analysis/test/ExampleLexicalMatcher.java | 58 +++++++++++++++++++ .../resources/lexicalguesser/ALog.properties | 4 +- .../lexicalguesser/lexicalGuesser.properties | 10 ++-- 5 files changed, 76 insertions(+), 10 deletions(-) create mode 100644 src/main/java/org/gcube/contentmanagement/lexicalmatcher/analysis/test/ExampleLexicalMatcher.java diff --git a/src/main/java/org/gcube/contentmanagement/lexicalmatcher/analysis/run/CategoryGuesser.java b/src/main/java/org/gcube/contentmanagement/lexicalmatcher/analysis/run/CategoryGuesser.java index 45a241b..76ed6e3 100644 --- a/src/main/java/org/gcube/contentmanagement/lexicalmatcher/analysis/run/CategoryGuesser.java +++ b/src/main/java/org/gcube/contentmanagement/lexicalmatcher/analysis/run/CategoryGuesser.java @@ -212,6 +212,11 @@ public class CategoryGuesser { col = null; } + public ArrayList getLastResults(){ + return lastResults; + } + + ArrayList lastResults; public void runGuesser(String seriesName, String columnName, LexicalEngineConfiguration externalConfig, String CategoryFilter, String ColumnFilter, String SingletonString) throws Exception { SessionFactory dbSession = null; String cfgFileCompletePath = cfgFile; @@ -304,6 +309,7 @@ public class CategoryGuesser { AnalysisLogger.getLogger().warn("...End Calculation in " + t1 + "ms"); triesCounter = 0; + lastResults=checkingResults; // close session if not more necessary if (oneshotMode) dbSession.close(); @@ -478,7 +484,8 @@ public class CategoryGuesser { } } - return deleteDuplicates(results); +// return deleteDuplicates(results); + return results; } diff --git a/src/main/java/org/gcube/contentmanagement/lexicalmatcher/analysis/test/ExampleGuesser.java b/src/main/java/org/gcube/contentmanagement/lexicalmatcher/analysis/test/ExampleGuesser.java index 56e4200..fefd0da 100644 --- a/src/main/java/org/gcube/contentmanagement/lexicalmatcher/analysis/test/ExampleGuesser.java +++ b/src/main/java/org/gcube/contentmanagement/lexicalmatcher/analysis/test/ExampleGuesser.java @@ -37,11 +37,12 @@ public class ExampleGuesser { conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect"); guesser.runGuesser(seriesName, column, conf); - ArrayList classification = guesser.getClassification(); + ArrayList classification = guesser.getLastResults(); //only for debug guesser.showResults(classification); - + + guesser.shutdown(); } catch (Exception e) { e.printStackTrace(); } diff --git a/src/main/java/org/gcube/contentmanagement/lexicalmatcher/analysis/test/ExampleLexicalMatcher.java b/src/main/java/org/gcube/contentmanagement/lexicalmatcher/analysis/test/ExampleLexicalMatcher.java new file mode 100644 index 0000000..0781028 --- /dev/null +++ b/src/main/java/org/gcube/contentmanagement/lexicalmatcher/analysis/test/ExampleLexicalMatcher.java @@ -0,0 +1,58 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test; + +import java.util.ArrayList; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.Category; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult; +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class ExampleLexicalMatcher { + + public static void main(String[] args) { + + try { + + String singleton = "Faroe Island"; + String family = "COUNTRY_OLD"; + String column = "field6"; + + CategoryGuesser guesser = new CategoryGuesser(); + LexicalEngineConfiguration conf = new LexicalEngineConfiguration(); + + ArrayList categories = new ArrayList(); + //human name, index, table name, description + categories.add(new Category("COUNTRY_OLD","39c98800-dd3c-11e0-b8d1-d1e2e7ba4f9d","rdf39c98800dd3c11e0b8d1d1e2e7ba4f9d","country")); + categories.add(new Category("CONTINENT_OLD","1d5d51f0-dd42-11e0-b8d3-d1e2e7ba4f9d","rdf1d5d51f0dd4211e0b8d3d1e2e7ba4f9d","continent reference data")); + categories.add(new Category("SPECIES_OLD","0a7fb500-dd3d-11e0-b8d1-d1e2e7ba4f9d","rdf0a7fb500dd3d11e0b8d1d1e2e7ba4f9d","species")); + categories.add(new Category("CodeListCountry","4c8d93a0-edc2-11e0-93e4-f6a9821baa29","rdf4c8d93a0edc211e093e4f6a9821baa29","Country")); + categories.add(new Category("CL_DIVISION","1140bdf0-dd2c-11e0-9220-ae17b3db32b7","rdf1140bdf0dd2c11e09220ae17b3db32b7","undefined")); + categories.add(new Category("CL_ASFIS_TAX","f87360f0-d9f9-11e0-ba05-d9adb0db767c","rdff87360f0d9f911e0ba05d9adb0db767c","undefined")); + conf.setCategories(categories); + + //CHANGE THIS TO ENHANCE THE RECALL + conf.setEntryAcceptanceThreshold(30); + conf.setReferenceChunksToTake(-1); + conf.setTimeSeriesChunksToTake(-1); + conf.setUseSimpleDistance(false); + + //database Parameters + conf.setDatabaseUserName("gcube"); + conf.setDatabasePassword("d4science2"); + conf.setDatabaseURL("jdbc:postgresql://localhost/testdb"); + conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect"); + + guesser.runGuesser(singleton, conf, family,column ); + ArrayList detailedResults = guesser.getDetailedMatches(); + AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton); + + CategoryGuesser.showResults(detailedResults); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/main/resources/lexicalguesser/ALog.properties b/src/main/resources/lexicalguesser/ALog.properties index 2229b75..ec5c991 100644 --- a/src/main/resources/lexicalguesser/ALog.properties +++ b/src/main/resources/lexicalguesser/ALog.properties @@ -8,9 +8,9 @@ log4j.appender.stdout.layout.ConversionPattern=%d{dd/MM/yyyy HH:mm:ss} %p %t %c #### Second appender writes to a file -log4j.logger.AnalysisLogger=info,AR +log4j.logger.AnalysisLogger=trace,AR,stdout log4j.appender.AR=org.apache.log4j.RollingFileAppender -log4j.appender.AR.Threshold=info +log4j.appender.AR.Threshold=trace log4j.appender.AR.File=${GLOBUS_LOCATION}/logs/Analysis.log log4j.appender.AR.MaxFileSize=1024KB log4j.appender.AR.MaxBackupIndex=2 diff --git a/src/main/resources/lexicalguesser/lexicalGuesser.properties b/src/main/resources/lexicalguesser/lexicalGuesser.properties index 41b5ddb..5d285b0 100644 --- a/src/main/resources/lexicalguesser/lexicalGuesser.properties +++ b/src/main/resources/lexicalguesser/lexicalGuesser.properties @@ -1,13 +1,13 @@ #Percentage threshold for discarding a category categoryDiscardThreshold=0 #Percentage threshold for accepting similarity between a single Time series entry and a reference entry -entryAcceptanceThreshold=80 +entryAcceptanceThreshold=50 #Size of a comparison chunk -chunkSize=25 +chunkSize=50 #Number of chunks to take from Time series for performing comparison respect to reference data; if set to -1 all chunks will be analyzed -timeSeriesChunksToTake=2 +timeSeriesChunksToTake=1 #Number of chunks to take from Reference for performing comparison Time Series Elements; if set to -1 all chunks will be analyzed -referenceChunksToTake =50 +referenceChunksToTake=5 #Use random choice for chunks selection = true |false randomTake=true #Use Simple String Match as distance calculation @@ -15,6 +15,6 @@ useSimpleDistance=false #Number Of Threads to use numberOfThreadsToUse=5 #if two final scores differ for more than this percentage, prune the lower result -categoryDiscardDifferencialThreshold = 2 +categoryDiscardDifferencialThreshold = 50 #maximum difference between a result and the best result singleEntryRecognitionMaxDeviation = 40 \ No newline at end of file