diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/core/DataTypeRecognizer.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/core/DataTypeRecognizer.java new file mode 100644 index 0000000..bad6828 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/core/DataTypeRecognizer.java @@ -0,0 +1,117 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.core; + +import java.math.BigDecimal; +import java.util.ArrayList; + +public class DataTypeRecognizer { + + // if the DB type contains one of this, org.gcube.contentmanagement.lexicalmatcher will be classified as Decimal + private static String[] decimalType = { "decimal", "integer", "int", "ordinal", "length", "position" ,"real"}; + + private static String[] booleanType = { "bool" }; + + private static String[] stringType = { "varchar", "char", "string", "text" }; + + public static String transformTypeFromDB(String DBType) { + + // check if the db type is yet known + String type = null; + + try { + // check if org.gcube.contentmanagement.lexicalmatcher is a char + if (contains(DBType, stringType)) { + type = String.class.getName(); + } + // check if org.gcube.contentmanagement.lexicalmatcher is a decimal + else if (contains(DBType, decimalType)) + type = BigDecimal.class.getName(); + // check if org.gcube.contentmanagement.lexicalmatcher is a boolean + else if (contains(DBType, booleanType)) + type = Boolean.class.getName(); + else + type = String.class.getName(); + } catch (Exception e) { + type = String.class.getName(); + } + return type; + } + + // guesses the type of an object + public static Object guessType(String entry) { + + Object type = null; + + // try to transform to a double + try { + double d = Double.parseDouble(entry); + type = BigDecimal.valueOf(d); + } catch (Exception eD) { + // try to transform to a boolean + if (entry.equalsIgnoreCase("true") || (entry.equalsIgnoreCase("false"))) { + boolean b = Boolean.parseBoolean(entry); + type = Boolean.valueOf(b); + } else + type = entry; + } + + return type; + + } + + private static boolean contains(String element, String[] array) { + element = element.toLowerCase(); + for (String arrayElem : array) { + + if (element.contains(arrayElem)) { + return true; + } + } + + return false; + } + + public static String guessType(ArrayList elementlist) { + + // 0 = String 1 = Boolean 2 = Decimal + int[] scores = new int[3]; + String[] types = { String.class.getName(), Boolean.class.getName(), BigDecimal.class.getName() }; + for (String element : elementlist) { + Object guessedObj = guessType(element); + if (guessedObj instanceof String) { + scores[0] = scores[0] + 1; + } else if (guessedObj instanceof Boolean) { + scores[1] = scores[1] + 1; + } else if (guessedObj instanceof BigDecimal) { + scores[2] = scores[2] + 1; + } + + } + int max = -1; + int maxindex = -1; + for (int i = 0; i < scores.length; i++) { + if (scores[i] > max) { + max = scores[i]; + maxindex = i; + } + } + +// System.out.println("index " + maxindex + " max " + max); + + String type = types[maxindex]; + + return type; + } + + public static void main(String[] args) throws ClassNotFoundException { + + ArrayList prova = new ArrayList(); + for (int i = 0; i < 5; i++) { + prova.add("1234"); + } + + String classtype = guessType(prova); + System.out.println(classtype); + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/core/Engine.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/core/Engine.java new file mode 100644 index 0000000..e215c00 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/core/Engine.java @@ -0,0 +1,350 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.core; + + +import java.util.ArrayList; +import java.util.HashMap; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.CategoryOrderedList; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.CategoryScores; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.ChunkSet; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.ReferenceChunk; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.ReferenceChunkSet; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.SetOfReferenceChunkSet; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.SingletonChunkSet; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.TimeSeriesChunk; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.TimeSeriesChunkSet; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; +import org.gcube.contentmanagement.lexicalmatcher.utils.DatabaseFactory; +import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions; +import org.hibernate.SessionFactory; + +public class Engine { + + private String ConfigurationFileNameLocal = "hibernate.cfg.xml"; + private SessionFactory referenceDBSession; + + public ArrayList bestCategories; + public ArrayList bestScores; + public ArrayList bestColumns; + public HashMap scoresTable; + public String columnFilter; + private LexicalEngineConfiguration config; + private TimeSeriesChunk singletonChunk; + + public ArrayList getSingletonMatches(){ + return singletonChunk.getDetailedResults(); + } + + public String getSingletonElement(){ + return singletonChunk.getSingletonEntry(); + } + + public SessionFactory getDBSession() throws Exception { + + if (referenceDBSession == null) { + referenceDBSession = DatabaseFactory.initDBConnection(ConfigurationFileNameLocal); + } + + return referenceDBSession; + } + + public SessionFactory getDBSession(LexicalEngineConfiguration externalConf) throws Exception { + + if (referenceDBSession == null) { + referenceDBSession = DatabaseFactory.initDBConnection(ConfigurationFileNameLocal, externalConf); + } + + return referenceDBSession; + } + + public void resetEngine(LexicalEngineConfiguration Config,String ColumnFilter,String configPath){ + config = Config; + scoresTable = new HashMap(); + bestCategories = new ArrayList(); + bestColumns = new ArrayList(); + bestScores = new ArrayList(); + columnFilter = ColumnFilter; +// ConfigurationFileNameLocal = configPath+"/"+ConfigurationFileNameLocal; + } + + public Engine(LexicalEngineConfiguration Config,String ColumnFilter,String configPath) { + config = Config; + scoresTable = new HashMap(); + bestCategories = new ArrayList(); + bestColumns = new ArrayList(); + bestScores = new ArrayList(); + columnFilter = ColumnFilter; + ConfigurationFileNameLocal = configPath+"/"+ConfigurationFileNameLocal; + } + + public void calcLike(CategoryOrderedList col, String unknownSeriesName, String unknownSeriesColumn) { + scoresTable = col.getScoresTable(); + + // take a time series set of chunks + TimeSeriesChunkSet tsChunkSet = null; + try { + tsChunkSet = new TimeSeriesChunkSet(config.TimeSeriesChunksToTake, config.chunkSize, unknownSeriesName, unknownSeriesColumn,config, this); + } catch (Exception e) { + e.printStackTrace(); + AnalysisLogger.getLogger().error("Engine->calcLike-> ERROR could not retrieve time series chunks " + e.getLocalizedMessage()); + } + // if we took the ts chunk set correctly perform calculation + if (tsChunkSet != null) { + + // generate the set of reference chunks + SetOfReferenceChunkSet setRefChunksSet = new SetOfReferenceChunkSet(col.getOrderedList(),config, this); + + TimeSeriesChunk tsChunk = tsChunkSet.nextChunk(); + // for all ts chunks + while (tsChunk != null) { + + // take a set of chunks from a reference category + ReferenceChunkSet refChunkSet = setRefChunksSet.getNextChunkSet(); + while (refChunkSet != null) { + // take a chunk in the reference chunk set + ReferenceChunk refChunk = refChunkSet.nextChunk(); + while (refChunk != null) { + + try { + tsChunk.compareToReferenceChunk(scoresTable, refChunk); + } catch (Exception e) { + e.printStackTrace(); + AnalysisLogger.getLogger().error("Engine->calcLike-> ERROR could not compare time series chunk with reference chunk " + e.getLocalizedMessage()); + } + // take another chunk in the reference chunk set + refChunk = refChunkSet.nextChunk(); + } + + // check score + UpdateScores(refChunkSet.getSeriesName(),false); + + // take another set of chunks from another reference category + refChunkSet = setRefChunksSet.getNextChunkSet(); + } + + tsChunk = tsChunkSet.nextChunk(); + } + + } + + } + + boolean threadActivity[]; + + private void wait4Thread(int index){ + + + // wait until thread is free + while (threadActivity[index]) { + try { + Thread.sleep(10); + } catch (InterruptedException e) { + } + } + + + } + + private void startNewTCalc(TimeSeriesChunk tsChunk, ReferenceChunkSet refChunkSet,int index){ + + threadActivity[index] = true; + ThreadCalculator tc = new ThreadCalculator(tsChunk, refChunkSet,index); + Thread t = new Thread(tc); + t.start(); +// AnalysisLogger.getLogger().info("ThreadCalculator<-go "+index); + } + + + public void calcLikeThread(CategoryOrderedList col, String unknownSeriesName, String unknownSeriesColumn,String singletonString) { + scoresTable = col.getScoresTable(); + + // take a time series set of chunks + ChunkSet tsChunkSet = null; + int[] currentThreads = MathFunctions.generateSequence(config.numberOfThreadsToUse); + int currentThread = 0; + threadActivity = new boolean [currentThreads.length]; + //initialize to false; + for (int j=0;jcalcLike-> ERROR could not retrieve time series chunks " + e.getLocalizedMessage()); + } + // if we took the ts chunk set correctly perform calculation + if (tsChunkSet != null) { + + // generate the set of reference chunks + SetOfReferenceChunkSet setRefChunksSet = new SetOfReferenceChunkSet(col.getOrderedList(),config, this); + + TimeSeriesChunk tsChunk = (TimeSeriesChunk)tsChunkSet.nextChunk(); + + AnalysisLogger.getLogger().debug("tsChunk is null "+(tsChunk != null)); + // for all ts chunks + while (tsChunk != null) { + + // take a set of chunks from a reference category + ReferenceChunkSet refChunkSet = setRefChunksSet.getNextChunkSet(); + while (refChunkSet != null) { + wait4Thread(currentThreads[currentThread]); + startNewTCalc(tsChunk, refChunkSet,currentThreads[currentThread]); + +// makeComparisonsTSChunk2RefChunks(tsChunk, refChunkSet); + + // take another set of chunks from another reference category + refChunkSet = setRefChunksSet.getNextChunkSet(); + + currentThread++; + if (currentThread >= currentThreads.length) + currentThread = 0; + } + + + //if the chunk is a singleton, don't process other and record the result + if (tsChunk.isSingleton()){ + singletonChunk = tsChunk; + + break; + } + + tsChunk = (TimeSeriesChunk)tsChunkSet.nextChunk(); + } + + //wait for last threads to finish + for (int i : currentThreads) { + // free previous calculation + wait4Thread(i); + } + + } + + } + + private void makeComparisonsTSChunk2RefChunks(TimeSeriesChunk tsChunk, ReferenceChunkSet refChunkSet) { + + // take a chunk in the reference chunk set + ReferenceChunk refChunk = refChunkSet.nextChunk(); + while (refChunk != null) { + + try { + tsChunk.compareToReferenceChunk(scoresTable, refChunk,columnFilter); + } catch (Exception e) { + e.printStackTrace(); + AnalysisLogger.getLogger().error("Engine->calcLike-> ERROR could not compare time series chunk with reference chunk " + e.getLocalizedMessage()); + } + + //if the TimeSeries chunk states the processing must be interrupted, don't perform other comparisons + if (tsChunk.mustInterruptProcess()) + break; + + // take another chunk in the reference chunk set + refChunk = refChunkSet.nextChunk(); + + } + // check score + UpdateScores(refChunkSet.getSeriesName(),tsChunk.isSingleton()); + } + + private void UpdateScores(String categoryName, boolean singletonMatch) { + + CategoryScores categoryScore = scoresTable.get(categoryName); + ArrayList bestCols = categoryScore.findBestList(); + String bestColumn = null; + double score = 0; + if (bestCols.size() > 0) { + bestColumn = bestCols.get(0); + score = categoryScore.getScore(bestColumn,singletonMatch); + } + + AnalysisLogger.getLogger().trace("Engine->UpdateScores-> \tBEST SUITABLE COLUMN IS: " + bestColumn); + AnalysisLogger.getLogger().trace("Engine->UpdateScores-> \tBEST SCORE IS: " + score); + + // order this column + if (score > config.categoryDiscardThreshold) { + + int index = 0; + // insert at the right point in the classification + for (Double dscore : bestScores) { + if (dscore.doubleValue() < score) { + + break; + } + index++; + } + bestCategories.add(index, categoryName); + bestScores.add(index, score); + bestColumns.add(index, bestColumn); + checkAndAddColumns(categoryScore, bestCols, categoryName,singletonMatch); + } + + } + + private void checkAndAddColumns(CategoryScores scores, ArrayList bestCols, String categoryName,boolean singletonMatch) { + + int size = bestCols.size(); + double bestScore = scores.getScore(bestCols.get(0),singletonMatch); + + for (int i = 1; i < size; i++) { + // take the i-th column + String column = bestCols.get(i); + if (column != null) { + // check the score + double score = scores.getScore(column,singletonMatch); + + // if the score is near the best, add the column + if ((score > 0) && (score >= (bestScore - 0.5 * bestScore))) { + + int index = 0; + // insert at the right point in the classification + for (Double dscore : bestScores) { + if (dscore.doubleValue() < score) { + + break; + } + index++; + } + + // AnalysisLogger.getLogger().info("chechAndAddColumns -> column to add "+column+" category "+categoryName+" with value "+score+" previous "+(bestScore - 0.5 * bestScore)); + bestColumns.add(index,column); + bestScores.add(index,score); + bestCategories.add(index,categoryName); + // AnalysisLogger.getLogger().info("chechAndAddColumns -> "+bestCategories); + } + } + } + + } + + private class ThreadCalculator implements Runnable { + TimeSeriesChunk tsChunk; + ReferenceChunkSet refChunksSet; + int index; + + public ThreadCalculator(TimeSeriesChunk tsChunk, ReferenceChunkSet refChunksSet,int index) { + this.tsChunk = tsChunk; + this.refChunksSet = refChunksSet; + this.index = index; + } + + public void run() { +// AnalysisLogger.getLogger().info("ThreadCalculator->started "+index); + makeComparisonsTSChunk2RefChunks(tsChunk, refChunksSet); + threadActivity[index]=false; +// AnalysisLogger.getLogger().info("ThreadCalculator>-finished "+index); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/core/LexicalEngineConfiguration.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/core/LexicalEngineConfiguration.java new file mode 100644 index 0000000..ae9e418 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/core/LexicalEngineConfiguration.java @@ -0,0 +1,322 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.core; + +import java.io.FileInputStream; +import java.util.Properties; + + +public class LexicalEngineConfiguration { + + public void configure(String absoluteFilePath) throws Exception { + Properties props = new Properties(); + FileInputStream fis = new FileInputStream(absoluteFilePath); + props.load(fis); + categoryDiscardThreshold = Float.parseFloat(props.getProperty("categoryDiscardThreshold")); + entryAcceptanceThreshold = Integer.parseInt(props.getProperty("entryAcceptanceThreshold")); + chunkSize = Integer.parseInt(props.getProperty("chunkSize")); + TimeSeriesChunksToTake = Integer.parseInt(props.getProperty("timeSeriesChunksToTake")); + ReferenceChunksToTake = Integer.parseInt(props.getProperty("referenceChunksToTake")); + randomTake = Boolean.parseBoolean(props.getProperty("randomTake")); + useSimpleDistance = Boolean.parseBoolean(props.getProperty("useSimpleDistance")); + numberOfThreadsToUse = Integer.parseInt(props.getProperty("numberOfThreadsToUse")); + categoryDiscardDifferencialThreshold = Float.parseFloat(props.getProperty("categoryDiscardDifferencialThreshold")); + singleEntryRecognitionMaxDeviation = Float.parseFloat(props.getProperty("singleEntryRecognitionMaxDeviation")); + fis.close(); + } + + + + public void setCategoryDiscardThreshold(float categoryDiscardThreshold) { + this.categoryDiscardThreshold = categoryDiscardThreshold; + } + + public float getCategoryDiscardThreshold() { + return categoryDiscardThreshold; + } + + public void setEntryAcceptanceThreshold(float entryAcceptanceThreshold) { + this.entryAcceptanceThreshold = entryAcceptanceThreshold; + } + + public float getEntryAcceptanceThreshold() { + return entryAcceptanceThreshold; + } + + + + public void setCategoryDiscardDifferencialThreshold(float categoryDiscardDifferencialThreshold) { + this.categoryDiscardDifferencialThreshold = categoryDiscardDifferencialThreshold; + } + + public float getCategoryDiscardDifferencialThreshold() { + return categoryDiscardDifferencialThreshold; + } + + public void setChunkSize(int chunkSize) { + this.chunkSize = chunkSize; + } + + public int getChunkSize() { + return chunkSize; + } + + public void setRandomTake(boolean randomTake) { + this.randomTake = randomTake; + } + + public boolean isRandomTake() { + return randomTake; + } + + public void setTimeSeriesChunksToTake(int timeSeriesChunksToTake) { + TimeSeriesChunksToTake = timeSeriesChunksToTake; + } + + public int getTimeSeriesChunksToTake() { + return TimeSeriesChunksToTake; + } + + public void setReferenceChunksToTake(int referenceChunksToTake) { + ReferenceChunksToTake = referenceChunksToTake; + } + + public int getReferenceChunksToTake() { + return ReferenceChunksToTake; + } + + public void setUseSimpleDistance(boolean useSimpleDistance) { + this.useSimpleDistance = useSimpleDistance; + } + + public boolean isUseSimpleDistance() { + return useSimpleDistance; + } + + + public void setNumberOfThreadsToUse(int numberOfThreadsToUse) { + this.numberOfThreadsToUse = numberOfThreadsToUse; + } + + public int getNumberOfThreadsToUse() { + return numberOfThreadsToUse; + } + + public void setSingleEntryRecognitionMaxDeviation(float singleEntryRecognitionMaxDeviation) { + this.singleEntryRecognitionMaxDeviation = singleEntryRecognitionMaxDeviation; + } + + public float getSingleEntryRecognitionMaxDeviation() { + return singleEntryRecognitionMaxDeviation; + } + + public float categoryDiscardThreshold = -Float.MIN_VALUE; + public float entryAcceptanceThreshold = -Float.MIN_VALUE; + public float categoryDiscardDifferencialThreshold = -Float.MIN_VALUE; + public float singleEntryRecognitionMaxDeviation = -Float.MIN_VALUE; + public int chunkSize = -Integer.MIN_VALUE; + public Boolean randomTake = null; + // if set to -1 all chunks will be analyzed + public int TimeSeriesChunksToTake = -Integer.MIN_VALUE; + public int ReferenceChunksToTake = -Integer.MIN_VALUE; + public Boolean useSimpleDistance = null; + public int numberOfThreadsToUse = -Integer.MIN_VALUE; + + //database parameters + public String databaseDriver = null; + public String databaseURL = null; + public String databaseUserName = null; + public String databasePassword = null; + public String databaseDialect = null; + public String databaseIdleConnectionTestPeriod = null; + public String databaseAutomaticTestTable = null; + + //reference data parameters + public String referenceTable = null; + public String referenceColumn = null; + public String idColumn= null; + public String nameHuman = null; + public String description = null; + + public void mergeConfig(LexicalEngineConfiguration config){ + + if (config.getCategoryDiscardDifferencialThreshold()!=-Float.MIN_VALUE) + setCategoryDiscardDifferencialThreshold(config.getCategoryDiscardDifferencialThreshold()); + if (config.getSingleEntryRecognitionMaxDeviation()!=-Float.MIN_VALUE) + setSingleEntryRecognitionMaxDeviation(config.getSingleEntryRecognitionMaxDeviation()); + if (config.getCategoryDiscardThreshold()!=-Float.MIN_VALUE) + setCategoryDiscardThreshold(config.getCategoryDiscardThreshold()); + if (config.getChunkSize()!=-Integer.MIN_VALUE) + setChunkSize(config.getChunkSize()); + if (config.getEntryAcceptanceThreshold()!=-Float.MIN_VALUE) + setEntryAcceptanceThreshold(config.getEntryAcceptanceThreshold()); + if (config.getNumberOfThreadsToUse()!=-Integer.MIN_VALUE) + setNumberOfThreadsToUse(config.getNumberOfThreadsToUse()); + if (config.getReferenceChunksToTake()!=-Integer.MIN_VALUE) + setReferenceChunksToTake(config.getReferenceChunksToTake()); + if (config.getTimeSeriesChunksToTake()!=-Integer.MIN_VALUE) + setTimeSeriesChunksToTake(config.getTimeSeriesChunksToTake()); + if (config.randomTake!= null) + setRandomTake(config.isRandomTake()); + if (config.useSimpleDistance!=null) + setUseSimpleDistance(config.isUseSimpleDistance()); + //database information merge + if (config.databaseDriver!=null) + setDatabaseDriver(config.databaseDriver); + if (config.databaseDialect!=null) + setDatabaseDialect(config.databaseDialect); + if (config.databaseAutomaticTestTable!=null) + setDatabaseAutomaticTestTable(config.databaseAutomaticTestTable); + if (config.databaseIdleConnectionTestPeriod!=null) + setDatabaseIdleConnectionTestPeriod(config.databaseIdleConnectionTestPeriod); + if (config.databaseUserName!=null) + setDatabaseUserName(config.databaseUserName); + if (config.databasePassword!=null) + setDatabasePassword(config.databasePassword); + if (config.databaseURL!=null) + setDatabaseURL(config.databaseURL); + if (config.referenceTable!=null) + setReferenceTable(config.referenceTable); + if (config.referenceColumn!=null) + setReferenceColumn(config.referenceColumn); + if (config.idColumn!=null) + setIdColumn(config.idColumn); + if (config.nameHuman!=null) + setNameHuman(config.nameHuman); + if (config.description!=null) + setDescription(config.description); + } + + + + public void setDatabaseDriver(String databaseDriver) { + this.databaseDriver = databaseDriver; + } + + + + public String getDatabaseDriver() { + return databaseDriver; + } + + + + public void setDatabaseURL(String databaseURL) { + this.databaseURL = databaseURL; + } + + + + public String getDatabaseURL() { + return databaseURL; + } + + + + public void setDatabaseUserName(String databaseUserName) { + this.databaseUserName = databaseUserName; + } + + + + public String getDatabaseUserName() { + return databaseUserName; + } + + + + public void setDatabasePassword(String databasePassword) { + this.databasePassword = databasePassword; + } + + + + public String getDatabasePassword() { + return databasePassword; + } + + + + public void setDatabaseDialect(String databaseDialect) { + this.databaseDialect = databaseDialect; + } + + + + public String getDatabaseDialect() { + return databaseDialect; + } + + + + public void setDatabaseIdleConnectionTestPeriod(String databaseIdleConnectionTestPeriod) { + this.databaseIdleConnectionTestPeriod = databaseIdleConnectionTestPeriod; + } + + + + public String getDatabaseIdleConnectionTestPeriod() { + return databaseIdleConnectionTestPeriod; + } + + + + public void setDatabaseAutomaticTestTable(String databaseAutomaticTestTable) { + this.databaseAutomaticTestTable = databaseAutomaticTestTable; + } + + + + public String getDatabaseAutomaticTestTable() { + return databaseAutomaticTestTable; + } + + public String getReferenceTable() { + return referenceTable; + } + + public void setReferenceTable(String referenceTable) { + this.referenceTable = referenceTable; + } + + public String getReferenceColumn() { + return referenceColumn; + } + + public void setReferenceColumn(String referenceColumn) { + this.referenceColumn = referenceColumn; + } + + public String getIdColumn() { + return idColumn; + } + + + + public void setIdColumn(String idColumn) { + this.idColumn = idColumn; + } + + + + public String getNameHuman() { + return nameHuman; + } + + + + public void setNameHuman(String nameHuman) { + this.nameHuman = nameHuman; + } + + + + public String getDescription() { + return description; + } + + + + public void setDescription(String description) { + this.description = description; + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example1_Species.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example1_Species.java new file mode 100644 index 0000000..0e8a6e8 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example1_Species.java @@ -0,0 +1,32 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.examples; + + +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class Example1_Species { + + public static void main(String[] args) { + + try { + int attempts = 1; + + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6"; + String column = "field1"; + String correctFamily = "SPECIES"; + String correctColumn = "SCIENTIFIC_NAME"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example2_Area.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example2_Area.java new file mode 100644 index 0000000..86e6251 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example2_Area.java @@ -0,0 +1,32 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.examples; + + +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class Example2_Area { + + public static void main(String[] args) { + + try { + int attempts = 1; + + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6"; + String column = "field3"; + String correctFamily = "AREA"; + String correctColumn = "NAME_EN"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example3_SingleMatchShark.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example3_SingleMatchShark.java new file mode 100644 index 0000000..2bd8a3b --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example3_SingleMatchShark.java @@ -0,0 +1,48 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.examples; + +import java.util.ArrayList; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult; +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + + +public class Example3_SingleMatchShark { + + public static void main(String[] args) { + + try { + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String singleton = "shark"; + String family = "species"; + String column = "name_en"; + + LexicalEngineConfiguration conf = new LexicalEngineConfiguration(); + + //CHANGE THIS TO ENHANCE THE RECALL + conf.setEntryAcceptanceThreshold(30); + conf.setReferenceChunksToTake(-1); + conf.setTimeSeriesChunksToTake(-1); + conf.setUseSimpleDistance(false); + + guesser.runGuesser(configPath, singleton, conf, family,column ); + ArrayList detailedResults = guesser.getDetailedMatches(); + + AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton); + + CategoryGuesser.showResults(detailedResults); + + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example4_SingleMatchMitella.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example4_SingleMatchMitella.java new file mode 100644 index 0000000..68f3ce6 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example4_SingleMatchMitella.java @@ -0,0 +1,49 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.examples; + +import java.util.ArrayList; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult; +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + + +public class Example4_SingleMatchMitella { + + public static void main(String[] args) { + + try { + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String singleton = "Mitella pollicipes"; +// String singleton = "policipes"; + String family = "species"; + String column = "scientific_name"; + + LexicalEngineConfiguration conf = new LexicalEngineConfiguration(); + + //CHANGE THIS TO ENHANCE THE RECALL + conf.setEntryAcceptanceThreshold(30); + conf.setReferenceChunksToTake(-1); + conf.setTimeSeriesChunksToTake(-1); + conf.setUseSimpleDistance(false); + + guesser.runGuesser(configPath, singleton, conf, family,column ); + ArrayList detailedResults = guesser.getDetailedMatches(); + + AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton); + + CategoryGuesser.showResults(detailedResults); + + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example5_SingleMatchMitella.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example5_SingleMatchMitella.java new file mode 100644 index 0000000..e1bb6b7 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/Example5_SingleMatchMitella.java @@ -0,0 +1,48 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.examples; + +import java.util.ArrayList; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult; +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + + +public class Example5_SingleMatchMitella { + + public static void main(String[] args) { + + try { + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String singleton = "Mirella policepes"; + String family = "species"; + String column = "scientific_name"; + + LexicalEngineConfiguration conf = new LexicalEngineConfiguration(); + + //CHANGE THIS TO ENHANCE THE RECALL + conf.setEntryAcceptanceThreshold(30); + conf.setReferenceChunksToTake(-1); + conf.setTimeSeriesChunksToTake(-1); + conf.setUseSimpleDistance(false); + + guesser.runGuesser(configPath, singleton, conf, family,column ); + ArrayList detailedResults = guesser.getDetailedMatches(); + + AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton); + + CategoryGuesser.showResults(detailedResults); + + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/ExampleGuessingExternalCfg.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/ExampleGuessingExternalCfg.java new file mode 100644 index 0000000..d652383 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/examples/ExampleGuessingExternalCfg.java @@ -0,0 +1,64 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.examples; + +import java.util.ArrayList; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult; +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; + +public class ExampleGuessingExternalCfg { + + public static void main(String[] args) { + + try { + + String configPath = "./"; + CategoryGuesser guesser = new CategoryGuesser(configPath); + + + //bench 1 + System.out.println("----------------------BENCH 1-------------------------"); + String seriesName = "import_532bba80_1c8f_11df_a4ee_87804054691e"; + String column = "field2"; + LexicalEngineConfiguration conf = new LexicalEngineConfiguration(); + conf.setCategoryDiscardDifferencialThreshold(10); + conf.setCategoryDiscardThreshold(0); + conf.setChunkSize(25); + conf.setEntryAcceptanceThreshold(50); + conf.setNumberOfThreadsToUse(2); + conf.setRandomTake(true); + conf.setReferenceChunksToTake(20); + conf.setTimeSeriesChunksToTake(1); + conf.setUseSimpleDistance(false); + + //database Parameters + conf.setDatabaseUserName("root"); + conf.setDatabasePassword("ash_ash80"); + conf.setDatabaseDriver("com.mysql.jdbc.Driver"); + conf.setDatabaseURL("jdbc:mysql://localhost/timeseries"); + conf.setDatabaseDialect("org.hibernate.dialect.MySQLDialect"); + conf.setDatabaseAutomaticTestTable("connectiontesttable"); + conf.setDatabaseIdleConnectionTestPeriod("3600"); + + //reference parameters + conf.setReferenceTable("reference_table"); + conf.setReferenceColumn("table_name"); + conf.setIdColumn("id"); + conf.setNameHuman("name_human"); + conf.setDescription("description"); + + guesser.init(conf); + + guesser.runGuesser(seriesName, column, conf); + ArrayList results = guesser.getClassification(); + CategoryGuesser.showResults(results); + + System.out.println("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/Category.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/Category.java new file mode 100644 index 0000000..4981482 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/Category.java @@ -0,0 +1,71 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data; + + +import java.math.BigInteger; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.interfaces.Reference; + +public class Category implements Reference { + + + public static void main(String[] args) { + // TODO Auto-generated method stub + + } + private String categoryName; + private String categoryIndex; + private String tableName; + private String description; + private BigInteger numberOfElements; + + public Category(String name,String index,String tablename,String descr){ + categoryName=name; + categoryIndex=index; + tableName=tablename; + description=descr; + } + + public void setName(String categoryName) { + this.categoryName = categoryName; + } + + public String getName() { + return categoryName; + } + + public void setIndex(String categoryIndex) { + this.categoryIndex = categoryIndex; + } + + public String getIndex() { + return categoryIndex; + } + + public void setTableName(String tableName) { + this.tableName = tableName; + } + + public String getTableName() { + return tableName; + } + + public void setDescription(String description) { + this.description = description; + } + + public String getDescription() { + return description; + } + + public String toString(){ + return "["+categoryName+": index "+categoryIndex+" table "+tableName+" description "+description+"]"; + } + + public void setNumberOfElements(BigInteger numberOfElements) { + this.numberOfElements = numberOfElements; + } + + public BigInteger getNumberOfElements() { + return numberOfElements; + } +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/CategoryOrderedList.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/CategoryOrderedList.java new file mode 100644 index 0000000..826c5ae --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/CategoryOrderedList.java @@ -0,0 +1,79 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data; + + +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.HashMap; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.interfaces.Reference; + +public class CategoryOrderedList { + + // lista ordinata in ordine decrescente + ArrayList orderedList; + HashMap orderedListTable; + private HashMap scoresTable; + + + public void setOrderedList(ArrayList OrderedList){ + orderedList = OrderedList; + } + public HashMap getScoresTable() { + return scoresTable; + } + + public void setCategoryTable( HashMap OrderedListTable ) { + orderedListTable = OrderedListTable ; + } + + public Reference getCategory ( String categoryName ) { + return orderedListTable.get(categoryName); + } + + public ArrayList getOrderedList() { + return orderedList; + } + + LexicalEngineConfiguration config; + + public CategoryOrderedList(LexicalEngineConfiguration Config) { + orderedList = new ArrayList(); + scoresTable = new HashMap(); + config = Config; + orderedListTable = new HashMap(); + } + + public void addCategory(Category c) { + + BigInteger nElements = c.getNumberOfElements(); + int index = 0; + + for (Reference cc : orderedList) { + BigInteger localnum = cc.getNumberOfElements(); + if (localnum.compareTo(nElements) < 0) { + break; + } + index++; + } + orderedList.add(index, c); + scoresTable.put(c.getName(), new CategoryScores(c.getNumberOfElements(),config)); + orderedListTable.put(c.getName(), c); +// scoresTable.put(c.getName(), new CategoryScores()); + } + + public CategoryOrderedList generateNovelList(){ + CategoryOrderedList newCatList = new CategoryOrderedList(config); + newCatList.setOrderedList(orderedList); + newCatList.setCategoryTable(orderedListTable); + + for (String key:scoresTable.keySet()){ + CategoryScores ct = scoresTable.get(key); + CategoryScores ctnew = new CategoryScores(ct.getCategoryElements(), config); + newCatList.getScoresTable().put(key,ctnew); + } + + return newCatList; + } + +} \ No newline at end of file diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/CategoryScores.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/CategoryScores.java new file mode 100644 index 0000000..285b790 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/CategoryScores.java @@ -0,0 +1,205 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data; + + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.HashMap; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; +import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions; + +//score relative to a certain category and column + +public class CategoryScores { + + // column names vs percentage + private HashMap columnsScore; + + private int matchedElements; + private BigInteger maxElements; + private BigInteger categoryElements; + private LexicalEngineConfiguration config; + + public CategoryScores(BigInteger catElements, LexicalEngineConfiguration Config) { + columnsScore = new HashMap(); + matchedElements = 0; + setCategoryElements(catElements); + config = Config; + maxElements = calculateMaxElements(catElements); + } + + public double calculateCoverage(){ + + double bd = new BigDecimal(matchedElements).divide(new BigDecimal(maxElements), 2, BigDecimal.ROUND_FLOOR).doubleValue(); + + //lower poor categories + if (maxElements.compareTo(BigInteger.valueOf(config.chunkSize))<=0) + bd = bd *0.8; + + //To-DO take into observation!!! + //higher very big set coverage + if (categoryElements.compareTo(BigInteger.valueOf(10000))>0) + bd = Math.max(0.01, bd); + + return bd; + } + + private BigInteger calculateMaxElements(BigInteger catElements){ + BigInteger maxElements = BigInteger.ZERO; + + int maxNumberOfChunks = config.ReferenceChunksToTake; + int chunkSize = config.chunkSize; + int numberofcycles=0; + + if (maxNumberOfChunks<0) + return catElements; + try{ + BigDecimal intcycles; + BigDecimal oddcycles; + BigDecimal catElementsDecimal = new BigDecimal(catElements); + BigDecimal[] arraydecimal = catElementsDecimal.divideAndRemainder(new BigDecimal(BigInteger.valueOf(chunkSize))); + intcycles = arraydecimal[0]; + oddcycles = arraydecimal[1]; + numberofcycles = intcycles.intValue(); + if ((numberofcycles==0)&&(oddcycles.intValue() > 0)) { + numberofcycles = numberofcycles + 1; + maxElements = oddcycles.toBigInteger(); + } + else{ + if (numberofcycles>maxNumberOfChunks) + numberofcycles = maxNumberOfChunks; + + maxElements = BigInteger.valueOf(chunkSize).multiply(BigInteger.valueOf(numberofcycles)); + } + + }catch(Exception e){} + + return maxElements; + } + + + public String showScores(){ + return columnsScore.toString()+":"+calculateCoverage(); //+" - "+matchedElements+" vs "+maxElements; + } + + public void incrementScore(String columnName,float increment,boolean doIncrementMathes) { + + Float score = columnsScore.get(columnName); + + if (score==null) + score =new Float(0); + + score = MathFunctions.incrementPerc(score, increment, matchedElements); + + if (doIncrementMathes) + matchedElements ++; + + columnsScore.put(columnName, score); + } + + + public float getScore(String columnName,boolean simpleMatch) { + + if (simpleMatch){ + return getSimpleScore(columnName); + } + else + return getScore(columnName); + } + + + public float getScore(String columnName) { + + Float score = null; + try { +// score = columnsScore.get(columnName)*(float)calculateCoverage(); + score = columnsScore.get(columnName); + if (score!=null){ + return score*(float)calculateCoverage(); + } + } catch (Exception e) { + } + return score; + + } + + public float getSimpleScore(String columnName) { + + Float score = null; + try { +// score = columnsScore.get(columnName)*(float)calculateCoverage(); + score = columnsScore.get(columnName); + if (score!=null){ + return score; + } + } catch (Exception e) { + } + return score; + + } + + // take the best performing column + public String findBest() { + + String bestCol = null; + Float bestscore = Float.valueOf(-1); + + for (String column : columnsScore.keySet()) { + + Float score = new Float(0); + try { + score = columnsScore.get(column); + } catch (Exception e) { + AnalysisLogger.getLogger().error("ERROR in getting SCORE " + e.getLocalizedMessage()); + } + if (bestscore.compareTo(score) < 0) { + bestscore = score; + bestCol = column; + } + } + + return bestCol; + } + + // take the best performing columns + public ArrayList findBestList() { + + ArrayList bestCols = new ArrayList(); + + for (String column : columnsScore.keySet()) { + + Float score = new Float(0); + + try { + score = columnsScore.get(column); + } catch (Exception e) { + AnalysisLogger.getLogger().error("ERROR in getting SCORE " + e.getLocalizedMessage()); + } + + // find best place where to put column + int size = bestCols.size(); + int index = size; + for (int i = 0; i < size; i++) { + if (columnsScore.get(bestCols.get(i)).compareTo(score) <= 0) { + index = i; + break; + } + } + bestCols.add(index, column); + + } + + return bestCols; + } + + public void setCategoryElements(BigInteger categoryElements) { + this.categoryElements = categoryElements; + } + + public BigInteger getCategoryElements() { + return categoryElements; + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/CategoryScoresOld.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/CategoryScoresOld.java new file mode 100644 index 0000000..4f39044 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/CategoryScoresOld.java @@ -0,0 +1,123 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data; + + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.HashMap; + +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +//score relative to a certain category and column + +public class CategoryScoresOld { + + // column names vs percentage + private HashMap columnsScore; + + private BigDecimal maximumElements; + + public CategoryScoresOld(BigInteger maxelements) { + this.maximumElements = new BigDecimal(maxelements); + columnsScore = new HashMap(); + } + + public void setMaximumElements(BigDecimal MaximumElements) { + maximumElements = MaximumElements; + } + + public void incrementScore(String columnName,float increment) { + + BigDecimal score = columnsScore.get(columnName); + + BigDecimal reciproc = BigDecimal.valueOf(increment); + + if (score == null) { + // build up a new score : 1/TOTAL + score = reciproc; + } else { + score = score.add(reciproc); + } + columnsScore.put(columnName, score); + // AnalysisLogger.getLogger().debug("CategoryOrderedList->checkUnkEntriesOnEntireCategory-> SCORE "+score); + } + + public double getScore(String columnName) { + + double score = 0; + try { + + BigDecimal percentage = columnsScore.get(columnName); + try { + if (percentage == null) + percentage = BigDecimal.ZERO; + + AnalysisLogger.getLogger().trace("getScore -> Score for "+columnName+": " + percentage + " vs " + maximumElements); + percentage = percentage.divide(maximumElements, 2, BigDecimal.ROUND_DOWN); + } catch (ArithmeticException e) { + percentage = BigDecimal.ZERO; + e.printStackTrace(); + } + + score = percentage.doubleValue(); + } catch (Exception e) { + } + return score; + + } + + // take the best performing column + public String findBest() { + + String bestCol = null; + BigDecimal bestscore = BigDecimal.valueOf(-1); + + for (String column : columnsScore.keySet()) { + + BigDecimal score = BigDecimal.ZERO; + try { + score = columnsScore.get(column); + } catch (Exception e) { + AnalysisLogger.getLogger().error("ERROR in getting SCORE " + e.getLocalizedMessage()); + } + if (bestscore.compareTo(score) < 0) { + bestscore = score; + bestCol = column; + } + } + + return bestCol; + } + + // take the best performing columns + public ArrayList findBestList() { + + ArrayList bestCols = new ArrayList(); + + for (String column : columnsScore.keySet()) { + + BigDecimal score = BigDecimal.ZERO; + + try { + score = columnsScore.get(column); + } catch (Exception e) { + AnalysisLogger.getLogger().error("ERROR in getting SCORE " + e.getLocalizedMessage()); + } + + // find best place where to put column + int size = bestCols.size(); + int index = size; + for (int i = 0; i < size; i++) { + if (columnsScore.get(bestCols.get(i)).compareTo(score) <= 0) { + index = i; + break; + } + } + bestCols.add(index, column); + + } + + return bestCols; + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/DBObjectTranslator.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/DBObjectTranslator.java new file mode 100644 index 0000000..7dcbf7c --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/DBObjectTranslator.java @@ -0,0 +1,272 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data; + +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.List; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.DataTypeRecognizer; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; +import org.gcube.contentmanagement.lexicalmatcher.utils.DatabaseFactory; +import org.hibernate.SessionFactory; + +public class DBObjectTranslator { + + public static void main(String[] args) { + + } + + public ArrayList relations; + public ArrayList categories; + + public BigInteger totalEntries; + public BigInteger totalCatElements; + public BigInteger totalRelationElements; + + public DBObjectTranslator() { + relations = new ArrayList(); + categories = new ArrayList(); + totalCatElements = BigInteger.ZERO; + totalRelationElements = BigInteger.ZERO; + totalEntries = BigInteger.ZERO; + } + + public BigInteger calculateTotalEntries(SessionFactory dbSession, String timeSeriesName, String timeSeriesColumn) { + + BigInteger count = BigInteger.ZERO; + String query = "select count(*) from (SELECT distinct " + timeSeriesColumn + " FROM " + timeSeriesName + ") r;"; + // String query = "SELECT count(*) FROM " + timeSeriesName.toLowerCase(); + + List resultSet = DatabaseFactory.executeSQLQuery(query, dbSession); + + for (Object result : resultSet) { + + try { + BigInteger resultcount = (BigInteger) result; + totalEntries = totalEntries.add(resultcount); + count = resultcount; + AnalysisLogger.getLogger().trace("DBObjectTranslator->calculateTotalEntries: Time Series " + timeSeriesName + " total " + totalEntries); + } catch (Exception e) { + } + } + + return count; + } + + public ArrayList retrieveTimeSeriesEntries(SessionFactory dbSession, String timeSeriesName, String timeSeriesColumn, BigInteger min, int numberOfElements) { + + // String query = "SELECT distinct "+timeSeriesColumn+" FROM "+timeSeriesName+" r limit "+min+","+numberOfElements; + String query = "SELECT distinct " + timeSeriesColumn + " FROM " + timeSeriesName + " r limit " + numberOfElements + " offset " + min; + AnalysisLogger.getLogger().trace("DBObjectTranslator->query: " + query); + + List resultSet = DatabaseFactory.executeSQLQuery(query, dbSession); + ArrayList column = new ArrayList(); + + for (Object result : resultSet) { + try { + String value = ""; + if (result != null) + value = result.toString(); + + column.add(value); + + // AnalysisLogger.getLogger().debug("DBObjectTranslator->retrieveColumnRange: Column Element Added " + value); + } catch (Exception e) { + e.printStackTrace(); + AnalysisLogger.getLogger().trace("DBObjectTranslator->retrieveTimeSeriesEntries: Error in adding entry :" + e.getLocalizedMessage()); + } + } + + AnalysisLogger.getLogger().trace("DBObjectTranslator->retrieveColumnRange: Column " + column.toString()); + + return column; + } + + public ArrayList retrieveEntries(SessionFactory dbSession, String timeSeriesName, BigInteger min, int numberOfElements) { + + // clean previous entries + ArrayList currentEntries = new ArrayList(); + + ArrayList descriptions = new ArrayList(); + ArrayList types = new ArrayList(); + /* + * SELECT table_name,ordinal_position,column_name,data_type, is_nullable,character_maximum_length FROM information_schema.COLUMNS WHERE table_name ='ref_area'; + */ + + String queryDesc = "SELECT table_name,ordinal_position,column_name,data_type, is_nullable,character_maximum_length FROM information_schema.COLUMNS WHERE table_name ='" + timeSeriesName.toLowerCase() + "'"; + + List resultSetDesc = DatabaseFactory.executeSQLQuery(queryDesc, dbSession); + for (Object result : resultSetDesc) { + Object[] resultArray = (Object[]) result; + descriptions.add((String) resultArray[2]); + types.add(DataTypeRecognizer.transformTypeFromDB((String) resultArray[3])); + } + + if (descriptions.size() > 0) { + // String query = "SELECT DISTINCT * FROM " + timeSeriesName + " r where id>=" + min.toString() + " and id<=" + max.toString(); + // String query = "SELECT DISTINCT * FROM " + timeSeriesName + " r limit "+min+","+numberOfElements; + String query = "SELECT DISTINCT * FROM " + timeSeriesName + " r limit " + numberOfElements + " offset " + min; + AnalysisLogger.getLogger().trace("DBObjectTranslator->retrieveEntries: query " + query); + + List resultSet = DatabaseFactory.executeSQLQuery(query, dbSession); + + for (Object result : resultSet) { + Entry entry = new Entry(); + try { + Object[] resultArray = (Object[]) result; + int i = 0; + for (Object res : resultArray) { + // build entry + String value = ""; + if (res != null) + value = res.toString(); + + entry.addAttribute(descriptions.get(i), value); + entry.addType(descriptions.get(i), types.get(i)); + i++; + } + // add entry + currentEntries.add(entry); + // AnalysisLogger.getLogger().debug("DBObjectTranslator->retrieveEntries: Entry Added " + entry.toString()); + } catch (Exception e) { + // e.printStackTrace(); + AnalysisLogger.getLogger().trace("DBObjectTranslator->retrieveEntries: Error in adding entry :" + e.getLocalizedMessage()); + } + } + } + +// AnalysisLogger.getLogger().trace("DBObjectTranslator->retrieveEntries: Entries " + currentEntries); + return currentEntries; + } + + public void buildRelationsEdges(SessionFactory dbSession) { + + String query = "select * from relation_table;"; + List resultSet = DatabaseFactory.executeSQLQuery(query, dbSession); + for (Object result : resultSet) { + Object[] resultArray = (Object[]) result; + RelationEdge re = null; + try { + re = new RelationEdge(((String) resultArray[2]), "" + resultArray[0], "" + resultArray[1]); + } catch (Exception e) { + e.printStackTrace(); + } + if (re != null) { + relations.add(re); + AnalysisLogger.getLogger().trace("DBObjectTranslator->buildRelationsEdges: add relation " + re.toString()); + } + } + } + + public void buildCategories(SessionFactory dbSession, String referenceTable, String referenceColumn, String idColumn, String nameHuman, String description) { + + referenceTable = referenceTable == null ? "reference_table" : referenceTable; + referenceColumn = referenceColumn == null ? "table_name" : referenceColumn; + nameHuman = nameHuman == null ? "name_human" : nameHuman; + idColumn = idColumn == null ? "id" : idColumn; + description = description == null ? "description" : description; + + String query = "SELECT " + nameHuman + "," + idColumn + "," + referenceColumn + "," + description + " FROM " + referenceTable + " r;"; + List resultSet = DatabaseFactory.executeSQLQuery(query, dbSession); + if (resultSet != null) { + for (Object result : resultSet) { + Object[] resultArray = (Object[]) result; + Category cat = null; + try { + // name_human, id, table_name,description + cat = new Category("" + resultArray[0], "" + resultArray[1], "" + resultArray[2], "" + resultArray[3]); + } catch (Exception e) { + e.printStackTrace(); + } + if (cat != null) { + categories.add(cat); + AnalysisLogger.getLogger().trace("DBObjectTranslator->buildCategories: add category " + cat.toString()); + } + } + } + } + + public Category getCategoryfromIndex(String index) { + + Category cat = null; + for (Category c : categories) { + + if (c.getIndex().equals(index)) { + cat = c; + break; + } + } + + return cat; + } + + public void populateRelationWithCategories() { + + for (RelationEdge re : relations) { + + Category from = getCategoryfromIndex(re.getFrom()); + Category to = getCategoryfromIndex(re.getTo()); + re.setCategoryFrom(from.getName()); + re.setCategoryTo(to.getName()); + AnalysisLogger.getLogger().trace("DBObjectTranslator->populateRelationWithCategories: modified Relation " + re.toString()); + } + } + + public void calculateRelationWeights(SessionFactory dbSession) { + + for (RelationEdge re : relations) { + + String query = "SELECT count(*) FROM " + re.getName().toLowerCase(); + + List resultSet = DatabaseFactory.executeSQLQuery(query, dbSession); + for (Object result : resultSet) { + + try { + BigInteger resultcount = (BigInteger) result; + re.setWeigth(resultcount); + totalRelationElements = totalRelationElements.add(resultcount); + AnalysisLogger.getLogger().trace("DBObjectTranslator->calculateRelationWeights: Relation " + re.getName() + " weight " + re.getWeigth()); + } catch (Exception e) { + } + } + } + } + + public void calculateCategoriesWeights(SessionFactory dbSession) { + + for (Category cat : categories) { + + String query = "SELECT count(*) FROM " + cat.getTableName().toLowerCase(); + + List resultSet = DatabaseFactory.executeSQLQuery(query, dbSession); + + for (Object result : resultSet) { + + try { + BigInteger resultcount = (BigInteger) result; + cat.setNumberOfElements(resultcount); + totalCatElements = totalCatElements.add(resultcount); + AnalysisLogger.getLogger().trace("DBObjectTranslator->calculateCategoriesWeights: Category " + cat.getName() + " weight " + cat.getNumberOfElements() + " total " + totalCatElements); + } catch (Exception e) { + } + } + } + } + + public void buildCategoriesStructure(SessionFactory dbSession, String referenceTable, String referenceColumn, String idColumn, String nameHuman, String description) { + buildCategories(dbSession, referenceTable, referenceColumn, idColumn, nameHuman, description); + calculateCategoriesWeights(dbSession); + AnalysisLogger.getLogger().trace("DBObjectTranslator->buildWholeStructure: Total Categories Elements " + totalCatElements + " Total Relation Elements " + totalRelationElements); + } + + public void buildWholeStructure(SessionFactory dbSession, String referenceTable, String referenceColumn, String idColumn, String nameHuman, String description) { + + buildRelationsEdges(dbSession); + buildCategories(dbSession, referenceTable, referenceColumn, idColumn, nameHuman, description); + populateRelationWithCategories(); + calculateRelationWeights(dbSession); + calculateCategoriesWeights(dbSession); + + AnalysisLogger.getLogger().trace("DBObjectTranslator->buildWholeStructure: Total Categories Elements " + totalCatElements + " Total Relation Elements " + totalRelationElements); + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/Entry.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/Entry.java new file mode 100644 index 0000000..3eafcfc --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/Entry.java @@ -0,0 +1,49 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data; + +import java.util.HashMap; + +//a single entry from a category +public class Entry { + + HashMap attributes; + HashMap types; + + public HashMap getAttributes(){ + return attributes; + } + + public HashMap getTypes(){ + return types; + } + + public void addAttribute(String column,String value){ + if (value==null) + value = ""; + + attributes.put(column, value); + } + + public void addType(String column,String value){ + if (value==null) + value = ""; + + types.put(column, value); + } + + public Entry(){ + attributes = new HashMap(); + types = new HashMap(); + } + + public String toString(){ + + StringBuffer returningString = new StringBuffer(); + returningString.append("{"); + for (String att: attributes.keySet()){ + String value = attributes.get(att); + returningString.append(att+"="+value+"|"+types.get(att).toUpperCase()+"; "); + } + returningString.append("}"); + return returningString.toString(); + } +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/RelationEdge.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/RelationEdge.java new file mode 100644 index 0000000..98035e4 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/RelationEdge.java @@ -0,0 +1,71 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data; + +import java.math.BigInteger; + +public class RelationEdge { + + /** + * @param args + */ + public static void main(String[] args) { + // TODO Auto-generated method stub + + } + + private String relationName; + private String indexFrom; + private String indexTo; + private BigInteger weight; + + private String categoryFrom; + private String categoryTo; + + + public BigInteger getWeigth(){ + return weight; + } + + public void setWeigth(BigInteger Weight){ + weight = Weight; + } + + public String getTo(){ + return indexTo; + } + public String getFrom(){ + return indexFrom; + } + public String getName(){ + return relationName; + } + public void setName(String name){ + relationName = name; + } + + public RelationEdge(String name,String from,String to){ + relationName = name; + indexFrom = from; + indexTo = to; + } + @Override + public String toString(){ + return "["+relationName+": from "+indexFrom+" to " +indexTo+" nameFrom "+categoryFrom+" nameTo "+categoryTo+"]"; + } + + public void setCategoryFrom(String categoryFrom) { + this.categoryFrom = categoryFrom; + } + + public String getCategoryFrom() { + return categoryFrom; + } + + public void setCategoryTo(String categoryTo) { + this.categoryTo = categoryTo; + } + + public String getCategoryTo() { + return categoryTo; + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/SingleResult.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/SingleResult.java new file mode 100644 index 0000000..2689ce5 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/SingleResult.java @@ -0,0 +1,65 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data; + + +public class SingleResult { + private String category; + private String column; + + private String tablename; + private String familyID; + + private double score; + + public void setCategory(String category) { + this.category = category; + } + public String getCategory() { + return category; + } + public void setColumn(String column) { + this.column = column; + } + public String getColumn() { + return column; + } + public void setScore(double score) { + this.score = score; + } + public double getScore() { + return score; + } + + public String getStringScore() { + double scored = Math.round((int)(score*100))/(double)100; + + return ""+scored; + } + + public String toString(){ + double scored = Math.round((int)(score*100))/(double)100; + if (column!=null) + return category+"="+column+":"+scored+" tab:"+tablename+":"+familyID; + else + return category+"="+":"+scored; + } + + public SingleResult (String Category,String Column,double Score, String TableName,String FamilyID){ + category = Category; + column = Column; + score = Score; + tablename = TableName; + familyID = FamilyID; + } + public void setTablename(String tablename) { + this.tablename = tablename; + } + public String getTablename() { + return tablename; + } + public void setFamilyID(String familyID) { + this.familyID = familyID; + } + public String getFamilyID() { + return familyID; + } +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/TSObjectTransformer.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/TSObjectTransformer.java new file mode 100644 index 0000000..0cb9ab0 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/data/TSObjectTransformer.java @@ -0,0 +1,80 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data; + +import java.math.BigDecimal; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph.GraphFramer; + + + +public class TSObjectTransformer { + + /** + * @param args + */ + public static void main(String[] args) { + // TODO Auto-generated method stub + + } + + public static CategoryOrderedList transform2List(DBObjectTranslator dbo, LexicalEngineConfiguration config){ + return transform2List(dbo,config,null); + } + + public static CategoryOrderedList transform2List(DBObjectTranslator dbo, LexicalEngineConfiguration config, String filter){ + CategoryOrderedList col = new CategoryOrderedList(config); + for (Category cat:dbo.categories){ + if ((filter==null) || filter.equalsIgnoreCase(cat.getName())) + col.addCategory(cat); + } + return col; + } + + + + public static void transform2Graph(DBObjectTranslator dbo){ + + GraphFramer starter = new GraphFramer("Time Series Graph"); + BigDecimal total = new BigDecimal(dbo.totalCatElements); +// total = new BigDecimal(100).divide(total,2,BigDecimal.ROUND_HALF_UP); + for (Category cat:dbo.categories){ + + BigDecimal bd = new BigDecimal(cat.getNumberOfElements()); + + bd = bd.divide(total,4,BigDecimal.ROUND_HALF_UP); + bd = bd.multiply(new BigDecimal(100)); + bd = bd.setScale(2,BigDecimal.ROUND_HALF_UP); +// double perc = bd.doubleValue()*100; + + String builtname = cat.getName()+":"+bd+"% "; + + starter.graphDisplayer.addVertex(builtname); + } + for (RelationEdge rel:dbo.relations){ + Category cat = dbo.getCategoryfromIndex(rel.getFrom()); + BigDecimal bd = new BigDecimal(cat.getNumberOfElements()); + bd = bd.divide(total,4,BigDecimal.ROUND_HALF_UP); + bd = bd.multiply(new BigDecimal(100)); + bd = bd.setScale(2,BigDecimal.ROUND_HALF_UP); +// double perc = bd.doubleValue()*100; + + String name1 = cat.getName()+":"+bd+"% "; + + cat = dbo.getCategoryfromIndex(rel.getTo()); + bd = new BigDecimal(cat.getNumberOfElements()); + bd = bd.divide(total,4,BigDecimal.ROUND_HALF_UP); + bd = bd.multiply(new BigDecimal(100)); + bd = bd.setScale(2,BigDecimal.ROUND_HALF_UP); +// perc = bd.doubleValue()+100; + + String name2 = cat.getName()+":"+bd+"% "; + starter.graphDisplayer.addEdge(name1,name2,new BigDecimal(rel.getWeigth()).divide(new BigDecimal(dbo.totalCatElements),2,BigDecimal.ROUND_HALF_UP).multiply(new BigDecimal(100)).doubleValue()); +// starter.graphDisplayer.addEdge(name1,name2,0); + } + +// starter.graphDisplayer.generateRandomGraph(); + starter.graphDisplayer.generateUpTo5StarGraph(); + + starter.go(); + } +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/interfaces/Reference.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/interfaces/Reference.java new file mode 100644 index 0000000..d599f5b --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/interfaces/Reference.java @@ -0,0 +1,19 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.interfaces; + +import java.math.BigInteger; + +public interface Reference { + + public void setName(String categoryName); + public String getName(); + public void setIndex(String categoryIndex); + public String getIndex(); + public void setTableName(String tableName); + public String getTableName(); + public void setDescription(String description); + public String getDescription(); + public String toString(); + public void setNumberOfElements(BigInteger numberOfElements); + public BigInteger getNumberOfElements(); + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/Chunk.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/Chunk.java new file mode 100644 index 0000000..6720266 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/Chunk.java @@ -0,0 +1,14 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine; + +public abstract class Chunk { + + + protected Engine engine; + + public Chunk(Engine engine){ + this.engine = engine; + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/ChunkSet.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/ChunkSet.java new file mode 100644 index 0000000..ed849ad --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/ChunkSet.java @@ -0,0 +1,128 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks; + + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.ArrayList; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine; +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; +import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions; + +public abstract class ChunkSet { + + + protected String seriesName; + protected String seriesColumn; + protected int chunkSize; + private int maxNumberOfChunks; + ArrayList chunkSet; + protected int chunkSetSize; + protected BigInteger numberOfEntries; + protected int chunkIndex; + protected LexicalEngineConfiguration config; + protected Engine engine; + + public ChunkSet(int MaxNumberOfChunks, int ChunkSize, String SeriesName,String SeriesColumn, LexicalEngineConfiguration Config, Engine engine) throws Exception{ + this.engine = engine; + config = Config; + setSeriesName(SeriesName); + setSeriesColumn(SeriesColumn); + setChunkSize(ChunkSize); + maxNumberOfChunks = MaxNumberOfChunks; + generateChunkSet(); + + } + + public ChunkSet(int MaxNumberOfChunks, int ChunkSize, String SeriesName,String SeriesColumn,BigInteger numberOfEntries,LexicalEngineConfiguration Config , Engine engine) throws Exception{ + this.engine = engine; + config = Config; + setSeriesName(SeriesName); + setSeriesColumn(SeriesColumn); + setChunkSize(ChunkSize); + setNumberOfEntries(numberOfEntries); + maxNumberOfChunks = MaxNumberOfChunks; + generateChunkSet(); + + + } + + + + public void generateChunkSet() throws Exception{ + + AnalysisLogger.getLogger().trace("ChunkSet->generateChunkSet-> \tGenerating Chunk Set for " + seriesName+ " "+seriesColumn); + int numberOfChunks = calculateNumberOfCycles(); + //generate chunks to be processed + chunkSet = MathFunctions.generateRandoms(maxNumberOfChunks, 0, numberOfChunks); + chunkIndex = 0; + chunkSetSize = numberOfChunks; + } + + + abstract protected BigDecimal calculateNumberOfElements() throws Exception; + + + protected int calculateNumberOfCycles() throws Exception { + + int numberofcycles = 0; + + // calculate total entries in the time series + BigDecimal numberOfElements = calculateNumberOfElements(); + // calculate total cycles of comparison + BigDecimal intcycles; + BigDecimal oddcycles; + BigDecimal[] arraydecimal = numberOfElements.divideAndRemainder(new BigDecimal(BigInteger.valueOf(chunkSize))); + intcycles = arraydecimal[0]; + oddcycles = arraydecimal[1]; + numberofcycles = intcycles.intValue(); + if ((numberofcycles==0)&&(oddcycles.intValue() > 0)) numberofcycles = numberofcycles + 1; + + return numberofcycles; + + } + + public void setSeriesName(String seriesName) { + this.seriesName = seriesName; + } + + + public String getSeriesName() { + return seriesName; + } + + + public void setSeriesColumn(String seriesColumn) { + this.seriesColumn = seriesColumn; + } + + + public String getSeriesColumn() { + return seriesColumn; + } + + + public void setChunkSize(int chunkSize) { + this.chunkSize = chunkSize; + } + + + public int getChunkSize() { + return chunkSize; + } + + public void setNumberOfEntries(BigInteger numberOfEntries) { + this.numberOfEntries = numberOfEntries; + } + + public BigInteger getNumberOfEntries() { + return numberOfEntries; + } + + + + abstract public Object nextChunk(); + + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/ReferenceChunk.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/ReferenceChunk.java new file mode 100644 index 0000000..1f2ad15 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/ReferenceChunk.java @@ -0,0 +1,55 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks; + + +import java.math.BigInteger; +import java.util.ArrayList; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.DBObjectTranslator; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.Entry; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; +import org.hibernate.SessionFactory; + +public class ReferenceChunk extends Chunk{ + + + + private String categoryName; + private String categoryTableName; + private ArrayList referenceEntries; + + private BigInteger startPoint; + private int chunkSize; + + public ReferenceChunk(String CategoryName, String CategoryTableName, BigInteger StartPoint, int ChunkSize, Engine engine){ + super(engine); + chunkSize = ChunkSize; + categoryName = CategoryName; + categoryTableName = CategoryTableName; + startPoint = StartPoint; + AnalysisLogger.getLogger().trace("ReferenceChunk-> \t\tTOOK CATEGORY CHUNK FOR CATEGORY: " + categoryName+" - index : "+startPoint); + } + + + //takes references on demand from DB + public ArrayList getReferenceEntries() throws Exception{ + + DBObjectTranslator dbo = new DBObjectTranslator(); + SessionFactory sess = engine.getDBSession(); +// AnalysisLogger.getLogger().debug("ReferenceChunk->getReferenceEntries-> \tCATEGORY CHUNK START : " + startPoint); + referenceEntries = dbo.retrieveEntries(sess, categoryTableName, startPoint, chunkSize); + return referenceEntries; + } + + public void setCategoryName(String categoryName) { + this.categoryName = categoryName; + } + public String getCategoryName() { + return categoryName; + } + + + + + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/ReferenceChunkSet.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/ReferenceChunkSet.java new file mode 100644 index 0000000..d847ebf --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/ReferenceChunkSet.java @@ -0,0 +1,51 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks; + + +import java.math.BigDecimal; +import java.math.BigInteger; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine; +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions; + +public class ReferenceChunkSet extends ChunkSet{ + + + public ReferenceChunkSet(int MaxNumberOfChunks, int ChunkSize, String CategoryName,String CategoryColumn, LexicalEngineConfiguration config, Engine engine) throws Exception{ + super(MaxNumberOfChunks, ChunkSize, CategoryName,CategoryColumn, config, engine); + } + + public ReferenceChunkSet(int MaxNumberOfChunks, int ChunkSize, String CategoryName, String CategoryTable, BigInteger numberOfCategoryElements, LexicalEngineConfiguration config, Engine engine) throws Exception{ + super(MaxNumberOfChunks, ChunkSize, CategoryName, CategoryTable, numberOfCategoryElements, config, engine); + } + + protected BigDecimal calculateNumberOfElements() throws Exception{ + // calculate total entries in the time series + BigDecimal numberOfElements = new BigDecimal(numberOfEntries); + return numberOfElements; + } + + + public ReferenceChunk nextChunk() { + + ReferenceChunk rc = null; + + while (!chunkSet.contains(chunkIndex) && (chunkIndex < chunkSetSize)) { + chunkIndex++; + } + if (chunkIndex < chunkSetSize) { + BigInteger startIndex = MathFunctions.chunk2Index(chunkIndex, chunkSize); + try { + rc = new ReferenceChunk(seriesName, seriesColumn , startIndex, chunkSize, engine); + } catch (Exception e) { + e.printStackTrace(); + } + } + + chunkIndex++; + return rc; + + } + + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/SetOfReferenceChunkSet.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/SetOfReferenceChunkSet.java new file mode 100644 index 0000000..a2020af --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/SetOfReferenceChunkSet.java @@ -0,0 +1,41 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks; + + +import java.util.ArrayList; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine; +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.interfaces.Reference; + +public class SetOfReferenceChunkSet { + + ArrayList orderedList; + int referenceIndex; + LexicalEngineConfiguration config; + Engine engine; + + public SetOfReferenceChunkSet(ArrayList OrderedList, LexicalEngineConfiguration Config, Engine engine){ + + this.engine = engine; + orderedList = OrderedList; + referenceIndex = 0; + config = Config; + } + + //filter selects only one of the categories + public ReferenceChunkSet getNextChunkSet(){ + ReferenceChunkSet cs = null; + if (orderedList.size()>referenceIndex){ + Reference ref = orderedList.get(referenceIndex); + try{ + cs = new ReferenceChunkSet(config.ReferenceChunksToTake,config.chunkSize,ref.getName(),ref.getTableName(),ref.getNumberOfElements(),config, engine); + }catch (Exception e){ + e.printStackTrace(); + } + referenceIndex++; + } + + return cs; + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/SingletonChunkSet.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/SingletonChunkSet.java new file mode 100644 index 0000000..fa98f73 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/SingletonChunkSet.java @@ -0,0 +1,52 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks; + + +import java.math.BigDecimal; +import java.math.BigInteger; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine; +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions; + + +public class SingletonChunkSet extends ChunkSet { + + private String singletonString; + private String ColumnType; + public SingletonChunkSet(String SingletonString, LexicalEngineConfiguration config, Engine engine) throws Exception { + super(1, 1, null, null, config, engine); + singletonString = SingletonString; + ColumnType = null; + } + + protected BigDecimal calculateNumberOfElements() throws Exception { + // calculate total entries in the time series + BigDecimal numberOfElements = BigDecimal.ONE; + return numberOfElements; + } + + public TimeSeriesChunk nextChunk() { + + TimeSeriesChunk tsc = null; + + while (!chunkSet.contains(chunkIndex) && (chunkIndex < chunkSetSize)) { + chunkIndex++; + } + if (chunkIndex < chunkSetSize) { + BigInteger startIndex = MathFunctions.chunk2Index(chunkIndex, chunkSize); + + try { + tsc = new TimeSeriesChunk(singletonString, ColumnType, startIndex, chunkSize, config, engine); + if (ColumnType == null) { + ColumnType = tsc.getColumnType(); + } + } catch (Exception e) { + e.printStackTrace(); + } + } + chunkIndex++; + return tsc; + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/TimeSeriesChunk.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/TimeSeriesChunk.java new file mode 100644 index 0000000..86c816a --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/TimeSeriesChunk.java @@ -0,0 +1,167 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks; + + +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.HashMap; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.DataTypeRecognizer; +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine; +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.CategoryScores; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.DBObjectTranslator; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.Entry; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; +import org.gcube.contentmanagement.lexicalmatcher.utils.DistanceCalculator; +import org.hibernate.SessionFactory; + +public class TimeSeriesChunk extends Chunk{ + + + private ArrayList columnEntries; + private String columnType; + private LexicalEngineConfiguration config; + private boolean mustInterrupt; + private ArrayList detailedResults; + private String singletonElement; + private boolean isSingleton; + + public String getColumnType(){ + return columnType; + } + + public String getSingletonEntry(){ + return singletonElement; + } + + public ArrayList getDetailedResults(){ + return detailedResults; + } + public boolean isSingleton(){ + return isSingleton; + } + + public TimeSeriesChunk(String timeSeriesName, String timeSeriesColumn, String ColumnType, BigInteger start, int ChunkSize, LexicalEngineConfiguration Config, Engine engine) throws Exception{ + super(engine); + DBObjectTranslator dbo = new DBObjectTranslator(); + SessionFactory sess = engine.getDBSession(); + columnEntries = dbo.retrieveTimeSeriesEntries(sess, timeSeriesName, timeSeriesColumn, start, ChunkSize); + if (ColumnType==null){ + columnType = DataTypeRecognizer.guessType(columnEntries); + AnalysisLogger.getLogger().trace("TimeSeriesChunk-> GUESSED TYPE " + columnType + " FOR COLUMN "+timeSeriesColumn); + } + mustInterrupt = false; + config = Config; + isSingleton = false; + } + + public TimeSeriesChunk(String singletonString, String ColumnType, BigInteger start, int ChunkSize, LexicalEngineConfiguration Config, Engine engine) throws Exception{ + super(engine); + columnEntries = new ArrayList(); + columnEntries.add(singletonString); + if (ColumnType==null){ + columnType = DataTypeRecognizer.guessType(columnEntries); + AnalysisLogger.getLogger().trace("TimeSeriesChunk-> GUESSED TYPE " + columnType + " FOR SINGLETON "+singletonString); + } + mustInterrupt = false; + config = Config; + isSingleton = true; + singletonElement = singletonString; + detailedResults = new ArrayList(); + } + + + + public boolean mustInterruptProcess (){ + return this.mustInterrupt; + } + public void compareToReferenceChunk(HashMap scoresTable, ReferenceChunk catChunk) throws Exception { + compareToReferenceChunk(scoresTable, catChunk,null); + } + + // checks an entry set against a reference set + // columnEntries: column elements from unknown column + // cat: category analyzed for candidating to recognized + // referenceEntries: some elements belonging to cat, to be compared to columnEntries + public void compareToReferenceChunk(HashMap scoresTable, ReferenceChunk catChunk,String ColumnFilter) throws Exception { + + + //in the case of a singleton Chunk interrupt computation in case of exact match + + // get category Score for further processing + CategoryScores categoryScores = scoresTable.get(catChunk.getCategoryName()); + //extract Entries from DB + ArrayList categoryEntries = catChunk.getReferenceEntries(); + + for (String timeSeriesElement : columnEntries) { + // for each reference entry + for (Entry referenceEntry : categoryEntries) { + + // take all attributes of a reference entry for confrontation to columns + HashMap attributes = referenceEntry.getAttributes(); + HashMap types = referenceEntry.getTypes(); + boolean anotherReference= true; + + // for each attribute of an entry + for (String referenceColumn : attributes.keySet()) { + + // perform calculation only if the column type is the same + if (types.get(referenceColumn).equals(columnType)&&((ColumnFilter==null)||(ColumnFilter.equalsIgnoreCase(referenceColumn)))) { +// AnalysisLogger.getLogger().debug("CategoryOrderedList->checkAllEntriesOnEntireCategory-> REFERENCE COLUMN "+referenceColumn+" HAS TYPE "+types.get(referenceColumn)); + // take the attribute value of the entry + String attribute = attributes.get(referenceColumn); + // calculate the distance between the unknown entry and the attribute + DistanceCalculator d = new DistanceCalculator(); + double percentage = d.CD(config.useSimpleDistance, timeSeriesElement, attribute, isSingleton, isSingleton) * 100f; +// AnalysisLogger.getLogger().debug("CategoryOrderedList->checkUnkEntriesOnEntireCategory-> Percentage between " +timeSeriesElement + " and " + attribute + " is: "+percentage ); + // if they are similar + if (percentage > config.entryAcceptanceThreshold) { +// if (catChunk.getCategoryName().equals("COUNTRY_OLD")) + AnalysisLogger.getLogger().trace("TimeSeriesChunk->compareToCategoryChunk-> \t\tPercentage between " + timeSeriesElement + " vs. " + attribute + " is: " + percentage+" in "+catChunk.getCategoryName()+":"+referenceColumn); + + categoryScores.incrementScore(referenceColumn, (float)percentage,anotherReference); + + //if we are in a singleton we have to get the details + if (isSingleton){ + //for singleton match, fulfil details + int index =0; + for (SingleResult sr :detailedResults){ + + Double scoredetail = sr.getScore(); + + if (scoredetailcompareToCategoryChunk-> "+categoryScores.showScores()); + } + //if exact match is reached, exit + if ((percentage==100)&&(isSingleton)) + { + detailedResults = new ArrayList(); + detailedResults.add(new SingleResult(attribute, null, percentage,null,"0")); + mustInterrupt = true; + break; + } + } + } + + }// end for on columns + + if (mustInterrupt) + break; + + }// end for on entries + } + } + + + + + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/TimeSeriesChunkSet.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/TimeSeriesChunkSet.java new file mode 100644 index 0000000..f165742 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/chunks/TimeSeriesChunkSet.java @@ -0,0 +1,53 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks; + + +import java.math.BigDecimal; +import java.math.BigInteger; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine; +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.DBObjectTranslator; +import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions; +import org.hibernate.SessionFactory; + +public class TimeSeriesChunkSet extends ChunkSet { + + private String ColumnType; + + public TimeSeriesChunkSet(int MaxNumberOfChunks, int ChunkSize, String TimeSeriesName, String TimeSeriesColumn, LexicalEngineConfiguration config, Engine engine) throws Exception { + super(MaxNumberOfChunks, ChunkSize, TimeSeriesName, TimeSeriesColumn, config,engine); + ColumnType = null; + } + + protected BigDecimal calculateNumberOfElements() throws Exception { + // calculate total entries in the time series + DBObjectTranslator dbo = new DBObjectTranslator(); + SessionFactory sess = engine.getDBSession(); + BigDecimal numberOfElements = new BigDecimal(dbo.calculateTotalEntries(sess, seriesName, seriesColumn)); + return numberOfElements; + } + + public TimeSeriesChunk nextChunk() { + + TimeSeriesChunk tsc = null; + + while (!chunkSet.contains(chunkIndex) && (chunkIndex < chunkSetSize)) { + chunkIndex++; + } + if (chunkIndex < chunkSetSize) { + BigInteger startIndex = MathFunctions.chunk2Index(chunkIndex, chunkSize); + try { + tsc = new TimeSeriesChunk(seriesName, seriesColumn, ColumnType, startIndex, chunkSize, config, engine); + if (ColumnType == null) { + ColumnType = tsc.getColumnType(); + } + } catch (Exception e) { + e.printStackTrace(); + } + } + chunkIndex++; + return tsc; + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/CustomListenableDirectedWeightedGraph.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/CustomListenableDirectedWeightedGraph.java new file mode 100644 index 0000000..bdda6ad --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/CustomListenableDirectedWeightedGraph.java @@ -0,0 +1,25 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph; + +import org.jgrapht.graph.ListenableDirectedWeightedGraph; + +public class CustomListenableDirectedWeightedGraph extends ListenableDirectedWeightedGraph{ + + + public CustomListenableDirectedWeightedGraph(Class arg0) { + super(arg0); + } + + public void setEdgeWeight(E e, double weight) { + super.setEdgeWeight(e, weight); + + ((CustomWeightedEdge)e).setWeight(weight); + } + + public E addEdge(V o1,V o2) { + E out = super.addEdge(o1,o2); + ((CustomWeightedEdge)out).setEdges(o1,o2); + + return out; + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/CustomWeightedEdge.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/CustomWeightedEdge.java new file mode 100644 index 0000000..3aa5ac1 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/CustomWeightedEdge.java @@ -0,0 +1,27 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph; + +import org.jgrapht.graph.DefaultWeightedEdge; + +import com.touchgraph.graphlayout.Edge; + +public class CustomWeightedEdge extends DefaultWeightedEdge{ + + @Override + public String toString(){ + return "["+o1+":"+o2+":"+weight+"%]"; + } + + private double weight; + private Object o1; + private Object o2; + + public void setWeight(double weight){ + this.weight = weight; + } + + public void setEdges(Object o1,Object o2){ + this.o1=o1; + this.o2=o2; + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/CustomWeightedVertex.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/CustomWeightedVertex.java new file mode 100644 index 0000000..9810ccd --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/CustomWeightedVertex.java @@ -0,0 +1,36 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph; + +import org.jgrapht.graph.DefaultWeightedEdge; + +import com.touchgraph.graphlayout.Edge; + +public class CustomWeightedVertex { + + @Override + public String toString() { + + return "[" + name + ":" + weight + "%]"; + } + + private double weight; + private String name; + + public CustomWeightedVertex(String name, double weight) { + this.weight = weight; + this.name = name; + } + + public CustomWeightedVertex(String name) { + this.weight = 0; + this.name = name; + } + + public boolean equals(CustomWeightedVertex v) { + + if (v.name.equals(name)) + return true; + else + return false; + + } +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/GraphDisplayer.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/GraphDisplayer.java new file mode 100644 index 0000000..8df91c1 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/GraphDisplayer.java @@ -0,0 +1,299 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph; + + +import java.awt.Color; +import java.awt.Dimension; +import java.awt.Rectangle; +import java.awt.geom.Rectangle2D; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; + +import javax.swing.JApplet; + +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; +import org.jgraph.JGraph; +import org.jgraph.graph.DefaultGraphCell; +import org.jgraph.graph.GraphConstants; +import org.jgrapht.ext.JGraphModelAdapter; + +public class GraphDisplayer extends JApplet { + private static final Color DEFAULT_BG_COLOR = Color.decode("#FAFBFF"); + private static final Dimension DEFAULT_SIZE = new Dimension(530, 320); + + private JGraphModelAdapter m_jgAdapter; + + public static int WIDTH = 1000; + public static int HEIGHT = 800; + + public static int WIDTHBOX = 1280; + public static int HEIGHTBOX = 1024; + + private int newxposition; + private int newyposition; + + private CustomListenableDirectedWeightedGraph g; + private int nodesCounter; + private static final int minx = 10; + private static final int miny = 10; + ArrayList VertexNames; + HashMap Edges; + + public void generatePosition(int lastxPosition, int lastyposition) { + + int rangex = (int) WIDTH - (int) lastxPosition; + // compute a fraction of the range, 0 <= frac < range + Random a = new Random(); + int newx = lastxPosition + 70 + (int) (rangex * a.nextDouble()); + int epsilon = 1; + int newy = (int) lastyposition + (int) (epsilon * 20f * Math.random()); + if (newx > WIDTH) + newx = WIDTH - 100; + if (newx < lastxPosition - 90) + newx = lastxPosition + 90; + if (newy > HEIGHT) + newy = HEIGHT - 10; + if (newy < 0) + newy = 0; + newxposition = newx; + newyposition = newy; + // System.out.println("LAST X "+lastxPosition+" NEW X "+newxposition); + // System.out.println("LAST Y "+lastyposition+" NEW Y "+newyposition); + } + + public void init() { + AnalysisLogger.getLogger().debug("INIZIALIZZATO!"); + + JGraph jgraph = new JGraph(m_jgAdapter); + + adjustDisplaySettings(jgraph); + getContentPane().add(jgraph); + resize(DEFAULT_SIZE); + + AnalysisLogger.getLogger().debug("RESIZED!"); + } + + public void generateGraph() { + + for (String v : VertexNames) { + genPositionVertex(v); + } + } + + public void generateRandomGraph() { + + for (String v : VertexNames) { + int randx = minx + (int) ((WIDTH - 100) * Math.random()); + int randy = miny + (int) ((HEIGHT - 100) * Math.random()); + positionVertexAt(v, randx, randy); + } + } + + public void generateUpTo5StarGraph() { + + // individua le star + HashMap vertexFrequencies = new HashMap(); + // calcolo le frequenze dei vertici + for (String edge : Edges.values()) { + System.out.println(edge + "-" + vertexFrequencies.get(edge)); + if (vertexFrequencies.get(edge) != null) { + int f = vertexFrequencies.get(edge).intValue(); + vertexFrequencies.put(edge, new Integer(f + 1)); + } else + vertexFrequencies.put(edge, new Integer(0)); + + } + + for (String vertex : VertexNames) { + + if (Edges.get(vertex) == null) { + boolean trovato = false; + // cerco ogni vertice tra gli archi + for (String starvertex : Edges.values()) { + if (vertex.equals(starvertex)) { + trovato = true; + break; + } + } + if (!trovato) { + System.out.println("aggiunto vertice isolato " + vertex); + vertexFrequencies.put(vertex, new Integer(0)); + } + } + + } + + System.out.println("FEQS " + vertexFrequencies.toString()); + // ordino le star + ArrayList starList = new ArrayList(); + for (String vertex : vertexFrequencies.keySet()) { + + int freq = vertexFrequencies.get(vertex); + int i = 0; + boolean trovato = false; + for (String element : starList) { + + int referfreq = vertexFrequencies.get(element); + if (referfreq < freq) { + starList.add(i, vertex); + trovato = true; + break; + } + i++; + } + if (!trovato) + starList.add(vertex); + } + + // dispongo le star nel layout + System.out.println(starList.toString()); + int bound = 200; + int[] boundedXIndexex = { bound, WIDTH - bound, bound, WIDTH - bound, WIDTH / 2 }; + int[] boundedYIndexex = { bound, bound, HEIGHT - bound, HEIGHT - bound, HEIGHT / 2 }; + int sizeStar = starList.size(); + // int sizeStar = 1; + + // distribuisco le star sul grafico + for (int i = 0; i < sizeStar; i++) { + + positionVertexAt(starList.get(i), boundedXIndexex[i], boundedYIndexex[i]); + + // calcolo il numero di elementi della stella + int countelems = 0; + for (String edge : Edges.keySet()) { + if (Edges.get(edge).equals(starList.get(i))) { + countelems++; + } + } + + if (countelems > 0) { + double subdivision = 360 / countelems; + double angle = 105f; + double radius = 200f; + System.out.println("Numero di elementi nella stella: " + countelems + " suddivisioni: " + subdivision); + for (String edge : Edges.keySet()) { + // dispongo gli elementi a stella + if (Edges.get(edge).equals(starList.get(i))) { + int currentx = boundedXIndexex[i]; + int currenty = boundedYIndexex[i]; + int epsilonx = (int) (radius * Math.cos(Math.toRadians(angle))); + int epsilony = (int) (radius * Math.sin(Math.toRadians(angle))); + System.out.println("angolo attuale: " + angle + " x0: " + currentx + " y0 " + currenty + " ex " + epsilonx + " ey " + epsilony); + positionVertexAt(edge, currentx + epsilonx, currenty + epsilony); + + angle += subdivision; + } + } + } + + } + + } + + private void genPositionVertex(String vertexName) { + + if (nodesCounter > 0) { + if ((nodesCounter % 2) == 0) { + newxposition = 10 + (int) (20f * Math.random()); + newyposition += 100; + } else + generatePosition(newxposition, newyposition); + } + + positionVertexAt(vertexName, newxposition, newyposition); + nodesCounter++; + } + + public GraphDisplayer() { + g = new CustomListenableDirectedWeightedGraph(CustomWeightedEdge.class); + m_jgAdapter = new JGraphModelAdapter(g); + VertexNames = new ArrayList(); + Edges = new HashMap(); + newxposition = minx; + newyposition = miny; + nodesCounter = 0; + } + + public void addVertex(String name) { + g.addVertex(name); + VertexNames.add(name); + } + + public void addEdge(String v1, String v2, double bi) { + CustomWeightedEdge ed = (CustomWeightedEdge)g.addEdge(v1,v2); + g.setEdgeWeight(ed,bi); + Edges.put(v1, v2); + } + + private void adjustDisplaySettings(JGraph jg) { + jg.setPreferredSize(DEFAULT_SIZE); + + Color c = DEFAULT_BG_COLOR; + String colorStr = null; + + try { + colorStr = getParameter("bgcolor"); + } catch (Exception e) { + } + + if (colorStr != null) { + c = Color.decode(colorStr); + } + + jg.setBackground(c); + } + + private void positionVertexAt(Object vertex, int x, int y) { + + // seleziono la cella chiamata vertex + DefaultGraphCell cell = m_jgAdapter.getVertexCell(vertex); + + + // recupero gli attributi della cella + Map attr = cell.getAttributes(); + // recupero i boundaries della cella + Rectangle2D b = GraphConstants.getBounds(attr); + // setto i parametri del nuovo rettangolo + GraphConstants.setBounds(attr, new Rectangle(x, y, (int) (((String)vertex).length()+50+b.getWidth()), (int) b.getHeight())); + // costruisco una nuova cella + Map cellAttr = new HashMap(); + cellAttr.put(cell, attr); + + // posiziono la cella nel grafo + m_jgAdapter.edit(cellAttr, null, null, null); + + } + + public void start() { + repaint(); + + } + + public static void main(String[] args) { + + GraphFramer starter = new GraphFramer("Grafo"); + + // create a visualization using JGraph, via an adapter + String nodi[] = { "ciao", "come", "stai", "oggi", "domani", "dopodomani" }; + for (String nodo : nodi) { + starter.graphDisplayer.addVertex(nodo); + } + + for (int j = 0; j < nodi.length; j++) { + int i0 = (int) (nodi.length * Math.random()); + int i1 = (int) (nodi.length * Math.random()); + System.out.println("i0: " + i0 + " i1: " + i1); + if (i0 != i1) { + starter.graphDisplayer.addEdge(nodi[i0], nodi[i1],0); + } + } + + starter.graphDisplayer.generateGraph(); + + starter.go(); + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/GraphFramer.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/GraphFramer.java new file mode 100644 index 0000000..9db3a9f --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/GraphFramer.java @@ -0,0 +1,40 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph; + +import java.awt.Event; +import java.awt.Frame; + +public class GraphFramer extends Frame{ + + public GraphDisplayer graphDisplayer; + + public GraphFramer(String frameName){ + super(frameName); + graphDisplayer = new GraphDisplayer(); + add("Center",graphDisplayer); + + } + + public void go(){ + + graphDisplayer.init(); + + this.resize(GraphDisplayer.WIDTHBOX, GraphDisplayer.HEIGHTBOX); + this.show(); + graphDisplayer.start(); + + } + + public boolean HandleEvent(Event event){ + + if (event.id == Event.WINDOW_DESTROY) + + { + try + {graphDisplayer.stop(); + graphDisplayer.destroy(); + }catch(Exception e){e.printStackTrace();} + System.exit(0); + } + return false; + } +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/GraphGeneratorApplet.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/GraphGeneratorApplet.java new file mode 100644 index 0000000..b55cd02 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/GraphGeneratorApplet.java @@ -0,0 +1,106 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph; + +import java.awt.Color; +import java.awt.Dimension; +import java.awt.Rectangle; +import java.awt.geom.Rectangle2D; + +import java.util.HashMap; +import java.util.Map; + +import javax.swing.JApplet; +import javax.swing.JFrame; + +import org.jgraph.JGraph; +import org.jgraph.graph.AttributeMap; +import org.jgraph.graph.DefaultGraphCell; +import org.jgraph.graph.GraphConstants; + +import org.jgrapht.ListenableGraph; +import org.jgrapht.ext.JGraphModelAdapter; +import org.jgrapht.graph.ListenableDirectedGraph; +import org.jgrapht.graph.DefaultEdge; + + +public class GraphGeneratorApplet extends JApplet { + private static final Color DEFAULT_BG_COLOR = Color.decode("#FAFBFF"); + private static final Dimension DEFAULT_SIZE = new Dimension(530, 320); + + // + private JGraphModelAdapter m_jgAdapter; + + /** + * @see java.applet.Applet#init(). + */ + public void init() { + // create a JGraphT graph + ListenableGraph g = new ListenableDirectedGraph(DefaultEdge.class); + + // create a visualization using JGraph, via an adapter + m_jgAdapter = new JGraphModelAdapter(g); + + JGraph jgraph = new JGraph(m_jgAdapter); + + adjustDisplaySettings(jgraph); + getContentPane().add(jgraph); + resize(DEFAULT_SIZE); + + // add some sample data (graph manipulated via JGraphT) + g.addVertex("v1"); + g.addVertex("v2"); + g.addVertex("v3"); + g.addVertex("v4"); + + g.addEdge("v1", "v2"); + g.addEdge("v2", "v3"); + g.addEdge("v3", "v1"); + g.addEdge("v4", "v3"); + + + + // position vertices nicely within JGraph component + positionVertexAt("v1", 130, 40); + positionVertexAt("v2", 60, 200); + positionVertexAt("v3", 310, 230); + positionVertexAt("v4", 380, 70); + + // that's all there is to org.gcube.contentmanagement.lexicalmatcher!... + } + + private void adjustDisplaySettings(JGraph jg) { + jg.setPreferredSize(DEFAULT_SIZE); + + Color c = DEFAULT_BG_COLOR; + String colorStr = null; + + try { + colorStr = getParameter("bgcolor"); + } catch (Exception e) { + } + + if (colorStr != null) { + c = Color.decode(colorStr); + } + + jg.setBackground(c); + } + + private void positionVertexAt(Object vertex, int x, int y) { + + + //seleziono la cella chiamata vertex + DefaultGraphCell cell = m_jgAdapter.getVertexCell(vertex); + //recupero gli attributi della cella + Map attr = cell.getAttributes(); + //recupero i boundaries della cella + Rectangle2D b = GraphConstants.getBounds(attr); + //setto i parametri del nuovo rettangolo + GraphConstants.setBounds(attr, new Rectangle(x, y, (int)b.getWidth(), (int)b.getHeight())); + //costruisco una nuova cella + Map cellAttr = new HashMap(); + cellAttr.put(cell, attr); + //posiziono la cella nel grafo + m_jgAdapter.edit(cellAttr, null, null, null); + + } +} \ No newline at end of file diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/RelationEdge.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/RelationEdge.java new file mode 100644 index 0000000..a62c010 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/RelationEdge.java @@ -0,0 +1,73 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph; + +import java.math.BigInteger; + +import org.jgrapht.graph.DefaultWeightedEdge; + +public class RelationEdge extends DefaultWeightedEdge{ + + /** + * @param args + */ + public static void main(String[] args) { + // TODO Auto-generated method stub + + } + + private String relationName; + private long indexFrom; + private long indexTo; + private BigInteger weight; + + private String categoryFrom; + private String categoryTo; + + + public BigInteger getWeigth(){ + return weight; + } + + public void setWeigth(BigInteger Weight){ + weight = Weight; + } + + public long getTo(){ + return indexTo; + } + public long getFrom(){ + return indexFrom; + } + public String getName(){ + return relationName; + } + public void setName(String name){ + relationName = name; + } + + public RelationEdge(String name,long from,long to){ + relationName = name; + indexFrom = from; + indexTo = to; + } + @Override + public String toString(){ + return "["+relationName+": from "+indexFrom+" to " +indexTo+" nameFrom "+categoryFrom+" nameTo "+categoryTo+"]"; + } + + public void setCategoryFrom(String categoryFrom) { + this.categoryFrom = categoryFrom; + } + + public String getCategoryFrom() { + return categoryFrom; + } + + public void setCategoryTo(String categoryTo) { + this.categoryTo = categoryTo; + } + + public String getCategoryTo() { + return categoryTo; + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/TreeExtractor.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/TreeExtractor.java new file mode 100644 index 0000000..75fdfe5 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/guesser/treeStructure/graph/TreeExtractor.java @@ -0,0 +1,68 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +import org.hibernate.SessionFactory; + +public class TreeExtractor { + + /** + * @param args + */ + public static void main(String[] args) { + // TODO Auto-generated method stub + + } + TreeNode categoriesTree; + + //recupera l'albero delle categorie + public TreeNode getCategoriesTree(SessionFactory DB){ + return categoriesTree; + } + + //creo un nuovo Albero + public TreeExtractor(){ + categoriesTree = new TreeNode(TreeNode.ROOT); + } + + class TreeNode implements Iterable { + + public static final String ROOT = "ROOT"; + + private Set children; + public String name; + + public TreeNode(String Name) { + children = new HashSet(); + name = Name; + } + + public String getName(){ + return name; + } + + public boolean addChild(TreeNode n) { + return children.add(n); + } + + public boolean removeChild(TreeNode n) { + return children.remove(n); + } + + public Iterator iterator() { + return children.iterator(); + } + + public boolean isLeaf(){ + return ((children==null) || (children.size()==0)); + } + + public boolean isRoot(){ + return (name.equals(ROOT)); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/run/CategoryGuesser.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/run/CategoryGuesser.java new file mode 100644 index 0000000..12326e9 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/run/CategoryGuesser.java @@ -0,0 +1,489 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.run; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.ArrayList; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine; +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.CategoryOrderedList; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.DBObjectTranslator; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.TSObjectTransformer; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.interfaces.Reference; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; +import org.hibernate.SessionFactory; + +public class CategoryGuesser { + + /** + * @param args + * @throws Exception + */ + + private final static int MAXRESULTS = 10; + + public static void showResults(ArrayList results) { + + AnalysisLogger.getLogger().warn("CLASSIFICATION RESULT:\n"); + int i = 1; + for (SingleResult result : results) { + if (result.getColumn() != null) + AnalysisLogger.getLogger().warn(i + ": " + result.getCategory() + " - " + result.getColumn() + " ; SCORE: " + result.getStringScore() + "%"); + else + AnalysisLogger.getLogger().warn(i + ": " + result.getCategory() + " ; SCORE: " + result.getStringScore() + "%"); + + i++; + } + + } + + public static void AccuracyCalc(CategoryGuesser guesser, String configPath, String seriesName, String column, int attempts, String correctFamily, String correctColumn) throws Exception { + AccuracyCalc(null, guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + } + + public static void AccuracyCalc(LexicalEngineConfiguration externalcfg, CategoryGuesser guesser, String configPath, String seriesName, String column, int attempts, String correctFamily, String correctColumn) throws Exception { + + int familyscore = 0; + int columnscore = 0; + // CategoryGuesser guesser = new CategoryGuesser(); + + for (int i = 0; i < attempts; i++) { + + guesser.runGuesser(seriesName, column, externalcfg); + ArrayList results = guesser.getClassification(); + String result = results.toString(); + showResults(results); + + AnalysisLogger.getLogger().info("CLASSIFICATION RESULT " + result + " " + CategoryGuesser.resultString(result, correctFamily, correctColumn)); + + if (CategoryGuesser.CheckCompleteResult(result, correctFamily, correctColumn)) + columnscore++; + + if (CategoryGuesser.CheckFamilyResult(result, correctFamily)) + familyscore++; + + } + + double percColumn = ((double) columnscore / (double) attempts) * 100; + double percFamily = ((double) familyscore / (double) attempts) * 100; + + AnalysisLogger.getLogger().info("->ACCURACY ON FAMILY " + correctFamily + ":" + percFamily + " ACCURACY ON COLUMN " + correctColumn + ":" + percColumn); + } + + public static String resultString(String result, String family, String column) { + + result = result.toUpperCase(); + family = family.toUpperCase(); + column = column.toUpperCase(); + + return "FAMILY REC: " + result.contains(family) + " COLUMN REC: " + result.contains(family + "=" + column); + } + + public static boolean CheckCompleteResult(String result, String family, String column) { + + result = result.toUpperCase(); + family = family.toUpperCase(); + column = column.toUpperCase(); + if (result.contains(family + "=" + column)) + return true; + else + return false; + } + + public static boolean CheckFamilyResult(String result, String family) { + + result = result.toUpperCase(); + family = family.toUpperCase(); + + if (result.contains(family + "=")) + return true; + else + return false; + } + + // NOTE: The config path has to contain the two files: lexicalGuesser.properties and ALog.properties + private static final String cfgFile = "lexicalGuesser.properties"; + private static final String LogFile = "ALog.properties"; + // singleton + private CategoryOrderedList col; + private Engine processor; + private CategoryOrderedList originalCol; + private LexicalEngineConfiguration config; + private String configPath; + private boolean oneshotMode; + private static final int maxTriesClassification = 3; + private int triesCounter; + + public CategoryGuesser(String ConfigPath) { + + triesCounter = 0; + this.configPath = ConfigPath; + } + + public CategoryGuesser() { + triesCounter = 0; + this.configPath = "."; + } + + public void runGuesser(String seriesName, String columnName, LexicalEngineConfiguration externalConfig) throws Exception { + runGuesser(seriesName, columnName, externalConfig, null, null); + } + + public void runGuesser(String seriesName, String columnName) throws Exception { + runGuesser(seriesName, columnName, null, null, null); + } + + public void runGuesser(String seriesName, String columnName, LexicalEngineConfiguration externalConfig, String CategoryFilter, String ColumnFilter) throws Exception { + runGuesser(seriesName, columnName, externalConfig, CategoryFilter, ColumnFilter, null); + } + + public void runGuesser(String SingletonString, LexicalEngineConfiguration externalConfig, String CategoryFilter, String ColumnFilter) throws Exception { + oneshotMode = true; + runGuesser(null, null, externalConfig, CategoryFilter, ColumnFilter, SingletonString); + } + + public void init(String categoryFilter, String columnFilter, LexicalEngineConfiguration externalConfig) throws Exception { + + String cfgFileCompletePath = configPath + "/" + cfgFile; + AnalysisLogger.setLogger(configPath + "/" + LogFile); + + AnalysisLogger.getLogger().trace("******************INITIALIZING******************"); + + config = new LexicalEngineConfiguration(); + config.configure(cfgFileCompletePath); + + if (externalConfig != null) { + config.mergeConfig(externalConfig); + } + + processor = new Engine(config, columnFilter, configPath); + + SessionFactory dbSession = processor.getDBSession(config); + DBObjectTranslator dbo = new DBObjectTranslator(); + + if (col == null) { + AnalysisLogger.getLogger().trace("******************Order Category******************"); + if (externalConfig == null) + externalConfig = new LexicalEngineConfiguration(); + dbo.buildCategoriesStructure(dbSession, externalConfig.getReferenceTable(), externalConfig.getReferenceColumn(), externalConfig.getIdColumn(), externalConfig.getNameHuman(), externalConfig.getDescription()); + col = TSObjectTransformer.transform2List(dbo, config, categoryFilter); + AnalysisLogger.getLogger().trace("***************End Ordering********************"); + originalCol = col.generateNovelList(); + } else { + col = originalCol.generateNovelList(); + } + + oneshotMode = false; + } + + public void initSingleMatcher(LexicalEngineConfiguration externalConfig, String ColumnFilter) throws Exception { + + String cfgFileCompletePath = configPath + "/" + cfgFile; + AnalysisLogger.setLogger(configPath + "/" + LogFile); + + config = new LexicalEngineConfiguration(); + config.configure(cfgFileCompletePath); + + if (externalConfig != null) { + config.mergeConfig(externalConfig); + } + + processor = new Engine(config, ColumnFilter, configPath); + + // in this case, the lexical matcher is invoked once, then it has to be stopped in the end + oneshotMode = true; + } + + public void init(String categoryFilter, String columnFilter) throws Exception { + init(categoryFilter, columnFilter, null); + } + + public void init(LexicalEngineConfiguration externalConfig) throws Exception { + init(null, null, externalConfig); + } + + public void init() throws Exception { + init(null, null, null); + } + + public void refreshReferences() { + col = null; + } + + public void runGuesser(String seriesName, String columnName, LexicalEngineConfiguration externalConfig, String CategoryFilter, String ColumnFilter, String SingletonString) throws Exception { + + String cfgFileCompletePath = configPath + "/" + cfgFile; + AnalysisLogger.setLogger(configPath + "/" + LogFile); + + AnalysisLogger.getLogger().debug("Guessing Table " + seriesName + " column " + columnName); + if (externalConfig != null) { + config = new LexicalEngineConfiguration(); + config.configure(cfgFileCompletePath); + config.mergeConfig(externalConfig); + + // NOTE FOR FUTURE OPTIMIZATION: perform the re-init only if there is a change in the Database pointing + processor = new Engine(config, ColumnFilter, configPath); + } else { + if (config == null) { + config = new LexicalEngineConfiguration(); + config.configure(cfgFileCompletePath); + + } + if (processor == null) { + processor = new Engine(config, ColumnFilter, configPath); + } else + processor.resetEngine(config, ColumnFilter, configPath); + } + + SessionFactory dbSession = processor.getDBSession(config); + DBObjectTranslator dbo = new DBObjectTranslator(); + + //modification of 10/10/11 calculate structure each time +// if (col == null) { + AnalysisLogger.getLogger().trace("******************Order Category******************"); + dbo.buildCategoriesStructure(dbSession, config.getReferenceTable(), config.getReferenceColumn(), config.getIdColumn(), config.getNameHuman(), config.getDescription()); + col = TSObjectTransformer.transform2List(dbo, config, CategoryFilter); + AnalysisLogger.getLogger().trace("***************End Ordering********************"); + originalCol = col.generateNovelList(); + /* + } else { + col = originalCol.generateNovelList(); + } + */ + + AnalysisLogger.getLogger().warn("Starting Calculation...wait..."); + + long t0 = System.currentTimeMillis(); + + // processor.calcLike(col,seriesName, columnName); + + processor.calcLikeThread(col, seriesName, columnName, SingletonString); + + // perform processing until the table contains at least one element + ArrayList checkingResults = null; + + // if (oneshotMode) + // checkingResults = getClassification(); + // else + checkingResults = getClassification(); + + while ((checkingResults == null || checkingResults.size() == 0) && (triesCounter < maxTriesClassification)) { + AnalysisLogger.getLogger().warn("..another processing pass is required. Attempt number " + (triesCounter + 1)); + triesCounter++; + float differencialThr = config.getCategoryDiscardDifferencialThreshold(); + float acceptanceThr = config.getEntryAcceptanceThreshold(); + // reduce the thresholds of 10 points and recalculate + config.setCategoryDiscardDifferencialThreshold(Math.max(differencialThr - 20, 0)); + config.setEntryAcceptanceThreshold(Math.max(acceptanceThr - 20, 0)); + AnalysisLogger.getLogger().trace("Performing next processing pass"); + runGuesser(seriesName, columnName, null, CategoryFilter, ColumnFilter, SingletonString); + AnalysisLogger.getLogger().debug("End processing pass"); + + // if (oneshotMode) + // checkingResults = getClassification(); + // else + checkingResults = getClassification(); + + if (triesCounter == 0) + break; + } + + long t1 = System.currentTimeMillis() - t0; + + AnalysisLogger.getLogger().warn("...End Calculation in " + t1 + "ms"); + + triesCounter = 0; + // close session if not more necessary + if (oneshotMode) + dbSession.close(); + } + + public ArrayList getClassificationOLD() { + + ArrayList results = new ArrayList(); + int size = processor.bestCategories.size(); + for (int i = 0; i < size; i++) { + results.add(new SingleResult(processor.bestCategories.get(i), processor.bestColumns.get(i), processor.bestScores.get(i), null, "0")); + } + + return results; + } + + public ArrayList getDetailedMatches() { + + if (processor.getSingletonMatches() != null) { + + // use deviation to cut results + float threshold = config.getSingleEntryRecognitionMaxDeviation(); + ArrayList results = processor.getSingletonMatches(); + double minScore = 0; + // get the best result and calculate the threshold + if (results.size() > 0) { + minScore = results.get(0).getScore() - threshold; + } + + // remove poor objects + int size = results.size(); + for (int i = 0; i < size; i++) { + SingleResult sr = results.get(i); + if (sr.getScore() < minScore) { + results.remove(i); + i--; + size--; + } + } + + return processor.getSingletonMatches(); + } else + return new ArrayList(); + } + + public String getDetailedSingletonEntry() { + + if (processor.getSingletonElement() != null) { + return processor.getSingletonElement(); + } else + return ""; + } + + public ArrayList getClassificationPlain() { + + ArrayList results = new ArrayList(); + int size = processor.bestCategories.size(); + double maxscore = 0; + + for (int i = 0; i < size; i++) { + double score = processor.bestScores.get(i); + if (maxscore < score) { + maxscore = score; + } + } + + for (int i = 0; i < size; i++) { + + double score = processor.bestScores.get(i); + // normalizing percentages!!! + score = (score / (maxscore + ((size > 1) ? 1 : 0))) * 100; + + if (score > config.categoryDiscardDifferencialThreshold) { + + Reference ref = col.getCategory(processor.bestCategories.get(i)); + + results.add(new SingleResult(processor.bestCategories.get(i), processor.bestColumns.get(i), score, ref.getTableName(), ref.getIndex())); + } + } + + return results; + } + + public ArrayList getClassification() { + + ArrayList results = new ArrayList(); + int size = processor.bestCategories.size(); + double maxscore = 0; + + BigDecimal sumElements = BigDecimal.ZERO; + ArrayList subscores = new ArrayList(); + + // calculate sum of elements and weights; + for (int i = 0; i < size; i++) { + BigInteger catElements = col.getScoresTable().get(processor.bestCategories.get(i)).getCategoryElements(); + sumElements = sumElements.add(new BigDecimal(catElements)); + } +/* + if (sumElements.compareTo(BigDecimal.valueOf(10000)) < 0) + return getClassificationPlain(); +*/ + for (int i = 0; i < size; i++) { + double score = processor.bestScores.get(i); + // multiply for impotance + BigInteger catElements = col.getScoresTable().get(processor.bestCategories.get(i)).getCategoryElements(); + + // AnalysisLogger.getLogger().warn("\t elements "+catElements+" sum "+sumElements); + + double weight = new BigDecimal(catElements).divide(sumElements, 2, BigDecimal.ROUND_HALF_UP).doubleValue(); + + if (weight >= 3) + weight = 2 * Math.log(weight * 100) / 10f; + else if ((weight >= 0.5) && (weight <= 1)) + { + weight = Math.log(weight * 100) / 100.00f; + } + else if (weight < 0.05) + weight = 0.05; + + AnalysisLogger.getLogger().warn("WEIGHT FOR CATEGORY " + processor.bestCategories.get(i) + "-" + processor.bestColumns.get(i) + " : " + weight + " SCORE " + score); + + // recalculate weights + score = score * weight; + score = Math.min(1, score); + + if (maxscore < score) { + maxscore = score; + } + + subscores.add(score); + } + // AnalysisLogger.getLogger().warn("MAX SCORE "+maxscore); + + for (int i = 0; i < size; i++) { + + // double score = processor.bestScores.get(i); + double score = subscores.get(i); + + // AnalysisLogger.getLogger().warn("SCORE FOR CATEGORY "+processor.bestCategories.get(i)+" -COLUMN : "+processor.bestColumns.get(i)+" - "+score); + + // normalizing percentages!!! + score = (score / (maxscore + ((size > 1) ? 1 : 0))) * 100; + + // AnalysisLogger.getLogger().warn("SCORE FOR CATEGORY "+processor.bestCategories.get(i)+" -COLUMN : "+processor.bestColumns.get(i)+" - "+score); + if (score > config.categoryDiscardDifferencialThreshold) { + // AnalysisLogger.getLogger().warn("SCORE "+score); + // insert into the right place + int index = results.size(); + int j = 0; + for (SingleResult res : results) { + if (res.getScore() < score) { + index = j; + } + j++; + } + + Reference ref = col.getCategory(processor.bestCategories.get(i)); + SingleResult sr = new SingleResult(processor.bestCategories.get(i), processor.bestColumns.get(i), score, ref.getTableName(), ref.getIndex()); + //control for repetitions + if (isnotRepetition(sr, results)) + results.add(index, sr); + } + } + + //limit the result list after rescoring + int s = results.size(); + if (s>MAXRESULTS){ + int diff = (size-MAXRESULTS); + for (int i=0;i previous) { + + boolean notrepeated = true; + int size = previous.size(); + for (int i = 0; i < size; i++) { + SingleResult sr = previous.get(i); + if (sr.getCategory().equalsIgnoreCase(result.getCategory()) && sr.getColumn().equalsIgnoreCase(result.getColumn())) { + notrepeated = true; + break; + } + } + + return notrepeated; + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/run/StarGraphExtraction.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/run/StarGraphExtraction.java new file mode 100644 index 0000000..df43655 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/run/StarGraphExtraction.java @@ -0,0 +1,36 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.run; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.DBObjectTranslator; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.TSObjectTransformer; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; +import org.gcube.contentmanagement.lexicalmatcher.utils.DatabaseFactory; +import org.hibernate.SessionFactory; + +public class StarGraphExtraction { + + /** + * @param args + */ + public static void main(String[] args) { + try { + RunMain(); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + + private final static String ConfigurationFileNameLocal = "hibernate.cfg.xml"; + + public static void RunMain() throws Exception{ + AnalysisLogger.setLogger("./ALog.properties"); + + //configurazione DB - inizializzo la sessione e mi connetto + SessionFactory dbSession = DatabaseFactory.initDBConnection(ConfigurationFileNameLocal); + DBObjectTranslator dbo = new DBObjectTranslator(); + dbo.buildWholeStructure(dbSession,null,null,null,null,null); + TSObjectTransformer.transform2Graph(dbo); + + } +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/TestExternalCfgProduction.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/TestExternalCfgProduction.java new file mode 100644 index 0000000..e476f8f --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/TestExternalCfgProduction.java @@ -0,0 +1,49 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class TestExternalCfgProduction { + + public static void main(String[] args) { + + try { + int attempts = 1; + + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + + String seriesName = "IMPORT_ecd2e3a0_ee90_11e0_be9e_90f3621758ee"; + String column = "field4"; + + LexicalEngineConfiguration conf = new LexicalEngineConfiguration(); + conf.setReferenceTable("codelist1733371938"); + conf.setReferenceColumn("ifield14"); + conf.setNameHuman("ifield1"); + conf.setIdColumn("ifield0"); + conf.setDescription("ifield2"); + + + //database Parameters + conf.setDatabaseUserName("gcube"); + conf.setDatabasePassword("d4science2"); + conf.setDatabaseDriver("org.postgresql.Driver"); + conf.setDatabaseURL("jdbc:postgresql://localhost/testdb"); + conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect"); + + guesser.runGuesser(seriesName, column, conf); + guesser.showResults(guesser.getClassification()); + + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/TestSingleExternalCfgProduction.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/TestSingleExternalCfgProduction.java new file mode 100644 index 0000000..19e55a5 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/TestSingleExternalCfgProduction.java @@ -0,0 +1,64 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test; + +import java.util.ArrayList; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult; +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class TestSingleExternalCfgProduction { + + public static void main(String[] args) { + + try { + + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String singleton = "Faroe Island"; + + String family = "COUNTRY_OLD"; + String column = "field6"; + + LexicalEngineConfiguration conf = new LexicalEngineConfiguration(); + conf.setReferenceTable("codelist1733371938"); + conf.setReferenceColumn("ifield14"); + conf.setNameHuman("ifield1"); + conf.setIdColumn("ifield0"); + conf.setDescription("ifield2"); + + //CHANGE THIS TO ENHANCE THE RECALL + conf.setEntryAcceptanceThreshold(30); + conf.setReferenceChunksToTake(-1); + conf.setTimeSeriesChunksToTake(-1); + conf.setUseSimpleDistance(false); + + //database Parameters + conf.setDatabaseUserName("gcube"); + conf.setDatabasePassword("d4science2"); +// conf.setDatabaseDriver("org.postgresql.Driver"); + conf.setDatabaseURL("jdbc:postgresql://localhost/testdb"); + conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect"); + + guesser.runGuesser(singleton, conf, family,column ); + + ArrayList detailedResults = guesser.getDetailedMatches(); + + AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton); + + CategoryGuesser.showResults(detailedResults); + + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest1.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest1.java new file mode 100644 index 0000000..735864e --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest1.java @@ -0,0 +1,58 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class BenchMarkTest1 { + + public static void main(String[] args) { + + try { + int attempts = 1; + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String seriesName = "import_bdefb470_5cea_11df_a0a6_909e7d074592"; + String column = "field1"; + String correctFamily = "country"; + String correctColumn = "name_en"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + //bench 2 + AnalysisLogger.getLogger().warn("----------------------BENCH 2-------------------------"); + seriesName = "import_bdefb470_5cea_11df_a0a6_909e7d074592"; + column = "field2"; + correctFamily = "area"; + correctColumn = "name_en"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 2-----------------------\n"); + + //bench 3 + AnalysisLogger.getLogger().warn("----------------------BENCH 3-------------------------"); + seriesName = "import_bdefb470_5cea_11df_a0a6_909e7d074592"; + column = "field4"; + correctFamily = "species"; + correctColumn = "scientific_name"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 3-----------------------\n"); + + //bench 4 + AnalysisLogger.getLogger().warn("----------------------BENCH 4-------------------------"); + seriesName = "import_bdefb470_5cea_11df_a0a6_909e7d074592"; + column = "field3"; + correctFamily = "species"; + correctColumn = "scientific_name"; +// CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 4-----------------------\n"); + + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest2.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest2.java new file mode 100644 index 0000000..352b4a2 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest2.java @@ -0,0 +1,54 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class BenchMarkTest2 { + + public static void main(String[] args) { + + try { + int attempts = 1; + + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6"; + String column = "field1"; + String correctFamily = "SPECIES"; + String correctColumn = "SCIENTIFIC_NAME"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + + + + //bench 2 + AnalysisLogger.getLogger().warn("----------------------BENCH 2-------------------------"); + seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6"; + column = "field2"; + correctFamily = "COUNTRY"; + correctColumn = "ISO_3_CODE"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 2-----------------------\n"); + + + //bench 4 + AnalysisLogger.getLogger().warn("----------------------BENCH 4-------------------------"); + seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6"; + column = "field3"; + correctFamily = "AREA"; + correctColumn = "NAME_EN"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 4-----------------------\n"); + + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest3.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest3.java new file mode 100644 index 0000000..cfdd033 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest3.java @@ -0,0 +1,31 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class BenchMarkTest3 { + + public static void main(String[] args) { + + try { + int attempts = 1; + + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6"; + String column = "field1"; + String correctFamily = "SPECIES"; + String correctColumn = "SCIENTIFIC_NAME"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest4.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest4.java new file mode 100644 index 0000000..a84356d --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest4.java @@ -0,0 +1,31 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class BenchMarkTest4 { + + public static void main(String[] args) { + + try { + int attempts = 1; + + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6"; + String column = "field3"; + String correctFamily = "AREA"; + String correctColumn = "NAME_EN"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest5.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest5.java new file mode 100644 index 0000000..55f954b --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTest5.java @@ -0,0 +1,31 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class BenchMarkTest5 { + + public static void main(String[] args) { + + try { + int attempts = 1; + + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String seriesName = "import_532bba80_1c8f_11df_a4ee_87804054691e"; + String column = "field2"; + String correctFamily = "ISSCAAP GROUP"; + String correctColumn = "NAME_EN"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTestExternalCfg.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTestExternalCfg.java new file mode 100644 index 0000000..5b3bf5e --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTestExternalCfg.java @@ -0,0 +1,52 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class BenchMarkTestExternalCfg { + + public static void main(String[] args) { + + try { + int attempts = 1; + + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String seriesName = "import_532bba80_1c8f_11df_a4ee_87804054691e"; + String column = "field2"; + String correctFamily = "ISSCAAP GROUP"; + String correctColumn = "NAME_EN"; + LexicalEngineConfiguration conf = new LexicalEngineConfiguration(); + conf.setCategoryDiscardDifferencialThreshold(5); + conf.setCategoryDiscardThreshold(0); + conf.setChunkSize(25); + conf.setEntryAcceptanceThreshold(50); + conf.setNumberOfThreadsToUse(2); + conf.setRandomTake(true); + conf.setReferenceChunksToTake(20); + conf.setTimeSeriesChunksToTake(1); + conf.setUseSimpleDistance(false); + + //database Parameters + conf.setDatabaseUserName("root"); +// conf.setDatabasePassword("password"); + conf.setDatabaseDriver("com.mysql.jdbc.Driver"); + conf.setDatabaseURL("jdbc:mysql://localhost/timeseries"); + conf.setDatabaseDialect("org.hibernate.dialect.MySQLDialect"); + conf.setDatabaseAutomaticTestTable("connectiontesttable"); + conf.setDatabaseIdleConnectionTestPeriod("3600"); + + CategoryGuesser.AccuracyCalc(conf,guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTestFilterCategory.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTestFilterCategory.java new file mode 100644 index 0000000..6a76403 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTestFilterCategory.java @@ -0,0 +1,38 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old; + +import java.util.ArrayList; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult; +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + + +public class BenchMarkTestFilterCategory { + + public static void main(String[] args) { + + try { + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String seriesName = "ref_order"; + String column = "scientific_name"; + String correctFamily = "order"; + String correctColumn = "scientific_name"; + + guesser.runGuesser(seriesName, column, null, correctFamily, correctColumn); + ArrayList results = guesser.getClassification(); + + CategoryGuesser.showResults(results); + + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTestSingleton.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTestSingleton.java new file mode 100644 index 0000000..b5bf8ce --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTestSingleton.java @@ -0,0 +1,51 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old; + +import java.util.ArrayList; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult; +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + + +public class BenchMarkTestSingleton { + + public static void main(String[] args) { + + try { + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String singleton = "sarda sarda"; +// String singleton = "Mitella pollicipes"; +// String singleton = "policipes"; +// String singleton = ""; + String family = "catalog life"; + String column = "scientific_name"; + + LexicalEngineConfiguration conf = new LexicalEngineConfiguration(); + + //CHANGE THIS TO ENHANCE THE RECALL + conf.setEntryAcceptanceThreshold(30); + conf.setReferenceChunksToTake(-1); + conf.setTimeSeriesChunksToTake(-1); + conf.setUseSimpleDistance(false); + + guesser.runGuesser(singleton, conf, family,column ); + ArrayList detailedResults = guesser.getDetailedMatches(); + + AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton); + + CategoryGuesser.showResults(detailedResults); + + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTestTSCountry.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTestTSCountry.java new file mode 100644 index 0000000..70be422 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTestTSCountry.java @@ -0,0 +1,31 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class BenchMarkTestTSCountry { + + public static void main(String[] args) { + + try { + int attempts = 1; + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String seriesName = "import_bdefb470_5cea_11df_a0a6_909e7d074592"; + String column = "field1"; + String correctFamily = "country"; + String correctColumn = "name_en"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTrainingSet.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTrainingSet.java new file mode 100644 index 0000000..4616735 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTrainingSet.java @@ -0,0 +1,88 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class BenchMarkTrainingSet { + + +public static void main(String[] args) { + + try { + String configPath ="."; + int attempts = 1; + CategoryGuesser guesser = new CategoryGuesser(configPath); + + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String seriesName = "ref_commission"; + String column = "name_en"; + String correctFamily = "commission"; + String correctColumn = "name_en"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + + AnalysisLogger.getLogger().warn("----------------------BENCH 2-------------------------"); + seriesName = "ref_species"; + column = "scientific_name"; + correctFamily = "species"; + correctColumn = "scientific_name"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 2-----------------------\n"); + + + AnalysisLogger.getLogger().warn("----------------------BENCH 3-------------------------"); + seriesName = "ref_area"; + column = "name_en"; + correctFamily = "area"; + correctColumn = "name_en"; +// CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 3-----------------------\n"); + + + AnalysisLogger.getLogger().warn("----------------------BENCH 4-------------------------"); + seriesName = "ref_ocean"; + column = "name_en"; + correctFamily = "ocean"; + correctColumn = "name_en"; +// CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 4-----------------------\n"); + + + AnalysisLogger.getLogger().warn("----------------------BENCH 5-------------------------"); + seriesName = "ref_geo_region"; + column = "name_en"; + correctFamily = "geo region"; + correctColumn = "name_en"; +// CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 5-----------------------\n"); + + + AnalysisLogger.getLogger().warn("----------------------BENCH 6-------------------------"); + seriesName = "ref_fa_region"; + column = "name_en"; + correctFamily = "fa region"; + correctColumn = "name_en"; +// CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 6-----------------------\n"); + + + AnalysisLogger.getLogger().warn("----------------------BENCH 7-------------------------"); + seriesName = "ref_order"; + column = "scientific_name"; + correctFamily = "order"; + correctColumn = "scientific_name"; +// CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 7-----------------------\n"); + + + + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTrainingSetScientificName.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTrainingSetScientificName.java new file mode 100644 index 0000000..e8a0694 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/BenchMarkTrainingSetScientificName.java @@ -0,0 +1,33 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class BenchMarkTrainingSetScientificName { + + +public static void main(String[] args) { + + try { + String configPath ="."; + int attempts = 1; + CategoryGuesser guesser = new CategoryGuesser(configPath); + + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String seriesName = "ref_species"; + String column = "scientific_name"; + String correctFamily = "species"; + String correctColumn = "scientific_name"; + CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn); + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/TestExternalCfgProduction.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/TestExternalCfgProduction.java new file mode 100644 index 0000000..23106a8 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/TestExternalCfgProduction.java @@ -0,0 +1,64 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class TestExternalCfgProduction { + + public static void main(String[] args) { + + try { + int attempts = 1; + + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); +// String seriesName = "rdmc366dfe0ddf511e086b1b1c5d6fb1c27"; + String seriesName = "IMPORT_ecd2e3a0_ee90_11e0_be9e_90f3621758ee"; + + String column = "field4"; + LexicalEngineConfiguration conf = new LexicalEngineConfiguration(); + /* + conf.setCategoryDiscardDifferencialThreshold(5); + conf.setCategoryDiscardThreshold(0); + conf.setChunkSize(25); + conf.setEntryAcceptanceThreshold(50); + conf.setNumberOfThreadsToUse(2); + conf.setRandomTake(true); + conf.setReferenceChunksToTake(20); + conf.setTimeSeriesChunksToTake(1); + conf.setUseSimpleDistance(false); + */ + + //database Parameters + conf.setDatabaseUserName("utente"); + conf.setDatabasePassword("d4science"); +// conf.setDatabaseDriver("org.postgresql.Driver"); + conf.setDatabaseURL("jdbc:postgresql://dbtest.next.research-infrastructures.eu/timeseries"); + conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect"); + conf.setDatabaseAutomaticTestTable("connectiontesttable"); + conf.setDatabaseIdleConnectionTestPeriod("3600"); + conf.setReferenceTable("codelist1733371938"); + conf.setReferenceColumn("ifield14"); + conf.setNameHuman("ifield1"); + conf.setIdColumn("ifield0"); + conf.setDescription("ifield2"); + guesser.runGuesser(seriesName, column, conf); + guesser.showResults(guesser.getClassification()); +// AnalysisLogger.getLogger().warn(); + + + + + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/TestSingleExternalCfgProduction.java b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/TestSingleExternalCfgProduction.java new file mode 100644 index 0000000..375f4af --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/analysis/test/old/TestSingleExternalCfgProduction.java @@ -0,0 +1,71 @@ +package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old; + +import java.util.ArrayList; + +import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration; +import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult; +import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser; +import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; + +public class TestSingleExternalCfgProduction { + + public static void main(String[] args) { + + try { + + + String configPath = "."; + CategoryGuesser guesser = new CategoryGuesser(configPath); + //bench 1 + AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------"); + String singleton = "Faroe Island"; +// String singleton = "Mitella pollicipes"; +// String singleton = "policipes"; +// String singleton = ""; +// String family = "rdf0a7fb500dd3d11e0b8d1d1e2e7ba4f9d"; + + String family = "COUNTRY_OLD"; + String column = "field6"; + + LexicalEngineConfiguration conf = new LexicalEngineConfiguration(); + + //CHANGE THIS TO ENHANCE THE RECALL + conf.setEntryAcceptanceThreshold(30); + conf.setReferenceChunksToTake(-1); + conf.setTimeSeriesChunksToTake(-1); + conf.setUseSimpleDistance(false); + //database Parameters + conf.setDatabaseUserName("utente"); + conf.setDatabasePassword("d4science"); +// conf.setDatabaseDriver("org.postgresql.Driver"); + conf.setDatabaseURL("jdbc:postgresql://dbtest.next.research-infrastructures.eu/timeseries"); + conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect"); + conf.setDatabaseAutomaticTestTable("connectiontesttable"); + conf.setDatabaseIdleConnectionTestPeriod("3600"); + + conf.setReferenceTable("codelist1733371938"); + conf.setReferenceColumn("ifield14"); + conf.setNameHuman("ifield1"); + conf.setIdColumn("ifield0"); + conf.setDescription("ifield2"); + + guesser.initSingleMatcher(conf,column ); + + guesser.runGuesser(singleton, null, family,column ); + + ArrayList detailedResults = guesser.getDetailedMatches(); + + AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton); + + CategoryGuesser.showResults(detailedResults); + + AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n"); + + + } catch (Exception e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/utils/AnalysisLogger.java b/src/org/gcube/contentmanagement/lexicalmatcher/utils/AnalysisLogger.java new file mode 100644 index 0000000..e6abc67 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/utils/AnalysisLogger.java @@ -0,0 +1,37 @@ +package org.gcube.contentmanagement.lexicalmatcher.utils; + +import org.apache.log4j.Logger; +import org.apache.log4j.PropertyConfigurator; + +public class AnalysisLogger { + + + private static Logger logger; + private static Logger hibernateLogger; + + public static Logger getLogger(){ + + if (logger == null){ + setLogger("./ALog.properties"); + logger = Logger.getLogger("AnalysisLogger"); + } + + return logger; + } + //in ingresso vuole il path al file di config del log4j + public static void setLogger(String path){ + if (logger == null){ + PropertyConfigurator.configure(path); + } + logger = Logger.getLogger("AnalysisLogger"); + hibernateLogger = Logger.getLogger("hibernate"); + } + + public static void printStackTrace(Exception e){ + + int numberoflines = e.getStackTrace().length; + for (int i=0;i nodes = document.selectNodes("//hibernate-configuration/session-factory/property"); + Iterator nodesIterator = nodes.iterator(); + +// System.out.println("--- DATABASE Configuration --- "); + + while (nodesIterator.hasNext()) { + Node currentnode = nodesIterator.next(); + String element = currentnode.valueOf("@name"); + if (element.equals("connection.driver_class")) + if (config.getDatabaseDriver() != null){ + currentnode.setText(config.getDatabaseDriver()); + } + if (element.equals("connection.url")) { + if (config.getDatabaseURL() != null) + currentnode.setText(config.getDatabaseURL()); + } + if (element.equals("connection.username")) { + if (config.getDatabaseUserName() != null) + currentnode.setText(config.getDatabaseUserName()); + } + if (element.equals("connection.password")) { + if (config.getDatabasePassword() != null) + currentnode.setText(config.getDatabasePassword()); + } + if (element.equals("dialect")) { + AnalysisLogger.getLogger().trace("Dialect -> "+config.getDatabaseDialect()); + if (config.getDatabaseDialect() != null) + currentnode.setText(config.getDatabaseDialect()); + } + if (element.equals("c3p0.idleConnectionTestPeriod")) { + if (config.getDatabaseIdleConnectionTestPeriod() != null) + currentnode.setText(config.getDatabaseIdleConnectionTestPeriod()); + } + if (element.equals("c3p0.automaticTestTable")) { + if (config.getDatabaseAutomaticTestTable() != null) + currentnode.setText(config.getDatabaseAutomaticTestTable()); + } + } + + Configuration cfg = new Configuration(); + cfg = cfg.configure(DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new ByteArrayInputStream(document.asXML().getBytes()))); + cfg.setProperty("hibernate.hbm2ddl.auto", "create"); + + SessionFactory DBSessionFactory = null; + DBSessionFactory = cfg.buildSessionFactory(); + + // close stream + stream.close(); + + + + + return DBSessionFactory; + } + + @SuppressWarnings({"unchecked"}) + public static List executeHQLQuery(String query, SessionFactory DBSessionFactory, boolean useSQL) { + + List obj = null; + Session ss = null; + try { + ss = DBSessionFactory.getCurrentSession(); + + ss.beginTransaction(); + + Query qr = null; + + if (useSQL) + qr = ss.createSQLQuery(query); + else + qr = ss.createQuery(query); + + List result = qr.list(); + + ss.getTransaction().commit(); + + /* + if (result == null) + System.out.println("Hibernate doesn't return a valid object when org.gcube.contentmanagement.lexicalmatcher retrieve UserState Object"); + + if (result != null && result.size() == 0) + System.out.println(String.format("found nothing in database")); +*/ + if (result != null && result.size() != 0) { + obj = result; + } + + } catch (Exception e) { + +// System.out.println(String.format("Error while executing query: %1$s %2$s", query, e.getMessage())); + e.printStackTrace(); + rollback(ss); + } + + return obj; + + } + + public static void executeHQLUpdate(String query, SessionFactory DBSessionFactory, boolean useSQL) { +// System.out.println("executing query: " + query); + Session ss = null; + + try { + + ss = DBSessionFactory.getCurrentSession(); +// System.out.println("executing query"); + ss.beginTransaction(); + Query qr = null; + + if (useSQL) + qr = ss.createSQLQuery(query); + else + qr = ss.createQuery(query); + + qr.executeUpdate(); + ss.getTransaction().commit(); + + } catch (Exception e) { + rollback(ss); + e.printStackTrace(); + } + } + + public static void executeSQLUpdate(String query, SessionFactory DBSessionFactory) { + executeHQLUpdate(query, DBSessionFactory, true); + } + + public static List executeSQLQuery(String query, SessionFactory DBSessionFactory) { + return executeHQLQuery(query, DBSessionFactory, true); + } + + public static void rollback(Session ss) { + + try { + if (ss != null && ss.getTransaction() != null) + ss.getTransaction().rollback(); + } catch (Exception ex) { + + } finally { + try { + ss.close(); + } catch (Exception ee) { + } + } + } + + public static void saveObject(Object obj, SessionFactory DBSessionFactory) throws Exception { + if (DBSessionFactory != null) { + Session ss = null; + try { + ss = DBSessionFactory.getCurrentSession(); + ss.beginTransaction(); + ss.saveOrUpdate(obj); + ss.getTransaction().commit(); + } catch (Exception e) { + rollback(ss); + throw e; + } + } + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/utils/DistanceCalculator.java b/src/org/gcube/contentmanagement/lexicalmatcher/utils/DistanceCalculator.java new file mode 100644 index 0000000..d5d2c1c --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/utils/DistanceCalculator.java @@ -0,0 +1,189 @@ +package org.gcube.contentmanagement.lexicalmatcher.utils; + +public class DistanceCalculator { + + // **************************** + // Get minimum of three values + // **************************** + + private int Minimum(int a, int b, int c) { + int mi; + + mi = a; + if (b < mi) { + mi = b; + } + if (c < mi) { + mi = c; + } + return mi; + + } + + // ***************************** + // Compute Levenshtein distance + // ***************************** + + public int LD(String s, String t) { + int d[][]; // matrix + int n; // length of s + int m; // length of t + int i; // iterates through s + int j; // iterates through t + char s_i; // ith character of s + char t_j; // jth character of t + int cost; // cost + + // Step 1 + + n = s.length(); + m = t.length(); + if (n == 0) { + return m; + } + if (m == 0) { + return n; + } + d = new int[n + 1][m + 1]; + + // Step 2 + + for (i = 0; i <= n; i++) { + d[i][0] = i; + } + + for (j = 0; j <= m; j++) { + d[0][j] = j; + } + + // Step 3 + + for (i = 1; i <= n; i++) { + + s_i = s.charAt(i - 1); + + // Step 4 + + for (j = 1; j <= m; j++) { + + t_j = t.charAt(j - 1); + + // Step 5 + + if (s_i == t_j) { + cost = 0; + } else { + cost = 1; + } + + // Step 6 + + d[i][j] = Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost); + + } + + } + + // Step 7 + + return d[n][m]; + + } + + // ***************************** + // Calculate Complete Distance + // ***************************** + public double CD(boolean useSimpleDistance, String h, String t) { + return CD(useSimpleDistance, h, t,false,false); + } + //output will be a percentage. 1 will mean a complete agreement between the inputs + public double CD(boolean useSimpleDistance, String h, String t, boolean ignoreCase, boolean boostMatch) { + + + + double distance = 0; + if ((h == null) && (t == null)) { + distance = 1; + } + else if ((h != null) && (t != null)) { + + h = treatString(h,ignoreCase); + t = treatString(t,ignoreCase); + int lt = t.length(); + int lh = h.length(); + double matchFactor = 1.5f; + if (boostMatch) + matchFactor = 2f; + + if (((lt==0)&&(lh!=0))||((lt!=0)&&(lh==0))) + distance = 0; + else if (h.equalsIgnoreCase(t)){ + distance = 1; + } + else if (useSimpleDistance) { + distance = 0; + } + else if (t.contains(h)) { + // calcolo la percentuale di contenimento + String treatedT = t.replace(h, ""); + double percentage = 1 - ((double) treatedT.length() / (double) lt); +// AnalysisLogger.getLogger().debug("Complete Distance Calculation: coverage percentage of h on t " + percentage); +// double percentage = 0.9; + percentage = Math.min(percentage * matchFactor,0.98); + distance = percentage; + } + else if (h.contains(t)) { + // calcolo la percentuale di contenimento + String treatedH = h.replace(t, ""); + double percentage = 1 - ((double) treatedH.length() / (double) lh); +// AnalysisLogger.getLogger().debug("Complete Distance Calculation: coverage percentage of t on h " + percentage); +// double percentage = 0.9; + percentage = Math.min(percentage * matchFactor,0.98); + distance = percentage; + } + else { + /* + if ((lh>lt)||((lt>lh*1.5))){ + System.out.println("UNMATCHABLE "+lt +" vs "+lh); + distance = 0; + } + else{ + */ + //calcolo percentuale su Levenshtein distance + int levenDist = LD(h, t); + int maxlen = Math.max(lh, lt); + distance = 1-((double)levenDist / (double)maxlen); +// System.out.println("L " + levenDist+" max "+maxlen+" h "+h+" t "+t); +// AnalysisLogger.getLogger().debug("Complete Distance Calculation: leven distance percentage of h on t " + distance); +// } + } + } + + return distance; + } + + private String treatString(String h, boolean ignoreCase){ + //tolgo la punteggiatura + h = h.replaceAll("[!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-]", ""); + //riduco gli spazi multipli a spazi singoli + h = h.replaceAll("[ ]+", " "); + //trim + h = h.trim(); + if (ignoreCase) + h = h.toLowerCase(); + + return h; + } + + + public static void main(String[] args) { + + String h = "Mediteranean"; + String t = "Mediterranean horse mackerel"; + DistanceCalculator d = new DistanceCalculator(); + double cd = d.CD(false,h, t, true , true); + System.out.println("Distance between "+h+" and "+t+" : " + cd); + + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/utils/FileTools.java b/src/org/gcube/contentmanagement/lexicalmatcher/utils/FileTools.java new file mode 100644 index 0000000..8487309 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/utils/FileTools.java @@ -0,0 +1,89 @@ +package org.gcube.contentmanagement.lexicalmatcher.utils; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.io.Writer; + +import org.dom4j.Document; +import org.dom4j.io.SAXReader; + +public class FileTools { + + public static String readXMLDoc(String xmlFilePath) throws Exception { + String xml = null; + + File fl = new File(xmlFilePath); + FileInputStream stream = new FileInputStream(fl); + SAXReader saxReader = new SAXReader(); + Document document = saxReader.read(stream); + xml = document.asXML(); + return xml; + } + + public static void saveString2File(String filename, String string2save) throws Exception { + + } + + public static boolean checkInput(String filename) { + File file = new File(filename); + if (!file.exists()) + return false; + if (!file.canRead()) + return false; + else + return true; + } + + public static boolean checkOutput(String filename, boolean overwrite) { + File file = new File(filename); + if (!overwrite && file.exists()) + return false; + if (file.exists() && (file.isDirectory() || !file.canWrite())) + return false; + else + return true; + } + + public static String loadString(String filename, String encoding) throws Exception { + try { + if (checkInput(filename)) { + + BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(filename), encoding)); + String line = null; + StringBuilder vud = new StringBuilder(); + + while ((line = in.readLine()) != null) { + vud.append(line + "\n"); + } + in.close(); + return vud.toString(); + } else + return null; + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + throw new Exception("The file " + filename + " is not in the correct format!"); + } catch (IOException e) { + throw new Exception("The file " + filename + " is not in the correct format!"); + } + } + + public static void saveString(String filename, String s, boolean overwrite, String encoding) throws Exception { + try { + if (checkOutput(filename, overwrite)) { + Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), encoding)); + out.write(s); + out.close(); + } + } catch (IOException e) { + throw new Exception("The system can not write in " + filename + " because:\n" + e.getMessage()); + } + } + +} diff --git a/src/org/gcube/contentmanagement/lexicalmatcher/utils/MathFunctions.java b/src/org/gcube/contentmanagement/lexicalmatcher/utils/MathFunctions.java new file mode 100644 index 0000000..465a5b9 --- /dev/null +++ b/src/org/gcube/contentmanagement/lexicalmatcher/utils/MathFunctions.java @@ -0,0 +1,99 @@ +package org.gcube.contentmanagement.lexicalmatcher.utils; + +import java.math.BigInteger; +import java.util.ArrayList; + +public class MathFunctions { + + /** + * @param args + */ + public static void main(String[] args) { + // TODO Auto-generated method stub + + } + + //increments a percentage o mean calculation when a lot of elements are present + public static float incrementPerc(float perc, float quantity, int N){ + + if (N==0) + return quantity; + + float out = 0; + int N_plus_1 = N+1; + out = (float)((perc + ((double)quantity / (double)N )) * ((double)N/(double)N_plus_1)); + return out; + + } + + + public static ArrayList generateRandoms(int numberOfRandoms, int min, int max) { + + ArrayList randomsSet = new ArrayList(); + // if number of randoms is equal to -1 generate all numbers + if (numberOfRandoms == -1) { + for (int i = min; i < max; i++) { + randomsSet.add(i); + } + } else { + int numofrandstogenerate = 0; + if (numberOfRandoms <= max) { + numofrandstogenerate = numberOfRandoms; + } else { + numofrandstogenerate = max; + } + + if (numofrandstogenerate == 0) { + randomsSet.add(0); + } else { + for (int i = 0; i < numofrandstogenerate; i++) { + + int RNum = -1; + RNum = (int) ((max) * Math.random()) + min; + + // generate random number + while (randomsSet.contains(RNum)) { + RNum = (int) ((max) * Math.random()) + min; + // AnalysisLogger.getLogger().debug("generated " + RNum); + } + + // AnalysisLogger.getLogger().debug("generated " + RNum); + + if (RNum >= 0) + randomsSet.add(RNum); + } + + } + } + + AnalysisLogger.getLogger().trace("MathFunctions-> generateRandoms " + randomsSet.toString()); + + return randomsSet; + } + + + public static int[] generateSequence(int elements) { + int [] sequence = new int[elements]; + for (int i=0;i