This commit is contained in:
Gianpaolo Coro 2012-02-23 17:12:45 +00:00
parent 297676de1e
commit d95cf07f69
57 changed files with 5223 additions and 0 deletions

View File

@ -0,0 +1,117 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.core;
import java.math.BigDecimal;
import java.util.ArrayList;
public class DataTypeRecognizer {
// if the DB type contains one of this, org.gcube.contentmanagement.lexicalmatcher will be classified as Decimal
private static String[] decimalType = { "decimal", "integer", "int", "ordinal", "length", "position" ,"real"};
private static String[] booleanType = { "bool" };
private static String[] stringType = { "varchar", "char", "string", "text" };
public static String transformTypeFromDB(String DBType) {
// check if the db type is yet known
String type = null;
try {
// check if org.gcube.contentmanagement.lexicalmatcher is a char
if (contains(DBType, stringType)) {
type = String.class.getName();
}
// check if org.gcube.contentmanagement.lexicalmatcher is a decimal
else if (contains(DBType, decimalType))
type = BigDecimal.class.getName();
// check if org.gcube.contentmanagement.lexicalmatcher is a boolean
else if (contains(DBType, booleanType))
type = Boolean.class.getName();
else
type = String.class.getName();
} catch (Exception e) {
type = String.class.getName();
}
return type;
}
// guesses the type of an object
public static Object guessType(String entry) {
Object type = null;
// try to transform to a double
try {
double d = Double.parseDouble(entry);
type = BigDecimal.valueOf(d);
} catch (Exception eD) {
// try to transform to a boolean
if (entry.equalsIgnoreCase("true") || (entry.equalsIgnoreCase("false"))) {
boolean b = Boolean.parseBoolean(entry);
type = Boolean.valueOf(b);
} else
type = entry;
}
return type;
}
private static boolean contains(String element, String[] array) {
element = element.toLowerCase();
for (String arrayElem : array) {
if (element.contains(arrayElem)) {
return true;
}
}
return false;
}
public static String guessType(ArrayList<String> elementlist) {
// 0 = String 1 = Boolean 2 = Decimal
int[] scores = new int[3];
String[] types = { String.class.getName(), Boolean.class.getName(), BigDecimal.class.getName() };
for (String element : elementlist) {
Object guessedObj = guessType(element);
if (guessedObj instanceof String) {
scores[0] = scores[0] + 1;
} else if (guessedObj instanceof Boolean) {
scores[1] = scores[1] + 1;
} else if (guessedObj instanceof BigDecimal) {
scores[2] = scores[2] + 1;
}
}
int max = -1;
int maxindex = -1;
for (int i = 0; i < scores.length; i++) {
if (scores[i] > max) {
max = scores[i];
maxindex = i;
}
}
// System.out.println("index " + maxindex + " max " + max);
String type = types[maxindex];
return type;
}
public static void main(String[] args) throws ClassNotFoundException {
ArrayList<String> prova = new ArrayList<String>();
for (int i = 0; i < 5; i++) {
prova.add("1234");
}
String classtype = guessType(prova);
System.out.println(classtype);
}
}

View File

@ -0,0 +1,350 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.core;
import java.util.ArrayList;
import java.util.HashMap;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.CategoryOrderedList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.CategoryScores;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.ChunkSet;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.ReferenceChunk;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.ReferenceChunkSet;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.SetOfReferenceChunkSet;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.SingletonChunkSet;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.TimeSeriesChunk;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks.TimeSeriesChunkSet;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.gcube.contentmanagement.lexicalmatcher.utils.DatabaseFactory;
import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions;
import org.hibernate.SessionFactory;
public class Engine {
private String ConfigurationFileNameLocal = "hibernate.cfg.xml";
private SessionFactory referenceDBSession;
public ArrayList<String> bestCategories;
public ArrayList<Double> bestScores;
public ArrayList<String> bestColumns;
public HashMap<String, CategoryScores> scoresTable;
public String columnFilter;
private LexicalEngineConfiguration config;
private TimeSeriesChunk singletonChunk;
public ArrayList<SingleResult> getSingletonMatches(){
return singletonChunk.getDetailedResults();
}
public String getSingletonElement(){
return singletonChunk.getSingletonEntry();
}
public SessionFactory getDBSession() throws Exception {
if (referenceDBSession == null) {
referenceDBSession = DatabaseFactory.initDBConnection(ConfigurationFileNameLocal);
}
return referenceDBSession;
}
public SessionFactory getDBSession(LexicalEngineConfiguration externalConf) throws Exception {
if (referenceDBSession == null) {
referenceDBSession = DatabaseFactory.initDBConnection(ConfigurationFileNameLocal, externalConf);
}
return referenceDBSession;
}
public void resetEngine(LexicalEngineConfiguration Config,String ColumnFilter,String configPath){
config = Config;
scoresTable = new HashMap<String, CategoryScores>();
bestCategories = new ArrayList<String>();
bestColumns = new ArrayList<String>();
bestScores = new ArrayList<Double>();
columnFilter = ColumnFilter;
// ConfigurationFileNameLocal = configPath+"/"+ConfigurationFileNameLocal;
}
public Engine(LexicalEngineConfiguration Config,String ColumnFilter,String configPath) {
config = Config;
scoresTable = new HashMap<String, CategoryScores>();
bestCategories = new ArrayList<String>();
bestColumns = new ArrayList<String>();
bestScores = new ArrayList<Double>();
columnFilter = ColumnFilter;
ConfigurationFileNameLocal = configPath+"/"+ConfigurationFileNameLocal;
}
public void calcLike(CategoryOrderedList col, String unknownSeriesName, String unknownSeriesColumn) {
scoresTable = col.getScoresTable();
// take a time series set of chunks
TimeSeriesChunkSet tsChunkSet = null;
try {
tsChunkSet = new TimeSeriesChunkSet(config.TimeSeriesChunksToTake, config.chunkSize, unknownSeriesName, unknownSeriesColumn,config, this);
} catch (Exception e) {
e.printStackTrace();
AnalysisLogger.getLogger().error("Engine->calcLike-> ERROR could not retrieve time series chunks " + e.getLocalizedMessage());
}
// if we took the ts chunk set correctly perform calculation
if (tsChunkSet != null) {
// generate the set of reference chunks
SetOfReferenceChunkSet setRefChunksSet = new SetOfReferenceChunkSet(col.getOrderedList(),config, this);
TimeSeriesChunk tsChunk = tsChunkSet.nextChunk();
// for all ts chunks
while (tsChunk != null) {
// take a set of chunks from a reference category
ReferenceChunkSet refChunkSet = setRefChunksSet.getNextChunkSet();
while (refChunkSet != null) {
// take a chunk in the reference chunk set
ReferenceChunk refChunk = refChunkSet.nextChunk();
while (refChunk != null) {
try {
tsChunk.compareToReferenceChunk(scoresTable, refChunk);
} catch (Exception e) {
e.printStackTrace();
AnalysisLogger.getLogger().error("Engine->calcLike-> ERROR could not compare time series chunk with reference chunk " + e.getLocalizedMessage());
}
// take another chunk in the reference chunk set
refChunk = refChunkSet.nextChunk();
}
// check score
UpdateScores(refChunkSet.getSeriesName(),false);
// take another set of chunks from another reference category
refChunkSet = setRefChunksSet.getNextChunkSet();
}
tsChunk = tsChunkSet.nextChunk();
}
}
}
boolean threadActivity[];
private void wait4Thread(int index){
// wait until thread is free
while (threadActivity[index]) {
try {
Thread.sleep(10);
} catch (InterruptedException e) {
}
}
}
private void startNewTCalc(TimeSeriesChunk tsChunk, ReferenceChunkSet refChunkSet,int index){
threadActivity[index] = true;
ThreadCalculator tc = new ThreadCalculator(tsChunk, refChunkSet,index);
Thread t = new Thread(tc);
t.start();
// AnalysisLogger.getLogger().info("ThreadCalculator<-go "+index);
}
public void calcLikeThread(CategoryOrderedList col, String unknownSeriesName, String unknownSeriesColumn,String singletonString) {
scoresTable = col.getScoresTable();
// take a time series set of chunks
ChunkSet tsChunkSet = null;
int[] currentThreads = MathFunctions.generateSequence(config.numberOfThreadsToUse);
int currentThread = 0;
threadActivity = new boolean [currentThreads.length];
//initialize to false;
for (int j=0;j<threadActivity.length;j++){
threadActivity[j] = false;
}
try {
if (singletonString==null)
tsChunkSet = new TimeSeriesChunkSet(config.TimeSeriesChunksToTake, config.chunkSize, unknownSeriesName, unknownSeriesColumn,config, this);
else{
tsChunkSet = new SingletonChunkSet(singletonString,config, this);
}
} catch (Exception e) {
e.printStackTrace();
AnalysisLogger.getLogger().error("Engine->calcLike-> ERROR could not retrieve time series chunks " + e.getLocalizedMessage());
}
// if we took the ts chunk set correctly perform calculation
if (tsChunkSet != null) {
// generate the set of reference chunks
SetOfReferenceChunkSet setRefChunksSet = new SetOfReferenceChunkSet(col.getOrderedList(),config, this);
TimeSeriesChunk tsChunk = (TimeSeriesChunk)tsChunkSet.nextChunk();
AnalysisLogger.getLogger().debug("tsChunk is null "+(tsChunk != null));
// for all ts chunks
while (tsChunk != null) {
// take a set of chunks from a reference category
ReferenceChunkSet refChunkSet = setRefChunksSet.getNextChunkSet();
while (refChunkSet != null) {
wait4Thread(currentThreads[currentThread]);
startNewTCalc(tsChunk, refChunkSet,currentThreads[currentThread]);
// makeComparisonsTSChunk2RefChunks(tsChunk, refChunkSet);
// take another set of chunks from another reference category
refChunkSet = setRefChunksSet.getNextChunkSet();
currentThread++;
if (currentThread >= currentThreads.length)
currentThread = 0;
}
//if the chunk is a singleton, don't process other and record the result
if (tsChunk.isSingleton()){
singletonChunk = tsChunk;
break;
}
tsChunk = (TimeSeriesChunk)tsChunkSet.nextChunk();
}
//wait for last threads to finish
for (int i : currentThreads) {
// free previous calculation
wait4Thread(i);
}
}
}
private void makeComparisonsTSChunk2RefChunks(TimeSeriesChunk tsChunk, ReferenceChunkSet refChunkSet) {
// take a chunk in the reference chunk set
ReferenceChunk refChunk = refChunkSet.nextChunk();
while (refChunk != null) {
try {
tsChunk.compareToReferenceChunk(scoresTable, refChunk,columnFilter);
} catch (Exception e) {
e.printStackTrace();
AnalysisLogger.getLogger().error("Engine->calcLike-> ERROR could not compare time series chunk with reference chunk " + e.getLocalizedMessage());
}
//if the TimeSeries chunk states the processing must be interrupted, don't perform other comparisons
if (tsChunk.mustInterruptProcess())
break;
// take another chunk in the reference chunk set
refChunk = refChunkSet.nextChunk();
}
// check score
UpdateScores(refChunkSet.getSeriesName(),tsChunk.isSingleton());
}
private void UpdateScores(String categoryName, boolean singletonMatch) {
CategoryScores categoryScore = scoresTable.get(categoryName);
ArrayList<String> bestCols = categoryScore.findBestList();
String bestColumn = null;
double score = 0;
if (bestCols.size() > 0) {
bestColumn = bestCols.get(0);
score = categoryScore.getScore(bestColumn,singletonMatch);
}
AnalysisLogger.getLogger().trace("Engine->UpdateScores-> \tBEST SUITABLE COLUMN IS: " + bestColumn);
AnalysisLogger.getLogger().trace("Engine->UpdateScores-> \tBEST SCORE IS: " + score);
// order this column
if (score > config.categoryDiscardThreshold) {
int index = 0;
// insert at the right point in the classification
for (Double dscore : bestScores) {
if (dscore.doubleValue() < score) {
break;
}
index++;
}
bestCategories.add(index, categoryName);
bestScores.add(index, score);
bestColumns.add(index, bestColumn);
checkAndAddColumns(categoryScore, bestCols, categoryName,singletonMatch);
}
}
private void checkAndAddColumns(CategoryScores scores, ArrayList<String> bestCols, String categoryName,boolean singletonMatch) {
int size = bestCols.size();
double bestScore = scores.getScore(bestCols.get(0),singletonMatch);
for (int i = 1; i < size; i++) {
// take the i-th column
String column = bestCols.get(i);
if (column != null) {
// check the score
double score = scores.getScore(column,singletonMatch);
// if the score is near the best, add the column
if ((score > 0) && (score >= (bestScore - 0.5 * bestScore))) {
int index = 0;
// insert at the right point in the classification
for (Double dscore : bestScores) {
if (dscore.doubleValue() < score) {
break;
}
index++;
}
// AnalysisLogger.getLogger().info("chechAndAddColumns -> column to add "+column+" category "+categoryName+" with value "+score+" previous "+(bestScore - 0.5 * bestScore));
bestColumns.add(index,column);
bestScores.add(index,score);
bestCategories.add(index,categoryName);
// AnalysisLogger.getLogger().info("chechAndAddColumns -> "+bestCategories);
}
}
}
}
private class ThreadCalculator implements Runnable {
TimeSeriesChunk tsChunk;
ReferenceChunkSet refChunksSet;
int index;
public ThreadCalculator(TimeSeriesChunk tsChunk, ReferenceChunkSet refChunksSet,int index) {
this.tsChunk = tsChunk;
this.refChunksSet = refChunksSet;
this.index = index;
}
public void run() {
// AnalysisLogger.getLogger().info("ThreadCalculator->started "+index);
makeComparisonsTSChunk2RefChunks(tsChunk, refChunksSet);
threadActivity[index]=false;
// AnalysisLogger.getLogger().info("ThreadCalculator>-finished "+index);
}
}
}

View File

@ -0,0 +1,322 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.core;
import java.io.FileInputStream;
import java.util.Properties;
public class LexicalEngineConfiguration {
public void configure(String absoluteFilePath) throws Exception {
Properties props = new Properties();
FileInputStream fis = new FileInputStream(absoluteFilePath);
props.load(fis);
categoryDiscardThreshold = Float.parseFloat(props.getProperty("categoryDiscardThreshold"));
entryAcceptanceThreshold = Integer.parseInt(props.getProperty("entryAcceptanceThreshold"));
chunkSize = Integer.parseInt(props.getProperty("chunkSize"));
TimeSeriesChunksToTake = Integer.parseInt(props.getProperty("timeSeriesChunksToTake"));
ReferenceChunksToTake = Integer.parseInt(props.getProperty("referenceChunksToTake"));
randomTake = Boolean.parseBoolean(props.getProperty("randomTake"));
useSimpleDistance = Boolean.parseBoolean(props.getProperty("useSimpleDistance"));
numberOfThreadsToUse = Integer.parseInt(props.getProperty("numberOfThreadsToUse"));
categoryDiscardDifferencialThreshold = Float.parseFloat(props.getProperty("categoryDiscardDifferencialThreshold"));
singleEntryRecognitionMaxDeviation = Float.parseFloat(props.getProperty("singleEntryRecognitionMaxDeviation"));
fis.close();
}
public void setCategoryDiscardThreshold(float categoryDiscardThreshold) {
this.categoryDiscardThreshold = categoryDiscardThreshold;
}
public float getCategoryDiscardThreshold() {
return categoryDiscardThreshold;
}
public void setEntryAcceptanceThreshold(float entryAcceptanceThreshold) {
this.entryAcceptanceThreshold = entryAcceptanceThreshold;
}
public float getEntryAcceptanceThreshold() {
return entryAcceptanceThreshold;
}
public void setCategoryDiscardDifferencialThreshold(float categoryDiscardDifferencialThreshold) {
this.categoryDiscardDifferencialThreshold = categoryDiscardDifferencialThreshold;
}
public float getCategoryDiscardDifferencialThreshold() {
return categoryDiscardDifferencialThreshold;
}
public void setChunkSize(int chunkSize) {
this.chunkSize = chunkSize;
}
public int getChunkSize() {
return chunkSize;
}
public void setRandomTake(boolean randomTake) {
this.randomTake = randomTake;
}
public boolean isRandomTake() {
return randomTake;
}
public void setTimeSeriesChunksToTake(int timeSeriesChunksToTake) {
TimeSeriesChunksToTake = timeSeriesChunksToTake;
}
public int getTimeSeriesChunksToTake() {
return TimeSeriesChunksToTake;
}
public void setReferenceChunksToTake(int referenceChunksToTake) {
ReferenceChunksToTake = referenceChunksToTake;
}
public int getReferenceChunksToTake() {
return ReferenceChunksToTake;
}
public void setUseSimpleDistance(boolean useSimpleDistance) {
this.useSimpleDistance = useSimpleDistance;
}
public boolean isUseSimpleDistance() {
return useSimpleDistance;
}
public void setNumberOfThreadsToUse(int numberOfThreadsToUse) {
this.numberOfThreadsToUse = numberOfThreadsToUse;
}
public int getNumberOfThreadsToUse() {
return numberOfThreadsToUse;
}
public void setSingleEntryRecognitionMaxDeviation(float singleEntryRecognitionMaxDeviation) {
this.singleEntryRecognitionMaxDeviation = singleEntryRecognitionMaxDeviation;
}
public float getSingleEntryRecognitionMaxDeviation() {
return singleEntryRecognitionMaxDeviation;
}
public float categoryDiscardThreshold = -Float.MIN_VALUE;
public float entryAcceptanceThreshold = -Float.MIN_VALUE;
public float categoryDiscardDifferencialThreshold = -Float.MIN_VALUE;
public float singleEntryRecognitionMaxDeviation = -Float.MIN_VALUE;
public int chunkSize = -Integer.MIN_VALUE;
public Boolean randomTake = null;
// if set to -1 all chunks will be analyzed
public int TimeSeriesChunksToTake = -Integer.MIN_VALUE;
public int ReferenceChunksToTake = -Integer.MIN_VALUE;
public Boolean useSimpleDistance = null;
public int numberOfThreadsToUse = -Integer.MIN_VALUE;
//database parameters
public String databaseDriver = null;
public String databaseURL = null;
public String databaseUserName = null;
public String databasePassword = null;
public String databaseDialect = null;
public String databaseIdleConnectionTestPeriod = null;
public String databaseAutomaticTestTable = null;
//reference data parameters
public String referenceTable = null;
public String referenceColumn = null;
public String idColumn= null;
public String nameHuman = null;
public String description = null;
public void mergeConfig(LexicalEngineConfiguration config){
if (config.getCategoryDiscardDifferencialThreshold()!=-Float.MIN_VALUE)
setCategoryDiscardDifferencialThreshold(config.getCategoryDiscardDifferencialThreshold());
if (config.getSingleEntryRecognitionMaxDeviation()!=-Float.MIN_VALUE)
setSingleEntryRecognitionMaxDeviation(config.getSingleEntryRecognitionMaxDeviation());
if (config.getCategoryDiscardThreshold()!=-Float.MIN_VALUE)
setCategoryDiscardThreshold(config.getCategoryDiscardThreshold());
if (config.getChunkSize()!=-Integer.MIN_VALUE)
setChunkSize(config.getChunkSize());
if (config.getEntryAcceptanceThreshold()!=-Float.MIN_VALUE)
setEntryAcceptanceThreshold(config.getEntryAcceptanceThreshold());
if (config.getNumberOfThreadsToUse()!=-Integer.MIN_VALUE)
setNumberOfThreadsToUse(config.getNumberOfThreadsToUse());
if (config.getReferenceChunksToTake()!=-Integer.MIN_VALUE)
setReferenceChunksToTake(config.getReferenceChunksToTake());
if (config.getTimeSeriesChunksToTake()!=-Integer.MIN_VALUE)
setTimeSeriesChunksToTake(config.getTimeSeriesChunksToTake());
if (config.randomTake!= null)
setRandomTake(config.isRandomTake());
if (config.useSimpleDistance!=null)
setUseSimpleDistance(config.isUseSimpleDistance());
//database information merge
if (config.databaseDriver!=null)
setDatabaseDriver(config.databaseDriver);
if (config.databaseDialect!=null)
setDatabaseDialect(config.databaseDialect);
if (config.databaseAutomaticTestTable!=null)
setDatabaseAutomaticTestTable(config.databaseAutomaticTestTable);
if (config.databaseIdleConnectionTestPeriod!=null)
setDatabaseIdleConnectionTestPeriod(config.databaseIdleConnectionTestPeriod);
if (config.databaseUserName!=null)
setDatabaseUserName(config.databaseUserName);
if (config.databasePassword!=null)
setDatabasePassword(config.databasePassword);
if (config.databaseURL!=null)
setDatabaseURL(config.databaseURL);
if (config.referenceTable!=null)
setReferenceTable(config.referenceTable);
if (config.referenceColumn!=null)
setReferenceColumn(config.referenceColumn);
if (config.idColumn!=null)
setIdColumn(config.idColumn);
if (config.nameHuman!=null)
setNameHuman(config.nameHuman);
if (config.description!=null)
setDescription(config.description);
}
public void setDatabaseDriver(String databaseDriver) {
this.databaseDriver = databaseDriver;
}
public String getDatabaseDriver() {
return databaseDriver;
}
public void setDatabaseURL(String databaseURL) {
this.databaseURL = databaseURL;
}
public String getDatabaseURL() {
return databaseURL;
}
public void setDatabaseUserName(String databaseUserName) {
this.databaseUserName = databaseUserName;
}
public String getDatabaseUserName() {
return databaseUserName;
}
public void setDatabasePassword(String databasePassword) {
this.databasePassword = databasePassword;
}
public String getDatabasePassword() {
return databasePassword;
}
public void setDatabaseDialect(String databaseDialect) {
this.databaseDialect = databaseDialect;
}
public String getDatabaseDialect() {
return databaseDialect;
}
public void setDatabaseIdleConnectionTestPeriod(String databaseIdleConnectionTestPeriod) {
this.databaseIdleConnectionTestPeriod = databaseIdleConnectionTestPeriod;
}
public String getDatabaseIdleConnectionTestPeriod() {
return databaseIdleConnectionTestPeriod;
}
public void setDatabaseAutomaticTestTable(String databaseAutomaticTestTable) {
this.databaseAutomaticTestTable = databaseAutomaticTestTable;
}
public String getDatabaseAutomaticTestTable() {
return databaseAutomaticTestTable;
}
public String getReferenceTable() {
return referenceTable;
}
public void setReferenceTable(String referenceTable) {
this.referenceTable = referenceTable;
}
public String getReferenceColumn() {
return referenceColumn;
}
public void setReferenceColumn(String referenceColumn) {
this.referenceColumn = referenceColumn;
}
public String getIdColumn() {
return idColumn;
}
public void setIdColumn(String idColumn) {
this.idColumn = idColumn;
}
public String getNameHuman() {
return nameHuman;
}
public void setNameHuman(String nameHuman) {
this.nameHuman = nameHuman;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
}

View File

@ -0,0 +1,32 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.examples;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class Example1_Species {
public static void main(String[] args) {
try {
int attempts = 1;
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6";
String column = "field1";
String correctFamily = "SPECIES";
String correctColumn = "SCIENTIFIC_NAME";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,32 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.examples;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class Example2_Area {
public static void main(String[] args) {
try {
int attempts = 1;
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6";
String column = "field3";
String correctFamily = "AREA";
String correctColumn = "NAME_EN";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,48 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.examples;
import java.util.ArrayList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class Example3_SingleMatchShark {
public static void main(String[] args) {
try {
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String singleton = "shark";
String family = "species";
String column = "name_en";
LexicalEngineConfiguration conf = new LexicalEngineConfiguration();
//CHANGE THIS TO ENHANCE THE RECALL
conf.setEntryAcceptanceThreshold(30);
conf.setReferenceChunksToTake(-1);
conf.setTimeSeriesChunksToTake(-1);
conf.setUseSimpleDistance(false);
guesser.runGuesser(configPath, singleton, conf, family,column );
ArrayList<SingleResult> detailedResults = guesser.getDetailedMatches();
AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton);
CategoryGuesser.showResults(detailedResults);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,49 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.examples;
import java.util.ArrayList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class Example4_SingleMatchMitella {
public static void main(String[] args) {
try {
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String singleton = "Mitella pollicipes";
// String singleton = "policipes";
String family = "species";
String column = "scientific_name";
LexicalEngineConfiguration conf = new LexicalEngineConfiguration();
//CHANGE THIS TO ENHANCE THE RECALL
conf.setEntryAcceptanceThreshold(30);
conf.setReferenceChunksToTake(-1);
conf.setTimeSeriesChunksToTake(-1);
conf.setUseSimpleDistance(false);
guesser.runGuesser(configPath, singleton, conf, family,column );
ArrayList<SingleResult> detailedResults = guesser.getDetailedMatches();
AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton);
CategoryGuesser.showResults(detailedResults);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,48 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.examples;
import java.util.ArrayList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class Example5_SingleMatchMitella {
public static void main(String[] args) {
try {
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String singleton = "Mirella policepes";
String family = "species";
String column = "scientific_name";
LexicalEngineConfiguration conf = new LexicalEngineConfiguration();
//CHANGE THIS TO ENHANCE THE RECALL
conf.setEntryAcceptanceThreshold(30);
conf.setReferenceChunksToTake(-1);
conf.setTimeSeriesChunksToTake(-1);
conf.setUseSimpleDistance(false);
guesser.runGuesser(configPath, singleton, conf, family,column );
ArrayList<SingleResult> detailedResults = guesser.getDetailedMatches();
AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton);
CategoryGuesser.showResults(detailedResults);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,64 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.examples;
import java.util.ArrayList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
public class ExampleGuessingExternalCfg {
public static void main(String[] args) {
try {
String configPath = "./";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
System.out.println("----------------------BENCH 1-------------------------");
String seriesName = "import_532bba80_1c8f_11df_a4ee_87804054691e";
String column = "field2";
LexicalEngineConfiguration conf = new LexicalEngineConfiguration();
conf.setCategoryDiscardDifferencialThreshold(10);
conf.setCategoryDiscardThreshold(0);
conf.setChunkSize(25);
conf.setEntryAcceptanceThreshold(50);
conf.setNumberOfThreadsToUse(2);
conf.setRandomTake(true);
conf.setReferenceChunksToTake(20);
conf.setTimeSeriesChunksToTake(1);
conf.setUseSimpleDistance(false);
//database Parameters
conf.setDatabaseUserName("root");
conf.setDatabasePassword("ash_ash80");
conf.setDatabaseDriver("com.mysql.jdbc.Driver");
conf.setDatabaseURL("jdbc:mysql://localhost/timeseries");
conf.setDatabaseDialect("org.hibernate.dialect.MySQLDialect");
conf.setDatabaseAutomaticTestTable("connectiontesttable");
conf.setDatabaseIdleConnectionTestPeriod("3600");
//reference parameters
conf.setReferenceTable("reference_table");
conf.setReferenceColumn("table_name");
conf.setIdColumn("id");
conf.setNameHuman("name_human");
conf.setDescription("description");
guesser.init(conf);
guesser.runGuesser(seriesName, column, conf);
ArrayList<SingleResult> results = guesser.getClassification();
CategoryGuesser.showResults(results);
System.out.println("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,71 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data;
import java.math.BigInteger;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.interfaces.Reference;
public class Category implements Reference {
public static void main(String[] args) {
// TODO Auto-generated method stub
}
private String categoryName;
private String categoryIndex;
private String tableName;
private String description;
private BigInteger numberOfElements;
public Category(String name,String index,String tablename,String descr){
categoryName=name;
categoryIndex=index;
tableName=tablename;
description=descr;
}
public void setName(String categoryName) {
this.categoryName = categoryName;
}
public String getName() {
return categoryName;
}
public void setIndex(String categoryIndex) {
this.categoryIndex = categoryIndex;
}
public String getIndex() {
return categoryIndex;
}
public void setTableName(String tableName) {
this.tableName = tableName;
}
public String getTableName() {
return tableName;
}
public void setDescription(String description) {
this.description = description;
}
public String getDescription() {
return description;
}
public String toString(){
return "["+categoryName+": index "+categoryIndex+" table "+tableName+" description "+description+"]";
}
public void setNumberOfElements(BigInteger numberOfElements) {
this.numberOfElements = numberOfElements;
}
public BigInteger getNumberOfElements() {
return numberOfElements;
}
}

View File

@ -0,0 +1,79 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.interfaces.Reference;
public class CategoryOrderedList {
// lista ordinata in ordine decrescente
ArrayList<Reference> orderedList;
HashMap<String,Reference> orderedListTable;
private HashMap<String, CategoryScores> scoresTable;
public void setOrderedList(ArrayList<Reference> OrderedList){
orderedList = OrderedList;
}
public HashMap<String, CategoryScores> getScoresTable() {
return scoresTable;
}
public void setCategoryTable( HashMap<String,Reference> OrderedListTable ) {
orderedListTable = OrderedListTable ;
}
public Reference getCategory ( String categoryName ) {
return orderedListTable.get(categoryName);
}
public ArrayList<Reference> getOrderedList() {
return orderedList;
}
LexicalEngineConfiguration config;
public CategoryOrderedList(LexicalEngineConfiguration Config) {
orderedList = new ArrayList<Reference>();
scoresTable = new HashMap<String, CategoryScores>();
config = Config;
orderedListTable = new HashMap<String, Reference>();
}
public void addCategory(Category c) {
BigInteger nElements = c.getNumberOfElements();
int index = 0;
for (Reference cc : orderedList) {
BigInteger localnum = cc.getNumberOfElements();
if (localnum.compareTo(nElements) < 0) {
break;
}
index++;
}
orderedList.add(index, c);
scoresTable.put(c.getName(), new CategoryScores(c.getNumberOfElements(),config));
orderedListTable.put(c.getName(), c);
// scoresTable.put(c.getName(), new CategoryScores());
}
public CategoryOrderedList generateNovelList(){
CategoryOrderedList newCatList = new CategoryOrderedList(config);
newCatList.setOrderedList(orderedList);
newCatList.setCategoryTable(orderedListTable);
for (String key:scoresTable.keySet()){
CategoryScores ct = scoresTable.get(key);
CategoryScores ctnew = new CategoryScores(ct.getCategoryElements(), config);
newCatList.getScoresTable().put(key,ctnew);
}
return newCatList;
}
}

View File

@ -0,0 +1,205 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions;
//score relative to a certain category and column
public class CategoryScores {
// column names vs percentage
private HashMap<String, Float> columnsScore;
private int matchedElements;
private BigInteger maxElements;
private BigInteger categoryElements;
private LexicalEngineConfiguration config;
public CategoryScores(BigInteger catElements, LexicalEngineConfiguration Config) {
columnsScore = new HashMap<String, Float>();
matchedElements = 0;
setCategoryElements(catElements);
config = Config;
maxElements = calculateMaxElements(catElements);
}
public double calculateCoverage(){
double bd = new BigDecimal(matchedElements).divide(new BigDecimal(maxElements), 2, BigDecimal.ROUND_FLOOR).doubleValue();
//lower poor categories
if (maxElements.compareTo(BigInteger.valueOf(config.chunkSize))<=0)
bd = bd *0.8;
//To-DO take into observation!!!
//higher very big set coverage
if (categoryElements.compareTo(BigInteger.valueOf(10000))>0)
bd = Math.max(0.01, bd);
return bd;
}
private BigInteger calculateMaxElements(BigInteger catElements){
BigInteger maxElements = BigInteger.ZERO;
int maxNumberOfChunks = config.ReferenceChunksToTake;
int chunkSize = config.chunkSize;
int numberofcycles=0;
if (maxNumberOfChunks<0)
return catElements;
try{
BigDecimal intcycles;
BigDecimal oddcycles;
BigDecimal catElementsDecimal = new BigDecimal(catElements);
BigDecimal[] arraydecimal = catElementsDecimal.divideAndRemainder(new BigDecimal(BigInteger.valueOf(chunkSize)));
intcycles = arraydecimal[0];
oddcycles = arraydecimal[1];
numberofcycles = intcycles.intValue();
if ((numberofcycles==0)&&(oddcycles.intValue() > 0)) {
numberofcycles = numberofcycles + 1;
maxElements = oddcycles.toBigInteger();
}
else{
if (numberofcycles>maxNumberOfChunks)
numberofcycles = maxNumberOfChunks;
maxElements = BigInteger.valueOf(chunkSize).multiply(BigInteger.valueOf(numberofcycles));
}
}catch(Exception e){}
return maxElements;
}
public String showScores(){
return columnsScore.toString()+":"+calculateCoverage(); //+" - "+matchedElements+" vs "+maxElements;
}
public void incrementScore(String columnName,float increment,boolean doIncrementMathes) {
Float score = columnsScore.get(columnName);
if (score==null)
score =new Float(0);
score = MathFunctions.incrementPerc(score, increment, matchedElements);
if (doIncrementMathes)
matchedElements ++;
columnsScore.put(columnName, score);
}
public float getScore(String columnName,boolean simpleMatch) {
if (simpleMatch){
return getSimpleScore(columnName);
}
else
return getScore(columnName);
}
public float getScore(String columnName) {
Float score = null;
try {
// score = columnsScore.get(columnName)*(float)calculateCoverage();
score = columnsScore.get(columnName);
if (score!=null){
return score*(float)calculateCoverage();
}
} catch (Exception e) {
}
return score;
}
public float getSimpleScore(String columnName) {
Float score = null;
try {
// score = columnsScore.get(columnName)*(float)calculateCoverage();
score = columnsScore.get(columnName);
if (score!=null){
return score;
}
} catch (Exception e) {
}
return score;
}
// take the best performing column
public String findBest() {
String bestCol = null;
Float bestscore = Float.valueOf(-1);
for (String column : columnsScore.keySet()) {
Float score = new Float(0);
try {
score = columnsScore.get(column);
} catch (Exception e) {
AnalysisLogger.getLogger().error("ERROR in getting SCORE " + e.getLocalizedMessage());
}
if (bestscore.compareTo(score) < 0) {
bestscore = score;
bestCol = column;
}
}
return bestCol;
}
// take the best performing columns
public ArrayList<String> findBestList() {
ArrayList<String> bestCols = new ArrayList<String>();
for (String column : columnsScore.keySet()) {
Float score = new Float(0);
try {
score = columnsScore.get(column);
} catch (Exception e) {
AnalysisLogger.getLogger().error("ERROR in getting SCORE " + e.getLocalizedMessage());
}
// find best place where to put column
int size = bestCols.size();
int index = size;
for (int i = 0; i < size; i++) {
if (columnsScore.get(bestCols.get(i)).compareTo(score) <= 0) {
index = i;
break;
}
}
bestCols.add(index, column);
}
return bestCols;
}
public void setCategoryElements(BigInteger categoryElements) {
this.categoryElements = categoryElements;
}
public BigInteger getCategoryElements() {
return categoryElements;
}
}

View File

@ -0,0 +1,123 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
//score relative to a certain category and column
public class CategoryScoresOld {
// column names vs percentage
private HashMap<String, BigDecimal> columnsScore;
private BigDecimal maximumElements;
public CategoryScoresOld(BigInteger maxelements) {
this.maximumElements = new BigDecimal(maxelements);
columnsScore = new HashMap<String, BigDecimal>();
}
public void setMaximumElements(BigDecimal MaximumElements) {
maximumElements = MaximumElements;
}
public void incrementScore(String columnName,float increment) {
BigDecimal score = columnsScore.get(columnName);
BigDecimal reciproc = BigDecimal.valueOf(increment);
if (score == null) {
// build up a new score : 1/TOTAL
score = reciproc;
} else {
score = score.add(reciproc);
}
columnsScore.put(columnName, score);
// AnalysisLogger.getLogger().debug("CategoryOrderedList->checkUnkEntriesOnEntireCategory-> SCORE "+score);
}
public double getScore(String columnName) {
double score = 0;
try {
BigDecimal percentage = columnsScore.get(columnName);
try {
if (percentage == null)
percentage = BigDecimal.ZERO;
AnalysisLogger.getLogger().trace("getScore -> Score for "+columnName+": " + percentage + " vs " + maximumElements);
percentage = percentage.divide(maximumElements, 2, BigDecimal.ROUND_DOWN);
} catch (ArithmeticException e) {
percentage = BigDecimal.ZERO;
e.printStackTrace();
}
score = percentage.doubleValue();
} catch (Exception e) {
}
return score;
}
// take the best performing column
public String findBest() {
String bestCol = null;
BigDecimal bestscore = BigDecimal.valueOf(-1);
for (String column : columnsScore.keySet()) {
BigDecimal score = BigDecimal.ZERO;
try {
score = columnsScore.get(column);
} catch (Exception e) {
AnalysisLogger.getLogger().error("ERROR in getting SCORE " + e.getLocalizedMessage());
}
if (bestscore.compareTo(score) < 0) {
bestscore = score;
bestCol = column;
}
}
return bestCol;
}
// take the best performing columns
public ArrayList<String> findBestList() {
ArrayList<String> bestCols = new ArrayList<String>();
for (String column : columnsScore.keySet()) {
BigDecimal score = BigDecimal.ZERO;
try {
score = columnsScore.get(column);
} catch (Exception e) {
AnalysisLogger.getLogger().error("ERROR in getting SCORE " + e.getLocalizedMessage());
}
// find best place where to put column
int size = bestCols.size();
int index = size;
for (int i = 0; i < size; i++) {
if (columnsScore.get(bestCols.get(i)).compareTo(score) <= 0) {
index = i;
break;
}
}
bestCols.add(index, column);
}
return bestCols;
}
}

View File

@ -0,0 +1,272 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.DataTypeRecognizer;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.gcube.contentmanagement.lexicalmatcher.utils.DatabaseFactory;
import org.hibernate.SessionFactory;
public class DBObjectTranslator {
public static void main(String[] args) {
}
public ArrayList<RelationEdge> relations;
public ArrayList<Category> categories;
public BigInteger totalEntries;
public BigInteger totalCatElements;
public BigInteger totalRelationElements;
public DBObjectTranslator() {
relations = new ArrayList<RelationEdge>();
categories = new ArrayList<Category>();
totalCatElements = BigInteger.ZERO;
totalRelationElements = BigInteger.ZERO;
totalEntries = BigInteger.ZERO;
}
public BigInteger calculateTotalEntries(SessionFactory dbSession, String timeSeriesName, String timeSeriesColumn) {
BigInteger count = BigInteger.ZERO;
String query = "select count(*) from (SELECT distinct " + timeSeriesColumn + " FROM " + timeSeriesName + ") r;";
// String query = "SELECT count(*) FROM " + timeSeriesName.toLowerCase();
List<Object> resultSet = DatabaseFactory.executeSQLQuery(query, dbSession);
for (Object result : resultSet) {
try {
BigInteger resultcount = (BigInteger) result;
totalEntries = totalEntries.add(resultcount);
count = resultcount;
AnalysisLogger.getLogger().trace("DBObjectTranslator->calculateTotalEntries: Time Series " + timeSeriesName + " total " + totalEntries);
} catch (Exception e) {
}
}
return count;
}
public ArrayList<String> retrieveTimeSeriesEntries(SessionFactory dbSession, String timeSeriesName, String timeSeriesColumn, BigInteger min, int numberOfElements) {
// String query = "SELECT distinct "+timeSeriesColumn+" FROM "+timeSeriesName+" r limit "+min+","+numberOfElements;
String query = "SELECT distinct " + timeSeriesColumn + " FROM " + timeSeriesName + " r limit " + numberOfElements + " offset " + min;
AnalysisLogger.getLogger().trace("DBObjectTranslator->query: " + query);
List<Object> resultSet = DatabaseFactory.executeSQLQuery(query, dbSession);
ArrayList<String> column = new ArrayList<String>();
for (Object result : resultSet) {
try {
String value = "";
if (result != null)
value = result.toString();
column.add(value);
// AnalysisLogger.getLogger().debug("DBObjectTranslator->retrieveColumnRange: Column Element Added " + value);
} catch (Exception e) {
e.printStackTrace();
AnalysisLogger.getLogger().trace("DBObjectTranslator->retrieveTimeSeriesEntries: Error in adding entry :" + e.getLocalizedMessage());
}
}
AnalysisLogger.getLogger().trace("DBObjectTranslator->retrieveColumnRange: Column " + column.toString());
return column;
}
public ArrayList<Entry> retrieveEntries(SessionFactory dbSession, String timeSeriesName, BigInteger min, int numberOfElements) {
// clean previous entries
ArrayList<Entry> currentEntries = new ArrayList<Entry>();
ArrayList<String> descriptions = new ArrayList<String>();
ArrayList<String> types = new ArrayList<String>();
/*
* SELECT table_name,ordinal_position,column_name,data_type, is_nullable,character_maximum_length FROM information_schema.COLUMNS WHERE table_name ='ref_area';
*/
String queryDesc = "SELECT table_name,ordinal_position,column_name,data_type, is_nullable,character_maximum_length FROM information_schema.COLUMNS WHERE table_name ='" + timeSeriesName.toLowerCase() + "'";
List<Object> resultSetDesc = DatabaseFactory.executeSQLQuery(queryDesc, dbSession);
for (Object result : resultSetDesc) {
Object[] resultArray = (Object[]) result;
descriptions.add((String) resultArray[2]);
types.add(DataTypeRecognizer.transformTypeFromDB((String) resultArray[3]));
}
if (descriptions.size() > 0) {
// String query = "SELECT DISTINCT * FROM " + timeSeriesName + " r where id>=" + min.toString() + " and id<=" + max.toString();
// String query = "SELECT DISTINCT * FROM " + timeSeriesName + " r limit "+min+","+numberOfElements;
String query = "SELECT DISTINCT * FROM " + timeSeriesName + " r limit " + numberOfElements + " offset " + min;
AnalysisLogger.getLogger().trace("DBObjectTranslator->retrieveEntries: query " + query);
List<Object> resultSet = DatabaseFactory.executeSQLQuery(query, dbSession);
for (Object result : resultSet) {
Entry entry = new Entry();
try {
Object[] resultArray = (Object[]) result;
int i = 0;
for (Object res : resultArray) {
// build entry
String value = "";
if (res != null)
value = res.toString();
entry.addAttribute(descriptions.get(i), value);
entry.addType(descriptions.get(i), types.get(i));
i++;
}
// add entry
currentEntries.add(entry);
// AnalysisLogger.getLogger().debug("DBObjectTranslator->retrieveEntries: Entry Added " + entry.toString());
} catch (Exception e) {
// e.printStackTrace();
AnalysisLogger.getLogger().trace("DBObjectTranslator->retrieveEntries: Error in adding entry :" + e.getLocalizedMessage());
}
}
}
// AnalysisLogger.getLogger().trace("DBObjectTranslator->retrieveEntries: Entries " + currentEntries);
return currentEntries;
}
public void buildRelationsEdges(SessionFactory dbSession) {
String query = "select * from relation_table;";
List<Object> resultSet = DatabaseFactory.executeSQLQuery(query, dbSession);
for (Object result : resultSet) {
Object[] resultArray = (Object[]) result;
RelationEdge re = null;
try {
re = new RelationEdge(((String) resultArray[2]), "" + resultArray[0], "" + resultArray[1]);
} catch (Exception e) {
e.printStackTrace();
}
if (re != null) {
relations.add(re);
AnalysisLogger.getLogger().trace("DBObjectTranslator->buildRelationsEdges: add relation " + re.toString());
}
}
}
public void buildCategories(SessionFactory dbSession, String referenceTable, String referenceColumn, String idColumn, String nameHuman, String description) {
referenceTable = referenceTable == null ? "reference_table" : referenceTable;
referenceColumn = referenceColumn == null ? "table_name" : referenceColumn;
nameHuman = nameHuman == null ? "name_human" : nameHuman;
idColumn = idColumn == null ? "id" : idColumn;
description = description == null ? "description" : description;
String query = "SELECT " + nameHuman + "," + idColumn + "," + referenceColumn + "," + description + " FROM " + referenceTable + " r;";
List<Object> resultSet = DatabaseFactory.executeSQLQuery(query, dbSession);
if (resultSet != null) {
for (Object result : resultSet) {
Object[] resultArray = (Object[]) result;
Category cat = null;
try {
// name_human, id, table_name,description
cat = new Category("" + resultArray[0], "" + resultArray[1], "" + resultArray[2], "" + resultArray[3]);
} catch (Exception e) {
e.printStackTrace();
}
if (cat != null) {
categories.add(cat);
AnalysisLogger.getLogger().trace("DBObjectTranslator->buildCategories: add category " + cat.toString());
}
}
}
}
public Category getCategoryfromIndex(String index) {
Category cat = null;
for (Category c : categories) {
if (c.getIndex().equals(index)) {
cat = c;
break;
}
}
return cat;
}
public void populateRelationWithCategories() {
for (RelationEdge re : relations) {
Category from = getCategoryfromIndex(re.getFrom());
Category to = getCategoryfromIndex(re.getTo());
re.setCategoryFrom(from.getName());
re.setCategoryTo(to.getName());
AnalysisLogger.getLogger().trace("DBObjectTranslator->populateRelationWithCategories: modified Relation " + re.toString());
}
}
public void calculateRelationWeights(SessionFactory dbSession) {
for (RelationEdge re : relations) {
String query = "SELECT count(*) FROM " + re.getName().toLowerCase();
List<Object> resultSet = DatabaseFactory.executeSQLQuery(query, dbSession);
for (Object result : resultSet) {
try {
BigInteger resultcount = (BigInteger) result;
re.setWeigth(resultcount);
totalRelationElements = totalRelationElements.add(resultcount);
AnalysisLogger.getLogger().trace("DBObjectTranslator->calculateRelationWeights: Relation " + re.getName() + " weight " + re.getWeigth());
} catch (Exception e) {
}
}
}
}
public void calculateCategoriesWeights(SessionFactory dbSession) {
for (Category cat : categories) {
String query = "SELECT count(*) FROM " + cat.getTableName().toLowerCase();
List<Object> resultSet = DatabaseFactory.executeSQLQuery(query, dbSession);
for (Object result : resultSet) {
try {
BigInteger resultcount = (BigInteger) result;
cat.setNumberOfElements(resultcount);
totalCatElements = totalCatElements.add(resultcount);
AnalysisLogger.getLogger().trace("DBObjectTranslator->calculateCategoriesWeights: Category " + cat.getName() + " weight " + cat.getNumberOfElements() + " total " + totalCatElements);
} catch (Exception e) {
}
}
}
}
public void buildCategoriesStructure(SessionFactory dbSession, String referenceTable, String referenceColumn, String idColumn, String nameHuman, String description) {
buildCategories(dbSession, referenceTable, referenceColumn, idColumn, nameHuman, description);
calculateCategoriesWeights(dbSession);
AnalysisLogger.getLogger().trace("DBObjectTranslator->buildWholeStructure: Total Categories Elements " + totalCatElements + " Total Relation Elements " + totalRelationElements);
}
public void buildWholeStructure(SessionFactory dbSession, String referenceTable, String referenceColumn, String idColumn, String nameHuman, String description) {
buildRelationsEdges(dbSession);
buildCategories(dbSession, referenceTable, referenceColumn, idColumn, nameHuman, description);
populateRelationWithCategories();
calculateRelationWeights(dbSession);
calculateCategoriesWeights(dbSession);
AnalysisLogger.getLogger().trace("DBObjectTranslator->buildWholeStructure: Total Categories Elements " + totalCatElements + " Total Relation Elements " + totalRelationElements);
}
}

View File

@ -0,0 +1,49 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data;
import java.util.HashMap;
//a single entry from a category
public class Entry {
HashMap<String,String> attributes;
HashMap<String,String> types;
public HashMap<String,String> getAttributes(){
return attributes;
}
public HashMap<String,String> getTypes(){
return types;
}
public void addAttribute(String column,String value){
if (value==null)
value = "";
attributes.put(column, value);
}
public void addType(String column,String value){
if (value==null)
value = "";
types.put(column, value);
}
public Entry(){
attributes = new HashMap<String, String>();
types = new HashMap<String, String>();
}
public String toString(){
StringBuffer returningString = new StringBuffer();
returningString.append("{");
for (String att: attributes.keySet()){
String value = attributes.get(att);
returningString.append(att+"="+value+"|"+types.get(att).toUpperCase()+"; ");
}
returningString.append("}");
return returningString.toString();
}
}

View File

@ -0,0 +1,71 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data;
import java.math.BigInteger;
public class RelationEdge {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
private String relationName;
private String indexFrom;
private String indexTo;
private BigInteger weight;
private String categoryFrom;
private String categoryTo;
public BigInteger getWeigth(){
return weight;
}
public void setWeigth(BigInteger Weight){
weight = Weight;
}
public String getTo(){
return indexTo;
}
public String getFrom(){
return indexFrom;
}
public String getName(){
return relationName;
}
public void setName(String name){
relationName = name;
}
public RelationEdge(String name,String from,String to){
relationName = name;
indexFrom = from;
indexTo = to;
}
@Override
public String toString(){
return "["+relationName+": from "+indexFrom+" to " +indexTo+" nameFrom "+categoryFrom+" nameTo "+categoryTo+"]";
}
public void setCategoryFrom(String categoryFrom) {
this.categoryFrom = categoryFrom;
}
public String getCategoryFrom() {
return categoryFrom;
}
public void setCategoryTo(String categoryTo) {
this.categoryTo = categoryTo;
}
public String getCategoryTo() {
return categoryTo;
}
}

View File

@ -0,0 +1,65 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data;
public class SingleResult {
private String category;
private String column;
private String tablename;
private String familyID;
private double score;
public void setCategory(String category) {
this.category = category;
}
public String getCategory() {
return category;
}
public void setColumn(String column) {
this.column = column;
}
public String getColumn() {
return column;
}
public void setScore(double score) {
this.score = score;
}
public double getScore() {
return score;
}
public String getStringScore() {
double scored = Math.round((int)(score*100))/(double)100;
return ""+scored;
}
public String toString(){
double scored = Math.round((int)(score*100))/(double)100;
if (column!=null)
return category+"="+column+":"+scored+" tab:"+tablename+":"+familyID;
else
return category+"="+":"+scored;
}
public SingleResult (String Category,String Column,double Score, String TableName,String FamilyID){
category = Category;
column = Column;
score = Score;
tablename = TableName;
familyID = FamilyID;
}
public void setTablename(String tablename) {
this.tablename = tablename;
}
public String getTablename() {
return tablename;
}
public void setFamilyID(String familyID) {
this.familyID = familyID;
}
public String getFamilyID() {
return familyID;
}
}

View File

@ -0,0 +1,80 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data;
import java.math.BigDecimal;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph.GraphFramer;
public class TSObjectTransformer {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
public static CategoryOrderedList transform2List(DBObjectTranslator dbo, LexicalEngineConfiguration config){
return transform2List(dbo,config,null);
}
public static CategoryOrderedList transform2List(DBObjectTranslator dbo, LexicalEngineConfiguration config, String filter){
CategoryOrderedList col = new CategoryOrderedList(config);
for (Category cat:dbo.categories){
if ((filter==null) || filter.equalsIgnoreCase(cat.getName()))
col.addCategory(cat);
}
return col;
}
public static void transform2Graph(DBObjectTranslator dbo){
GraphFramer starter = new GraphFramer("Time Series Graph");
BigDecimal total = new BigDecimal(dbo.totalCatElements);
// total = new BigDecimal(100).divide(total,2,BigDecimal.ROUND_HALF_UP);
for (Category cat:dbo.categories){
BigDecimal bd = new BigDecimal(cat.getNumberOfElements());
bd = bd.divide(total,4,BigDecimal.ROUND_HALF_UP);
bd = bd.multiply(new BigDecimal(100));
bd = bd.setScale(2,BigDecimal.ROUND_HALF_UP);
// double perc = bd.doubleValue()*100;
String builtname = cat.getName()+":"+bd+"% ";
starter.graphDisplayer.addVertex(builtname);
}
for (RelationEdge rel:dbo.relations){
Category cat = dbo.getCategoryfromIndex(rel.getFrom());
BigDecimal bd = new BigDecimal(cat.getNumberOfElements());
bd = bd.divide(total,4,BigDecimal.ROUND_HALF_UP);
bd = bd.multiply(new BigDecimal(100));
bd = bd.setScale(2,BigDecimal.ROUND_HALF_UP);
// double perc = bd.doubleValue()*100;
String name1 = cat.getName()+":"+bd+"% ";
cat = dbo.getCategoryfromIndex(rel.getTo());
bd = new BigDecimal(cat.getNumberOfElements());
bd = bd.divide(total,4,BigDecimal.ROUND_HALF_UP);
bd = bd.multiply(new BigDecimal(100));
bd = bd.setScale(2,BigDecimal.ROUND_HALF_UP);
// perc = bd.doubleValue()+100;
String name2 = cat.getName()+":"+bd+"% ";
starter.graphDisplayer.addEdge(name1,name2,new BigDecimal(rel.getWeigth()).divide(new BigDecimal(dbo.totalCatElements),2,BigDecimal.ROUND_HALF_UP).multiply(new BigDecimal(100)).doubleValue());
// starter.graphDisplayer.addEdge(name1,name2,0);
}
// starter.graphDisplayer.generateRandomGraph();
starter.graphDisplayer.generateUpTo5StarGraph();
starter.go();
}
}

View File

@ -0,0 +1,19 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.interfaces;
import java.math.BigInteger;
public interface Reference {
public void setName(String categoryName);
public String getName();
public void setIndex(String categoryIndex);
public String getIndex();
public void setTableName(String tableName);
public String getTableName();
public void setDescription(String description);
public String getDescription();
public String toString();
public void setNumberOfElements(BigInteger numberOfElements);
public BigInteger getNumberOfElements();
}

View File

@ -0,0 +1,14 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine;
public abstract class Chunk {
protected Engine engine;
public Chunk(Engine engine){
this.engine = engine;
}
}

View File

@ -0,0 +1,128 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions;
public abstract class ChunkSet {
protected String seriesName;
protected String seriesColumn;
protected int chunkSize;
private int maxNumberOfChunks;
ArrayList<Integer> chunkSet;
protected int chunkSetSize;
protected BigInteger numberOfEntries;
protected int chunkIndex;
protected LexicalEngineConfiguration config;
protected Engine engine;
public ChunkSet(int MaxNumberOfChunks, int ChunkSize, String SeriesName,String SeriesColumn, LexicalEngineConfiguration Config, Engine engine) throws Exception{
this.engine = engine;
config = Config;
setSeriesName(SeriesName);
setSeriesColumn(SeriesColumn);
setChunkSize(ChunkSize);
maxNumberOfChunks = MaxNumberOfChunks;
generateChunkSet();
}
public ChunkSet(int MaxNumberOfChunks, int ChunkSize, String SeriesName,String SeriesColumn,BigInteger numberOfEntries,LexicalEngineConfiguration Config , Engine engine) throws Exception{
this.engine = engine;
config = Config;
setSeriesName(SeriesName);
setSeriesColumn(SeriesColumn);
setChunkSize(ChunkSize);
setNumberOfEntries(numberOfEntries);
maxNumberOfChunks = MaxNumberOfChunks;
generateChunkSet();
}
public void generateChunkSet() throws Exception{
AnalysisLogger.getLogger().trace("ChunkSet->generateChunkSet-> \tGenerating Chunk Set for " + seriesName+ " "+seriesColumn);
int numberOfChunks = calculateNumberOfCycles();
//generate chunks to be processed
chunkSet = MathFunctions.generateRandoms(maxNumberOfChunks, 0, numberOfChunks);
chunkIndex = 0;
chunkSetSize = numberOfChunks;
}
abstract protected BigDecimal calculateNumberOfElements() throws Exception;
protected int calculateNumberOfCycles() throws Exception {
int numberofcycles = 0;
// calculate total entries in the time series
BigDecimal numberOfElements = calculateNumberOfElements();
// calculate total cycles of comparison
BigDecimal intcycles;
BigDecimal oddcycles;
BigDecimal[] arraydecimal = numberOfElements.divideAndRemainder(new BigDecimal(BigInteger.valueOf(chunkSize)));
intcycles = arraydecimal[0];
oddcycles = arraydecimal[1];
numberofcycles = intcycles.intValue();
if ((numberofcycles==0)&&(oddcycles.intValue() > 0)) numberofcycles = numberofcycles + 1;
return numberofcycles;
}
public void setSeriesName(String seriesName) {
this.seriesName = seriesName;
}
public String getSeriesName() {
return seriesName;
}
public void setSeriesColumn(String seriesColumn) {
this.seriesColumn = seriesColumn;
}
public String getSeriesColumn() {
return seriesColumn;
}
public void setChunkSize(int chunkSize) {
this.chunkSize = chunkSize;
}
public int getChunkSize() {
return chunkSize;
}
public void setNumberOfEntries(BigInteger numberOfEntries) {
this.numberOfEntries = numberOfEntries;
}
public BigInteger getNumberOfEntries() {
return numberOfEntries;
}
abstract public Object nextChunk();
}

View File

@ -0,0 +1,55 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks;
import java.math.BigInteger;
import java.util.ArrayList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.DBObjectTranslator;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.Entry;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.hibernate.SessionFactory;
public class ReferenceChunk extends Chunk{
private String categoryName;
private String categoryTableName;
private ArrayList<Entry> referenceEntries;
private BigInteger startPoint;
private int chunkSize;
public ReferenceChunk(String CategoryName, String CategoryTableName, BigInteger StartPoint, int ChunkSize, Engine engine){
super(engine);
chunkSize = ChunkSize;
categoryName = CategoryName;
categoryTableName = CategoryTableName;
startPoint = StartPoint;
AnalysisLogger.getLogger().trace("ReferenceChunk-> \t\tTOOK CATEGORY CHUNK FOR CATEGORY: " + categoryName+" - index : "+startPoint);
}
//takes references on demand from DB
public ArrayList<Entry> getReferenceEntries() throws Exception{
DBObjectTranslator dbo = new DBObjectTranslator();
SessionFactory sess = engine.getDBSession();
// AnalysisLogger.getLogger().debug("ReferenceChunk->getReferenceEntries-> \tCATEGORY CHUNK START : " + startPoint);
referenceEntries = dbo.retrieveEntries(sess, categoryTableName, startPoint, chunkSize);
return referenceEntries;
}
public void setCategoryName(String categoryName) {
this.categoryName = categoryName;
}
public String getCategoryName() {
return categoryName;
}
}

View File

@ -0,0 +1,51 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks;
import java.math.BigDecimal;
import java.math.BigInteger;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions;
public class ReferenceChunkSet extends ChunkSet{
public ReferenceChunkSet(int MaxNumberOfChunks, int ChunkSize, String CategoryName,String CategoryColumn, LexicalEngineConfiguration config, Engine engine) throws Exception{
super(MaxNumberOfChunks, ChunkSize, CategoryName,CategoryColumn, config, engine);
}
public ReferenceChunkSet(int MaxNumberOfChunks, int ChunkSize, String CategoryName, String CategoryTable, BigInteger numberOfCategoryElements, LexicalEngineConfiguration config, Engine engine) throws Exception{
super(MaxNumberOfChunks, ChunkSize, CategoryName, CategoryTable, numberOfCategoryElements, config, engine);
}
protected BigDecimal calculateNumberOfElements() throws Exception{
// calculate total entries in the time series
BigDecimal numberOfElements = new BigDecimal(numberOfEntries);
return numberOfElements;
}
public ReferenceChunk nextChunk() {
ReferenceChunk rc = null;
while (!chunkSet.contains(chunkIndex) && (chunkIndex < chunkSetSize)) {
chunkIndex++;
}
if (chunkIndex < chunkSetSize) {
BigInteger startIndex = MathFunctions.chunk2Index(chunkIndex, chunkSize);
try {
rc = new ReferenceChunk(seriesName, seriesColumn , startIndex, chunkSize, engine);
} catch (Exception e) {
e.printStackTrace();
}
}
chunkIndex++;
return rc;
}
}

View File

@ -0,0 +1,41 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks;
import java.util.ArrayList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.interfaces.Reference;
public class SetOfReferenceChunkSet {
ArrayList<Reference> orderedList;
int referenceIndex;
LexicalEngineConfiguration config;
Engine engine;
public SetOfReferenceChunkSet(ArrayList<Reference> OrderedList, LexicalEngineConfiguration Config, Engine engine){
this.engine = engine;
orderedList = OrderedList;
referenceIndex = 0;
config = Config;
}
//filter selects only one of the categories
public ReferenceChunkSet getNextChunkSet(){
ReferenceChunkSet cs = null;
if (orderedList.size()>referenceIndex){
Reference ref = orderedList.get(referenceIndex);
try{
cs = new ReferenceChunkSet(config.ReferenceChunksToTake,config.chunkSize,ref.getName(),ref.getTableName(),ref.getNumberOfElements(),config, engine);
}catch (Exception e){
e.printStackTrace();
}
referenceIndex++;
}
return cs;
}
}

View File

@ -0,0 +1,52 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks;
import java.math.BigDecimal;
import java.math.BigInteger;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions;
public class SingletonChunkSet extends ChunkSet {
private String singletonString;
private String ColumnType;
public SingletonChunkSet(String SingletonString, LexicalEngineConfiguration config, Engine engine) throws Exception {
super(1, 1, null, null, config, engine);
singletonString = SingletonString;
ColumnType = null;
}
protected BigDecimal calculateNumberOfElements() throws Exception {
// calculate total entries in the time series
BigDecimal numberOfElements = BigDecimal.ONE;
return numberOfElements;
}
public TimeSeriesChunk nextChunk() {
TimeSeriesChunk tsc = null;
while (!chunkSet.contains(chunkIndex) && (chunkIndex < chunkSetSize)) {
chunkIndex++;
}
if (chunkIndex < chunkSetSize) {
BigInteger startIndex = MathFunctions.chunk2Index(chunkIndex, chunkSize);
try {
tsc = new TimeSeriesChunk(singletonString, ColumnType, startIndex, chunkSize, config, engine);
if (ColumnType == null) {
ColumnType = tsc.getColumnType();
}
} catch (Exception e) {
e.printStackTrace();
}
}
chunkIndex++;
return tsc;
}
}

View File

@ -0,0 +1,167 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.DataTypeRecognizer;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.CategoryScores;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.DBObjectTranslator;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.Entry;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.gcube.contentmanagement.lexicalmatcher.utils.DistanceCalculator;
import org.hibernate.SessionFactory;
public class TimeSeriesChunk extends Chunk{
private ArrayList<String> columnEntries;
private String columnType;
private LexicalEngineConfiguration config;
private boolean mustInterrupt;
private ArrayList<SingleResult> detailedResults;
private String singletonElement;
private boolean isSingleton;
public String getColumnType(){
return columnType;
}
public String getSingletonEntry(){
return singletonElement;
}
public ArrayList<SingleResult> getDetailedResults(){
return detailedResults;
}
public boolean isSingleton(){
return isSingleton;
}
public TimeSeriesChunk(String timeSeriesName, String timeSeriesColumn, String ColumnType, BigInteger start, int ChunkSize, LexicalEngineConfiguration Config, Engine engine) throws Exception{
super(engine);
DBObjectTranslator dbo = new DBObjectTranslator();
SessionFactory sess = engine.getDBSession();
columnEntries = dbo.retrieveTimeSeriesEntries(sess, timeSeriesName, timeSeriesColumn, start, ChunkSize);
if (ColumnType==null){
columnType = DataTypeRecognizer.guessType(columnEntries);
AnalysisLogger.getLogger().trace("TimeSeriesChunk-> GUESSED TYPE " + columnType + " FOR COLUMN "+timeSeriesColumn);
}
mustInterrupt = false;
config = Config;
isSingleton = false;
}
public TimeSeriesChunk(String singletonString, String ColumnType, BigInteger start, int ChunkSize, LexicalEngineConfiguration Config, Engine engine) throws Exception{
super(engine);
columnEntries = new ArrayList<String>();
columnEntries.add(singletonString);
if (ColumnType==null){
columnType = DataTypeRecognizer.guessType(columnEntries);
AnalysisLogger.getLogger().trace("TimeSeriesChunk-> GUESSED TYPE " + columnType + " FOR SINGLETON "+singletonString);
}
mustInterrupt = false;
config = Config;
isSingleton = true;
singletonElement = singletonString;
detailedResults = new ArrayList<SingleResult>();
}
public boolean mustInterruptProcess (){
return this.mustInterrupt;
}
public void compareToReferenceChunk(HashMap<String, CategoryScores> scoresTable, ReferenceChunk catChunk) throws Exception {
compareToReferenceChunk(scoresTable, catChunk,null);
}
// checks an entry set against a reference set
// columnEntries: column elements from unknown column
// cat: category analyzed for candidating to recognized
// referenceEntries: some elements belonging to cat, to be compared to columnEntries
public void compareToReferenceChunk(HashMap<String, CategoryScores> scoresTable, ReferenceChunk catChunk,String ColumnFilter) throws Exception {
//in the case of a singleton Chunk interrupt computation in case of exact match
// get category Score for further processing
CategoryScores categoryScores = scoresTable.get(catChunk.getCategoryName());
//extract Entries from DB
ArrayList<Entry> categoryEntries = catChunk.getReferenceEntries();
for (String timeSeriesElement : columnEntries) {
// for each reference entry
for (Entry referenceEntry : categoryEntries) {
// take all attributes of a reference entry for confrontation to columns
HashMap<String, String> attributes = referenceEntry.getAttributes();
HashMap<String, String> types = referenceEntry.getTypes();
boolean anotherReference= true;
// for each attribute of an entry
for (String referenceColumn : attributes.keySet()) {
// perform calculation only if the column type is the same
if (types.get(referenceColumn).equals(columnType)&&((ColumnFilter==null)||(ColumnFilter.equalsIgnoreCase(referenceColumn)))) {
// AnalysisLogger.getLogger().debug("CategoryOrderedList->checkAllEntriesOnEntireCategory-> REFERENCE COLUMN "+referenceColumn+" HAS TYPE "+types.get(referenceColumn));
// take the attribute value of the entry
String attribute = attributes.get(referenceColumn);
// calculate the distance between the unknown entry and the attribute
DistanceCalculator d = new DistanceCalculator();
double percentage = d.CD(config.useSimpleDistance, timeSeriesElement, attribute, isSingleton, isSingleton) * 100f;
// AnalysisLogger.getLogger().debug("CategoryOrderedList->checkUnkEntriesOnEntireCategory-> Percentage between " +timeSeriesElement + " and " + attribute + " is: "+percentage );
// if they are similar
if (percentage > config.entryAcceptanceThreshold) {
// if (catChunk.getCategoryName().equals("COUNTRY_OLD"))
AnalysisLogger.getLogger().trace("TimeSeriesChunk->compareToCategoryChunk-> \t\tPercentage between " + timeSeriesElement + " vs. " + attribute + " is: " + percentage+" in "+catChunk.getCategoryName()+":"+referenceColumn);
categoryScores.incrementScore(referenceColumn, (float)percentage,anotherReference);
//if we are in a singleton we have to get the details
if (isSingleton){
//for singleton match, fulfil details
int index =0;
for (SingleResult sr :detailedResults){
Double scoredetail = sr.getScore();
if (scoredetail<percentage){
break;
}
index ++;
}
detailedResults.add(index, new SingleResult(attribute, null, percentage,null,"0"));
}
else{
AnalysisLogger.getLogger().trace("TimeSeriesChunk->compareToCategoryChunk-> "+categoryScores.showScores());
}
//if exact match is reached, exit
if ((percentage==100)&&(isSingleton))
{
detailedResults = new ArrayList<SingleResult>();
detailedResults.add(new SingleResult(attribute, null, percentage,null,"0"));
mustInterrupt = true;
break;
}
}
}
}// end for on columns
if (mustInterrupt)
break;
}// end for on entries
}
}
}

View File

@ -0,0 +1,53 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.chunks;
import java.math.BigDecimal;
import java.math.BigInteger;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.DBObjectTranslator;
import org.gcube.contentmanagement.lexicalmatcher.utils.MathFunctions;
import org.hibernate.SessionFactory;
public class TimeSeriesChunkSet extends ChunkSet {
private String ColumnType;
public TimeSeriesChunkSet(int MaxNumberOfChunks, int ChunkSize, String TimeSeriesName, String TimeSeriesColumn, LexicalEngineConfiguration config, Engine engine) throws Exception {
super(MaxNumberOfChunks, ChunkSize, TimeSeriesName, TimeSeriesColumn, config,engine);
ColumnType = null;
}
protected BigDecimal calculateNumberOfElements() throws Exception {
// calculate total entries in the time series
DBObjectTranslator dbo = new DBObjectTranslator();
SessionFactory sess = engine.getDBSession();
BigDecimal numberOfElements = new BigDecimal(dbo.calculateTotalEntries(sess, seriesName, seriesColumn));
return numberOfElements;
}
public TimeSeriesChunk nextChunk() {
TimeSeriesChunk tsc = null;
while (!chunkSet.contains(chunkIndex) && (chunkIndex < chunkSetSize)) {
chunkIndex++;
}
if (chunkIndex < chunkSetSize) {
BigInteger startIndex = MathFunctions.chunk2Index(chunkIndex, chunkSize);
try {
tsc = new TimeSeriesChunk(seriesName, seriesColumn, ColumnType, startIndex, chunkSize, config, engine);
if (ColumnType == null) {
ColumnType = tsc.getColumnType();
}
} catch (Exception e) {
e.printStackTrace();
}
}
chunkIndex++;
return tsc;
}
}

View File

@ -0,0 +1,25 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph;
import org.jgrapht.graph.ListenableDirectedWeightedGraph;
public class CustomListenableDirectedWeightedGraph<V,E> extends ListenableDirectedWeightedGraph<V,E>{
public CustomListenableDirectedWeightedGraph(Class arg0) {
super(arg0);
}
public void setEdgeWeight(E e, double weight) {
super.setEdgeWeight(e, weight);
((CustomWeightedEdge)e).setWeight(weight);
}
public E addEdge(V o1,V o2) {
E out = super.addEdge(o1,o2);
((CustomWeightedEdge)out).setEdges(o1,o2);
return out;
}
}

View File

@ -0,0 +1,27 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph;
import org.jgrapht.graph.DefaultWeightedEdge;
import com.touchgraph.graphlayout.Edge;
public class CustomWeightedEdge extends DefaultWeightedEdge{
@Override
public String toString(){
return "["+o1+":"+o2+":"+weight+"%]";
}
private double weight;
private Object o1;
private Object o2;
public void setWeight(double weight){
this.weight = weight;
}
public void setEdges(Object o1,Object o2){
this.o1=o1;
this.o2=o2;
}
}

View File

@ -0,0 +1,36 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph;
import org.jgrapht.graph.DefaultWeightedEdge;
import com.touchgraph.graphlayout.Edge;
public class CustomWeightedVertex {
@Override
public String toString() {
return "[" + name + ":" + weight + "%]";
}
private double weight;
private String name;
public CustomWeightedVertex(String name, double weight) {
this.weight = weight;
this.name = name;
}
public CustomWeightedVertex(String name) {
this.weight = 0;
this.name = name;
}
public boolean equals(CustomWeightedVertex v) {
if (v.name.equals(name))
return true;
else
return false;
}
}

View File

@ -0,0 +1,299 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph;
import java.awt.Color;
import java.awt.Dimension;
import java.awt.Rectangle;
import java.awt.geom.Rectangle2D;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import javax.swing.JApplet;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.jgraph.JGraph;
import org.jgraph.graph.DefaultGraphCell;
import org.jgraph.graph.GraphConstants;
import org.jgrapht.ext.JGraphModelAdapter;
public class GraphDisplayer extends JApplet {
private static final Color DEFAULT_BG_COLOR = Color.decode("#FAFBFF");
private static final Dimension DEFAULT_SIZE = new Dimension(530, 320);
private JGraphModelAdapter m_jgAdapter;
public static int WIDTH = 1000;
public static int HEIGHT = 800;
public static int WIDTHBOX = 1280;
public static int HEIGHTBOX = 1024;
private int newxposition;
private int newyposition;
private CustomListenableDirectedWeightedGraph g;
private int nodesCounter;
private static final int minx = 10;
private static final int miny = 10;
ArrayList<String> VertexNames;
HashMap<String, String> Edges;
public void generatePosition(int lastxPosition, int lastyposition) {
int rangex = (int) WIDTH - (int) lastxPosition;
// compute a fraction of the range, 0 <= frac < range
Random a = new Random();
int newx = lastxPosition + 70 + (int) (rangex * a.nextDouble());
int epsilon = 1;
int newy = (int) lastyposition + (int) (epsilon * 20f * Math.random());
if (newx > WIDTH)
newx = WIDTH - 100;
if (newx < lastxPosition - 90)
newx = lastxPosition + 90;
if (newy > HEIGHT)
newy = HEIGHT - 10;
if (newy < 0)
newy = 0;
newxposition = newx;
newyposition = newy;
// System.out.println("LAST X "+lastxPosition+" NEW X "+newxposition);
// System.out.println("LAST Y "+lastyposition+" NEW Y "+newyposition);
}
public void init() {
AnalysisLogger.getLogger().debug("INIZIALIZZATO!");
JGraph jgraph = new JGraph(m_jgAdapter);
adjustDisplaySettings(jgraph);
getContentPane().add(jgraph);
resize(DEFAULT_SIZE);
AnalysisLogger.getLogger().debug("RESIZED!");
}
public void generateGraph() {
for (String v : VertexNames) {
genPositionVertex(v);
}
}
public void generateRandomGraph() {
for (String v : VertexNames) {
int randx = minx + (int) ((WIDTH - 100) * Math.random());
int randy = miny + (int) ((HEIGHT - 100) * Math.random());
positionVertexAt(v, randx, randy);
}
}
public void generateUpTo5StarGraph() {
// individua le star
HashMap<String, Integer> vertexFrequencies = new HashMap<String, Integer>();
// calcolo le frequenze dei vertici
for (String edge : Edges.values()) {
System.out.println(edge + "-" + vertexFrequencies.get(edge));
if (vertexFrequencies.get(edge) != null) {
int f = vertexFrequencies.get(edge).intValue();
vertexFrequencies.put(edge, new Integer(f + 1));
} else
vertexFrequencies.put(edge, new Integer(0));
}
for (String vertex : VertexNames) {
if (Edges.get(vertex) == null) {
boolean trovato = false;
// cerco ogni vertice tra gli archi
for (String starvertex : Edges.values()) {
if (vertex.equals(starvertex)) {
trovato = true;
break;
}
}
if (!trovato) {
System.out.println("aggiunto vertice isolato " + vertex);
vertexFrequencies.put(vertex, new Integer(0));
}
}
}
System.out.println("FEQS " + vertexFrequencies.toString());
// ordino le star
ArrayList<String> starList = new ArrayList<String>();
for (String vertex : vertexFrequencies.keySet()) {
int freq = vertexFrequencies.get(vertex);
int i = 0;
boolean trovato = false;
for (String element : starList) {
int referfreq = vertexFrequencies.get(element);
if (referfreq < freq) {
starList.add(i, vertex);
trovato = true;
break;
}
i++;
}
if (!trovato)
starList.add(vertex);
}
// dispongo le star nel layout
System.out.println(starList.toString());
int bound = 200;
int[] boundedXIndexex = { bound, WIDTH - bound, bound, WIDTH - bound, WIDTH / 2 };
int[] boundedYIndexex = { bound, bound, HEIGHT - bound, HEIGHT - bound, HEIGHT / 2 };
int sizeStar = starList.size();
// int sizeStar = 1;
// distribuisco le star sul grafico
for (int i = 0; i < sizeStar; i++) {
positionVertexAt(starList.get(i), boundedXIndexex[i], boundedYIndexex[i]);
// calcolo il numero di elementi della stella
int countelems = 0;
for (String edge : Edges.keySet()) {
if (Edges.get(edge).equals(starList.get(i))) {
countelems++;
}
}
if (countelems > 0) {
double subdivision = 360 / countelems;
double angle = 105f;
double radius = 200f;
System.out.println("Numero di elementi nella stella: " + countelems + " suddivisioni: " + subdivision);
for (String edge : Edges.keySet()) {
// dispongo gli elementi a stella
if (Edges.get(edge).equals(starList.get(i))) {
int currentx = boundedXIndexex[i];
int currenty = boundedYIndexex[i];
int epsilonx = (int) (radius * Math.cos(Math.toRadians(angle)));
int epsilony = (int) (radius * Math.sin(Math.toRadians(angle)));
System.out.println("angolo attuale: " + angle + " x0: " + currentx + " y0 " + currenty + " ex " + epsilonx + " ey " + epsilony);
positionVertexAt(edge, currentx + epsilonx, currenty + epsilony);
angle += subdivision;
}
}
}
}
}
private void genPositionVertex(String vertexName) {
if (nodesCounter > 0) {
if ((nodesCounter % 2) == 0) {
newxposition = 10 + (int) (20f * Math.random());
newyposition += 100;
} else
generatePosition(newxposition, newyposition);
}
positionVertexAt(vertexName, newxposition, newyposition);
nodesCounter++;
}
public GraphDisplayer() {
g = new CustomListenableDirectedWeightedGraph(CustomWeightedEdge.class);
m_jgAdapter = new JGraphModelAdapter(g);
VertexNames = new ArrayList<String>();
Edges = new HashMap<String, String>();
newxposition = minx;
newyposition = miny;
nodesCounter = 0;
}
public void addVertex(String name) {
g.addVertex(name);
VertexNames.add(name);
}
public void addEdge(String v1, String v2, double bi) {
CustomWeightedEdge ed = (CustomWeightedEdge)g.addEdge(v1,v2);
g.setEdgeWeight(ed,bi);
Edges.put(v1, v2);
}
private void adjustDisplaySettings(JGraph jg) {
jg.setPreferredSize(DEFAULT_SIZE);
Color c = DEFAULT_BG_COLOR;
String colorStr = null;
try {
colorStr = getParameter("bgcolor");
} catch (Exception e) {
}
if (colorStr != null) {
c = Color.decode(colorStr);
}
jg.setBackground(c);
}
private void positionVertexAt(Object vertex, int x, int y) {
// seleziono la cella chiamata vertex
DefaultGraphCell cell = m_jgAdapter.getVertexCell(vertex);
// recupero gli attributi della cella
Map attr = cell.getAttributes();
// recupero i boundaries della cella
Rectangle2D b = GraphConstants.getBounds(attr);
// setto i parametri del nuovo rettangolo
GraphConstants.setBounds(attr, new Rectangle(x, y, (int) (((String)vertex).length()+50+b.getWidth()), (int) b.getHeight()));
// costruisco una nuova cella
Map cellAttr = new HashMap();
cellAttr.put(cell, attr);
// posiziono la cella nel grafo
m_jgAdapter.edit(cellAttr, null, null, null);
}
public void start() {
repaint();
}
public static void main(String[] args) {
GraphFramer starter = new GraphFramer("Grafo");
// create a visualization using JGraph, via an adapter
String nodi[] = { "ciao", "come", "stai", "oggi", "domani", "dopodomani" };
for (String nodo : nodi) {
starter.graphDisplayer.addVertex(nodo);
}
for (int j = 0; j < nodi.length; j++) {
int i0 = (int) (nodi.length * Math.random());
int i1 = (int) (nodi.length * Math.random());
System.out.println("i0: " + i0 + " i1: " + i1);
if (i0 != i1) {
starter.graphDisplayer.addEdge(nodi[i0], nodi[i1],0);
}
}
starter.graphDisplayer.generateGraph();
starter.go();
}
}

View File

@ -0,0 +1,40 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph;
import java.awt.Event;
import java.awt.Frame;
public class GraphFramer extends Frame{
public GraphDisplayer graphDisplayer;
public GraphFramer(String frameName){
super(frameName);
graphDisplayer = new GraphDisplayer();
add("Center",graphDisplayer);
}
public void go(){
graphDisplayer.init();
this.resize(GraphDisplayer.WIDTHBOX, GraphDisplayer.HEIGHTBOX);
this.show();
graphDisplayer.start();
}
public boolean HandleEvent(Event event){
if (event.id == Event.WINDOW_DESTROY)
{
try
{graphDisplayer.stop();
graphDisplayer.destroy();
}catch(Exception e){e.printStackTrace();}
System.exit(0);
}
return false;
}
}

View File

@ -0,0 +1,106 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph;
import java.awt.Color;
import java.awt.Dimension;
import java.awt.Rectangle;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.Map;
import javax.swing.JApplet;
import javax.swing.JFrame;
import org.jgraph.JGraph;
import org.jgraph.graph.AttributeMap;
import org.jgraph.graph.DefaultGraphCell;
import org.jgraph.graph.GraphConstants;
import org.jgrapht.ListenableGraph;
import org.jgrapht.ext.JGraphModelAdapter;
import org.jgrapht.graph.ListenableDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
public class GraphGeneratorApplet extends JApplet {
private static final Color DEFAULT_BG_COLOR = Color.decode("#FAFBFF");
private static final Dimension DEFAULT_SIZE = new Dimension(530, 320);
//
private JGraphModelAdapter m_jgAdapter;
/**
* @see java.applet.Applet#init().
*/
public void init() {
// create a JGraphT graph
ListenableGraph g = new ListenableDirectedGraph(DefaultEdge.class);
// create a visualization using JGraph, via an adapter
m_jgAdapter = new JGraphModelAdapter(g);
JGraph jgraph = new JGraph(m_jgAdapter);
adjustDisplaySettings(jgraph);
getContentPane().add(jgraph);
resize(DEFAULT_SIZE);
// add some sample data (graph manipulated via JGraphT)
g.addVertex("v1");
g.addVertex("v2");
g.addVertex("v3");
g.addVertex("v4");
g.addEdge("v1", "v2");
g.addEdge("v2", "v3");
g.addEdge("v3", "v1");
g.addEdge("v4", "v3");
// position vertices nicely within JGraph component
positionVertexAt("v1", 130, 40);
positionVertexAt("v2", 60, 200);
positionVertexAt("v3", 310, 230);
positionVertexAt("v4", 380, 70);
// that's all there is to org.gcube.contentmanagement.lexicalmatcher!...
}
private void adjustDisplaySettings(JGraph jg) {
jg.setPreferredSize(DEFAULT_SIZE);
Color c = DEFAULT_BG_COLOR;
String colorStr = null;
try {
colorStr = getParameter("bgcolor");
} catch (Exception e) {
}
if (colorStr != null) {
c = Color.decode(colorStr);
}
jg.setBackground(c);
}
private void positionVertexAt(Object vertex, int x, int y) {
//seleziono la cella chiamata vertex
DefaultGraphCell cell = m_jgAdapter.getVertexCell(vertex);
//recupero gli attributi della cella
Map attr = cell.getAttributes();
//recupero i boundaries della cella
Rectangle2D b = GraphConstants.getBounds(attr);
//setto i parametri del nuovo rettangolo
GraphConstants.setBounds(attr, new Rectangle(x, y, (int)b.getWidth(), (int)b.getHeight()));
//costruisco una nuova cella
Map cellAttr = new HashMap();
cellAttr.put(cell, attr);
//posiziono la cella nel grafo
m_jgAdapter.edit(cellAttr, null, null, null);
}
}

View File

@ -0,0 +1,73 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph;
import java.math.BigInteger;
import org.jgrapht.graph.DefaultWeightedEdge;
public class RelationEdge extends DefaultWeightedEdge{
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
private String relationName;
private long indexFrom;
private long indexTo;
private BigInteger weight;
private String categoryFrom;
private String categoryTo;
public BigInteger getWeigth(){
return weight;
}
public void setWeigth(BigInteger Weight){
weight = Weight;
}
public long getTo(){
return indexTo;
}
public long getFrom(){
return indexFrom;
}
public String getName(){
return relationName;
}
public void setName(String name){
relationName = name;
}
public RelationEdge(String name,long from,long to){
relationName = name;
indexFrom = from;
indexTo = to;
}
@Override
public String toString(){
return "["+relationName+": from "+indexFrom+" to " +indexTo+" nameFrom "+categoryFrom+" nameTo "+categoryTo+"]";
}
public void setCategoryFrom(String categoryFrom) {
this.categoryFrom = categoryFrom;
}
public String getCategoryFrom() {
return categoryFrom;
}
public void setCategoryTo(String categoryTo) {
this.categoryTo = categoryTo;
}
public String getCategoryTo() {
return categoryTo;
}
}

View File

@ -0,0 +1,68 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.treeStructure.graph;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.hibernate.SessionFactory;
public class TreeExtractor {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
TreeNode categoriesTree;
//recupera l'albero delle categorie
public TreeNode getCategoriesTree(SessionFactory DB){
return categoriesTree;
}
//creo un nuovo Albero
public TreeExtractor(){
categoriesTree = new TreeNode(TreeNode.ROOT);
}
class TreeNode implements Iterable<TreeNode> {
public static final String ROOT = "ROOT";
private Set<TreeNode> children;
public String name;
public TreeNode(String Name) {
children = new HashSet<TreeNode>();
name = Name;
}
public String getName(){
return name;
}
public boolean addChild(TreeNode n) {
return children.add(n);
}
public boolean removeChild(TreeNode n) {
return children.remove(n);
}
public Iterator<TreeNode> iterator() {
return children.iterator();
}
public boolean isLeaf(){
return ((children==null) || (children.size()==0));
}
public boolean isRoot(){
return (name.equals(ROOT));
}
}
}

View File

@ -0,0 +1,489 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.run;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.CategoryOrderedList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.DBObjectTranslator;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.TSObjectTransformer;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.interfaces.Reference;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.hibernate.SessionFactory;
public class CategoryGuesser {
/**
* @param args
* @throws Exception
*/
private final static int MAXRESULTS = 10;
public static void showResults(ArrayList<SingleResult> results) {
AnalysisLogger.getLogger().warn("CLASSIFICATION RESULT:\n");
int i = 1;
for (SingleResult result : results) {
if (result.getColumn() != null)
AnalysisLogger.getLogger().warn(i + ": " + result.getCategory() + " - " + result.getColumn() + " ; SCORE: " + result.getStringScore() + "%");
else
AnalysisLogger.getLogger().warn(i + ": " + result.getCategory() + " ; SCORE: " + result.getStringScore() + "%");
i++;
}
}
public static void AccuracyCalc(CategoryGuesser guesser, String configPath, String seriesName, String column, int attempts, String correctFamily, String correctColumn) throws Exception {
AccuracyCalc(null, guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
}
public static void AccuracyCalc(LexicalEngineConfiguration externalcfg, CategoryGuesser guesser, String configPath, String seriesName, String column, int attempts, String correctFamily, String correctColumn) throws Exception {
int familyscore = 0;
int columnscore = 0;
// CategoryGuesser guesser = new CategoryGuesser();
for (int i = 0; i < attempts; i++) {
guesser.runGuesser(seriesName, column, externalcfg);
ArrayList<SingleResult> results = guesser.getClassification();
String result = results.toString();
showResults(results);
AnalysisLogger.getLogger().info("CLASSIFICATION RESULT " + result + " " + CategoryGuesser.resultString(result, correctFamily, correctColumn));
if (CategoryGuesser.CheckCompleteResult(result, correctFamily, correctColumn))
columnscore++;
if (CategoryGuesser.CheckFamilyResult(result, correctFamily))
familyscore++;
}
double percColumn = ((double) columnscore / (double) attempts) * 100;
double percFamily = ((double) familyscore / (double) attempts) * 100;
AnalysisLogger.getLogger().info("->ACCURACY ON FAMILY " + correctFamily + ":" + percFamily + " ACCURACY ON COLUMN " + correctColumn + ":" + percColumn);
}
public static String resultString(String result, String family, String column) {
result = result.toUpperCase();
family = family.toUpperCase();
column = column.toUpperCase();
return "FAMILY REC: " + result.contains(family) + " COLUMN REC: " + result.contains(family + "=" + column);
}
public static boolean CheckCompleteResult(String result, String family, String column) {
result = result.toUpperCase();
family = family.toUpperCase();
column = column.toUpperCase();
if (result.contains(family + "=" + column))
return true;
else
return false;
}
public static boolean CheckFamilyResult(String result, String family) {
result = result.toUpperCase();
family = family.toUpperCase();
if (result.contains(family + "="))
return true;
else
return false;
}
// NOTE: The config path has to contain the two files: lexicalGuesser.properties and ALog.properties
private static final String cfgFile = "lexicalGuesser.properties";
private static final String LogFile = "ALog.properties";
// singleton
private CategoryOrderedList col;
private Engine processor;
private CategoryOrderedList originalCol;
private LexicalEngineConfiguration config;
private String configPath;
private boolean oneshotMode;
private static final int maxTriesClassification = 3;
private int triesCounter;
public CategoryGuesser(String ConfigPath) {
triesCounter = 0;
this.configPath = ConfigPath;
}
public CategoryGuesser() {
triesCounter = 0;
this.configPath = ".";
}
public void runGuesser(String seriesName, String columnName, LexicalEngineConfiguration externalConfig) throws Exception {
runGuesser(seriesName, columnName, externalConfig, null, null);
}
public void runGuesser(String seriesName, String columnName) throws Exception {
runGuesser(seriesName, columnName, null, null, null);
}
public void runGuesser(String seriesName, String columnName, LexicalEngineConfiguration externalConfig, String CategoryFilter, String ColumnFilter) throws Exception {
runGuesser(seriesName, columnName, externalConfig, CategoryFilter, ColumnFilter, null);
}
public void runGuesser(String SingletonString, LexicalEngineConfiguration externalConfig, String CategoryFilter, String ColumnFilter) throws Exception {
oneshotMode = true;
runGuesser(null, null, externalConfig, CategoryFilter, ColumnFilter, SingletonString);
}
public void init(String categoryFilter, String columnFilter, LexicalEngineConfiguration externalConfig) throws Exception {
String cfgFileCompletePath = configPath + "/" + cfgFile;
AnalysisLogger.setLogger(configPath + "/" + LogFile);
AnalysisLogger.getLogger().trace("******************INITIALIZING******************");
config = new LexicalEngineConfiguration();
config.configure(cfgFileCompletePath);
if (externalConfig != null) {
config.mergeConfig(externalConfig);
}
processor = new Engine(config, columnFilter, configPath);
SessionFactory dbSession = processor.getDBSession(config);
DBObjectTranslator dbo = new DBObjectTranslator();
if (col == null) {
AnalysisLogger.getLogger().trace("******************Order Category******************");
if (externalConfig == null)
externalConfig = new LexicalEngineConfiguration();
dbo.buildCategoriesStructure(dbSession, externalConfig.getReferenceTable(), externalConfig.getReferenceColumn(), externalConfig.getIdColumn(), externalConfig.getNameHuman(), externalConfig.getDescription());
col = TSObjectTransformer.transform2List(dbo, config, categoryFilter);
AnalysisLogger.getLogger().trace("***************End Ordering********************");
originalCol = col.generateNovelList();
} else {
col = originalCol.generateNovelList();
}
oneshotMode = false;
}
public void initSingleMatcher(LexicalEngineConfiguration externalConfig, String ColumnFilter) throws Exception {
String cfgFileCompletePath = configPath + "/" + cfgFile;
AnalysisLogger.setLogger(configPath + "/" + LogFile);
config = new LexicalEngineConfiguration();
config.configure(cfgFileCompletePath);
if (externalConfig != null) {
config.mergeConfig(externalConfig);
}
processor = new Engine(config, ColumnFilter, configPath);
// in this case, the lexical matcher is invoked once, then it has to be stopped in the end
oneshotMode = true;
}
public void init(String categoryFilter, String columnFilter) throws Exception {
init(categoryFilter, columnFilter, null);
}
public void init(LexicalEngineConfiguration externalConfig) throws Exception {
init(null, null, externalConfig);
}
public void init() throws Exception {
init(null, null, null);
}
public void refreshReferences() {
col = null;
}
public void runGuesser(String seriesName, String columnName, LexicalEngineConfiguration externalConfig, String CategoryFilter, String ColumnFilter, String SingletonString) throws Exception {
String cfgFileCompletePath = configPath + "/" + cfgFile;
AnalysisLogger.setLogger(configPath + "/" + LogFile);
AnalysisLogger.getLogger().debug("Guessing Table " + seriesName + " column " + columnName);
if (externalConfig != null) {
config = new LexicalEngineConfiguration();
config.configure(cfgFileCompletePath);
config.mergeConfig(externalConfig);
// NOTE FOR FUTURE OPTIMIZATION: perform the re-init only if there is a change in the Database pointing
processor = new Engine(config, ColumnFilter, configPath);
} else {
if (config == null) {
config = new LexicalEngineConfiguration();
config.configure(cfgFileCompletePath);
}
if (processor == null) {
processor = new Engine(config, ColumnFilter, configPath);
} else
processor.resetEngine(config, ColumnFilter, configPath);
}
SessionFactory dbSession = processor.getDBSession(config);
DBObjectTranslator dbo = new DBObjectTranslator();
//modification of 10/10/11 calculate structure each time
// if (col == null) {
AnalysisLogger.getLogger().trace("******************Order Category******************");
dbo.buildCategoriesStructure(dbSession, config.getReferenceTable(), config.getReferenceColumn(), config.getIdColumn(), config.getNameHuman(), config.getDescription());
col = TSObjectTransformer.transform2List(dbo, config, CategoryFilter);
AnalysisLogger.getLogger().trace("***************End Ordering********************");
originalCol = col.generateNovelList();
/*
} else {
col = originalCol.generateNovelList();
}
*/
AnalysisLogger.getLogger().warn("Starting Calculation...wait...");
long t0 = System.currentTimeMillis();
// processor.calcLike(col,seriesName, columnName);
processor.calcLikeThread(col, seriesName, columnName, SingletonString);
// perform processing until the table contains at least one element
ArrayList<SingleResult> checkingResults = null;
// if (oneshotMode)
// checkingResults = getClassification();
// else
checkingResults = getClassification();
while ((checkingResults == null || checkingResults.size() == 0) && (triesCounter < maxTriesClassification)) {
AnalysisLogger.getLogger().warn("..another processing pass is required. Attempt number " + (triesCounter + 1));
triesCounter++;
float differencialThr = config.getCategoryDiscardDifferencialThreshold();
float acceptanceThr = config.getEntryAcceptanceThreshold();
// reduce the thresholds of 10 points and recalculate
config.setCategoryDiscardDifferencialThreshold(Math.max(differencialThr - 20, 0));
config.setEntryAcceptanceThreshold(Math.max(acceptanceThr - 20, 0));
AnalysisLogger.getLogger().trace("Performing next processing pass");
runGuesser(seriesName, columnName, null, CategoryFilter, ColumnFilter, SingletonString);
AnalysisLogger.getLogger().debug("End processing pass");
// if (oneshotMode)
// checkingResults = getClassification();
// else
checkingResults = getClassification();
if (triesCounter == 0)
break;
}
long t1 = System.currentTimeMillis() - t0;
AnalysisLogger.getLogger().warn("...End Calculation in " + t1 + "ms");
triesCounter = 0;
// close session if not more necessary
if (oneshotMode)
dbSession.close();
}
public ArrayList<SingleResult> getClassificationOLD() {
ArrayList<SingleResult> results = new ArrayList<SingleResult>();
int size = processor.bestCategories.size();
for (int i = 0; i < size; i++) {
results.add(new SingleResult(processor.bestCategories.get(i), processor.bestColumns.get(i), processor.bestScores.get(i), null, "0"));
}
return results;
}
public ArrayList<SingleResult> getDetailedMatches() {
if (processor.getSingletonMatches() != null) {
// use deviation to cut results
float threshold = config.getSingleEntryRecognitionMaxDeviation();
ArrayList<SingleResult> results = processor.getSingletonMatches();
double minScore = 0;
// get the best result and calculate the threshold
if (results.size() > 0) {
minScore = results.get(0).getScore() - threshold;
}
// remove poor objects
int size = results.size();
for (int i = 0; i < size; i++) {
SingleResult sr = results.get(i);
if (sr.getScore() < minScore) {
results.remove(i);
i--;
size--;
}
}
return processor.getSingletonMatches();
} else
return new ArrayList<SingleResult>();
}
public String getDetailedSingletonEntry() {
if (processor.getSingletonElement() != null) {
return processor.getSingletonElement();
} else
return "";
}
public ArrayList<SingleResult> getClassificationPlain() {
ArrayList<SingleResult> results = new ArrayList<SingleResult>();
int size = processor.bestCategories.size();
double maxscore = 0;
for (int i = 0; i < size; i++) {
double score = processor.bestScores.get(i);
if (maxscore < score) {
maxscore = score;
}
}
for (int i = 0; i < size; i++) {
double score = processor.bestScores.get(i);
// normalizing percentages!!!
score = (score / (maxscore + ((size > 1) ? 1 : 0))) * 100;
if (score > config.categoryDiscardDifferencialThreshold) {
Reference ref = col.getCategory(processor.bestCategories.get(i));
results.add(new SingleResult(processor.bestCategories.get(i), processor.bestColumns.get(i), score, ref.getTableName(), ref.getIndex()));
}
}
return results;
}
public ArrayList<SingleResult> getClassification() {
ArrayList<SingleResult> results = new ArrayList<SingleResult>();
int size = processor.bestCategories.size();
double maxscore = 0;
BigDecimal sumElements = BigDecimal.ZERO;
ArrayList<Double> subscores = new ArrayList<Double>();
// calculate sum of elements and weights;
for (int i = 0; i < size; i++) {
BigInteger catElements = col.getScoresTable().get(processor.bestCategories.get(i)).getCategoryElements();
sumElements = sumElements.add(new BigDecimal(catElements));
}
/*
if (sumElements.compareTo(BigDecimal.valueOf(10000)) < 0)
return getClassificationPlain();
*/
for (int i = 0; i < size; i++) {
double score = processor.bestScores.get(i);
// multiply for impotance
BigInteger catElements = col.getScoresTable().get(processor.bestCategories.get(i)).getCategoryElements();
// AnalysisLogger.getLogger().warn("\t elements "+catElements+" sum "+sumElements);
double weight = new BigDecimal(catElements).divide(sumElements, 2, BigDecimal.ROUND_HALF_UP).doubleValue();
if (weight >= 3)
weight = 2 * Math.log(weight * 100) / 10f;
else if ((weight >= 0.5) && (weight <= 1))
{
weight = Math.log(weight * 100) / 100.00f;
}
else if (weight < 0.05)
weight = 0.05;
AnalysisLogger.getLogger().warn("WEIGHT FOR CATEGORY " + processor.bestCategories.get(i) + "-" + processor.bestColumns.get(i) + " : " + weight + " SCORE " + score);
// recalculate weights
score = score * weight;
score = Math.min(1, score);
if (maxscore < score) {
maxscore = score;
}
subscores.add(score);
}
// AnalysisLogger.getLogger().warn("MAX SCORE "+maxscore);
for (int i = 0; i < size; i++) {
// double score = processor.bestScores.get(i);
double score = subscores.get(i);
// AnalysisLogger.getLogger().warn("SCORE FOR CATEGORY "+processor.bestCategories.get(i)+" -COLUMN : "+processor.bestColumns.get(i)+" - "+score);
// normalizing percentages!!!
score = (score / (maxscore + ((size > 1) ? 1 : 0))) * 100;
// AnalysisLogger.getLogger().warn("SCORE FOR CATEGORY "+processor.bestCategories.get(i)+" -COLUMN : "+processor.bestColumns.get(i)+" - "+score);
if (score > config.categoryDiscardDifferencialThreshold) {
// AnalysisLogger.getLogger().warn("SCORE "+score);
// insert into the right place
int index = results.size();
int j = 0;
for (SingleResult res : results) {
if (res.getScore() < score) {
index = j;
}
j++;
}
Reference ref = col.getCategory(processor.bestCategories.get(i));
SingleResult sr = new SingleResult(processor.bestCategories.get(i), processor.bestColumns.get(i), score, ref.getTableName(), ref.getIndex());
//control for repetitions
if (isnotRepetition(sr, results))
results.add(index, sr);
}
}
//limit the result list after rescoring
int s = results.size();
if (s>MAXRESULTS){
int diff = (size-MAXRESULTS);
for (int i=0;i<diff;i++){
s = results.size();
results.remove(s-1);
}
}
return results;
}
private boolean isnotRepetition(SingleResult result, ArrayList<SingleResult> previous) {
boolean notrepeated = true;
int size = previous.size();
for (int i = 0; i < size; i++) {
SingleResult sr = previous.get(i);
if (sr.getCategory().equalsIgnoreCase(result.getCategory()) && sr.getColumn().equalsIgnoreCase(result.getColumn())) {
notrepeated = true;
break;
}
}
return notrepeated;
}
}

View File

@ -0,0 +1,36 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.run;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.DBObjectTranslator;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.TSObjectTransformer;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.gcube.contentmanagement.lexicalmatcher.utils.DatabaseFactory;
import org.hibernate.SessionFactory;
public class StarGraphExtraction {
/**
* @param args
*/
public static void main(String[] args) {
try {
RunMain();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private final static String ConfigurationFileNameLocal = "hibernate.cfg.xml";
public static void RunMain() throws Exception{
AnalysisLogger.setLogger("./ALog.properties");
//configurazione DB - inizializzo la sessione e mi connetto
SessionFactory dbSession = DatabaseFactory.initDBConnection(ConfigurationFileNameLocal);
DBObjectTranslator dbo = new DBObjectTranslator();
dbo.buildWholeStructure(dbSession,null,null,null,null,null);
TSObjectTransformer.transform2Graph(dbo);
}
}

View File

@ -0,0 +1,49 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class TestExternalCfgProduction {
public static void main(String[] args) {
try {
int attempts = 1;
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String seriesName = "IMPORT_ecd2e3a0_ee90_11e0_be9e_90f3621758ee";
String column = "field4";
LexicalEngineConfiguration conf = new LexicalEngineConfiguration();
conf.setReferenceTable("codelist1733371938");
conf.setReferenceColumn("ifield14");
conf.setNameHuman("ifield1");
conf.setIdColumn("ifield0");
conf.setDescription("ifield2");
//database Parameters
conf.setDatabaseUserName("gcube");
conf.setDatabasePassword("d4science2");
conf.setDatabaseDriver("org.postgresql.Driver");
conf.setDatabaseURL("jdbc:postgresql://localhost/testdb");
conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect");
guesser.runGuesser(seriesName, column, conf);
guesser.showResults(guesser.getClassification());
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,64 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test;
import java.util.ArrayList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class TestSingleExternalCfgProduction {
public static void main(String[] args) {
try {
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String singleton = "Faroe Island";
String family = "COUNTRY_OLD";
String column = "field6";
LexicalEngineConfiguration conf = new LexicalEngineConfiguration();
conf.setReferenceTable("codelist1733371938");
conf.setReferenceColumn("ifield14");
conf.setNameHuman("ifield1");
conf.setIdColumn("ifield0");
conf.setDescription("ifield2");
//CHANGE THIS TO ENHANCE THE RECALL
conf.setEntryAcceptanceThreshold(30);
conf.setReferenceChunksToTake(-1);
conf.setTimeSeriesChunksToTake(-1);
conf.setUseSimpleDistance(false);
//database Parameters
conf.setDatabaseUserName("gcube");
conf.setDatabasePassword("d4science2");
// conf.setDatabaseDriver("org.postgresql.Driver");
conf.setDatabaseURL("jdbc:postgresql://localhost/testdb");
conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect");
guesser.runGuesser(singleton, conf, family,column );
ArrayList<SingleResult> detailedResults = guesser.getDetailedMatches();
AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton);
CategoryGuesser.showResults(detailedResults);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,58 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class BenchMarkTest1 {
public static void main(String[] args) {
try {
int attempts = 1;
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String seriesName = "import_bdefb470_5cea_11df_a0a6_909e7d074592";
String column = "field1";
String correctFamily = "country";
String correctColumn = "name_en";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
//bench 2
AnalysisLogger.getLogger().warn("----------------------BENCH 2-------------------------");
seriesName = "import_bdefb470_5cea_11df_a0a6_909e7d074592";
column = "field2";
correctFamily = "area";
correctColumn = "name_en";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 2-----------------------\n");
//bench 3
AnalysisLogger.getLogger().warn("----------------------BENCH 3-------------------------");
seriesName = "import_bdefb470_5cea_11df_a0a6_909e7d074592";
column = "field4";
correctFamily = "species";
correctColumn = "scientific_name";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 3-----------------------\n");
//bench 4
AnalysisLogger.getLogger().warn("----------------------BENCH 4-------------------------");
seriesName = "import_bdefb470_5cea_11df_a0a6_909e7d074592";
column = "field3";
correctFamily = "species";
correctColumn = "scientific_name";
// CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 4-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,54 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class BenchMarkTest2 {
public static void main(String[] args) {
try {
int attempts = 1;
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6";
String column = "field1";
String correctFamily = "SPECIES";
String correctColumn = "SCIENTIFIC_NAME";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
//bench 2
AnalysisLogger.getLogger().warn("----------------------BENCH 2-------------------------");
seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6";
column = "field2";
correctFamily = "COUNTRY";
correctColumn = "ISO_3_CODE";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 2-----------------------\n");
//bench 4
AnalysisLogger.getLogger().warn("----------------------BENCH 4-------------------------");
seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6";
column = "field3";
correctFamily = "AREA";
correctColumn = "NAME_EN";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 4-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,31 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class BenchMarkTest3 {
public static void main(String[] args) {
try {
int attempts = 1;
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6";
String column = "field1";
String correctFamily = "SPECIES";
String correctColumn = "SCIENTIFIC_NAME";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,31 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class BenchMarkTest4 {
public static void main(String[] args) {
try {
int attempts = 1;
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String seriesName = "import_2c97f580_35a0_11df_b8b3_aa10916debe6";
String column = "field3";
String correctFamily = "AREA";
String correctColumn = "NAME_EN";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,31 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class BenchMarkTest5 {
public static void main(String[] args) {
try {
int attempts = 1;
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String seriesName = "import_532bba80_1c8f_11df_a4ee_87804054691e";
String column = "field2";
String correctFamily = "ISSCAAP GROUP";
String correctColumn = "NAME_EN";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,52 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class BenchMarkTestExternalCfg {
public static void main(String[] args) {
try {
int attempts = 1;
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String seriesName = "import_532bba80_1c8f_11df_a4ee_87804054691e";
String column = "field2";
String correctFamily = "ISSCAAP GROUP";
String correctColumn = "NAME_EN";
LexicalEngineConfiguration conf = new LexicalEngineConfiguration();
conf.setCategoryDiscardDifferencialThreshold(5);
conf.setCategoryDiscardThreshold(0);
conf.setChunkSize(25);
conf.setEntryAcceptanceThreshold(50);
conf.setNumberOfThreadsToUse(2);
conf.setRandomTake(true);
conf.setReferenceChunksToTake(20);
conf.setTimeSeriesChunksToTake(1);
conf.setUseSimpleDistance(false);
//database Parameters
conf.setDatabaseUserName("root");
// conf.setDatabasePassword("password");
conf.setDatabaseDriver("com.mysql.jdbc.Driver");
conf.setDatabaseURL("jdbc:mysql://localhost/timeseries");
conf.setDatabaseDialect("org.hibernate.dialect.MySQLDialect");
conf.setDatabaseAutomaticTestTable("connectiontesttable");
conf.setDatabaseIdleConnectionTestPeriod("3600");
CategoryGuesser.AccuracyCalc(conf,guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,38 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old;
import java.util.ArrayList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class BenchMarkTestFilterCategory {
public static void main(String[] args) {
try {
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String seriesName = "ref_order";
String column = "scientific_name";
String correctFamily = "order";
String correctColumn = "scientific_name";
guesser.runGuesser(seriesName, column, null, correctFamily, correctColumn);
ArrayList<SingleResult> results = guesser.getClassification();
CategoryGuesser.showResults(results);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,51 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old;
import java.util.ArrayList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class BenchMarkTestSingleton {
public static void main(String[] args) {
try {
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String singleton = "sarda sarda";
// String singleton = "Mitella pollicipes";
// String singleton = "policipes";
// String singleton = "";
String family = "catalog life";
String column = "scientific_name";
LexicalEngineConfiguration conf = new LexicalEngineConfiguration();
//CHANGE THIS TO ENHANCE THE RECALL
conf.setEntryAcceptanceThreshold(30);
conf.setReferenceChunksToTake(-1);
conf.setTimeSeriesChunksToTake(-1);
conf.setUseSimpleDistance(false);
guesser.runGuesser(singleton, conf, family,column );
ArrayList<SingleResult> detailedResults = guesser.getDetailedMatches();
AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton);
CategoryGuesser.showResults(detailedResults);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,31 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class BenchMarkTestTSCountry {
public static void main(String[] args) {
try {
int attempts = 1;
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String seriesName = "import_bdefb470_5cea_11df_a0a6_909e7d074592";
String column = "field1";
String correctFamily = "country";
String correctColumn = "name_en";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,88 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class BenchMarkTrainingSet {
public static void main(String[] args) {
try {
String configPath =".";
int attempts = 1;
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String seriesName = "ref_commission";
String column = "name_en";
String correctFamily = "commission";
String correctColumn = "name_en";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
AnalysisLogger.getLogger().warn("----------------------BENCH 2-------------------------");
seriesName = "ref_species";
column = "scientific_name";
correctFamily = "species";
correctColumn = "scientific_name";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 2-----------------------\n");
AnalysisLogger.getLogger().warn("----------------------BENCH 3-------------------------");
seriesName = "ref_area";
column = "name_en";
correctFamily = "area";
correctColumn = "name_en";
// CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 3-----------------------\n");
AnalysisLogger.getLogger().warn("----------------------BENCH 4-------------------------");
seriesName = "ref_ocean";
column = "name_en";
correctFamily = "ocean";
correctColumn = "name_en";
// CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 4-----------------------\n");
AnalysisLogger.getLogger().warn("----------------------BENCH 5-------------------------");
seriesName = "ref_geo_region";
column = "name_en";
correctFamily = "geo region";
correctColumn = "name_en";
// CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 5-----------------------\n");
AnalysisLogger.getLogger().warn("----------------------BENCH 6-------------------------");
seriesName = "ref_fa_region";
column = "name_en";
correctFamily = "fa region";
correctColumn = "name_en";
// CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 6-----------------------\n");
AnalysisLogger.getLogger().warn("----------------------BENCH 7-------------------------");
seriesName = "ref_order";
column = "scientific_name";
correctFamily = "order";
correctColumn = "scientific_name";
// CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 7-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,33 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class BenchMarkTrainingSetScientificName {
public static void main(String[] args) {
try {
String configPath =".";
int attempts = 1;
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String seriesName = "ref_species";
String column = "scientific_name";
String correctFamily = "species";
String correctColumn = "scientific_name";
CategoryGuesser.AccuracyCalc(guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,64 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class TestExternalCfgProduction {
public static void main(String[] args) {
try {
int attempts = 1;
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
// String seriesName = "rdmc366dfe0ddf511e086b1b1c5d6fb1c27";
String seriesName = "IMPORT_ecd2e3a0_ee90_11e0_be9e_90f3621758ee";
String column = "field4";
LexicalEngineConfiguration conf = new LexicalEngineConfiguration();
/*
conf.setCategoryDiscardDifferencialThreshold(5);
conf.setCategoryDiscardThreshold(0);
conf.setChunkSize(25);
conf.setEntryAcceptanceThreshold(50);
conf.setNumberOfThreadsToUse(2);
conf.setRandomTake(true);
conf.setReferenceChunksToTake(20);
conf.setTimeSeriesChunksToTake(1);
conf.setUseSimpleDistance(false);
*/
//database Parameters
conf.setDatabaseUserName("utente");
conf.setDatabasePassword("d4science");
// conf.setDatabaseDriver("org.postgresql.Driver");
conf.setDatabaseURL("jdbc:postgresql://dbtest.next.research-infrastructures.eu/timeseries");
conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect");
conf.setDatabaseAutomaticTestTable("connectiontesttable");
conf.setDatabaseIdleConnectionTestPeriod("3600");
conf.setReferenceTable("codelist1733371938");
conf.setReferenceColumn("ifield14");
conf.setNameHuman("ifield1");
conf.setIdColumn("ifield0");
conf.setDescription("ifield2");
guesser.runGuesser(seriesName, column, conf);
guesser.showResults(guesser.getClassification());
// AnalysisLogger.getLogger().warn();
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,71 @@
package org.gcube.contentmanagement.lexicalmatcher.analysis.test.old;
import java.util.ArrayList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
import org.gcube.contentmanagement.lexicalmatcher.analysis.run.CategoryGuesser;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
public class TestSingleExternalCfgProduction {
public static void main(String[] args) {
try {
String configPath = ".";
CategoryGuesser guesser = new CategoryGuesser(configPath);
//bench 1
AnalysisLogger.getLogger().warn("----------------------BENCH 1-------------------------");
String singleton = "Faroe Island";
// String singleton = "Mitella pollicipes";
// String singleton = "policipes";
// String singleton = "";
// String family = "rdf0a7fb500dd3d11e0b8d1d1e2e7ba4f9d";
String family = "COUNTRY_OLD";
String column = "field6";
LexicalEngineConfiguration conf = new LexicalEngineConfiguration();
//CHANGE THIS TO ENHANCE THE RECALL
conf.setEntryAcceptanceThreshold(30);
conf.setReferenceChunksToTake(-1);
conf.setTimeSeriesChunksToTake(-1);
conf.setUseSimpleDistance(false);
//database Parameters
conf.setDatabaseUserName("utente");
conf.setDatabasePassword("d4science");
// conf.setDatabaseDriver("org.postgresql.Driver");
conf.setDatabaseURL("jdbc:postgresql://dbtest.next.research-infrastructures.eu/timeseries");
conf.setDatabaseDialect("org.hibernate.dialect.PostgreSQLDialect");
conf.setDatabaseAutomaticTestTable("connectiontesttable");
conf.setDatabaseIdleConnectionTestPeriod("3600");
conf.setReferenceTable("codelist1733371938");
conf.setReferenceColumn("ifield14");
conf.setNameHuman("ifield1");
conf.setIdColumn("ifield0");
conf.setDescription("ifield2");
guesser.initSingleMatcher(conf,column );
guesser.runGuesser(singleton, null, family,column );
ArrayList<SingleResult> detailedResults = guesser.getDetailedMatches();
AnalysisLogger.getLogger().warn("Detailed Match on Name :"+singleton);
CategoryGuesser.showResults(detailedResults);
AnalysisLogger.getLogger().warn("--------------------END BENCH 1-----------------------\n");
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,37 @@
package org.gcube.contentmanagement.lexicalmatcher.utils;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
public class AnalysisLogger {
private static Logger logger;
private static Logger hibernateLogger;
public static Logger getLogger(){
if (logger == null){
setLogger("./ALog.properties");
logger = Logger.getLogger("AnalysisLogger");
}
return logger;
}
//in ingresso vuole il path al file di config del log4j
public static void setLogger(String path){
if (logger == null){
PropertyConfigurator.configure(path);
}
logger = Logger.getLogger("AnalysisLogger");
hibernateLogger = Logger.getLogger("hibernate");
}
public static void printStackTrace(Exception e){
int numberoflines = e.getStackTrace().length;
for (int i=0;i<numberoflines;i++){
logger.error(e.getStackTrace()[i]);
}
}
}

View File

@ -0,0 +1,207 @@
package org.gcube.contentmanagement.lexicalmatcher.utils;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.util.Iterator;
import java.util.List;
import javax.xml.parsers.DocumentBuilderFactory;
import org.dom4j.Document;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.hibernate.Query;
import org.hibernate.Session;
import org.hibernate.SessionFactory;
import org.hibernate.cfg.Configuration;
public class DatabaseFactory {
public static SessionFactory initDBConnection(String configurationFile) throws Exception {
String xml = FileTools.readXMLDoc(configurationFile);
SessionFactory DBSessionFactory = null;
Configuration cfg = new Configuration();
cfg = cfg.configure(DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new ByteArrayInputStream(xml.getBytes())));
DBSessionFactory = cfg.buildSessionFactory();
return DBSessionFactory;
}
@SuppressWarnings({"unchecked"})
public static SessionFactory initDBConnection(String configurationFile, LexicalEngineConfiguration config) throws Exception {
if (config==null)
return initDBConnection(configurationFile);
// take the configuration file
File fl = new File(configurationFile);
FileInputStream stream = new FileInputStream(fl);
SAXReader saxReader = new SAXReader();
Document document = saxReader.read(stream);
List<Node> nodes = document.selectNodes("//hibernate-configuration/session-factory/property");
Iterator<Node> nodesIterator = nodes.iterator();
// System.out.println("--- DATABASE Configuration --- ");
while (nodesIterator.hasNext()) {
Node currentnode = nodesIterator.next();
String element = currentnode.valueOf("@name");
if (element.equals("connection.driver_class"))
if (config.getDatabaseDriver() != null){
currentnode.setText(config.getDatabaseDriver());
}
if (element.equals("connection.url")) {
if (config.getDatabaseURL() != null)
currentnode.setText(config.getDatabaseURL());
}
if (element.equals("connection.username")) {
if (config.getDatabaseUserName() != null)
currentnode.setText(config.getDatabaseUserName());
}
if (element.equals("connection.password")) {
if (config.getDatabasePassword() != null)
currentnode.setText(config.getDatabasePassword());
}
if (element.equals("dialect")) {
AnalysisLogger.getLogger().trace("Dialect -> "+config.getDatabaseDialect());
if (config.getDatabaseDialect() != null)
currentnode.setText(config.getDatabaseDialect());
}
if (element.equals("c3p0.idleConnectionTestPeriod")) {
if (config.getDatabaseIdleConnectionTestPeriod() != null)
currentnode.setText(config.getDatabaseIdleConnectionTestPeriod());
}
if (element.equals("c3p0.automaticTestTable")) {
if (config.getDatabaseAutomaticTestTable() != null)
currentnode.setText(config.getDatabaseAutomaticTestTable());
}
}
Configuration cfg = new Configuration();
cfg = cfg.configure(DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new ByteArrayInputStream(document.asXML().getBytes())));
cfg.setProperty("hibernate.hbm2ddl.auto", "create");
SessionFactory DBSessionFactory = null;
DBSessionFactory = cfg.buildSessionFactory();
// close stream
stream.close();
return DBSessionFactory;
}
@SuppressWarnings({"unchecked"})
public static List<Object> executeHQLQuery(String query, SessionFactory DBSessionFactory, boolean useSQL) {
List<Object> obj = null;
Session ss = null;
try {
ss = DBSessionFactory.getCurrentSession();
ss.beginTransaction();
Query qr = null;
if (useSQL)
qr = ss.createSQLQuery(query);
else
qr = ss.createQuery(query);
List<Object> result = qr.list();
ss.getTransaction().commit();
/*
if (result == null)
System.out.println("Hibernate doesn't return a valid object when org.gcube.contentmanagement.lexicalmatcher retrieve UserState Object");
if (result != null && result.size() == 0)
System.out.println(String.format("found nothing in database"));
*/
if (result != null && result.size() != 0) {
obj = result;
}
} catch (Exception e) {
// System.out.println(String.format("Error while executing query: %1$s %2$s", query, e.getMessage()));
e.printStackTrace();
rollback(ss);
}
return obj;
}
public static void executeHQLUpdate(String query, SessionFactory DBSessionFactory, boolean useSQL) {
// System.out.println("executing query: " + query);
Session ss = null;
try {
ss = DBSessionFactory.getCurrentSession();
// System.out.println("executing query");
ss.beginTransaction();
Query qr = null;
if (useSQL)
qr = ss.createSQLQuery(query);
else
qr = ss.createQuery(query);
qr.executeUpdate();
ss.getTransaction().commit();
} catch (Exception e) {
rollback(ss);
e.printStackTrace();
}
}
public static void executeSQLUpdate(String query, SessionFactory DBSessionFactory) {
executeHQLUpdate(query, DBSessionFactory, true);
}
public static List<Object> executeSQLQuery(String query, SessionFactory DBSessionFactory) {
return executeHQLQuery(query, DBSessionFactory, true);
}
public static void rollback(Session ss) {
try {
if (ss != null && ss.getTransaction() != null)
ss.getTransaction().rollback();
} catch (Exception ex) {
} finally {
try {
ss.close();
} catch (Exception ee) {
}
}
}
public static void saveObject(Object obj, SessionFactory DBSessionFactory) throws Exception {
if (DBSessionFactory != null) {
Session ss = null;
try {
ss = DBSessionFactory.getCurrentSession();
ss.beginTransaction();
ss.saveOrUpdate(obj);
ss.getTransaction().commit();
} catch (Exception e) {
rollback(ss);
throw e;
}
}
}
}

View File

@ -0,0 +1,189 @@
package org.gcube.contentmanagement.lexicalmatcher.utils;
public class DistanceCalculator {
// ****************************
// Get minimum of three values
// ****************************
private int Minimum(int a, int b, int c) {
int mi;
mi = a;
if (b < mi) {
mi = b;
}
if (c < mi) {
mi = c;
}
return mi;
}
// *****************************
// Compute Levenshtein distance
// *****************************
public int LD(String s, String t) {
int d[][]; // matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
int cost; // cost
// Step 1
n = s.length();
m = t.length();
if (n == 0) {
return m;
}
if (m == 0) {
return n;
}
d = new int[n + 1][m + 1];
// Step 2
for (i = 0; i <= n; i++) {
d[i][0] = i;
}
for (j = 0; j <= m; j++) {
d[0][j] = j;
}
// Step 3
for (i = 1; i <= n; i++) {
s_i = s.charAt(i - 1);
// Step 4
for (j = 1; j <= m; j++) {
t_j = t.charAt(j - 1);
// Step 5
if (s_i == t_j) {
cost = 0;
} else {
cost = 1;
}
// Step 6
d[i][j] = Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
}
}
// Step 7
return d[n][m];
}
// *****************************
// Calculate Complete Distance
// *****************************
public double CD(boolean useSimpleDistance, String h, String t) {
return CD(useSimpleDistance, h, t,false,false);
}
//output will be a percentage. 1 will mean a complete agreement between the inputs
public double CD(boolean useSimpleDistance, String h, String t, boolean ignoreCase, boolean boostMatch) {
double distance = 0;
if ((h == null) && (t == null)) {
distance = 1;
}
else if ((h != null) && (t != null)) {
h = treatString(h,ignoreCase);
t = treatString(t,ignoreCase);
int lt = t.length();
int lh = h.length();
double matchFactor = 1.5f;
if (boostMatch)
matchFactor = 2f;
if (((lt==0)&&(lh!=0))||((lt!=0)&&(lh==0)))
distance = 0;
else if (h.equalsIgnoreCase(t)){
distance = 1;
}
else if (useSimpleDistance) {
distance = 0;
}
else if (t.contains(h)) {
// calcolo la percentuale di contenimento
String treatedT = t.replace(h, "");
double percentage = 1 - ((double) treatedT.length() / (double) lt);
// AnalysisLogger.getLogger().debug("Complete Distance Calculation: coverage percentage of h on t " + percentage);
// double percentage = 0.9;
percentage = Math.min(percentage * matchFactor,0.98);
distance = percentage;
}
else if (h.contains(t)) {
// calcolo la percentuale di contenimento
String treatedH = h.replace(t, "");
double percentage = 1 - ((double) treatedH.length() / (double) lh);
// AnalysisLogger.getLogger().debug("Complete Distance Calculation: coverage percentage of t on h " + percentage);
// double percentage = 0.9;
percentage = Math.min(percentage * matchFactor,0.98);
distance = percentage;
}
else {
/*
if ((lh>lt)||((lt>lh*1.5))){
System.out.println("UNMATCHABLE "+lt +" vs "+lh);
distance = 0;
}
else{
*/
//calcolo percentuale su Levenshtein distance
int levenDist = LD(h, t);
int maxlen = Math.max(lh, lt);
distance = 1-((double)levenDist / (double)maxlen);
// System.out.println("L " + levenDist+" max "+maxlen+" h "+h+" t "+t);
// AnalysisLogger.getLogger().debug("Complete Distance Calculation: leven distance percentage of h on t " + distance);
// }
}
}
return distance;
}
private String treatString(String h, boolean ignoreCase){
//tolgo la punteggiatura
h = h.replaceAll("[!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-]", "");
//riduco gli spazi multipli a spazi singoli
h = h.replaceAll("[ ]+", " ");
//trim
h = h.trim();
if (ignoreCase)
h = h.toLowerCase();
return h;
}
public static void main(String[] args) {
String h = "Mediteranean";
String t = "Mediterranean horse mackerel";
DistanceCalculator d = new DistanceCalculator();
double cd = d.CD(false,h, t, true , true);
System.out.println("Distance between "+h+" and "+t+" : " + cd);
}
}

View File

@ -0,0 +1,89 @@
package org.gcube.contentmanagement.lexicalmatcher.utils;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import org.dom4j.Document;
import org.dom4j.io.SAXReader;
public class FileTools {
public static String readXMLDoc(String xmlFilePath) throws Exception {
String xml = null;
File fl = new File(xmlFilePath);
FileInputStream stream = new FileInputStream(fl);
SAXReader saxReader = new SAXReader();
Document document = saxReader.read(stream);
xml = document.asXML();
return xml;
}
public static void saveString2File(String filename, String string2save) throws Exception {
}
public static boolean checkInput(String filename) {
File file = new File(filename);
if (!file.exists())
return false;
if (!file.canRead())
return false;
else
return true;
}
public static boolean checkOutput(String filename, boolean overwrite) {
File file = new File(filename);
if (!overwrite && file.exists())
return false;
if (file.exists() && (file.isDirectory() || !file.canWrite()))
return false;
else
return true;
}
public static String loadString(String filename, String encoding) throws Exception {
try {
if (checkInput(filename)) {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(filename), encoding));
String line = null;
StringBuilder vud = new StringBuilder();
while ((line = in.readLine()) != null) {
vud.append(line + "\n");
}
in.close();
return vud.toString();
} else
return null;
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
throw new Exception("The file " + filename + " is not in the correct format!");
} catch (IOException e) {
throw new Exception("The file " + filename + " is not in the correct format!");
}
}
public static void saveString(String filename, String s, boolean overwrite, String encoding) throws Exception {
try {
if (checkOutput(filename, overwrite)) {
Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), encoding));
out.write(s);
out.close();
}
} catch (IOException e) {
throw new Exception("The system can not write in " + filename + " because:\n" + e.getMessage());
}
}
}

View File

@ -0,0 +1,99 @@
package org.gcube.contentmanagement.lexicalmatcher.utils;
import java.math.BigInteger;
import java.util.ArrayList;
public class MathFunctions {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
//increments a percentage o mean calculation when a lot of elements are present
public static float incrementPerc(float perc, float quantity, int N){
if (N==0)
return quantity;
float out = 0;
int N_plus_1 = N+1;
out = (float)((perc + ((double)quantity / (double)N )) * ((double)N/(double)N_plus_1));
return out;
}
public static ArrayList<Integer> generateRandoms(int numberOfRandoms, int min, int max) {
ArrayList<Integer> randomsSet = new ArrayList<Integer>();
// if number of randoms is equal to -1 generate all numbers
if (numberOfRandoms == -1) {
for (int i = min; i < max; i++) {
randomsSet.add(i);
}
} else {
int numofrandstogenerate = 0;
if (numberOfRandoms <= max) {
numofrandstogenerate = numberOfRandoms;
} else {
numofrandstogenerate = max;
}
if (numofrandstogenerate == 0) {
randomsSet.add(0);
} else {
for (int i = 0; i < numofrandstogenerate; i++) {
int RNum = -1;
RNum = (int) ((max) * Math.random()) + min;
// generate random number
while (randomsSet.contains(RNum)) {
RNum = (int) ((max) * Math.random()) + min;
// AnalysisLogger.getLogger().debug("generated " + RNum);
}
// AnalysisLogger.getLogger().debug("generated " + RNum);
if (RNum >= 0)
randomsSet.add(RNum);
}
}
}
AnalysisLogger.getLogger().trace("MathFunctions-> generateRandoms " + randomsSet.toString());
return randomsSet;
}
public static int[] generateSequence(int elements) {
int [] sequence = new int[elements];
for (int i=0;i<elements;i++){
sequence[i]=i;
}
return sequence;
}
public static BigInteger chunk2Index(int chunkIndex,int chunkSize){
return BigInteger.valueOf(chunkIndex).multiply(BigInteger.valueOf(chunkSize));
}
//calculates mean
public static double mean(double[] p) {
double sum = 0; // sum of all the elements
for (int i=0; i<p.length; i++) {
sum += p[i];
}
return sum / p.length;
}//end method mean
}