ecological-engine/src/main/java/org/gcube/contentmanagement/lexicalmatcher/analysis/run/CategoryGuesser.java

531 lines
18 KiB
Java

package org.gcube.contentmanagement.lexicalmatcher.analysis.run;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.Engine;
import org.gcube.contentmanagement.lexicalmatcher.analysis.core.LexicalEngineConfiguration;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.Category;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.CategoryOrderedList;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.DBObjectTranslator;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.SingleResult;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.data.TSObjectTransformer;
import org.gcube.contentmanagement.lexicalmatcher.analysis.guesser.interfaces.Reference;
import org.hibernate.SessionFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class CategoryGuesser {
private static Logger logger = LoggerFactory.getLogger(CategoryGuesser.class);
private final static int MAXRESULTS = 10;
public static void showResults(ArrayList<SingleResult> results) {
logger.warn("CLASSIFICATION RESULT:\n");
int i = 1;
for (SingleResult result : results) {
if (result.getColumn() != null)
logger.warn(i + ": " + result.getCategory() + " - " + result.getColumn() + " ; SCORE: " + result.getStringScore() + "%");
else
logger.warn(i + ": " + result.getCategory() + " ; SCORE: " + result.getStringScore() + "%");
i++;
}
}
public static void AccuracyCalc(CategoryGuesser guesser, String configPath, String seriesName, String column, int attempts, String correctFamily, String correctColumn) throws Exception {
AccuracyCalc(null, guesser, configPath, seriesName, column, attempts, correctFamily, correctColumn);
}
public static void AccuracyCalc(LexicalEngineConfiguration externalcfg, CategoryGuesser guesser, String configPath, String seriesName, String column, int attempts, String correctFamily, String correctColumn) throws Exception {
int familyscore = 0;
int columnscore = 0;
// CategoryGuesser guesser = new CategoryGuesser();
for (int i = 0; i < attempts; i++) {
guesser.runGuesser(seriesName, column, externalcfg);
ArrayList<SingleResult> results = guesser.getClassification();
String result = results.toString();
showResults(results);
logger.info("CLASSIFICATION RESULT " + result + " " + CategoryGuesser.resultString(result, correctFamily, correctColumn));
if (CategoryGuesser.CheckCompleteResult(result, correctFamily, correctColumn))
columnscore++;
if (CategoryGuesser.CheckFamilyResult(result, correctFamily))
familyscore++;
}
double percColumn = ((double) columnscore / (double) attempts) * 100;
double percFamily = ((double) familyscore / (double) attempts) * 100;
logger.info("->ACCURACY ON FAMILY " + correctFamily + ":" + percFamily + " ACCURACY ON COLUMN " + correctColumn + ":" + percColumn);
}
public static String resultString(String result, String family, String column) {
result = result.toUpperCase();
family = family.toUpperCase();
column = column.toUpperCase();
return "FAMILY REC: " + result.contains(family) + " COLUMN REC: " + result.contains(family + "=" + column);
}
public static boolean CheckCompleteResult(String result, String family, String column) {
result = result.toUpperCase();
family = family.toUpperCase();
column = column.toUpperCase();
if (result.contains(family + "=" + column))
return true;
else
return false;
}
public static boolean CheckFamilyResult(String result, String family) {
result = result.toUpperCase();
family = family.toUpperCase();
if (result.contains(family + "="))
return true;
else
return false;
}
// NOTE: The config path has to contain the two files: lexicalGuesser.properties and ALog.properties
private static final String cfgFile = "lexicalguesser/lexicalGuesser.properties";
private static final String LogFile = "lexicalguesser/ALog.properties";
// singleton
private CategoryOrderedList col;
private Engine processor;
private CategoryOrderedList originalCol;
private LexicalEngineConfiguration config;
private boolean oneshotMode;
private static final int maxTriesClassification = 3;
private int triesCounter;
public CategoryGuesser() {
triesCounter = 0;
}
public void runGuesser(String seriesName, String columnName, LexicalEngineConfiguration externalConfig) throws Exception {
runGuesser(seriesName, columnName, externalConfig, null, null);
}
public void runGuesser(String seriesName, String columnName) throws Exception {
runGuesser(seriesName, columnName, null, null, null);
}
public void runGuesser(String seriesName, String columnName, LexicalEngineConfiguration externalConfig, String CategoryFilter, String ColumnFilter) throws Exception {
runGuesser(seriesName, columnName, externalConfig, CategoryFilter, ColumnFilter, null);
}
public void runGuesser(String SingletonString, LexicalEngineConfiguration externalConfig, String CategoryFilter, String ColumnFilter) throws Exception {
oneshotMode = true;
runGuesser(null, null, externalConfig, CategoryFilter, ColumnFilter, SingletonString);
}
public void init(String categoryFilter, String columnFilter, LexicalEngineConfiguration externalConfig) throws Exception {
String cfgFileCompletePath = cfgFile;
logger.trace("******************INITIALIZING******************");
config = new LexicalEngineConfiguration();
config.configureByStream(cfgFileCompletePath);
if (externalConfig != null) {
config.mergeConfig(externalConfig);
}
processor = new Engine(config, columnFilter);
SessionFactory dbSession = processor.getDBSession(config);
DBObjectTranslator dbo = new DBObjectTranslator();
if (col == null) {
logger.trace("******************Order Category******************");
if (externalConfig == null)
externalConfig = new LexicalEngineConfiguration();
if (externalConfig.getCategories()!=null && externalConfig.getCategories().size()>0)
dbo.categories=externalConfig.getCategories();
else
dbo.buildCategoriesStructure(dbSession, externalConfig.getReferenceTable(), externalConfig.getReferenceColumn(), externalConfig.getIdColumn(), externalConfig.getNameHuman(), externalConfig.getDescription());
col = TSObjectTransformer.transform2List(dbo, config, categoryFilter);
logger.trace("***************End Ordering********************");
originalCol = col.generateNovelList();
} else {
col = originalCol.generateNovelList();
}
oneshotMode = false;
}
public void initSingleMatcher(LexicalEngineConfiguration externalConfig, String ColumnFilter) throws Exception {
String cfgFileCompletePath = cfgFile;
config = new LexicalEngineConfiguration();
config.configureByStream(cfgFileCompletePath);
if (externalConfig != null) {
config.mergeConfig(externalConfig);
}
processor = new Engine(config, ColumnFilter);
// in this case, the lexical matcher is invoked once, then it has to be stopped in the end
oneshotMode = true;
}
public void init(String categoryFilter, String columnFilter) throws Exception {
init(categoryFilter, columnFilter, null);
}
public void init(LexicalEngineConfiguration externalConfig) throws Exception {
init(null, null, externalConfig);
}
public void init() throws Exception {
init(null, null, null);
}
public void refreshReferences() {
col = null;
}
public ArrayList<SingleResult> getLastResults(){
return lastResults;
}
ArrayList<SingleResult> lastResults;
public void runGuesser(String seriesName, String columnName, LexicalEngineConfiguration externalConfig, String CategoryFilter, String ColumnFilter, String SingletonString) throws Exception {
SessionFactory dbSession = null;
String cfgFileCompletePath = cfgFile;
logger.debug("Guessing Table " + seriesName + " column " + columnName);
if (externalConfig != null) {
config = new LexicalEngineConfiguration();
config.configureByStream(cfgFileCompletePath);
config.mergeConfig(externalConfig);
// NOTE FOR FUTURE OPTIMIZATION: perform the re-init only if there is a change in the Database pointing
processor = new Engine(config, ColumnFilter);
} else {
if (config == null) {
config = new LexicalEngineConfiguration();
config.configure(cfgFileCompletePath);
}
if (processor == null) {
processor = new Engine(config, ColumnFilter);
} else
processor.resetEngine(config, ColumnFilter);
}
dbSession = processor.getDBSession(config);
DBObjectTranslator dbo = new DBObjectTranslator();
//modification of 10/10/11 calculate structure each time
// if (col == null) {
logger.trace("******************Order Category******************");
if (externalConfig !=null){
ArrayList<Category> externalcategories = externalConfig.getCategories();
if ((externalcategories!=null) && (externalcategories.size()>0)){
dbo.categories=externalConfig.getCategories();
dbo.calculateCategoriesWeights(dbSession);
}
else
dbo.buildCategoriesStructure(dbSession, config.getReferenceTable(), config.getReferenceColumn(), config.getIdColumn(), config.getNameHuman(), config.getDescription());
}
col = TSObjectTransformer.transform2List(dbo, config, CategoryFilter);
logger.trace("***************End Ordering********************");
originalCol = col.generateNovelList();
/*
} else {
col = originalCol.generateNovelList();
}
*/
logger.warn("Starting Calculation...wait...");
long t0 = System.currentTimeMillis();
// processor.calcLike(col,seriesName, columnName);
processor.calcLikeThread(col, seriesName, columnName, SingletonString);
// perform processing until the table contains at least one element
ArrayList<SingleResult> checkingResults = null;
// if (oneshotMode)
// checkingResults = getClassification();
// else
checkingResults = getClassification();
while ((checkingResults == null || checkingResults.size() == 0) && (triesCounter < maxTriesClassification)) {
logger.warn("..another processing pass is required. Attempt number " + (triesCounter + 1));
triesCounter++;
float differencialThr = config.getCategoryDiscardDifferencialThreshold();
float acceptanceThr = config.getEntryAcceptanceThreshold();
// reduce the thresholds of 10 points and recalculate
config.setCategoryDiscardDifferencialThreshold(Math.max(differencialThr - 20, 0));
config.setEntryAcceptanceThreshold(Math.max(acceptanceThr - 20, 0));
logger.trace("Performing next processing pass");
runGuesser(seriesName, columnName, externalConfig, CategoryFilter, ColumnFilter, SingletonString);
logger.debug("End processing pass");
// if (oneshotMode)
// checkingResults = getClassification();
// else
checkingResults = getClassification();
if (triesCounter == 0)
break;
}
long t1 = System.currentTimeMillis() - t0;
logger.warn("...End Calculation in " + t1 + "ms");
triesCounter = 0;
lastResults=checkingResults;
// close session if not more necessary
if (oneshotMode)
dbSession.close();
}
public ArrayList<SingleResult> getClassificationOLD() {
ArrayList<SingleResult> results = new ArrayList<SingleResult>();
int size = processor.bestCategories.size();
for (int i = 0; i < size; i++) {
results.add(new SingleResult(processor.bestCategories.get(i), processor.bestColumns.get(i), processor.bestScores.get(i), null, "0"));
}
return results;
}
public ArrayList<SingleResult> getDetailedMatches() {
if (processor.getSingletonMatches() != null) {
// use deviation to cut results
float threshold = config.getSingleEntryRecognitionMaxDeviation();
ArrayList<SingleResult> results = processor.getSingletonMatches();
double minScore = 0;
// get the best result and calculate the threshold
if (results.size() > 0) {
minScore = results.get(0).getScore() - threshold;
}
// remove poor objects
int size = results.size();
for (int i = 0; i < size; i++) {
SingleResult sr = results.get(i);
if (sr.getScore() < minScore) {
results.remove(i);
i--;
size--;
}
}
return processor.getSingletonMatches();
} else
return new ArrayList<SingleResult>();
}
public String getDetailedSingletonEntry() {
if (processor.getSingletonElement() != null) {
return processor.getSingletonElement();
} else
return "";
}
public ArrayList<SingleResult> getClassificationPlain() {
ArrayList<SingleResult> results = new ArrayList<SingleResult>();
int size = processor.bestCategories.size();
double maxscore = 0;
for (int i = 0; i < size; i++) {
double score = processor.bestScores.get(i);
if (maxscore < score) {
maxscore = score;
}
}
for (int i = 0; i < size; i++) {
double score = processor.bestScores.get(i);
// normalizing percentages!!!
score = (score / (maxscore + ((size > 1) ? 1 : 0))) * 100;
if (score > config.categoryDiscardDifferencialThreshold) {
Reference ref = col.getCategory(processor.bestCategories.get(i));
results.add(new SingleResult(processor.bestCategories.get(i), processor.bestColumns.get(i), score, ref.getTableName(), ref.getIndex()));
}
}
return results;
}
public ArrayList<SingleResult> getClassification() {
ArrayList<SingleResult> results = new ArrayList<SingleResult>();
int size = processor.bestCategories.size();
double maxscore = 0;
BigDecimal sumElements = BigDecimal.ZERO;
ArrayList<Double> subscores = new ArrayList<Double>();
// calculate sum of elements and weights;
for (int i = 0; i < size; i++) {
BigInteger catElements = col.getScoresTable().get(processor.bestCategories.get(i)).getCategoryElements();
sumElements = sumElements.add(new BigDecimal(catElements));
}
/*
if (sumElements.compareTo(BigDecimal.valueOf(10000)) < 0)
return getClassificationPlain();
*/
for (int i = 0; i < size; i++) {
double score = processor.bestScores.get(i);
// multiply for impotance
BigInteger catElements = col.getScoresTable().get(processor.bestCategories.get(i)).getCategoryElements();
// logger.warn("\t elements "+catElements+" sum "+sumElements);
double weight = new BigDecimal(catElements).divide(sumElements, 2, BigDecimal.ROUND_HALF_UP).doubleValue();
if (weight >= 3)
weight = 2 * Math.log(weight * 100) / 10f;
else if ((weight >= 0.5) && (weight <= 1))
{
weight = Math.log(weight * 100) / 100.00f;
}
else if (weight < 0.05)
weight = 0.05;
logger.warn("WEIGHT FOR CATEGORY " + processor.bestCategories.get(i) + "-" + processor.bestColumns.get(i) + " : " + weight + " SCORE " + score);
// recalculate weights
score = score * weight;
score = Math.min(1, score);
if (maxscore < score) {
maxscore = score;
}
subscores.add(score);
}
// logger.warn("MAX SCORE "+maxscore);
for (int i = 0; i < size; i++) {
// double score = processor.bestScores.get(i);
double score = subscores.get(i);
// logger.warn("SCORE FOR CATEGORY "+processor.bestCategories.get(i)+" -COLUMN : "+processor.bestColumns.get(i)+" - "+score);
// normalizing percentages!!!
score = (score / (maxscore + ((size > 1) ? 1 : 0))) * 100;
// logger.warn("SCORE FOR CATEGORY "+processor.bestCategories.get(i)+" -COLUMN : "+processor.bestColumns.get(i)+" - "+score);
if (score > config.categoryDiscardDifferencialThreshold) {
// logger.warn("SCORE "+score);
// insert into the right place
int index = results.size();
int j = 0;
for (SingleResult res : results) {
if (res.getScore() < score) {
index = j;
}
j++;
}
Reference ref = col.getCategory(processor.bestCategories.get(i));
SingleResult sr = new SingleResult(processor.bestCategories.get(i), processor.bestColumns.get(i), score, ref.getTableName(), ref.getIndex());
//control for repetitions
if (isnotRepetition(sr, results))
results.add(index, sr);
}
}
//limit the result list after rescoring
int s = results.size();
if (s>MAXRESULTS){
int diff = (size-MAXRESULTS);
for (int i=0;i<diff;i++){
s = results.size();
results.remove(s-1);
}
}
// return deleteDuplicates(results);
return results;
}
private ArrayList<SingleResult> deleteDuplicates(ArrayList<SingleResult> results ){
HashMap<String,SingleResult> map = new HashMap<String,SingleResult>();
ArrayList<SingleResult> newResults = new ArrayList<SingleResult>();
for (SingleResult res: results){
String index = res.toString();
SingleResult sr = map.get(index.toString());
if (sr==null){
map.put(index, res);
newResults.add(sr);
}
}
return newResults;
}
private boolean isnotRepetition(SingleResult result, ArrayList<SingleResult> previous) {
boolean notrepeated = true;
int size = previous.size();
for (int i = 0; i < size; i++) {
SingleResult sr = previous.get(i);
if (sr.getCategory().equalsIgnoreCase(result.getCategory()) && sr.getColumn().equalsIgnoreCase(result.getColumn())) {
notrepeated = true;
break;
}
}
return notrepeated;
}
public void shutdown(){
try{
if (!processor.getDBSession().isClosed())
processor.getDBSession().close();
}catch(Exception e){
}
}
}