package org.gcube.dataanalysis.executor.nodes.transducers.bionym.abstracts; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Timer; import java.util.TimerTask; import org.gcube.contentmanagement.graphtools.utils.MathFunctions; import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; import org.gcube.contentmanagement.lexicalmatcher.utils.DistanceCalculator; import org.gcube.contentmanagement.lexicalmatcher.utils.FileTools; import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration; import org.gcube.dataanalysis.ecoengine.utils.Transformations; public class RealTimeMatcher { public String PARSER; public String INPUT_DATA_SOURCE_ID; public String INPUT_DATA_ID; public String INPUT_DATA; public String PREPARSED_INPUT_DATA; public String PARSED_SCIENTIFIC_NAME; public String PARSED_AUTHORITY; public String POST_PARSED_SCIENTIFIC_NAME; public String POST_PARSED_AUTHORITY; static HashMap> officialTafsMap; static int cachCleaningTime = 2 * 60 * 60 *1000; static String outputheaders = "SOURCE_DATASOURCE_ID;SOURCE_ID;SOURCE_DATA;PRE_PARSED_SOURCE_DATA;PARSED_SCIENTIFIC_NAME;PARSED_AUTHORITY;PARSER;POST_PARSED_SCIENTIFIC_NAME;POST_PARSED_AUTHORITY;MATCHING_SCORE;TARGET_DATA_SOURCE;TARGET_DATA_ID;TARGET_DATA_SCIENTIFIC_NAME;TARGET_DATA_AUTHORITY;TARGET_DATA_KINGDOM;TARGET_DATA_PHYLUM;TARGET_DATA_CLASS;TARGET_DATA_ORDER;TARGET_DATA_FAMILY;TARGET_DATA_GENUS;TARGET_DATA_SPECIES;TARGET_DATA_VERNACULAR_NAMES"; // MATCHING_SCORE TARGET_DATA_SOURCE TARGET_DATA_ID TARGET_DATA_SCIENTIFIC_NAME TARGET_DATA_AUTHORITY TARGET_DATA_KINGDOM TARGET_DATA_PHYLUM TARGET_DATA_CLASS TARGET_DATA_ORDER TARGET_DATA_FAMILY TARGET_DATA_GENUS TARGET_DATA_SPECIES TARGET_DATA_VERNACULAR_NAMES public RealTimeMatcher(){ //TODO: scheduler // databasecheckScheduler = new Timer(); databasecheckScheduler.schedule(new DatabaseController(), 0, refreshTime); } private class TafsCacheCleaner extends TimerTask { @Override public void run() { AnalysisLogger.getLogger().debug("RealTimeMatcher: Cache cleaned"); officialTafsMap = null; System.gc(); } } class TafInfo { double MATCHING_SCORE ; String TARGET_DATA_SOURCE ; String TARGET_DATA_ID; String TARGET_DATA_SCIENTIFIC_NAME; String TARGET_DATA_AUTHORITY; String TARGET_DATA_KINGDOM; String TARGET_DATA_PHYLUM; String TARGET_DATA_CLASS; String TARGET_DATA_ORDER; String TARGET_DATA_FAMILY; String TARGET_DATA_GENUS; String TARGET_DATA_SPECIES; String TARGET_DATA_VERNACULAR_NAMES; } private HashMap getCurrentTaf(String tafFile) throws Exception { try { String file = FileTools.loadString(tafFile, "UTF-8"); String[] tafrows = file.split("\n"); HashMap tafMap; if (officialTafsMap==null){ officialTafsMap = new HashMap>(); Timer cachecleaner = new Timer(); cachecleaner.schedule(new TafsCacheCleaner(), cachCleaningTime); } if (officialTafsMap.get(tafFile)!=null) return officialTafsMap.get(tafFile); tafMap = new HashMap(); for (String row : tafrows) { List elements = Transformations.parseCVSString(row, "\t"); TafInfo tafInfo = new TafInfo(); tafInfo.TARGET_DATA_ID = elements.get(0); tafInfo.TARGET_DATA_SCIENTIFIC_NAME = elements.get(37); tafInfo.TARGET_DATA_AUTHORITY = elements.get(41); tafInfo.TARGET_DATA_KINGDOM = elements.get(1); tafInfo.TARGET_DATA_PHYLUM = elements.get(5); tafInfo.TARGET_DATA_CLASS = elements.get(9); tafInfo.TARGET_DATA_ORDER = elements.get(13); tafInfo.TARGET_DATA_FAMILY = elements.get(17); tafInfo.TARGET_DATA_GENUS = elements.get(21); tafInfo.TARGET_DATA_SPECIES = elements.get(29); tafInfo.TARGET_DATA_VERNACULAR_NAMES = ""; tafMap.put((tafInfo.TARGET_DATA_SCIENTIFIC_NAME + " " + tafInfo.TARGET_DATA_AUTHORITY).toLowerCase(), tafInfo); } officialTafsMap.put(tafFile, tafMap); return tafMap; } catch (Exception e) { e.printStackTrace(); throw new Exception("Error: could not read Taf file"); } } private void getCurrentInput(String postParserFile) throws Exception { try { String file = FileTools.loadString(postParserFile, "UTF-8"); String row = file.split("\n")[1]; List elements = Transformations.parseCVSString(row, ";"); PARSER = elements.get(0); INPUT_DATA_SOURCE_ID = elements.get(1); INPUT_DATA_ID = elements.get(2); INPUT_DATA = elements.get(3); PREPARSED_INPUT_DATA = elements.get(4); PARSED_SCIENTIFIC_NAME = elements.get(5); PARSED_AUTHORITY = elements.get(6); POST_PARSED_SCIENTIFIC_NAME = elements.get(7); POST_PARSED_AUTHORITY = elements.get(8); } catch (Exception e) { throw new Exception("Error: could not read post-parsing file"); } } public void match(String taffile, String tafName, String postParserFile, String outputFile, double threshold, int maxvalues) throws Exception { AnalysisLogger.getLogger().debug("RealTimeMatcher started with the following parameters: "); AnalysisLogger.getLogger().debug("taffile: "+taffile); AnalysisLogger.getLogger().debug("tafName: "+tafName); AnalysisLogger.getLogger().debug("postParserFile: "+postParserFile); AnalysisLogger.getLogger().debug("outputFile: "+outputFile); AnalysisLogger.getLogger().debug("threshold: "+threshold); AnalysisLogger.getLogger().debug("maxvalues: "+maxvalues); long t0 = System.currentTimeMillis(); getCurrentInput(postParserFile); long t1 = System.currentTimeMillis(); AnalysisLogger.getLogger().debug("Current Input rebuilt in "+(t1-t0)); /* Properties p = PropertyLoader.loadProperties("cache/cache.ccf", ClassLoader.getSystemClassLoader()); CompositeCacheManager ccm = CompositeCacheManager.getUnconfiguredInstance(); ccm.configure(p); CacheAccess cache = JCS.getInstance("zone"); cache.put("hello", "world"); System.out.println("Retrieved: "+cache.get("hello")); */ HashMap scientificnames = getCurrentTaf(taffile); // TafInfo info = scientificnames.get((POST_PARSED_SCIENTIFIC_NAME + " " + POST_PARSED_AUTHORITY).toLowerCase()); long t2 = System.currentTimeMillis(); AnalysisLogger.getLogger().debug("TAF rebuilt in "+(t2-t1)); /* if (info != null) { AnalysisLogger.getLogger().debug(info.TARGET_DATA_SCIENTIFIC_NAME+" "+info.TARGET_DATA_AUTHORITY); return; } */ DistanceCalculator dc = new DistanceCalculator(); List bestTafs= new ArrayList(); int bestTafList = 0; for (TafInfo testInfo:scientificnames.values()){ double snameScore = dc.CD(false, POST_PARSED_SCIENTIFIC_NAME, testInfo.TARGET_DATA_SCIENTIFIC_NAME, true, false); double authScore = 0.5; if (POST_PARSED_AUTHORITY!=null && testInfo.TARGET_DATA_AUTHORITY!=null && POST_PARSED_AUTHORITY.length()>0 && testInfo.TARGET_DATA_AUTHORITY.length()>0) authScore=dc.CD(false, POST_PARSED_AUTHORITY, testInfo.TARGET_DATA_AUTHORITY, true, false); else if (POST_PARSED_AUTHORITY==null || POST_PARSED_AUTHORITY.length() == 0 ) authScore=1; if (authScore*snameScore>threshold){ testInfo.MATCHING_SCORE=(authScore*snameScore); insertTaf(bestTafs,testInfo,bestTafList); bestTafList++; } } long t3 = System.currentTimeMillis(); AnalysisLogger.getLogger().debug("Scientific Names rebuilt in "+(t3-t2)); AnalysisLogger.getLogger().debug("Results"); int belements = 0; BufferedWriter bw = new BufferedWriter(new FileWriter(new File(outputFile))); bw.write(outputheaders+"\n"); // static String outputheaders = "SOURCE_DATASOURCE_ID;SOURCE_ID;SOURCE_DATA;PRE_PARSED_SOURCE_DATA;PARSED_SCIENTIFIC_NAME;PARSED_AUTHORITY;PARSER;POST_PARSED_SCIENTIFIC_NAME;POST_PARSED_AUTHORITY;MATCHING_SCORE;TARGET_DATA_SOURCE;TARGET_DATA_ID;TARGET_DATA_SCIENTIFIC_NAME;TARGET_DATA_AUTHORITY;TARGET_DATA_KINGDOM;TARGET_DATA_PHYLUM;TARGET_DATA_CLASS;TARGET_DATA_ORDER;TARGET_DATA_FAMILY;TARGET_DATA_GENUS;TARGET_DATA_SPECIES;TARGET_DATA_VERNACULAR_NAMES"; for (TafInfo b:bestTafs){ AnalysisLogger.getLogger().debug(b.TARGET_DATA_SCIENTIFIC_NAME+" "+b.TARGET_DATA_AUTHORITY+" "+b.MATCHING_SCORE); bw.write(q(INPUT_DATA_SOURCE_ID)+";"+q(INPUT_DATA_ID)+";"+ q(INPUT_DATA)+";"+q(PREPARSED_INPUT_DATA)+";"+q(PARSED_SCIENTIFIC_NAME)+";"+ q(PARSED_AUTHORITY)+";"+q(PARSER)+";"+q(POST_PARSED_SCIENTIFIC_NAME)+";"+ q(POST_PARSED_AUTHORITY)+";"+MathFunctions.roundDecimal(b.MATCHING_SCORE,2)+";"+ q(tafName)+";"+ q(b.TARGET_DATA_ID)+";"+q(b.TARGET_DATA_SCIENTIFIC_NAME)+";"+q(b.TARGET_DATA_AUTHORITY)+";"+ q(b.TARGET_DATA_KINGDOM)+";"+q(b.TARGET_DATA_PHYLUM)+";"+q(b.TARGET_DATA_CLASS)+";"+ q(b.TARGET_DATA_ORDER)+";"+q(b.TARGET_DATA_FAMILY)+";"+q(b.TARGET_DATA_GENUS)+";"+ q(b.TARGET_DATA_SPECIES)+";"+q(b.TARGET_DATA_VERNACULAR_NAMES)+ "\n"); if (belements>maxvalues) break; belements++; } bw.close(); } private String q(String in){ return "\""+in+"\""; } private void insertTaf(List elements, TafInfo toInsert, int elementsLen){ int counter = 0; for (TafInfo e:elements){ if (e.MATCHING_SCORE