ecological-engine-smart-exe.../src/main/java/org/gcube/dataanalysis/executor/nodes/transducers/bionym/BionymWorkflow.java

430 lines
18 KiB
Java
Executable File

package org.gcube.dataanalysis.executor.nodes.transducers.bionym;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.List;
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.gcube.dataanalysis.ecoengine.configuration.ALG_PROPS;
import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration;
import org.gcube.dataanalysis.ecoengine.datatypes.ColumnType;
import org.gcube.dataanalysis.ecoengine.datatypes.DatabaseType;
import org.gcube.dataanalysis.ecoengine.datatypes.InputTable;
import org.gcube.dataanalysis.ecoengine.datatypes.OutputTable;
import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveType;
import org.gcube.dataanalysis.ecoengine.datatypes.ServiceType;
import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.ServiceParameters;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.TableTemplates;
import org.gcube.dataanalysis.ecoengine.interfaces.ActorNode;
import org.gcube.dataanalysis.ecoengine.utils.DatabaseFactory;
import org.gcube.dataanalysis.ecoengine.utils.DatabaseUtils;
import org.gcube.dataanalysis.ecoengine.utils.Transformations;
import org.gcube.dataanalysis.ecoengine.utils.Tuple;
import org.gcube.dataanalysis.executor.generators.D4ScienceDistributedProcessing;
import org.gcube.dataanalysis.executor.scripts.OSCommand;
import org.hibernate.SessionFactory;
import com.thoughtworks.xstream.XStream;
public class BionymWorkflow extends ActorNode {
protected AlgorithmConfiguration currentconfig;
protected SessionFactory dbconnection;
private static String createOutputTable = "CREATE TABLE %1$s (inputname character varying(255), suggestion character varying(255), score real)";
int rawnamescount = 0;
public int prevbroadcastTimePeriod;
public int prevmaxNumberOfStages;
public int prevmaxMessages;
String destinationTable;
public static String destinationTableParam = "OutputTable";
public static String destinationTableLable = "OutputTableLabel";
String originTable;
public static String originTableParam = "RawTaxaNamesTable";
String rawnamesColumn;
public static String rawnamesColumnParam = "RawNamesColumn";
String parser;
public static String parserParam = "Parser";
String reference;
public static String referenceParam = "ReferenceDataset";
String soundexweight;
public static String soundexweightParam = "SoundexVSEditDist";
String preprocessor;
public static String doPreprocessParam = "Preprocess";
float status;
public static String maxMatchesParam= "MaxMatches";
int maxMatches=10;
public static String pruningThresholdParam= "PruningThreshold";
float pruningThreshold=0.4f;
static String headers = "inputname,suggestion,score";
boolean haspostprocessed = false;
@Override
public ALG_PROPS[] getProperties() {
ALG_PROPS[] p = { ALG_PROPS.PHENOMENON_VS_PARALLEL_PHENOMENON };
return p;
}
@Override
public String getName() {
return "BIONYM";
}
@Override
public String getDescription() {
return "An algorithm implementing BiOnym, a flexible workflow approach to taxon name matching. The workflow allows to activate several taxa names matching algorithms and to get the list of possible transcriptions for a list of input raw species names with possible authorship indication.";
}
@Override
public List<StatisticalType> getInputParameters() {
List<TableTemplates> templateLWRInput = new ArrayList<TableTemplates>();
templateLWRInput.add(TableTemplates.GENERIC);
InputTable p1 = new InputTable(templateLWRInput, originTableParam, "Input table containing raw taxa names that you want to match", "byonym");
ColumnType p2 = new ColumnType(originTableParam, rawnamesColumnParam, "The column containing the raw taxa names with or without authoship information", "rawnames", false);
ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, destinationTableParam, "name of the table that will contain the matches", "bion_");
PrimitiveType p4 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, destinationTableLable, "Name of the table which will contain the matches", "bionout");
PrimitiveType p5 = new PrimitiveType(Enum.class.getName(), CometMatcherManager.Reference.values(), PrimitiveTypes.ENUMERATED, referenceParam, "The reference dataset to use", "" + CometMatcherManager.Reference.FISHBASE);
PrimitiveType p6 = new PrimitiveType(Enum.class.getName(), EVBPreprocessing.Preprocessors.values(), PrimitiveTypes.ENUMERATED, doPreprocessParam, "Set a preprocessing approach for the raw strings", "" + EVBPreprocessing.Preprocessors.EXPERT_RULES);
PrimitiveType p7 = new PrimitiveType(Enum.class.getName(), CometMatcherManager.Weights.values(), PrimitiveTypes.ENUMERATED, soundexweightParam, "Set the use of soundex vs edit distance approaches to string matching", "" + CometMatcherManager.Weights.EDIT_DISTANCE);
PrimitiveType p8 = new PrimitiveType(Enum.class.getName(), CometMatcherManager.Parsers.values(), PrimitiveTypes.ENUMERATED, parserParam, "Set the genus-species-author parser to use", "" + CometMatcherManager.Parsers.SIMPLE);
PrimitiveType p9 = new PrimitiveType(Float.class.getName(), null, PrimitiveTypes.NUMBER, pruningThresholdParam,"Pruning threshold for the output scores (from 0 to 1)","0.4");
PrimitiveType p10 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, maxMatchesParam,"Maximum number of matches to report per raw string","10");
List<StatisticalType> parameters = new ArrayList<StatisticalType>();
parameters.add(p1);
parameters.add(p3);
parameters.add(p2);
parameters.add(p4);
parameters.add(p5);
parameters.add(p6);
parameters.add(p7);
parameters.add(p8);
parameters.add(p9);
parameters.add(p10);
DatabaseType.addDefaultDBPars(parameters);
return parameters;
}
@Override
public StatisticalType getOutput() {
List<TableTemplates> template = new ArrayList<TableTemplates>();
template.add(TableTemplates.GENERIC);
OutputTable p = new OutputTable(template, destinationTableLable, destinationTable, "Output table");
return p;
}
@Override
public void initSingleNode(AlgorithmConfiguration config) {
}
@Override
public float getInternalStatus() {
return status;
}
List<Tuple<String>> matchedTuples = new ArrayList<Tuple<String>>();
List<Tuple<String>> unmatchedTuples = new ArrayList<Tuple<String>>();
public void filterMatchedTuples(List<Tuple<String>> tuples){
unmatchedTuples = null;
unmatchedTuples = new ArrayList<Tuple<String>>();
for (Tuple<String> tuple:tuples){
double score = 0;
if (tuple.getElements().size()>2){
try{score = Double.parseDouble(tuple.getElements().get(2));}catch(Exception e){
e.printStackTrace();
}
}
if (score==1)
matchedTuples.add(tuple);
else
unmatchedTuples.add(tuple);
}
}
public StringBuffer executeBionymWorkflow(String rawspeciesname, String sandboxfolder, String preprocessor, boolean[] enablematchers, float soundexweightF, int maxResults, float pruningThreshold) throws Exception {
StringBuffer sb = new StringBuffer();
List<String> rawnames = new ArrayList<String>();
rawnames.add(rawspeciesname);
List<Tuple<String>> preprocessednames = new ArrayList<Tuple<String>>();
// preprocessing
switch (EVBPreprocessing.Preprocessors.valueOf(preprocessor)) {
case EXPERT_RULES:
EVBPreprocessing preprocess = new EVBPreprocessing();
preprocessednames = preprocess.preprocess(parser, sandboxfolder, rawnames);
break;
default:
preprocessednames = EVBPreprocessing.populateTuples(rawnames);
break;
}
filterMatchedTuples(preprocessednames);
for (int i = 0; i < enablematchers.length; i++) {
// standard WF
if (i == 0) {
CometMatcherManager comet = new CometMatcherManager();
List<Tuple<String>> cometoutput = comet.match(parser, reference, sandboxfolder, unmatchedTuples, soundexweightF,maxResults);
filterMatchedTuples(cometoutput);
}
}
//add all non exact matches
matchedTuples.addAll(unmatchedTuples);
int msize = matchedTuples.size();
if (msize>0) {
for (int i = 0; i < msize; i++) {
Tuple<String> t = matchedTuples.get(i);
String scoreS = t.getElements().get(2);
Float score = (scoreS !=null)? Float.parseFloat(scoreS):0;
if (score>=pruningThreshold){
String spname = t.getElements().get(0);
String authorname = t.getElements().get(1);
if (authorname.length()>0)
spname +=" ("+authorname+")";
sb.append("('" + rawspeciesname + "','" + spname + "','" + scoreS + "')");
if (i < msize - 1)
sb.append(",");
}
}
}
String sbstring = sb.toString().trim();
int ssize = sbstring.length();
if (sbstring.endsWith(",")){
System.out.println("Deleting final comma..");
sb= new StringBuffer(sbstring.substring(0,ssize-1));
}
return sb;
}
@Override
public int executeNode(int leftStartIndex, int numberOfLeftElementsToProcess, int rightStartIndex, int numberOfRightElementsToProcess,boolean duplicate, String sandboxFolder, String nodeConfigurationFileObject, String logfileNameToProduce) {
try {
status = 0;
long t0 = System.currentTimeMillis();
// rebuild variables
System.out.println("Restoring configuration");
AlgorithmConfiguration config = Transformations.restoreConfig(new File(sandboxFolder, nodeConfigurationFileObject).getAbsolutePath());
config.setConfigPath(sandboxFolder);
dbconnection = DatabaseUtils.initDBSession(config);
destinationTable = config.getParam(destinationTableParam);
originTable = config.getParam(originTableParam);
rawnamesColumn = config.getParam(rawnamesColumnParam);
parser = config.getParam(parserParam);
reference = config.getParam(referenceParam);
soundexweight = config.getParam(soundexweightParam);
preprocessor = config.getParam(doPreprocessParam);
String maxMatchesS = config.getParam(maxMatchesParam);
maxMatches= (maxMatchesS==null)?10:Integer.parseInt(maxMatchesS);
String pruningThrS = config.getParam(pruningThresholdParam);
pruningThreshold = (pruningThrS==null)?0.4f:Float.parseFloat(pruningThrS);
System.out.println("Destination Table: " + destinationTable);
System.out.println("Origin Table: " + originTable);
System.out.println("Column of names: " + rawnamesColumn);
System.out.println("Parser to use: " + parser);
System.out.println("Reference Dataset: " + reference);
System.out.println("Soundex Preference: " + soundexweight);
System.out.println("Preprocessor:" + preprocessor);
System.out.println("Pruning threshold:" + pruningThreshold);
System.out.println("Number of Matches:" + maxMatches);
float soundexweightF = 0.5f;
switch (CometMatcherManager.Weights.valueOf(soundexweight)) {
case SOUNDEX:
soundexweightF = 1f;
break;
case EDIT_DISTANCE:
soundexweightF = 0f;
break;
case MIXED:
soundexweightF = 0.5f;
break;
default:
soundexweightF = 0.5f;
break;
}
// retrieve the list of names to process
System.out.println("Retrieving names to process");
List<Object> rawnames = DatabaseFactory.executeSQLQuery(DatabaseUtils.getDinstictElements(originTable, rawnamesColumn, ""), dbconnection);
System.out.println("Retrieved a total of "+rawnames.size()+" species");
int end = rightStartIndex + numberOfRightElementsToProcess;
System.out.println("Processing from "+rightStartIndex+" to "+end);
List<String> rawnamesFiltered = new ArrayList<String>();
for (int i = rightStartIndex; i < end; i++) {
rawnamesFiltered.add("" + rawnames.get(i));
}
// prepare the environment
try {
// OSCommand.ExecuteGetLine("chmod +x * | whoami", null);
OSCommand.ExecuteGetLine("chmod +x *", null);
} catch (Exception e) {
System.out.println("WARNING: could not change the permissions");
}
int rawscounter = 0;
System.out.println("Processing raw names");
for (String rawname : rawnamesFiltered) {
rawname = rawname.replace("'", "").replace("\"", "");
System.out.println("Processing species: "+rawname);
StringBuffer sb = executeBionymWorkflow(rawname, sandboxFolder, preprocessor, new boolean[] { true }, soundexweightF,maxMatches,pruningThreshold);
System.out.println("Processed species: "+rawname);
if (sb.length() > 0) {
rawscounter++;
System.out.println("Inserting results onto the table "+destinationTable);
String insertQuery = DatabaseUtils.insertFromBuffer(destinationTable, headers, sb);
System.out.println("Insert Query: " + insertQuery);
System.out.println("Inserting values for " + rawname);
DatabaseFactory.executeSQLUpdate(insertQuery, dbconnection);
System.out.println("Successfully Inserted values for " + rawname);
}
// Thread.sleep(10000);
}
System.out.println("The procedure finished successfully. Processed " + rawscounter + " species.");
System.out.println("Elapsed Time " + (System.currentTimeMillis() - t0) + " ms");
} catch (Exception e) {
e.printStackTrace();
System.out.println("warning: error in node execution " + e.getLocalizedMessage());
System.err.println("Error in node execution " + e.getLocalizedMessage());
return -1;
} finally {
if (dbconnection != null)
try {
dbconnection.close();
} catch (Exception e) {
}
status = 1f;
}
return 0;
}
@Override
public void setup(AlgorithmConfiguration config) throws Exception {
haspostprocessed = false;
AnalysisLogger.getLogger().info("Initializing DB Connection");
dbconnection = DatabaseUtils.initDBSession(config);
destinationTable = config.getParam(destinationTableParam);
originTable = config.getParam(originTableParam);
rawnamesColumn = config.getParam(rawnamesColumnParam);
List<Object> rawnames = DatabaseFactory.executeSQLQuery(DatabaseUtils.getDinstictElements(originTable, rawnamesColumn, ""), dbconnection);
rawnamescount = rawnames.size();
// rawnamescount =1;
AnalysisLogger.getLogger().info("Creating Destination Table " + destinationTable);
try {
DatabaseFactory.executeSQLUpdate(DatabaseUtils.dropTableStatement(destinationTable), dbconnection);
} catch (Exception e) {
AnalysisLogger.getLogger().info("Table " + destinationTable + " did not exist");
}
DatabaseFactory.executeSQLUpdate(String.format(createOutputTable, destinationTable), dbconnection);
prevmaxMessages=D4ScienceDistributedProcessing.maxMessagesAllowedPerJob;
D4ScienceDistributedProcessing.maxMessagesAllowedPerJob=1;
AnalysisLogger.getLogger().info("Destination Table Created! Addressing " + rawnamescount + " names");
}
@Override
public int getNumberOfRightElements() {
return rawnamescount;
}
@Override
public int getNumberOfLeftElements() {
return 1;
}
@Override
public void stop() {
// if has not postprocessed, then abort the computations by removing the database table
if (!haspostprocessed) {
try {
AnalysisLogger.getLogger().info("The procedure did NOT correctly postprocessed ....Removing Table " + destinationTable + " because of computation stop!");
DatabaseFactory.executeSQLUpdate(DatabaseUtils.dropTableStatement(destinationTable), dbconnection);
} catch (Exception e) {
AnalysisLogger.getLogger().info("Table " + destinationTable + " did not exist");
}
} else
AnalysisLogger.getLogger().info("The procedure has correctly postprocessed: shutting down the connection!");
if (dbconnection != null)
try {
dbconnection.close();
} catch (Exception e) {
}
}
@Override
public void postProcess(boolean manageDuplicates, boolean manageFault) {
D4ScienceDistributedProcessing.maxMessagesAllowedPerJob=prevmaxMessages;
haspostprocessed = true;
}
public static void mainTEST(String[] args) throws Exception {
AlgorithmConfiguration config = new AlgorithmConfiguration();
config.setConfigPath("./cfg/");
String sandbox = "./PARALLEL_PROCESSING";
String configfile = "testconfig.cfg";
config.setPersistencePath(sandbox);
/*
config.setParam("DatabaseUserName", "gcube");
config.setParam("DatabasePassword", "d4science2");
config.setParam("DatabaseURL", "jdbc:postgresql://146.48.87.169/testdb");
config.setParam("DatabaseDriver", "org.postgresql.Driver");
*/
config.setParam("DatabaseUserName","utente");
config.setParam("DatabasePassword","d4science");
config.setParam("DatabaseURL","jdbc:postgresql://statistical-manager.d.d4science.research-infrastructures.eu/testdb");
config.setParam(BionymWorkflow.destinationTableParam, "taxamatchoutputlocal");
config.setParam(BionymWorkflow.destinationTableLable, "taxamatchoutputlabel");
config.setParam(BionymWorkflow.originTableParam, "taxamatchinput");
config.setParam(BionymWorkflow.rawnamesColumnParam, "rawstrings");
config.setParam(BionymWorkflow.parserParam, CometMatcherManager.Parsers.SIMPLE.name());
config.setParam(BionymWorkflow.referenceParam, CometMatcherManager.Reference.ASFIS.name());
config.setParam(BionymWorkflow.soundexweightParam, CometMatcherManager.Weights.EDIT_DISTANCE.name());
config.setParam(BionymWorkflow.doPreprocessParam , EVBPreprocessing.Preprocessors.EXPERT_RULES.name());
config.setParam(BionymWorkflow.maxMatchesParam , "10");
AnalysisLogger.setLogger(config.getConfigPath() + AlgorithmConfiguration.defaultLoggerFile);
BufferedWriter oos = new BufferedWriter(new FileWriter(new File(sandbox, configfile)));
oos.write(new XStream().toXML(config));
oos.close();
new BionymWorkflow().setup(config);
// new BionymWorkflow().executeNode(0, 1, 0, 915, false, sandbox, configfile, "test.log");
new BionymWorkflow().executeNode(0, 1, 0, 1, false, sandbox, configfile, "test.log");
}
}