ecological-engine-smart-exe.../src/main/java/org/gcube/dataanalysis/executor/nodes/transducers/bionym/BionymWorkflow.java

package org.gcube.dataanalysis.executor.nodes.transducers.bionym;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.List;

import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
import org.gcube.dataanalysis.ecoengine.configuration.ALG_PROPS;
import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration;
import org.gcube.dataanalysis.ecoengine.datatypes.ColumnType;
import org.gcube.dataanalysis.ecoengine.datatypes.DatabaseType;
import org.gcube.dataanalysis.ecoengine.datatypes.InputTable;
import org.gcube.dataanalysis.ecoengine.datatypes.OutputTable;
import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveType;
import org.gcube.dataanalysis.ecoengine.datatypes.ServiceType;
import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.ServiceParameters;
import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.TableTemplates;
import org.gcube.dataanalysis.ecoengine.interfaces.ActorNode;
import org.gcube.dataanalysis.ecoengine.utils.DatabaseFactory;
import org.gcube.dataanalysis.ecoengine.utils.DatabaseUtils;
import org.gcube.dataanalysis.ecoengine.utils.Transformations;
import org.gcube.dataanalysis.ecoengine.utils.Tuple;
import org.gcube.dataanalysis.executor.generators.D4ScienceDistributedProcessing;
import org.gcube.dataanalysis.executor.scripts.OSCommand;
import org.hibernate.SessionFactory;

import com.thoughtworks.xstream.XStream;

public class BionymWorkflow extends ActorNode {
	protected AlgorithmConfiguration currentconfig;
	protected SessionFactory dbconnection;
	private static String createOutputTable = "CREATE TABLE %1$s (inputname character varying(255),   suggestion character varying(255), score real)";
	int rawnamescount = 0;
	public int prevbroadcastTimePeriod;
	public int prevmaxNumberOfStages;
	public int prevmaxMessages;

	String destinationTable;
	public static String destinationTableParam = "OutputTable";
	public static String destinationTableLable = "OutputTableLabel";
	String originTable;
	public static String originTableParam = "RawTaxaNamesTable";
	String rawnamesColumn;
	public static String rawnamesColumnParam = "RawNamesColumn";
	String parser;
	public static String parserParam = "Parser";
	String reference;
	public static String referenceParam = "ReferenceDataset";
	String soundexweight;
	public static String soundexweightParam = "SoundexVSEditDist";
	String preprocessor;
	public static String doPreprocessParam = "Preprocess";
	float status;
	public static String maxMatchesParam= "MaxMatches";
	int maxMatches=10;
	public static String pruningThresholdParam= "PruningThreshold";
	float pruningThreshold=0.4f;

	static String headers = "inputname,suggestion,score";
	boolean haspostprocessed = false;

	@Override
	public ALG_PROPS[] getProperties() {
		ALG_PROPS[] p = { ALG_PROPS.PHENOMENON_VS_PARALLEL_PHENOMENON };
		return p;
	}

	@Override
	public String getName() {
		return "BIONYM";
	}

	@Override
	public String getDescription() {
		return "An algorithm implementing BiOnym, a flexible workflow approach to taxon name matching. The workflow allows to activate several taxa names matching algorithms and to get the list of possible transcriptions for a list of input raw species names with possible authorship indication.";
	}

	@Override
	public List<StatisticalType> getInputParameters() {
		List<TableTemplates> templateLWRInput = new ArrayList<TableTemplates>();
		templateLWRInput.add(TableTemplates.GENERIC);

		InputTable p1 = new InputTable(templateLWRInput, originTableParam, "Input table containing raw taxa names that you want to match", "byonym");
		ColumnType p2 = new ColumnType(originTableParam, rawnamesColumnParam, "The column containing the raw taxa names with or without authoship information", "rawnames", false);
		ServiceType p3 = new ServiceType(ServiceParameters.RANDOMSTRING, destinationTableParam, "name of the table that will contain the matches", "bion_");
		PrimitiveType p4 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, destinationTableLable, "Name of the table which will contain the matches", "bionout");
		PrimitiveType p5 = new PrimitiveType(Enum.class.getName(), CometMatcherManager.Reference.values(), PrimitiveTypes.ENUMERATED, referenceParam, "The reference dataset to use", "" + CometMatcherManager.Reference.FISHBASE);
		PrimitiveType p6 = new PrimitiveType(Enum.class.getName(), EVBPreprocessing.Preprocessors.values(), PrimitiveTypes.ENUMERATED, doPreprocessParam, "Set a preprocessing approach for the raw strings", "" + EVBPreprocessing.Preprocessors.EXPERT_RULES);
		PrimitiveType p7 = new PrimitiveType(Enum.class.getName(), CometMatcherManager.Weights.values(), PrimitiveTypes.ENUMERATED, soundexweightParam, "Set the use of soundex vs edit distance approaches to string matching", "" + CometMatcherManager.Weights.EDIT_DISTANCE);
		PrimitiveType p8 = new PrimitiveType(Enum.class.getName(), CometMatcherManager.Parsers.values(), PrimitiveTypes.ENUMERATED, parserParam, "Set the genus-species-author parser to use", "" + CometMatcherManager.Parsers.SIMPLE);
		PrimitiveType p9 = new PrimitiveType(Float.class.getName(), null, PrimitiveTypes.NUMBER, pruningThresholdParam,"Pruning threshold for the output scores (from 0 to 1)","0.4");
		PrimitiveType p10 = new PrimitiveType(Integer.class.getName(), null, PrimitiveTypes.NUMBER, maxMatchesParam,"Maximum number of matches to report per raw string","10");


		List<StatisticalType> parameters = new ArrayList<StatisticalType>();
		parameters.add(p1);
		parameters.add(p3);
		parameters.add(p2);
		parameters.add(p4);
		parameters.add(p5);
		parameters.add(p6);
		parameters.add(p7);
		parameters.add(p8);
		parameters.add(p9);
		parameters.add(p10);

		DatabaseType.addDefaultDBPars(parameters);

		return parameters;

	}

	@Override
	public StatisticalType getOutput() {
		List<TableTemplates> template = new ArrayList<TableTemplates>();
		template.add(TableTemplates.GENERIC);
		OutputTable p = new OutputTable(template, destinationTableLable, destinationTable, "Output  table");
		return p;
	}

	@Override
	public void initSingleNode(AlgorithmConfiguration config) {


	}

	@Override
	public float getInternalStatus() {

		return status;
	}

	List<Tuple<String>> matchedTuples = new ArrayList<Tuple<String>>();
	List<Tuple<String>> unmatchedTuples = new ArrayList<Tuple<String>>();

	public void filterMatchedTuples(List<Tuple<String>> tuples){
		unmatchedTuples = null;
		unmatchedTuples = new ArrayList<Tuple<String>>();

		for (Tuple<String> tuple:tuples){
			double score = 0;
			if (tuple.getElements().size()>2){
				try{score = Double.parseDouble(tuple.getElements().get(2));}catch(Exception e){
					e.printStackTrace();
				}
			}
			if (score==1)
				matchedTuples.add(tuple);
			else
				unmatchedTuples.add(tuple);
		}

	}

	public StringBuffer executeBionymWorkflow(String rawspeciesname, String sandboxfolder, String preprocessor, boolean[] enablematchers, float soundexweightF, int maxResults, float pruningThreshold) throws Exception {
		StringBuffer sb = new StringBuffer();
		List<String> rawnames = new ArrayList<String>();
		rawnames.add(rawspeciesname);

		List<Tuple<String>> preprocessednames = new ArrayList<Tuple<String>>();

		// preprocessing
		switch (EVBPreprocessing.Preprocessors.valueOf(preprocessor)) {
		case EXPERT_RULES:
			EVBPreprocessing preprocess = new EVBPreprocessing();
			preprocessednames = preprocess.preprocess(parser, sandboxfolder, rawnames);
			break;
		default:
			preprocessednames = EVBPreprocessing.populateTuples(rawnames);
			break;
		}

		filterMatchedTuples(preprocessednames);

		for (int i = 0; i < enablematchers.length; i++) {
			// standard WF
			if (i == 0) {
				CometMatcherManager comet = new CometMatcherManager();
				List<Tuple<String>> cometoutput = comet.match(parser, reference, sandboxfolder, unmatchedTuples, soundexweightF,maxResults);
				filterMatchedTuples(cometoutput);
			}
		}

		//add all non exact matches
		matchedTuples.addAll(unmatchedTuples);
		int msize = matchedTuples.size();

		if (msize>0) {

			for (int i = 0; i < msize; i++) {
				Tuple<String> t = matchedTuples.get(i);
				String scoreS = t.getElements().get(2);
				Float score  =  (scoreS !=null)? Float.parseFloat(scoreS):0;

				if (score>=pruningThreshold){
					String spname = t.getElements().get(0);
					String authorname = t.getElements().get(1);
					if (authorname.length()>0)
						spname +=" ("+authorname+")";

					sb.append("('" + rawspeciesname + "','" + spname + "','" + scoreS + "')");
					if (i < msize - 1)
						sb.append(",");
				}
			}
		}
		String sbstring = sb.toString().trim();
		int ssize = sbstring.length();

		if (sbstring.endsWith(",")){
			System.out.println("Deleting final comma..");
			sb= new StringBuffer(sbstring.substring(0,ssize-1));
		}
		return sb;
	}

	@Override
	public int executeNode(int leftStartIndex, int numberOfLeftElementsToProcess, int rightStartIndex, int numberOfRightElementsToProcess,boolean duplicate, String sandboxFolder, String nodeConfigurationFileObject, String logfileNameToProduce) {
		try {
			status = 0;
			long t0 = System.currentTimeMillis();
			// rebuild variables
			System.out.println("Restoring configuration");
			AlgorithmConfiguration config = Transformations.restoreConfig(new File(sandboxFolder, nodeConfigurationFileObject).getAbsolutePath());
			config.setConfigPath(sandboxFolder);
			dbconnection = DatabaseUtils.initDBSession(config);
			destinationTable = config.getParam(destinationTableParam);
			originTable = config.getParam(originTableParam);
			rawnamesColumn = config.getParam(rawnamesColumnParam);
			parser = config.getParam(parserParam);
			reference = config.getParam(referenceParam);
			soundexweight = config.getParam(soundexweightParam);
			preprocessor = config.getParam(doPreprocessParam);
			String maxMatchesS = config.getParam(maxMatchesParam);
			maxMatches= (maxMatchesS==null)?10:Integer.parseInt(maxMatchesS);
			String pruningThrS = config.getParam(pruningThresholdParam);
			pruningThreshold = (pruningThrS==null)?0.4f:Float.parseFloat(pruningThrS);

			System.out.println("Destination Table: " + destinationTable);
			System.out.println("Origin Table: " + originTable);
			System.out.println("Column of names: " + rawnamesColumn);
			System.out.println("Parser to use: " + parser);
			System.out.println("Reference Dataset: " + reference);
			System.out.println("Soundex Preference: " + soundexweight);
			System.out.println("Preprocessor:" + preprocessor);
			System.out.println("Pruning threshold:" + pruningThreshold);
			System.out.println("Number of Matches:" + maxMatches);

			float soundexweightF = 0.5f;
			switch (CometMatcherManager.Weights.valueOf(soundexweight)) {
			case SOUNDEX:
				soundexweightF = 1f;
				break;
			case EDIT_DISTANCE:
				soundexweightF = 0f;
				break;
			case MIXED:
				soundexweightF = 0.5f;
				break;
			default:
				soundexweightF = 0.5f;
				break;
			}

			// retrieve the list of names to process
			System.out.println("Retrieving names to process");
			List<Object> rawnames = DatabaseFactory.executeSQLQuery(DatabaseUtils.getDinstictElements(originTable, rawnamesColumn, ""), dbconnection);
			System.out.println("Retrieved a total of "+rawnames.size()+" species");

			int end = rightStartIndex + numberOfRightElementsToProcess;

			System.out.println("Processing from "+rightStartIndex+" to "+end);
			List<String> rawnamesFiltered = new ArrayList<String>();
			for (int i = rightStartIndex; i < end; i++) {
				rawnamesFiltered.add("" + rawnames.get(i));
			}
			// prepare the environment
			try {
//				OSCommand.ExecuteGetLine("chmod +x * | whoami", null);
				OSCommand.ExecuteGetLine("chmod +x *", null);
			} catch (Exception e) {
				System.out.println("WARNING: could not change the permissions");
			}
			int rawscounter = 0;
			System.out.println("Processing raw names");
			for (String rawname : rawnamesFiltered) {
				rawname  = rawname.replace("'", "").replace("\"", "");
				System.out.println("Processing species: "+rawname);
				StringBuffer sb = executeBionymWorkflow(rawname, sandboxFolder, preprocessor, new boolean[] { true }, soundexweightF,maxMatches,pruningThreshold);
				System.out.println("Processed species: "+rawname);
				if (sb.length() > 0) {
					rawscounter++;
					System.out.println("Inserting results onto the table "+destinationTable);
					String insertQuery = DatabaseUtils.insertFromBuffer(destinationTable, headers, sb);
					System.out.println("Insert Query: " + insertQuery);
					System.out.println("Inserting values for " + rawname);
					DatabaseFactory.executeSQLUpdate(insertQuery, dbconnection);
					System.out.println("Successfully Inserted values for " + rawname);
				}
//				Thread.sleep(10000);
			}

			System.out.println("The procedure finished successfully. Processed " + rawscounter + " species.");
			System.out.println("Elapsed Time " + (System.currentTimeMillis() - t0) + " ms");
		} catch (Exception e) {
			e.printStackTrace();
			System.out.println("warning: error in node execution " + e.getLocalizedMessage());
			System.err.println("Error in node execution " + e.getLocalizedMessage());
			return -1;
		} finally {
			if (dbconnection != null)
				try {
					dbconnection.close();
				} catch (Exception e) {
				}
			status = 1f;
		}

		return 0;
	}

	@Override
	public void setup(AlgorithmConfiguration config) throws Exception {
		haspostprocessed = false;
		AnalysisLogger.getLogger().info("Initializing DB Connection");
		dbconnection = DatabaseUtils.initDBSession(config);
		destinationTable = config.getParam(destinationTableParam);
		originTable = config.getParam(originTableParam);
		rawnamesColumn = config.getParam(rawnamesColumnParam);

		List<Object> rawnames = DatabaseFactory.executeSQLQuery(DatabaseUtils.getDinstictElements(originTable, rawnamesColumn, ""), dbconnection);
		rawnamescount = rawnames.size();
//		rawnamescount =1;
		AnalysisLogger.getLogger().info("Creating Destination Table " + destinationTable);
		try {
			DatabaseFactory.executeSQLUpdate(DatabaseUtils.dropTableStatement(destinationTable), dbconnection);
		} catch (Exception e) {
			AnalysisLogger.getLogger().info("Table " + destinationTable + " did not exist");
		}

		DatabaseFactory.executeSQLUpdate(String.format(createOutputTable, destinationTable), dbconnection);

		prevmaxMessages=D4ScienceDistributedProcessing.maxMessagesAllowedPerJob;
		D4ScienceDistributedProcessing.maxMessagesAllowedPerJob=1;

		AnalysisLogger.getLogger().info("Destination Table Created! Addressing " + rawnamescount + " names");

	}

	@Override
	public int getNumberOfRightElements() {
		return rawnamescount;
	}

	@Override
	public int getNumberOfLeftElements() {
		return 1;
	}

	@Override
	public void stop() {
		// if has not postprocessed, then abort the computations by removing the database table
		if (!haspostprocessed) {
			try {
				AnalysisLogger.getLogger().info("The procedure did NOT correctly postprocessed ....Removing Table " + destinationTable + " because of computation stop!");
				DatabaseFactory.executeSQLUpdate(DatabaseUtils.dropTableStatement(destinationTable), dbconnection);
			} catch (Exception e) {
				AnalysisLogger.getLogger().info("Table " + destinationTable + " did not exist");
			}
		} else
			AnalysisLogger.getLogger().info("The procedure has correctly postprocessed: shutting down the connection!");
		if (dbconnection != null)
			try {
				dbconnection.close();
			} catch (Exception e) {
			}
	}

	@Override
	public void postProcess(boolean manageDuplicates, boolean manageFault) {
		D4ScienceDistributedProcessing.maxMessagesAllowedPerJob=prevmaxMessages;
		haspostprocessed = true;
	}


	public static void mainTEST(String[] args) throws Exception {
		AlgorithmConfiguration config = new AlgorithmConfiguration();

		config.setConfigPath("./cfg/");
		String sandbox = "./PARALLEL_PROCESSING";
		String configfile = "testconfig.cfg";
		config.setPersistencePath(sandbox);
		/*
		config.setParam("DatabaseUserName", "gcube");
		config.setParam("DatabasePassword", "d4science2");
		config.setParam("DatabaseURL", "jdbc:postgresql://146.48.87.169/testdb");
		config.setParam("DatabaseDriver", "org.postgresql.Driver");
*/
		config.setParam("DatabaseUserName","utente");
		config.setParam("DatabasePassword","d4science");
		config.setParam("DatabaseURL","jdbc:postgresql://statistical-manager.d.d4science.research-infrastructures.eu/testdb");

		config.setParam(BionymWorkflow.destinationTableParam, "taxamatchoutputlocal");
		config.setParam(BionymWorkflow.destinationTableLable, "taxamatchoutputlabel");
		config.setParam(BionymWorkflow.originTableParam, "taxamatchinput");
		config.setParam(BionymWorkflow.rawnamesColumnParam, "rawstrings");
		config.setParam(BionymWorkflow.parserParam, CometMatcherManager.Parsers.SIMPLE.name());
		config.setParam(BionymWorkflow.referenceParam, CometMatcherManager.Reference.ASFIS.name());
		config.setParam(BionymWorkflow.soundexweightParam, CometMatcherManager.Weights.EDIT_DISTANCE.name());
		config.setParam(BionymWorkflow.doPreprocessParam , EVBPreprocessing.Preprocessors.EXPERT_RULES.name());
		config.setParam(BionymWorkflow.maxMatchesParam , "10");

		AnalysisLogger.setLogger(config.getConfigPath() + AlgorithmConfiguration.defaultLoggerFile);

		BufferedWriter oos = new BufferedWriter(new FileWriter(new File(sandbox, configfile)));
		oos.write(new XStream().toXML(config));
		oos.close();

		new BionymWorkflow().setup(config);

//		new BionymWorkflow().executeNode(0, 1, 0, 915, false, sandbox, configfile, "test.log");
		new BionymWorkflow().executeNode(0, 1, 0, 1, false, sandbox, configfile, "test.log");
	}

}