package org.gcube.dataanalysis.executor.nodes.transducers.bionym; import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.UUID; import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; import org.gcube.dataanalysis.ecoengine.configuration.AlgorithmConfiguration; import org.gcube.dataanalysis.ecoengine.configuration.INFRASTRUCTURE; import org.gcube.dataanalysis.ecoengine.datatypes.PrimitiveType; import org.gcube.dataanalysis.ecoengine.datatypes.StatisticalType; import org.gcube.dataanalysis.ecoengine.datatypes.enumtypes.PrimitiveTypes; import org.gcube.dataanalysis.ecoengine.interfaces.Transducerer; import org.gcube.dataanalysis.ecoengine.utils.ResourceFactory; import org.gcube.dataanalysis.executor.generators.D4ScienceDistributedProcessing; import org.gcube.dataanalysis.executor.nodes.transducers.bionym.abstracts.MatcherOutput; import org.gcube.dataanalysis.executor.nodes.transducers.bionym.abstracts.SingleEntry; import org.gcube.dataanalysis.executor.nodes.transducers.bionym.abstracts.YasmeenMatcher; import org.gcube.dataanalysis.executor.nodes.transducers.bionym.implementations.workflows.BiOnymWF; import org.gcube.dataanalysis.executor.nodes.transducers.bionym.interfaces.Matcher; import org.gcube.dataanalysis.executor.nodes.transducers.bionym.utils.YasmeenGlobalParameters; import org.gcube.dataanalysis.executor.scripts.OSCommand; public class BionymLocalTransducer implements Transducerer { // BionymFlexibleWorkflowTransducer bionymwf = new BionymFlexibleWorkflowTransducer(); public float status = 0f; AlgorithmConfiguration config = null; public static String speciesName = "SpeciesAuthorName"; public LinkedHashMap outputmap = new LinkedHashMap(); @Override public List getInputParameters() { List types = new BionymFlexibleWorkflowTransducer().getInputParameters(); PrimitiveType p1 = new PrimitiveType(String.class.getName(), null, PrimitiveTypes.STRING, speciesName, "The scientific name of the species, possibly with authorship", "Gadus morhua (Linnaeus, 1758)"); types.remove(0); types.remove(0); types.remove(0); types.remove(0); types.add(0, p1); return types; } @Override public String getResourceLoad() { ResourceFactory resourceManager = new ResourceFactory(); return resourceManager.getResourceLoad(1); } public String getResources() { if ((status > 0) && (status < 100)) return ResourceFactory.getResources(100f); else return ResourceFactory.getResources(0f); } @Override public float getStatus() { return status; } @Override public INFRASTRUCTURE getInfrastructure() { return INFRASTRUCTURE.LOCAL; } @Override public StatisticalType getOutput() { PrimitiveType p = new PrimitiveType(Map.class.getName(), PrimitiveType.stringMap2StatisticalMap(outputmap), PrimitiveTypes.MAP, "RetrievedTaxaNames", "Retrieved Taxa Names"); return p; } @Override public void init() throws Exception { } @Override public void setConfiguration(AlgorithmConfiguration config) { this.config = config; } @Override public void shutdown() { } @Override public String getDescription() { return "A fast version of the algorithm implementing BiOnym, a flexible workflow approach to taxon name matching. The workflow allows to activate several taxa names matching algorithms and to get the list of possible transcriptions for a list of input raw species names with possible authorship indication."; } public void compute() throws Exception { status = 10f; String uuid = ("" + UUID.randomUUID()).replace("-", ""); String inputParserFile = "inputParser" + uuid + ".txt"; String outputParserFile = "outputParser" + uuid + ".txt"; String sandboxFolder = ""; try { String parser = config.getParam(YasmeenGlobalParameters.parserNameParam); String accuracyvsspeed = config.getParam(YasmeenGlobalParameters.performanceParam); String reference = config.getParam(YasmeenGlobalParameters.taxaAuthorityFileParam); String doPreprocess = config.getParam(YasmeenGlobalParameters.activatePreParsingProcessing); String usestemming = config.getParam(YasmeenGlobalParameters.useStemmedGenusAndSpecies); String overallMaxResults = config.getParam(YasmeenGlobalParameters.overallMaxResults); String inputSpecies = config.getParam(speciesName); sandboxFolder = new File(config.getPersistencePath(), D4ScienceDistributedProcessing.defaultContainerFolder).getAbsolutePath(); AnalysisLogger.getLogger().debug("BiOnymLocal-> Species Name: " + inputSpecies); AnalysisLogger.getLogger().debug("BiOnymLocal-> Parser to use: " + parser); AnalysisLogger.getLogger().debug("BiOnymLocal-> Accuracy vs Speed: " + accuracyvsspeed); AnalysisLogger.getLogger().debug("BiOnymLocal-> Reference Dataset: " + reference); AnalysisLogger.getLogger().debug("BiOnymLocal-> Do Preprocessing: " + doPreprocess); AnalysisLogger.getLogger().debug("BiOnymLocal-> Use Stemming:" + usestemming); AnalysisLogger.getLogger().debug("BiOnymLocal-> Overall MaxResults:" + overallMaxResults); // prepare the WF HashMap globalparameters = new HashMap(); globalparameters.put(YasmeenGlobalParameters.parserInputFileParam, inputParserFile); globalparameters.put(YasmeenGlobalParameters.parserOutputFileParam, outputParserFile); globalparameters.put(YasmeenGlobalParameters.activatePreParsingProcessing, doPreprocess); globalparameters.put(YasmeenGlobalParameters.parserNameParam, parser); globalparameters.put(YasmeenGlobalParameters.performanceParam, accuracyvsspeed); globalparameters.put(YasmeenGlobalParameters.taxaAuthorityFileParam, reference); globalparameters.put(YasmeenGlobalParameters.useStemmedGenusAndSpecies, usestemming); globalparameters.put(YasmeenGlobalParameters.staticFilesFolderParam, config.getConfigPath()); // retrieve the list of names to process inputSpecies = inputSpecies.replaceAll("^'", "").replaceAll("'$", ""); List rawnamesFiltered = new ArrayList(); rawnamesFiltered.add(inputSpecies); // prepare the environment try { OSCommand.ExecuteGetLine("chmod +x *", null); } catch (Exception e) { AnalysisLogger.getLogger().debug("BiOnymLocal-> WARNING: could not change the permissions"); } int overallMR = 10; if (overallMaxResults != null) overallMR = Integer.parseInt(overallMaxResults); AnalysisLogger.getLogger().debug("BiOnymLocal-> Executing WF"); BiOnymWF bionym = new BiOnymWF(sandboxFolder, overallMR, globalparameters); // rebuild the matchers List matchers = new BionymFlexibleWorkflowTransducer().buildMatcherList(config, sandboxFolder, globalparameters); if (matchers != null) bionym.resetMatchers(matchers); AnalysisLogger.getLogger().debug("BiOnymLocal-> WorkFlow: "); int mcounter = 1; for (Matcher matcher:matchers){ AnalysisLogger.getLogger().debug("BiOnymLocal-> "+mcounter+": "+matcher); mcounter++; } status = 20f; MatcherOutput output = bionym.executeChainedWorkflow(rawnamesFiltered); AnalysisLogger.getLogger().debug("BiOnymLocal-> Workflow Executed"); status = 70f; int nEntries = output.getEntriesNumber(); outputmap.put("HEADER", BionymFlexibleWorkflowTransducer.headers.toString()); for (int i = 0; i < nEntries; i++) { SingleEntry se = output.getEntry(i); // "SOURCE_DATA,TARGET_DATA_SCIENTIFIC_NAME,TARGET_DATA_AUTHORITY,MATCHING_SCORE,TARGET_DATA_SOURCE,TARGET_DATA_ID"; String[] srow = new String[6]; srow[0] = "\""+se.originalName+"\""; srow[1] = "\""+se.targetScientificName+"\""; srow[2] = "\""+se.targetAuthor+"\""; srow[3] = "\"" + se.matchingScore+"\""; srow[4] = "\""+reference+"\""; srow[5] = "\""+se.targetID+"\""; outputmap.put("" + (i + 1), Arrays.toString(srow)); } if (nEntries == 0) AnalysisLogger.getLogger().debug("BiOnymLocal-> Warning no output found!"); AnalysisLogger.getLogger().debug("BiOnymLocal-> map of outputs:\n" + outputmap); } catch (Exception e) { e.printStackTrace(); throw e; } finally { new File(sandboxFolder,inputParserFile).delete(); new File(sandboxFolder,outputParserFile).delete(); AnalysisLogger.getLogger().debug("BiOnymLocal-> deleting auxiliary files"); AnalysisLogger.getLogger().debug("BiOnymLocal-> shutting down"); status = 100f; } } }