package org.gcube.dataanalysis.executor.nodes.transducers.bionym; import java.io.File; import java.util.ArrayList; import java.util.List; import org.gcube.dataanalysis.ecoengine.utils.Tuple; public class EVBPreprocessing { // "preparsecleaning";"Pre-parsing cleaning";"Does not require knowledge of where the individual components of the complete namestring start or end" public static String[][] preparsecleaning = { {" ", " ","3"},//space {""+(char)10," ","3"}, //space {" {2,}"," ","3"}, //consecutive spaces {"^ ", "","3"},//leading space {" $", "","3"},//trailing space {"[?]", "", "1"},//uncertain identification {" v(ar)?\\.? "," v. ","1" },//standardise variety indication {" f(orm(a)?)?.? ", " f. ","1"}//standardise form indication }; public static String[][] postparsecleaning = { {" sp[\\.]?( ?[1-9a-zA-Z])?$","","1"}//remove temporary species indication }; public static String[] preparsecleaningorigins = { " ", //space ""+(char)10, //space " {2,}", //consecutive spaces "^ ", //leading space " $", //trailing space "[?]", //uncertain identification " v(ar)?\\.? ", //standardise variety indication " f(orm(a)?)?.? " //standardise form indication }; public static String[] preparsecleaningtargets = { " ", //space " ", //space " ", //consecutive spaces "", //leading space "", //trailing space "", //uncertain identification " v. ", //standardise variety indication " f. " //standardise form indication }; //"postparsecleaning";"Post-parsing cleaning";"Does require knowledge of where the individual components of the complete namestring start or end; assumes namestring is split in name proper and authority" public static String[] postparsecleaningorigin = { " sp[\\.]?( ?[1-9a-zA-Z])?$" //remove temporary species indication }; public static String[] postparsecleaningtargets = { "" //remove temporary species indication }; //"fuzzymatch";"Fuzzy matching";"Based on original idea from Tony Rees" public static String[] fuzzymatchorigins = { "h", //remove all characters h "y", //all y to i "s|k" //all s and k to c }; public static String[] fuzzymatchtargets = { "''", //remove all characters h "i", //all y to i "c" //all s and k to c }; public static enum Preprocessors{ EXPERT_RULES, NONE } public static boolean appliesToScientificName(int regexProperty){ int t = (1 & regexProperty); return t>0; } public static boolean appliesToAuthorship(int regexProperty){ int t = (2 & regexProperty); return t>0; } public static List> populateTuples(List rawnames){ List> preprocessednames = new ArrayList>(); for (String rawn:rawnames){ preprocessednames.add(new Tuple(rawn,"")); } return preprocessednames; } public List> preprocess(String parser, String sandboxFolder, List rawnamesFiltered) throws Exception{ File FParserinputFile = new File(sandboxFolder,"inputEVBParser.csv"); File FParseroutputFile = new File(sandboxFolder,"outputEVBParser.csv"); try{ FParserinputFile.delete(); }catch(Exception e){ } try{ FParseroutputFile.delete(); }catch(Exception e){ } String parserinputFile = FParserinputFile.getAbsolutePath(); String parseroutputFile = FParseroutputFile.getAbsolutePath(); List preprocessedrawnames = new ArrayList(rawnamesFiltered); //apply evb preprocess int namessize = preprocessedrawnames.size(); System.out.println("Applying preprocessing to strings"); for (int i=0;i"+preparsecleaningtargets[j]+"="+preprocessedrawname); } } System.out.println("Applying parsing"); CometMatcherManager.dumpCometInput(parserinputFile, populateTuples(preprocessedrawnames)); CometMatcherManager.cometParse(sandboxFolder,parser, parseroutputFile, parserinputFile); List> parsedNames = CometMatcherManager.parseCometParserOutput(parseroutputFile); int pnamessize = parsedNames.size(); System.out.println("Applying postprocessing to "+pnamessize+" strings"); for (int i=0;i normalized = CometMatcherManager.parseCometParserOutput("./PARALLEL_PROCESSING/outs.csv"); for (String norm:normalized){ System.out.println(norm); } */ System.out.println("Gadus var. morhua".replaceAll(" v(ar)?\\.? ", " v. ")); } }