ecological-engine-smart-exe.../src/main/java/org/gcube/dataanalysis/executor/nodes/transducers/bionym/EVBPreprocessing.java

163 lines
5.3 KiB
Java
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package org.gcube.dataanalysis.executor.nodes.transducers.bionym;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.gcube.dataanalysis.ecoengine.utils.Tuple;
public class EVBPreprocessing {
// "preparsecleaning";"Pre-parsing cleaning";"Does not require knowledge of where the individual components of the complete namestring start or end"
public static String[][] preparsecleaning = {
{" ", " ","3"},//space
{""+(char)10," ","3"}, //space
{" {2,}"," ","3"}, //consecutive spaces
{"^ ", "","3"},//leading space
{" $", "","3"},//trailing space
{"[?]", "", "1"},//uncertain identification
{" v(ar)?\\.? "," v. ","1" },//standardise variety indication
{" f(orm(a)?)?.? ", " f. ","1"}//standardise form indication
};
public static String[][] postparsecleaning = {
{" sp[\\.]?( ?[1-9a-zA-Z])?$","","1"}//remove temporary species indication
};
public static String[] preparsecleaningorigins = {
" ", //space
""+(char)10, //space
" {2,}", //consecutive spaces
"^ ", //leading space
" $", //trailing space
"[?]", //uncertain identification
" v(ar)?\\.? ", //standardise variety indication
" f(orm(a)?)?.? " //standardise form indication
};
public static String[] preparsecleaningtargets = {
" ", //space
" ", //space
" ", //consecutive spaces
"", //leading space
"", //trailing space
"", //uncertain identification
" v. ", //standardise variety indication
" f. " //standardise form indication
};
//"postparsecleaning";"Post-parsing cleaning";"Does require knowledge of where the individual components of the complete namestring start or end; assumes namestring is split in name proper and authority"
public static String[] postparsecleaningorigin = {
" sp[\\.]?( ?[1-9a-zA-Z])?$" //remove temporary species indication
};
public static String[] postparsecleaningtargets = {
"" //remove temporary species indication
};
//"fuzzymatch";"Fuzzy matching";"Based on original idea from Tony Rees"
public static String[] fuzzymatchorigins = {
"h", //remove all characters h
"y", //all y to i
"s|k" //all s and k to c
};
public static String[] fuzzymatchtargets = {
"''", //remove all characters h
"i", //all y to i
"c" //all s and k to c
};
public static enum Preprocessors{
EXPERT_RULES,
NONE
}
public static boolean appliesToScientificName(int regexProperty){
int t = (1 & regexProperty);
return t>0;
}
public static boolean appliesToAuthorship(int regexProperty){
int t = (2 & regexProperty);
return t>0;
}
public static List<Tuple<String>> populateTuples(List<String> rawnames){
List<Tuple<String>> preprocessednames = new ArrayList<Tuple<String>>();
for (String rawn:rawnames){
preprocessednames.add(new Tuple<String>(rawn,""));
}
return preprocessednames;
}
public List<Tuple<String>> preprocess(String parser, String sandboxFolder, List<String> rawnamesFiltered) throws Exception{
File FParserinputFile = new File(sandboxFolder,"inputEVBParser.csv");
File FParseroutputFile = new File(sandboxFolder,"outputEVBParser.csv");
try{
FParserinputFile.delete();
}catch(Exception e){
}
try{
FParseroutputFile.delete();
}catch(Exception e){
}
String parserinputFile = FParserinputFile.getAbsolutePath();
String parseroutputFile = FParseroutputFile.getAbsolutePath();
List<String> preprocessedrawnames = new ArrayList<String>(rawnamesFiltered);
//apply evb preprocess
int namessize = preprocessedrawnames.size();
System.out.println("Applying preprocessing to strings");
for (int i=0;i<namessize;i++){
String preprocessedrawname = preprocessedrawnames.get(i);
for (int j=0;j<preparsecleaningorigins.length;j++){
preprocessedrawname = preprocessedrawname.replaceAll(preparsecleaningorigins[j], preparsecleaningtargets[j]);
// System.out.println(preparsecleaningorigins[j]+"->"+preparsecleaningtargets[j]+"="+preprocessedrawname);
}
}
System.out.println("Applying parsing");
CometMatcherManager.dumpCometInput(parserinputFile, populateTuples(preprocessedrawnames));
CometMatcherManager.cometParse(sandboxFolder,parser, parseroutputFile, parserinputFile);
List<Tuple<String>> parsedNames = CometMatcherManager.parseCometParserOutput(parseroutputFile);
int pnamessize = parsedNames.size();
System.out.println("Applying postprocessing to "+pnamessize+" strings");
for (int i=0;i<pnamessize;i++){
String postprocessedrawname = parsedNames.get(i).getElements().get(0);
String postprocessedrawauthor = parsedNames.get(i).getElements().get(1);
System.out.println("Parsed name: "+postprocessedrawname+" author: "+postprocessedrawauthor);
for (int j=0;j<postparsecleaningorigin.length;j++){
postprocessedrawname = postprocessedrawname.replaceAll(postparsecleaningorigin[j], postparsecleaningtargets[j]);
}
}
return parsedNames;
}
public static void main(String[] args) throws Exception{
/*
List<String> normalized = CometMatcherManager.parseCometParserOutput("./PARALLEL_PROCESSING/outs.csv");
for (String norm:normalized){
System.out.println(norm);
}
*/
System.out.println("Gadus var. morhua".replaceAll(" v(ar)?\\.? ", " v. "));
}
}