diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/MakeReportSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/MakeReportSparkJob.java index 8f209d6c94..0dae3197fc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/MakeReportSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/MakeReportSparkJob.java @@ -95,23 +95,16 @@ public class MakeReportSparkJob implements Serializable { } - /** - * It cleans the association with the orcid to authors whose name and surname do not match the one in the orcid sequence file - * First step the join between the authoritative information and the prepared Info on orcid id. If the name surname of the authors do not match - * (or the surname and the first letter of the name) null is returned. If they match the resultInfo instance is returned. - * Second step the result info returned are grouped by key (the key is the id of the result) and a new result with as many authors as those - * having a correct association with the orcid is returned. - * Third step a join between the result and the authoritavide authors result is done on the result id. - * Each author in the original result with a pid of type orcid is match against the list of the authoritative authors. If the author - * in the original result has his pid among those in the authoritative result than the pid is retained, else is removed - * @param spark the stpark session - * @param inputPath the path where to find the result to emend - * @param entityClazz the class of the result considered - * @param outputPath the path where to write the emended result - * @param preparedInfoPath the path where to find the selected information for the result - * @param authoritative the authoritative association orcid name surname - * @param - */ + + private static void addInList(List list, String to_add){ + for (String word: to_add.split(" ")){ + if(word.length() >= 2){ + list.add(word); + } + } + + + } private static void makeReport(SparkSession spark, String inputPath, Class entityClazz, String outputPath, String preparedInfoPath, @@ -145,14 +138,46 @@ public class MakeReportSparkJob implements Serializable { if(!Optional.ofNullable(oa.getSurname()).isPresent()){ return new Tuple2<>("missing", reportInfo); } - final String handledOsurname = StringUtils.stripAccents(oa.getSurname().toLowerCase().trim()).replace("-", " "); - final String handledSurname = StringUtils.stripAccents(ri.getSurname().toLowerCase().trim()).replace("-", " "); + final String handledOsurname = StringUtils.stripAccents(oa.getSurname().toLowerCase().trim()) + .replace("-", " ").replace(".", ""); + final String handledSurname = StringUtils.stripAccents(ri.getSurname().toLowerCase().trim()) + .replace("-", " ").replace(".", ""); if (!handledOsurname .equalsIgnoreCase(handledSurname)) { if (!handledOsurname.contains(handledSurname) && !handledSurname.contains(handledOsurname)) { + //check if the words composing the name and the surname are the same or one list contains the other. + //do for words of lenght bigger than two + String handledOname = ""; + if(Optional.ofNullable(oa.getName()).isPresent()){ + handledOname = StringUtils.stripAccents(oa.getName().toLowerCase().trim()) + .replace("-", " ").replace(".", ""); + } + final String handledName = StringUtils.stripAccents(ri.getName().toLowerCase().trim()) + .replace("-", " ").replace(".", ""); + + final List orcidList = new ArrayList<>(); + final List paperList = new ArrayList<>(); + + addInList(orcidList, handledOname); + addInList(orcidList, handledOsurname); + + addInList(paperList, handledSurname); + addInList(paperList, handledName); + + if(orcidList.size()<= paperList.size()){ + if(searchIn(paperList, orcidList)){ + return new Tuple2<>("check", reportInfo); + } + }else{ + if(searchIn(orcidList, paperList)){ + return new Tuple2<>("check", reportInfo); + } + } + + //todo add another level of checking (use levenstein) return new Tuple2<>("wrong", reportInfo); } - return new Tuple2<>("check", reportInfo); + return new Tuple2<>("right", reportInfo); } return new Tuple2<>("right", reportInfo); @@ -167,6 +192,21 @@ public class MakeReportSparkJob implements Serializable { } + /** + * searches in list1 all the words of list 2 + * @param list1 the list where to search for the words + * @param list2 the list containing the words to be searched + * @return true if all the words in list 2 are contained in list1 + */ + private static boolean searchIn(List list1, List list2) { + for(String word:list2){ + if (!list1.contains(word)){ + return false; + } + } + return true; + } + private static void writeSet(Dataset> dataset, String outputPath){ dataset.groupByKey((MapFunction, String>) value -> value._2().getOid() , Encoders.STRING()) .mapGroups((MapGroupsFunction, ReportInfo>) (oid, tuple2Iterator) ->{