added one level of checking (search all the words of name surname in orcid and in paper)

2020-11-04 18:30:09 +01:00 · 2020-11-04 18:30:09 +01:00 · fff512a87a
parent 44cf0b712f
commit fff512a87a
1 changed files with 60 additions and 20 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/MakeReportSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/authorpids/MakeReportSparkJob.java
@ -95,23 +95,16 @@ public class MakeReportSparkJob implements Serializable {

    }

-    /**
-     * It cleans the association with the orcid to authors whose name and surname do not match the one in the orcid sequence file
-     * First step the join between the authoritative information and the prepared Info on orcid id. If the name surname of the authors do not match
-     * (or the surname and the first letter of the name) null is returned. If they match the resultInfo instance is returned.
-     * Second step the result info returned are grouped by key (the key is the id of the result) and a new result with as many authors as those
-     * having a correct association with the orcid is returned.
-     * Third step a join between the result and the authoritavide authors result is done on the result id.
-     * Each author in the original result with a pid of type orcid is match against the list of the authoritative authors. If the author
-     * in the original result has his pid among those in the authoritative result than the pid is retained, else is removed
-     * @param spark the stpark session
-     * @param inputPath the path where to find the result to emend
-     * @param entityClazz the class of the result considered
-     * @param outputPath the path where to write the emended result
-     * @param preparedInfoPath the path where to find the selected information for the result
-     * @param authoritative the authoritative association orcid name surname
-     * @param <I>
-     */
+
+    private static void addInList(List<String> list, String to_add){
+        for (String word: to_add.split(" ")){
+            if(word.length() >= 2){
+                list.add(word);
+            }
+        }
+
+
+    }

    private static <I extends Result> void makeReport(SparkSession spark, String inputPath, Class<I> entityClazz,
                                                      String outputPath, String preparedInfoPath,
@ -145,14 +138,46 @@ public class MakeReportSparkJob implements Serializable {
                    if(!Optional.ofNullable(oa.getSurname()).isPresent()){
                        return new Tuple2<>("missing", reportInfo);
                    }
-                    final String handledOsurname = StringUtils.stripAccents(oa.getSurname().toLowerCase().trim()).replace("-", " ");
-                    final String handledSurname = StringUtils.stripAccents(ri.getSurname().toLowerCase().trim()).replace("-", " ");
+                    final String handledOsurname = StringUtils.stripAccents(oa.getSurname().toLowerCase().trim())
+                            .replace("-", " ").replace(".", "");
+                    final String handledSurname = StringUtils.stripAccents(ri.getSurname().toLowerCase().trim())
+                            .replace("-", " ").replace(".", "");
                    if (!handledOsurname
                            .equalsIgnoreCase(handledSurname)) {
                        if (!handledOsurname.contains(handledSurname) && !handledSurname.contains(handledOsurname)) {
+                            //check if the words composing the name and the surname are the same or one list contains the other.
+                            //do for words of lenght bigger than two
+                            String handledOname = "";
+                            if(Optional.ofNullable(oa.getName()).isPresent()){
+                                handledOname = StringUtils.stripAccents(oa.getName().toLowerCase().trim())
+                                        .replace("-", " ").replace(".", "");
+                            }
+                            final String handledName = StringUtils.stripAccents(ri.getName().toLowerCase().trim())
+                                    .replace("-", " ").replace(".", "");
+
+                            final List<String> orcidList = new ArrayList<>();
+                            final List<String> paperList = new ArrayList<>();
+
+                            addInList(orcidList, handledOname);
+                            addInList(orcidList, handledOsurname);
+
+                            addInList(paperList, handledSurname);
+                            addInList(paperList, handledName);
+
+                            if(orcidList.size()<= paperList.size()){
+                                if(searchIn(paperList, orcidList)){
+                                    return new Tuple2<>("check", reportInfo);
+                                }
+                            }else{
+                                if(searchIn(orcidList, paperList)){
+                                    return new Tuple2<>("check", reportInfo);
+                                }
+                            }
+
+                            //todo add another level of checking (use levenstein)
                            return new Tuple2<>("wrong", reportInfo);
                        }
-                        return new Tuple2<>("check", reportInfo);
+                        return new Tuple2<>("right", reportInfo);
                    }

                    return new Tuple2<>("right", reportInfo);
@ -167,6 +192,21 @@ public class MakeReportSparkJob implements Serializable {

    }

+    /**
+     * searches in list1 all the words of list 2
+     * @param list1 the list where to search for the words
+     * @param list2 the list containing the words to be searched
+     * @return true if all the words in list 2 are contained in list1
+     */
+    private static boolean searchIn(List<String> list1, List<String> list2) {
+        for(String word:list2){
+            if (!list1.contains(word)){
+                return false;
+            }
+        }
+        return true;
+    }
+
    private static void writeSet(Dataset<Tuple2<String, ReportInfo>> dataset, String outputPath){
        dataset.groupByKey((MapFunction<Tuple2<String,ReportInfo>, String>) value -> value._2().getOid() , Encoders.STRING())
                .mapGroups((MapGroupsFunction<String, Tuple2<String, ReportInfo>, ReportInfo>) (oid, tuple2Iterator) ->{