added one level of checking (search all the words of name surname in orcid and in paper)

This commit is contained in:
Miriam Baglioni 2020-11-04 18:30:09 +01:00
parent 44cf0b712f
commit fff512a87a
1 changed files with 60 additions and 20 deletions

View File

@ -95,23 +95,16 @@ public class MakeReportSparkJob implements Serializable {
* It cleans the association with the orcid to authors whose name and surname do not match the one in the orcid sequence file
* First step the join between the authoritative information and the prepared Info on orcid id. If the name surname of the authors do not match
* (or the surname and the first letter of the name) null is returned. If they match the resultInfo instance is returned.
* Second step the result info returned are grouped by key (the key is the id of the result) and a new result with as many authors as those
* having a correct association with the orcid is returned.
* Third step a join between the result and the authoritavide authors result is done on the result id.
* Each author in the original result with a pid of type orcid is match against the list of the authoritative authors. If the author
* in the original result has his pid among those in the authoritative result than the pid is retained, else is removed
* @param spark the stpark session
* @param inputPath the path where to find the result to emend
* @param entityClazz the class of the result considered
* @param outputPath the path where to write the emended result
* @param preparedInfoPath the path where to find the selected information for the result
* @param authoritative the authoritative association orcid name surname
* @param <I>
private static void addInList(List<String> list, String to_add){
for (String word: to_add.split(" ")){
if(word.length() >= 2){
private static <I extends Result> void makeReport(SparkSession spark, String inputPath, Class<I> entityClazz,
String outputPath, String preparedInfoPath,
@ -145,14 +138,46 @@ public class MakeReportSparkJob implements Serializable {
return new Tuple2<>("missing", reportInfo);
final String handledOsurname = StringUtils.stripAccents(oa.getSurname().toLowerCase().trim()).replace("-", " ");
final String handledSurname = StringUtils.stripAccents(ri.getSurname().toLowerCase().trim()).replace("-", " ");
final String handledOsurname = StringUtils.stripAccents(oa.getSurname().toLowerCase().trim())
.replace("-", " ").replace(".", "");
final String handledSurname = StringUtils.stripAccents(ri.getSurname().toLowerCase().trim())
.replace("-", " ").replace(".", "");
if (!handledOsurname
.equalsIgnoreCase(handledSurname)) {
if (!handledOsurname.contains(handledSurname) && !handledSurname.contains(handledOsurname)) {
//check if the words composing the name and the surname are the same or one list contains the other.
//do for words of lenght bigger than two
String handledOname = "";
handledOname = StringUtils.stripAccents(oa.getName().toLowerCase().trim())
.replace("-", " ").replace(".", "");
final String handledName = StringUtils.stripAccents(ri.getName().toLowerCase().trim())
.replace("-", " ").replace(".", "");
final List<String> orcidList = new ArrayList<>();
final List<String> paperList = new ArrayList<>();
addInList(orcidList, handledOname);
addInList(orcidList, handledOsurname);
addInList(paperList, handledSurname);
addInList(paperList, handledName);
if(orcidList.size()<= paperList.size()){
if(searchIn(paperList, orcidList)){
return new Tuple2<>("check", reportInfo);
if(searchIn(orcidList, paperList)){
return new Tuple2<>("check", reportInfo);
//todo add another level of checking (use levenstein)
return new Tuple2<>("wrong", reportInfo);
return new Tuple2<>("check", reportInfo);
return new Tuple2<>("right", reportInfo);
return new Tuple2<>("right", reportInfo);
@ -167,6 +192,21 @@ public class MakeReportSparkJob implements Serializable {
* searches in list1 all the words of list 2
* @param list1 the list where to search for the words
* @param list2 the list containing the words to be searched
* @return true if all the words in list 2 are contained in list1
private static boolean searchIn(List<String> list1, List<String> list2) {
for(String word:list2){
if (!list1.contains(word)){
return false;
return true;
private static void writeSet(Dataset<Tuple2<String, ReportInfo>> dataset, String outputPath){
dataset.groupByKey((MapFunction<Tuple2<String,ReportInfo>, String>) value -> value._2().getOid() , Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Tuple2<String, ReportInfo>, ReportInfo>) (oid, tuple2Iterator) ->{