forked from D-Net/dnet-hadoop
added one level of checking (search all the words of name surname in orcid and in paper)
This commit is contained in:
parent
44cf0b712f
commit
fff512a87a
|
@ -95,23 +95,16 @@ public class MakeReportSparkJob implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* It cleans the association with the orcid to authors whose name and surname do not match the one in the orcid sequence file
|
||||
* First step the join between the authoritative information and the prepared Info on orcid id. If the name surname of the authors do not match
|
||||
* (or the surname and the first letter of the name) null is returned. If they match the resultInfo instance is returned.
|
||||
* Second step the result info returned are grouped by key (the key is the id of the result) and a new result with as many authors as those
|
||||
* having a correct association with the orcid is returned.
|
||||
* Third step a join between the result and the authoritavide authors result is done on the result id.
|
||||
* Each author in the original result with a pid of type orcid is match against the list of the authoritative authors. If the author
|
||||
* in the original result has his pid among those in the authoritative result than the pid is retained, else is removed
|
||||
* @param spark the stpark session
|
||||
* @param inputPath the path where to find the result to emend
|
||||
* @param entityClazz the class of the result considered
|
||||
* @param outputPath the path where to write the emended result
|
||||
* @param preparedInfoPath the path where to find the selected information for the result
|
||||
* @param authoritative the authoritative association orcid name surname
|
||||
* @param <I>
|
||||
*/
|
||||
|
||||
private static void addInList(List<String> list, String to_add){
|
||||
for (String word: to_add.split(" ")){
|
||||
if(word.length() >= 2){
|
||||
list.add(word);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
private static <I extends Result> void makeReport(SparkSession spark, String inputPath, Class<I> entityClazz,
|
||||
String outputPath, String preparedInfoPath,
|
||||
|
@ -145,14 +138,46 @@ public class MakeReportSparkJob implements Serializable {
|
|||
if(!Optional.ofNullable(oa.getSurname()).isPresent()){
|
||||
return new Tuple2<>("missing", reportInfo);
|
||||
}
|
||||
final String handledOsurname = StringUtils.stripAccents(oa.getSurname().toLowerCase().trim()).replace("-", " ");
|
||||
final String handledSurname = StringUtils.stripAccents(ri.getSurname().toLowerCase().trim()).replace("-", " ");
|
||||
final String handledOsurname = StringUtils.stripAccents(oa.getSurname().toLowerCase().trim())
|
||||
.replace("-", " ").replace(".", "");
|
||||
final String handledSurname = StringUtils.stripAccents(ri.getSurname().toLowerCase().trim())
|
||||
.replace("-", " ").replace(".", "");
|
||||
if (!handledOsurname
|
||||
.equalsIgnoreCase(handledSurname)) {
|
||||
if (!handledOsurname.contains(handledSurname) && !handledSurname.contains(handledOsurname)) {
|
||||
//check if the words composing the name and the surname are the same or one list contains the other.
|
||||
//do for words of lenght bigger than two
|
||||
String handledOname = "";
|
||||
if(Optional.ofNullable(oa.getName()).isPresent()){
|
||||
handledOname = StringUtils.stripAccents(oa.getName().toLowerCase().trim())
|
||||
.replace("-", " ").replace(".", "");
|
||||
}
|
||||
final String handledName = StringUtils.stripAccents(ri.getName().toLowerCase().trim())
|
||||
.replace("-", " ").replace(".", "");
|
||||
|
||||
final List<String> orcidList = new ArrayList<>();
|
||||
final List<String> paperList = new ArrayList<>();
|
||||
|
||||
addInList(orcidList, handledOname);
|
||||
addInList(orcidList, handledOsurname);
|
||||
|
||||
addInList(paperList, handledSurname);
|
||||
addInList(paperList, handledName);
|
||||
|
||||
if(orcidList.size()<= paperList.size()){
|
||||
if(searchIn(paperList, orcidList)){
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
}
|
||||
}else{
|
||||
if(searchIn(orcidList, paperList)){
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
}
|
||||
}
|
||||
|
||||
//todo add another level of checking (use levenstein)
|
||||
return new Tuple2<>("wrong", reportInfo);
|
||||
}
|
||||
return new Tuple2<>("check", reportInfo);
|
||||
return new Tuple2<>("right", reportInfo);
|
||||
}
|
||||
|
||||
return new Tuple2<>("right", reportInfo);
|
||||
|
@ -167,6 +192,21 @@ public class MakeReportSparkJob implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* searches in list1 all the words of list 2
|
||||
* @param list1 the list where to search for the words
|
||||
* @param list2 the list containing the words to be searched
|
||||
* @return true if all the words in list 2 are contained in list1
|
||||
*/
|
||||
private static boolean searchIn(List<String> list1, List<String> list2) {
|
||||
for(String word:list2){
|
||||
if (!list1.contains(word)){
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static void writeSet(Dataset<Tuple2<String, ReportInfo>> dataset, String outputPath){
|
||||
dataset.groupByKey((MapFunction<Tuple2<String,ReportInfo>, String>) value -> value._2().getOid() , Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Tuple2<String, ReportInfo>, ReportInfo>) (oid, tuple2Iterator) ->{
|
||||
|
|
Loading…
Reference in New Issue