126 lines
4.9 KiB
Java
126 lines
4.9 KiB
Java
package eu.dnetlib.support;
|
|
|
|
import com.clearspring.analytics.util.Lists;
|
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
import eu.dnetlib.featureextraction.Utilities;
|
|
import org.apache.spark.api.java.JavaPairRDD;
|
|
import org.apache.spark.api.java.JavaRDD;
|
|
import org.apache.spark.ml.linalg.DenseVector;
|
|
import org.codehaus.jackson.map.DeserializationConfig;
|
|
import org.codehaus.jackson.map.ObjectMapper;
|
|
import scala.Tuple2;
|
|
|
|
import javax.rmi.CORBA.Util;
|
|
import java.math.BigInteger;
|
|
import java.security.MessageDigest;
|
|
import java.security.NoSuchAlgorithmException;
|
|
import java.util.*;
|
|
import java.util.stream.Collectors;
|
|
|
|
public class AuthorsFactory {
|
|
|
|
public static JavaRDD<Author> extractAuthorsFromPublications(JavaRDD<Publication> publications, JavaPairRDD<String, Map<String, double[]>> topics) {
|
|
|
|
//read topics
|
|
JavaPairRDD<Publication, Map<String, double[]>> publicationWithEmbeddings = publications
|
|
.mapToPair(p -> new Tuple2<>(p.getId(), p))
|
|
.join(topics)
|
|
.mapToPair(Tuple2::_2);
|
|
|
|
return publicationWithEmbeddings.flatMap(AuthorsFactory::createAuthors);
|
|
|
|
}
|
|
|
|
public static Iterator<Author> createAuthors(Tuple2<Publication, Map<String, double[]>> publicationWithEmbeddings){
|
|
List<CoAuthor> baseCoAuthors = publicationWithEmbeddings._1()
|
|
.getAuthor()
|
|
.stream()
|
|
.map(a -> new CoAuthor(a.getFullname(), a.getName()!=null?a.getName():"", a.getSurname()!=null?a.getSurname():"", a.getPid().size()>0? a.getPid().get(0).getValue():""))
|
|
.collect(Collectors.toList());
|
|
|
|
List<Author> authors = new ArrayList<>();
|
|
for(eu.dnetlib.dhp.schema.oaf.Author a : publicationWithEmbeddings._1().getAuthor()) {
|
|
|
|
//prepare orcid
|
|
String orcid = a.getPid().size()>0? a.getPid().get(0).getValue() : "";
|
|
//prepare coauthors
|
|
List<CoAuthor> coAuthors = Lists.newArrayList(baseCoAuthors);
|
|
coAuthors.remove(new CoAuthor(a.getFullname(), a.getName() != null ? a.getName() : "", a.getSurname() != null ? a.getSurname() : "", a.getPid().size() > 0 ? a.getPid().get(0).getValue() : ""));
|
|
|
|
//prepare raw author id
|
|
String id = "author::" + getMd5(a.getFullname().concat(publicationWithEmbeddings._1().getId()));
|
|
|
|
//prepare embeddings
|
|
authors.add(new Author(
|
|
a.getFullname(),
|
|
a.getName(),
|
|
a.getSurname(),
|
|
coAuthors,
|
|
orcid,
|
|
id,
|
|
publicationWithEmbeddings._2(),
|
|
publicationWithEmbeddings._1().getId())
|
|
);
|
|
}
|
|
|
|
return authors.iterator();
|
|
}
|
|
|
|
public static String getMd5(String input)
|
|
{
|
|
try {
|
|
|
|
// Static getInstance method is called with hashing MD5
|
|
MessageDigest md = MessageDigest.getInstance("MD5");
|
|
|
|
// digest() method is called to calculate message digest
|
|
// of an input digest() return array of byte
|
|
byte[] messageDigest = md.digest(input.getBytes());
|
|
|
|
// Convert byte array into signum representation
|
|
BigInteger no = new BigInteger(1, messageDigest);
|
|
|
|
// Convert message digest into hex value
|
|
String hashtext = no.toString(16);
|
|
while (hashtext.length() < 32) {
|
|
hashtext = "0" + hashtext;
|
|
}
|
|
return hashtext;
|
|
}
|
|
|
|
// For specifying wrong message digest algorithms
|
|
catch (NoSuchAlgorithmException e) {
|
|
throw new RuntimeException(e);
|
|
}
|
|
}
|
|
|
|
public static List<String> getLNFI(Author a) {
|
|
final List<String> res = Lists.newArrayList();
|
|
|
|
if (a.isAccurate()) {
|
|
String lastName = Utilities.normalize(a.getLastname());
|
|
String firstName = Utilities.normalize(a.getFirstname());
|
|
String firstInitial = firstName.length()>0? firstName.substring(0,1) : "";
|
|
|
|
res.add(firstInitial.concat(lastName));
|
|
}
|
|
else { // is not accurate, meaning it has no defined name and surname
|
|
List<String> fullname = Arrays.asList(Utilities.normalize(a.getFullname()).split(" "));
|
|
if (fullname.size() == 1) {
|
|
res.add(Utilities.normalize(a.getFullname()).toLowerCase());
|
|
}
|
|
else if (fullname.size() == 2) {
|
|
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
|
|
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
|
|
}
|
|
else {
|
|
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
|
|
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
|
|
}
|
|
}
|
|
|
|
return res.stream().map(k -> k.replaceAll(" ","")).collect(Collectors.toList());
|
|
}
|
|
|
|
}
|