dnet-and/dnet-feature-extraction/src/main/java/eu/dnetlib/support/AuthorsFactory.java

126 lines
4.9 KiB
Java

package eu.dnetlib.support;
import com.clearspring.analytics.util.Lists;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.featureextraction.Utilities;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.linalg.DenseVector;
import org.codehaus.jackson.map.DeserializationConfig;
import org.codehaus.jackson.map.ObjectMapper;
import scala.Tuple2;
import javax.rmi.CORBA.Util;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;
import java.util.stream.Collectors;
public class AuthorsFactory {
public static JavaRDD<Author> extractAuthorsFromPublications(JavaRDD<Publication> publications, JavaPairRDD<String, Map<String, double[]>> topics) {
//read topics
JavaPairRDD<Publication, Map<String, double[]>> publicationWithEmbeddings = publications
.mapToPair(p -> new Tuple2<>(p.getId(), p))
.join(topics)
.mapToPair(Tuple2::_2);
return publicationWithEmbeddings.flatMap(AuthorsFactory::createAuthors);
}
public static Iterator<Author> createAuthors(Tuple2<Publication, Map<String, double[]>> publicationWithEmbeddings){
List<CoAuthor> baseCoAuthors = publicationWithEmbeddings._1()
.getAuthor()
.stream()
.map(a -> new CoAuthor(a.getFullname(), a.getName()!=null?a.getName():"", a.getSurname()!=null?a.getSurname():"", a.getPid().size()>0? a.getPid().get(0).getValue():""))
.collect(Collectors.toList());
List<Author> authors = new ArrayList<>();
for(eu.dnetlib.dhp.schema.oaf.Author a : publicationWithEmbeddings._1().getAuthor()) {
//prepare orcid
String orcid = a.getPid().size()>0? a.getPid().get(0).getValue() : "";
//prepare coauthors
List<CoAuthor> coAuthors = Lists.newArrayList(baseCoAuthors);
coAuthors.remove(new CoAuthor(a.getFullname(), a.getName() != null ? a.getName() : "", a.getSurname() != null ? a.getSurname() : "", a.getPid().size() > 0 ? a.getPid().get(0).getValue() : ""));
//prepare raw author id
String id = "author::" + getMd5(a.getFullname().concat(publicationWithEmbeddings._1().getId()));
//prepare embeddings
authors.add(new Author(
a.getFullname(),
a.getName(),
a.getSurname(),
coAuthors,
orcid,
id,
publicationWithEmbeddings._2(),
publicationWithEmbeddings._1().getId())
);
}
return authors.iterator();
}
public static String getMd5(String input)
{
try {
// Static getInstance method is called with hashing MD5
MessageDigest md = MessageDigest.getInstance("MD5");
// digest() method is called to calculate message digest
// of an input digest() return array of byte
byte[] messageDigest = md.digest(input.getBytes());
// Convert byte array into signum representation
BigInteger no = new BigInteger(1, messageDigest);
// Convert message digest into hex value
String hashtext = no.toString(16);
while (hashtext.length() < 32) {
hashtext = "0" + hashtext;
}
return hashtext;
}
// For specifying wrong message digest algorithms
catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
}
public static List<String> getLNFI(Author a) {
final List<String> res = Lists.newArrayList();
if (a.isAccurate()) {
String lastName = Utilities.normalize(a.getLastname());
String firstName = Utilities.normalize(a.getFirstname());
String firstInitial = firstName.length()>0? firstName.substring(0,1) : "";
res.add(firstInitial.concat(lastName));
}
else { // is not accurate, meaning it has no defined name and surname
List<String> fullname = Arrays.asList(Utilities.normalize(a.getFullname()).split(" "));
if (fullname.size() == 1) {
res.add(Utilities.normalize(a.getFullname()).toLowerCase());
}
else if (fullname.size() == 2) {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
else {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
}
return res.stream().map(k -> k.replaceAll(" ","")).collect(Collectors.toList());
}
}