dnet-and/dnet-feature-extraction/src/main/java/eu/dnetlib/support/AuthorsFactory.java

package eu.dnetlib.support;

import com.clearspring.analytics.util.Lists;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.featureextraction.Utilities;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.linalg.DenseVector;
import org.codehaus.jackson.map.DeserializationConfig;
import org.codehaus.jackson.map.ObjectMapper;
import scala.Tuple2;

import javax.rmi.CORBA.Util;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;
import java.util.stream.Collectors;

public class AuthorsFactory {

    public static JavaRDD<Author> extractAuthorsFromPublications(JavaRDD<Publication> publications, JavaPairRDD<String, Map<String, double[]>> topics) {

        //read topics
        JavaPairRDD<Publication, Map<String, double[]>> publicationWithEmbeddings = publications
                .mapToPair(p -> new Tuple2<>(p.getId(), p))
                .join(topics)
                .mapToPair(Tuple2::_2);

        return publicationWithEmbeddings.flatMap(AuthorsFactory::createAuthors);

    }

    public static Iterator<Author> createAuthors(Tuple2<Publication, Map<String, double[]>> publicationWithEmbeddings){
        List<CoAuthor> baseCoAuthors = publicationWithEmbeddings._1()
                .getAuthor()
                .stream()
                .map(a -> new CoAuthor(a.getFullname(), a.getName()!=null?a.getName():"", a.getSurname()!=null?a.getSurname():"", a.getPid().size()>0? a.getPid().get(0).getValue():""))
                .collect(Collectors.toList());

        List<Author> authors = new ArrayList<>();
        for(eu.dnetlib.dhp.schema.oaf.Author a  : publicationWithEmbeddings._1().getAuthor()) {

            //prepare orcid
            String orcid = a.getPid().size()>0? a.getPid().get(0).getValue() : "";
            //prepare coauthors
            List<CoAuthor> coAuthors = Lists.newArrayList(baseCoAuthors);
            coAuthors.remove(new CoAuthor(a.getFullname(), a.getName() != null ? a.getName() : "", a.getSurname() != null ? a.getSurname() : "", a.getPid().size() > 0 ? a.getPid().get(0).getValue() : ""));

            //prepare raw author id
            String id = "author::" + getMd5(a.getFullname().concat(publicationWithEmbeddings._1().getId()));

            //prepare embeddings
            authors.add(new Author(
                    a.getFullname(),
                    a.getName(),
                    a.getSurname(),
                    coAuthors,
                    orcid,
                    id,
                    publicationWithEmbeddings._2(),
                    publicationWithEmbeddings._1().getId())
            );
        }

        return authors.iterator();
    }

    public static String getMd5(String input)
    {
        try {

            // Static getInstance method is called with hashing MD5
            MessageDigest md = MessageDigest.getInstance("MD5");

            // digest() method is called to calculate message digest
            // of an input digest() return array of byte
            byte[] messageDigest = md.digest(input.getBytes());

            // Convert byte array into signum representation
            BigInteger no = new BigInteger(1, messageDigest);

            // Convert message digest into hex value
            String hashtext = no.toString(16);
            while (hashtext.length() < 32) {
                hashtext = "0" + hashtext;
            }
            return hashtext;
        }

        // For specifying wrong message digest algorithms
        catch (NoSuchAlgorithmException e) {
            throw new RuntimeException(e);
        }
    }

    public static List<String> getLNFI(Author a) {
        final List<String> res = Lists.newArrayList();

        if (a.isAccurate()) {
            String lastName = Utilities.normalize(a.getLastname());
            String firstName = Utilities.normalize(a.getFirstname());
            String firstInitial = firstName.length()>0? firstName.substring(0,1) : "";

            res.add(firstInitial.concat(lastName));
        }
        else {  // is not accurate, meaning it has no defined name and surname
            List<String> fullname = Arrays.asList(Utilities.normalize(a.getFullname()).split(" "));
            if (fullname.size() == 1) {
                res.add(Utilities.normalize(a.getFullname()).toLowerCase());
            }
            else if (fullname.size() == 2) {
                res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
                res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
            }
            else {
                res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
                res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
            }
        }

        return res.stream().map(k -> k.replaceAll(" ","")).collect(Collectors.toList());
    }

}