ORCID Enrichment and Download #364

Merged
claudio.atzori merged 14 commits from orcid_import into beta 2023-12-01 15:05:45 +01:00
5 changed files with 268 additions and 300 deletions
Showing only changes of commit cdfb7588dd - Show all commits

View File

@ -4,38 +4,13 @@ package eu.dnetlib.dhp.oa.merge;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import com.wcohen.ss.JaroWinkler; import com.wcohen.ss.JaroWinkler;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
import scala.Tuple2; import scala.Tuple2;
class SimilarityCellInfo implements Comparable<SimilarityCellInfo> {
public int authorPosition = 0;
public int orcidPosition = 0;
public double maxColumnSimilarity = 0.0;
public SimilarityCellInfo() {
}
public void setValues(final int authPos, final int orcidPos, final double similarity) {
this.authorPosition = authPos;
this.orcidPosition = orcidPos;
this.maxColumnSimilarity = similarity;
}
@Override
public int compareTo(@NotNull SimilarityCellInfo o) {
return Double.compare(maxColumnSimilarity, o.maxColumnSimilarity);
}
}
public class AuthorMerger { public class AuthorMerger {
@ -183,6 +158,7 @@ public class AuthorMerger {
/** /**
* This method tries to figure out when two author are the same in the contest * This method tries to figure out when two author are the same in the contest
* of ORCID enrichment * of ORCID enrichment
*
* @param left Author in the OAF entity * @param left Author in the OAF entity
* @param right Author ORCID * @param right Author ORCID
* @return based on a heuristic on the names of the authors if they are the same. * @return based on a heuristic on the names of the authors if they are the same.
@ -238,6 +214,7 @@ public class AuthorMerger {
/** /**
* Method to enrich ORCID information in one list of authors based on another list * Method to enrich ORCID information in one list of authors based on another list
*
* @param baseAuthor the Author List in the OAF Entity * @param baseAuthor the Author List in the OAF Entity
* @param orcidAuthor The list of ORCID Author intersected * @param orcidAuthor The list of ORCID Author intersected
* @return The Author List of the OAF Entity enriched with the orcid Author * @return The Author List of the OAF Entity enriched with the orcid Author

View File

@ -92,7 +92,6 @@ object SparkGenerateDoiBoost {
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.save(s"$workingDirPath/firstJoin") .save(s"$workingDirPath/firstJoin")
logger.info("Phase 2) Join Result with MAG") logger.info("Phase 2) Join Result with MAG")
val sj: Dataset[(String, Publication)] = val sj: Dataset[(String, Publication)] =
spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p)) spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))

View File

@ -6,13 +6,10 @@ import org.junit.jupiter.api.Test
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import org.apache.spark.sql.functions._ import org.apache.spark.sql.functions._
class EnrichOrcidTest { class EnrichOrcidTest {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
def test() = { def test() = {
val spark = SparkSession.builder().master("local[*]").getOrCreate() val spark = SparkSession.builder().master("local[*]").getOrCreate()
// spark.sparkContext.setLogLevel("ERROR") // spark.sparkContext.setLogLevel("ERROR")
@ -63,7 +60,6 @@ class EnrichOrcidTest {
// }).filter(author => author != null) // }).filter(author => author != null)
// }) // })
Encoders Encoders
import spark.implicits._ import spark.implicits._
@ -76,10 +72,6 @@ class EnrichOrcidTest {
// //
// .show() // .show()
} }
} }