[stats wf] indicators across stats dbs & updates in the org ids #248
|
@ -4,38 +4,13 @@ package eu.dnetlib.dhp.oa.merge;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
|
|
||||||
import com.wcohen.ss.JaroWinkler;
|
import com.wcohen.ss.JaroWinkler;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
class SimilarityCellInfo implements Comparable<SimilarityCellInfo> {
|
|
||||||
|
|
||||||
public int authorPosition = 0;
|
|
||||||
public int orcidPosition = 0;
|
|
||||||
|
|
||||||
public double maxColumnSimilarity = 0.0;
|
|
||||||
|
|
||||||
public SimilarityCellInfo() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setValues(final int authPos, final int orcidPos, final double similarity) {
|
|
||||||
this.authorPosition = authPos;
|
|
||||||
this.orcidPosition = orcidPos;
|
|
||||||
this.maxColumnSimilarity = similarity;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compareTo(@NotNull SimilarityCellInfo o) {
|
|
||||||
return Double.compare(maxColumnSimilarity, o.maxColumnSimilarity);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public class AuthorMerger {
|
public class AuthorMerger {
|
||||||
|
|
||||||
|
@ -183,6 +158,7 @@ public class AuthorMerger {
|
||||||
/**
|
/**
|
||||||
* This method tries to figure out when two author are the same in the contest
|
* This method tries to figure out when two author are the same in the contest
|
||||||
* of ORCID enrichment
|
* of ORCID enrichment
|
||||||
|
*
|
||||||
* @param left Author in the OAF entity
|
* @param left Author in the OAF entity
|
||||||
* @param right Author ORCID
|
* @param right Author ORCID
|
||||||
* @return based on a heuristic on the names of the authors if they are the same.
|
* @return based on a heuristic on the names of the authors if they are the same.
|
||||||
|
@ -238,6 +214,7 @@ public class AuthorMerger {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Method to enrich ORCID information in one list of authors based on another list
|
* Method to enrich ORCID information in one list of authors based on another list
|
||||||
|
*
|
||||||
* @param baseAuthor the Author List in the OAF Entity
|
* @param baseAuthor the Author List in the OAF Entity
|
||||||
* @param orcidAuthor The list of ORCID Author intersected
|
* @param orcidAuthor The list of ORCID Author intersected
|
||||||
* @return The Author List of the OAF Entity enriched with the orcid Author
|
* @return The Author List of the OAF Entity enriched with the orcid Author
|
||||||
|
|
|
@ -92,7 +92,6 @@ object SparkGenerateDoiBoost {
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.save(s"$workingDirPath/firstJoin")
|
.save(s"$workingDirPath/firstJoin")
|
||||||
|
|
||||||
|
|
||||||
logger.info("Phase 2) Join Result with MAG")
|
logger.info("Phase 2) Join Result with MAG")
|
||||||
val sj: Dataset[(String, Publication)] =
|
val sj: Dataset[(String, Publication)] =
|
||||||
spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
|
spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
|
||||||
|
|
|
@ -6,13 +6,10 @@ import org.junit.jupiter.api.Test
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
import org.apache.spark.sql.functions._
|
import org.apache.spark.sql.functions._
|
||||||
|
|
||||||
|
|
||||||
class EnrichOrcidTest {
|
class EnrichOrcidTest {
|
||||||
|
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test() = {
|
def test() = {
|
||||||
val spark = SparkSession.builder().master("local[*]").getOrCreate()
|
val spark = SparkSession.builder().master("local[*]").getOrCreate()
|
||||||
// spark.sparkContext.setLogLevel("ERROR")
|
// spark.sparkContext.setLogLevel("ERROR")
|
||||||
|
@ -63,7 +60,6 @@ class EnrichOrcidTest {
|
||||||
// }).filter(author => author != null)
|
// }).filter(author => author != null)
|
||||||
// })
|
// })
|
||||||
|
|
||||||
|
|
||||||
Encoders
|
Encoders
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
@ -76,10 +72,6 @@ class EnrichOrcidTest {
|
||||||
//
|
//
|
||||||
// .show()
|
// .show()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue