forked from antonis.lempesis/dnet-hadoop
formatted code
This commit is contained in:
parent
b78d2b71f0
commit
57e2c4b749
|
@ -3,8 +3,7 @@ package eu.dnetlib.dhp.datacite
|
||||||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||||
import org.json4s.{DefaultFormats, JValue}
|
import org.json4s.{DefaultFormats, JValue}
|
||||||
|
|
||||||
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until: Long = -1)
|
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until: Long = -1) extends AbstractRestClient {
|
||||||
extends AbstractRestClient {
|
|
||||||
|
|
||||||
override def extractInfo(input: String): Unit = {
|
override def extractInfo(input: String): Unit = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
|
|
@ -327,9 +327,7 @@ object DataciteToOAFTransformation {
|
||||||
a.setFullname(c.name.orNull)
|
a.setFullname(c.name.orNull)
|
||||||
a.setName(c.givenName.orNull)
|
a.setName(c.givenName.orNull)
|
||||||
a.setSurname(c.familyName.orNull)
|
a.setSurname(c.familyName.orNull)
|
||||||
if (
|
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
|
||||||
c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null
|
|
||||||
) {
|
|
||||||
a.setPid(
|
a.setPid(
|
||||||
c.nameIdentifiers.get
|
c.nameIdentifiers.get
|
||||||
.map(ni => {
|
.map(ni => {
|
||||||
|
@ -395,9 +393,7 @@ object DataciteToOAFTransformation {
|
||||||
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
|
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
|
||||||
.map(d => extract_date(d.date.get))
|
.map(d => extract_date(d.date.get))
|
||||||
val a_date: Option[String] = dates
|
val a_date: Option[String] = dates
|
||||||
.filter(d =>
|
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
|
||||||
d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available")
|
|
||||||
)
|
|
||||||
.map(d => extract_date(d.date.get))
|
.map(d => extract_date(d.date.get))
|
||||||
.find(d => d != null && d.isDefined)
|
.find(d => d != null && d.isDefined)
|
||||||
.map(d => d.get)
|
.map(d => d.get)
|
||||||
|
|
|
@ -2,12 +2,7 @@ package eu.dnetlib.dhp.sx.bio.pubmed
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.{
|
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
|
||||||
GraphCleaningFunctions,
|
|
||||||
IdentifierFactory,
|
|
||||||
OafMapperUtils,
|
|
||||||
PidType
|
|
||||||
}
|
|
||||||
import eu.dnetlib.dhp.schema.oaf._
|
import eu.dnetlib.dhp.schema.oaf._
|
||||||
import collection.JavaConverters._
|
import collection.JavaConverters._
|
||||||
|
|
||||||
|
@ -169,9 +164,7 @@ object PubMedToOaf {
|
||||||
pubmedInstance.setInstancetype(cojbCategory)
|
pubmedInstance.setInstancetype(cojbCategory)
|
||||||
} else {
|
} else {
|
||||||
val i_type = article.getPublicationTypes.asScala
|
val i_type = article.getPublicationTypes.asScala
|
||||||
.map(s =>
|
.map(s => getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue))
|
||||||
getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue)
|
|
||||||
)
|
|
||||||
.find(q => q != null)
|
.find(q => q != null)
|
||||||
if (i_type.isDefined)
|
if (i_type.isDefined)
|
||||||
pubmedInstance.setInstancetype(i_type.get)
|
pubmedInstance.setInstancetype(i_type.get)
|
||||||
|
|
|
@ -59,8 +59,7 @@ object SparkGenerateDoiBoost {
|
||||||
val workingDirPath = parser.get("workingPath")
|
val workingDirPath = parser.get("workingPath")
|
||||||
val openaireOrganizationPath = parser.get("openaireOrganizationPath")
|
val openaireOrganizationPath = parser.get("openaireOrganizationPath")
|
||||||
|
|
||||||
val crossrefAggregator = new Aggregator[(String, Publication), Publication, Publication]
|
val crossrefAggregator = new Aggregator[(String, Publication), Publication, Publication] with Serializable {
|
||||||
with Serializable {
|
|
||||||
override def zero: Publication = new Publication
|
override def zero: Publication = new Publication
|
||||||
|
|
||||||
override def reduce(b: Publication, a: (String, Publication)): Publication = {
|
override def reduce(b: Publication, a: (String, Publication)): Publication = {
|
||||||
|
|
|
@ -438,11 +438,10 @@ case object Crossref2Oaf {
|
||||||
funders.foreach(funder => {
|
funders.foreach(funder => {
|
||||||
if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
|
if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
|
||||||
funder.DOI.get match {
|
funder.DOI.get match {
|
||||||
case "10.13039/100010663" | "10.13039/100010661" | "10.13039/501100007601" |
|
case "10.13039/100010663" | "10.13039/100010661" | "10.13039/501100007601" | "10.13039/501100000780" |
|
||||||
"10.13039/501100000780" | "10.13039/100010665" =>
|
"10.13039/100010665" =>
|
||||||
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||||
case "10.13039/100011199" | "10.13039/100004431" | "10.13039/501100004963" |
|
case "10.13039/100011199" | "10.13039/100004431" | "10.13039/501100004963" | "10.13039/501100000780" =>
|
||||||
"10.13039/501100000780" =>
|
|
||||||
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||||
case "10.13039/501100000781" =>
|
case "10.13039/501100000781" =>
|
||||||
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||||
|
@ -512,8 +511,7 @@ case object Crossref2Oaf {
|
||||||
case "European Union's" =>
|
case "European Union's" =>
|
||||||
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||||
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||||
case "The French National Research Agency (ANR)" |
|
case "The French National Research Agency (ANR)" | "The French National Research Agency" =>
|
||||||
"The French National Research Agency" =>
|
|
||||||
generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
||||||
case "CONICYT, Programa de Formación de Capital Humano Avanzado" =>
|
case "CONICYT, Programa de Formación de Capital Humano Avanzado" =>
|
||||||
generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
|
generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
|
||||||
|
|
|
@ -15,9 +15,7 @@ object SparkProcessMAG {
|
||||||
def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = {
|
def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = {
|
||||||
d.where(col("Doi").isNotNull)
|
d.where(col("Doi").isNotNull)
|
||||||
.groupByKey(mp => DoiBoostMappingUtil.normalizeDoi(mp.Doi))(Encoders.STRING)
|
.groupByKey(mp => DoiBoostMappingUtil.normalizeDoi(mp.Doi))(Encoders.STRING)
|
||||||
.reduceGroups((p1: MagPapers, p2: MagPapers) =>
|
.reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2))
|
||||||
ConversionUtil.choiceLatestMagArtitcle(p1, p2)
|
|
||||||
)
|
|
||||||
.map(_._2)(Encoders.product[MagPapers])
|
.map(_._2)(Encoders.product[MagPapers])
|
||||||
.map(mp => {
|
.map(mp => {
|
||||||
MagPapers(
|
MagPapers(
|
||||||
|
|
|
@ -223,9 +223,7 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
val collectedFromList = result.getCollectedfrom.asScala
|
val collectedFromList = result.getCollectedfrom.asScala
|
||||||
assert(
|
assert(
|
||||||
collectedFromList.exists(c =>
|
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
|
||||||
c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")
|
|
||||||
),
|
|
||||||
"Wrong collected from assertion"
|
"Wrong collected from assertion"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -301,9 +299,7 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
val collectedFromList = result.getCollectedfrom.asScala
|
val collectedFromList = result.getCollectedfrom.asScala
|
||||||
assert(
|
assert(
|
||||||
collectedFromList.exists(c =>
|
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
|
||||||
c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")
|
|
||||||
),
|
|
||||||
"Wrong collected from assertion"
|
"Wrong collected from assertion"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -435,9 +431,7 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
val collectedFromList = result.getCollectedfrom.asScala
|
val collectedFromList = result.getCollectedfrom.asScala
|
||||||
assert(
|
assert(
|
||||||
collectedFromList.exists(c =>
|
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
|
||||||
c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")
|
|
||||||
),
|
|
||||||
"Wrong collected from assertion"
|
"Wrong collected from assertion"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -553,9 +547,7 @@ class CrossrefMappingTest {
|
||||||
println(mapper.writeValueAsString(item))
|
println(mapper.writeValueAsString(item))
|
||||||
|
|
||||||
assertTrue(
|
assertTrue(
|
||||||
item.getInstance().asScala exists (i =>
|
item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor"))
|
||||||
i.getLicense.getValue.equals("https://www.springer.com/vor")
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
assertTrue(
|
assertTrue(
|
||||||
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED"))
|
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED"))
|
||||||
|
@ -590,9 +582,7 @@ class CrossrefMappingTest {
|
||||||
)
|
)
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
|
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
|
||||||
assertTrue(
|
assertTrue(
|
||||||
item.getInstance().asScala exists (i =>
|
item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)
|
||||||
i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||||
println(mapper.writeValueAsString(item))
|
println(mapper.writeValueAsString(item))
|
||||||
|
@ -627,9 +617,7 @@ class CrossrefMappingTest {
|
||||||
)
|
)
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
|
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
|
||||||
assertTrue(
|
assertTrue(
|
||||||
item.getInstance().asScala exists (i =>
|
item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)
|
||||||
i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||||
println(mapper.writeValueAsString(item))
|
println(mapper.writeValueAsString(item))
|
||||||
|
|
|
@ -2,14 +2,7 @@ package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.{
|
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
|
||||||
OtherResearchProduct,
|
|
||||||
Publication,
|
|
||||||
Relation,
|
|
||||||
Result,
|
|
||||||
Software,
|
|
||||||
Dataset => OafDataset
|
|
||||||
}
|
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
|
|
|
@ -90,8 +90,7 @@ object PangaeaUtils {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
def getDatasetAggregator()
|
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] =
|
||||||
: Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] =
|
|
||||||
new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] {
|
new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] {
|
||||||
|
|
||||||
override def zero: PangaeaDataModel = null
|
override def zero: PangaeaDataModel = null
|
||||||
|
|
|
@ -1,11 +1,7 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.hostedbymap
|
package eu.dnetlib.dhp.oa.graph.hostedbymap
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkPrepareHostedByInfoToApply.{
|
import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkPrepareHostedByInfoToApply.{joinResHBM, prepareResultInfo, toEntityInfo}
|
||||||
joinResHBM,
|
|
||||||
prepareResultInfo,
|
|
||||||
toEntityInfo
|
|
||||||
}
|
|
||||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||||
|
|
|
@ -51,8 +51,7 @@ class TestPreprocess extends java.io.Serializable {
|
||||||
ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1
|
ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1
|
||||||
)
|
)
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata"))
|
ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata")).count == 1
|
||||||
.count == 1
|
|
||||||
)
|
)
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ds.filter(hbi =>
|
ds.filter(hbi =>
|
||||||
|
|
|
@ -189,11 +189,7 @@ class ResolveEntitiesTest extends Serializable {
|
||||||
|
|
||||||
var ct = pubDS.count()
|
var ct = pubDS.count()
|
||||||
var et = pubDS
|
var et = pubDS
|
||||||
.filter(p =>
|
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
|
||||||
p.getTitle != null && p.getTitle.asScala.forall(t =>
|
|
||||||
t.getValue != null && t.getValue.nonEmpty
|
|
||||||
)
|
|
||||||
)
|
|
||||||
.count()
|
.count()
|
||||||
|
|
||||||
assertEquals(ct, et)
|
assertEquals(ct, et)
|
||||||
|
@ -208,11 +204,7 @@ class ResolveEntitiesTest extends Serializable {
|
||||||
.count()
|
.count()
|
||||||
ct = datDS.count()
|
ct = datDS.count()
|
||||||
et = datDS
|
et = datDS
|
||||||
.filter(p =>
|
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
|
||||||
p.getTitle != null && p.getTitle.asScala.forall(t =>
|
|
||||||
t.getValue != null && t.getValue.nonEmpty
|
|
||||||
)
|
|
||||||
)
|
|
||||||
.count()
|
.count()
|
||||||
assertEquals(ct, et)
|
assertEquals(ct, et)
|
||||||
|
|
||||||
|
@ -226,11 +218,7 @@ class ResolveEntitiesTest extends Serializable {
|
||||||
.count()
|
.count()
|
||||||
ct = softDS.count()
|
ct = softDS.count()
|
||||||
et = softDS
|
et = softDS
|
||||||
.filter(p =>
|
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
|
||||||
p.getTitle != null && p.getTitle.asScala.forall(t =>
|
|
||||||
t.getValue != null && t.getValue.nonEmpty
|
|
||||||
)
|
|
||||||
)
|
|
||||||
.count()
|
.count()
|
||||||
assertEquals(ct, et)
|
assertEquals(ct, et)
|
||||||
|
|
||||||
|
@ -245,11 +233,7 @@ class ResolveEntitiesTest extends Serializable {
|
||||||
|
|
||||||
ct = orpDS.count()
|
ct = orpDS.count()
|
||||||
et = orpDS
|
et = orpDS
|
||||||
.filter(p =>
|
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
|
||||||
p.getTitle != null && p.getTitle.asScala.forall(t =>
|
|
||||||
t.getValue != null && t.getValue.nonEmpty
|
|
||||||
)
|
|
||||||
)
|
|
||||||
.count()
|
.count()
|
||||||
assertEquals(ct, et)
|
assertEquals(ct, et)
|
||||||
|
|
||||||
|
|
|
@ -56,9 +56,7 @@ class ScholixGraphTest extends AbstractVocabularyTest {
|
||||||
assertNotNull(result)
|
assertNotNull(result)
|
||||||
|
|
||||||
assertEquals(result.size, items.size)
|
assertEquals(result.size, items.size)
|
||||||
val d = result.find(s =>
|
val d = result.find(s => s.getLocalIdentifier.asScala.exists(i => i.getUrl == null || i.getUrl.isEmpty))
|
||||||
s.getLocalIdentifier.asScala.exists(i => i.getUrl == null || i.getUrl.isEmpty)
|
|
||||||
)
|
|
||||||
assertFalse(d.isDefined)
|
assertFalse(d.isDefined)
|
||||||
println(mapper.writeValueAsString(result.head))
|
println(mapper.writeValueAsString(result.head))
|
||||||
|
|
||||||
|
@ -74,9 +72,7 @@ class ScholixGraphTest extends AbstractVocabularyTest {
|
||||||
val result: List[(Relation, ScholixSummary)] = inputRelations.lines
|
val result: List[(Relation, ScholixSummary)] = inputRelations.lines
|
||||||
.sliding(2)
|
.sliding(2)
|
||||||
.map(s => (s.head, s(1)))
|
.map(s => (s.head, s(1)))
|
||||||
.map(p =>
|
.map(p => (mapper.readValue(p._1, classOf[Relation]), mapper.readValue(p._2, classOf[ScholixSummary])))
|
||||||
(mapper.readValue(p._1, classOf[Relation]), mapper.readValue(p._2, classOf[ScholixSummary]))
|
|
||||||
)
|
|
||||||
.toList
|
.toList
|
||||||
assertNotNull(result)
|
assertNotNull(result)
|
||||||
assertTrue(result.nonEmpty)
|
assertTrue(result.nonEmpty)
|
||||||
|
|
Loading…
Reference in New Issue