master #59
|
@ -0,0 +1,21 @@
|
||||||
|
style = defaultWithAlign
|
||||||
|
|
||||||
|
align.openParenCallSite = false
|
||||||
|
align.openParenDefnSite = false
|
||||||
|
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
|
||||||
|
continuationIndent.callSite = 2
|
||||||
|
continuationIndent.defnSite = 2
|
||||||
|
danglingParentheses = true
|
||||||
|
indentOperator = spray
|
||||||
|
maxColumn = 120
|
||||||
|
newlines.alwaysBeforeTopLevelStatements = true
|
||||||
|
project.excludeFilters = [".*\\.sbt"]
|
||||||
|
rewrite.rules = [AvoidInfix]
|
||||||
|
rewrite.rules = [ExpandImportSelectors]
|
||||||
|
rewrite.rules = [RedundantBraces]
|
||||||
|
rewrite.rules = [RedundantParens]
|
||||||
|
rewrite.rules = [SortImports]
|
||||||
|
rewrite.rules = [SortModifiers]
|
||||||
|
rewrite.rules = [PreferCurlyFors]
|
||||||
|
spaces.inImportCurlyBraces = false
|
||||||
|
unindentTopLevelOperators = true
|
|
@ -2,58 +2,57 @@ package eu.dnetlib.dhp.application
|
||||||
|
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
|
||||||
/**
|
/** This is the main Interface SparkApplication
|
||||||
* This is the main Interface SparkApplication
|
|
||||||
* where all the Spark Scala class should inherit
|
* where all the Spark Scala class should inherit
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
trait SparkScalaApplication {
|
trait SparkScalaApplication {
|
||||||
/**
|
|
||||||
* This is the path in the classpath of the json
|
/** This is the path in the classpath of the json
|
||||||
* describes all the argument needed to run
|
* describes all the argument needed to run
|
||||||
*/
|
*/
|
||||||
val propertyPath: String
|
val propertyPath: String
|
||||||
|
|
||||||
/**
|
/** Utility to parse the arguments using the
|
||||||
* Utility to parse the arguments using the
|
|
||||||
* property json in the classpath identified from
|
* property json in the classpath identified from
|
||||||
* the variable propertyPath
|
* the variable propertyPath
|
||||||
*
|
*
|
||||||
* @param args the list of arguments
|
* @param args the list of arguments
|
||||||
*/
|
*/
|
||||||
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
|
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
|
||||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString)
|
val parser = new ArgumentApplicationParser(
|
||||||
|
Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
parser
|
parser
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Here all the spark applications runs this method
|
||||||
* Here all the spark applications runs this method
|
|
||||||
* where the whole logic of the spark node is defined
|
* where the whole logic of the spark node is defined
|
||||||
*/
|
*/
|
||||||
def run(): Unit
|
def run(): Unit
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.SparkSession
|
import org.apache.spark.sql.SparkSession
|
||||||
import org.slf4j.Logger
|
import org.slf4j.Logger
|
||||||
|
|
||||||
abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends SparkScalaApplication {
|
abstract class AbstractScalaApplication(
|
||||||
|
val propertyPath: String,
|
||||||
|
val args: Array[String],
|
||||||
|
log: Logger
|
||||||
|
) extends SparkScalaApplication {
|
||||||
|
|
||||||
var parser: ArgumentApplicationParser = null
|
var parser: ArgumentApplicationParser = null
|
||||||
|
|
||||||
var spark: SparkSession = null
|
var spark: SparkSession = null
|
||||||
|
|
||||||
|
|
||||||
def initialize(): SparkScalaApplication = {
|
def initialize(): SparkScalaApplication = {
|
||||||
parser = parseArguments(args)
|
parser = parseArguments(args)
|
||||||
spark = createSparkSession()
|
spark = createSparkSession()
|
||||||
this
|
this
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Utility for creating a spark session starting from parser
|
||||||
* Utility for creating a spark session starting from parser
|
|
||||||
*
|
*
|
||||||
* @return a spark Session
|
* @return a spark Session
|
||||||
*/
|
*/
|
||||||
|
@ -63,7 +62,9 @@ abstract class AbstractScalaApplication (val propertyPath:String, val args:Array
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val master = parser.get("master")
|
val master = parser.get("master")
|
||||||
log.info(s"Creating Spark session: Master: $master")
|
log.info(s"Creating Spark session: Master: $master")
|
||||||
SparkSession.builder().config(conf)
|
SparkSession
|
||||||
|
.builder()
|
||||||
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(master)
|
.master(master)
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
|
|
|
@ -14,7 +14,6 @@ import scala.io.Source
|
||||||
|
|
||||||
object ScholixUtils extends Serializable {
|
object ScholixUtils extends Serializable {
|
||||||
|
|
||||||
|
|
||||||
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
|
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
|
||||||
|
|
||||||
val DATE_RELATION_KEY: String = "RelationDate"
|
val DATE_RELATION_KEY: String = "RelationDate"
|
||||||
|
@ -24,7 +23,11 @@ object ScholixUtils extends Serializable {
|
||||||
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
|
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
|
||||||
|
|
||||||
val relations: Map[String, RelationVocabulary] = {
|
val relations: Map[String, RelationVocabulary] = {
|
||||||
val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")).mkString
|
val input = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
|
||||||
lazy val json: json4s.JValue = parse(input)
|
lazy val json: json4s.JValue = parse(input)
|
||||||
|
@ -32,13 +35,14 @@ object ScholixUtils extends Serializable {
|
||||||
json.extract[Map[String, RelationVocabulary]]
|
json.extract[Map[String, RelationVocabulary]]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def extractRelationDate(relation: Relation): String = {
|
def extractRelationDate(relation: Relation): String = {
|
||||||
|
|
||||||
if (relation.getProperties == null || !relation.getProperties.isEmpty)
|
if (relation.getProperties == null || !relation.getProperties.isEmpty)
|
||||||
null
|
null
|
||||||
else {
|
else {
|
||||||
val date = relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue)
|
val date = relation.getProperties.asScala
|
||||||
|
.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey))
|
||||||
|
.map(p => p.getValue)
|
||||||
if (date.isDefined)
|
if (date.isDefined)
|
||||||
date.get
|
date.get
|
||||||
else
|
else
|
||||||
|
@ -58,16 +62,14 @@ object ScholixUtils extends Serializable {
|
||||||
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
|
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
|
||||||
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
|
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateScholixResourceFromResult(r: Result): ScholixResource = {
|
def generateScholixResourceFromResult(r: Result): ScholixResource = {
|
||||||
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
|
||||||
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
|
new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
|
||||||
override def zero: RelatedEntities = null
|
override def zero: RelatedEntities = null
|
||||||
|
|
||||||
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
|
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
|
||||||
|
@ -77,13 +79,20 @@ object ScholixUtils extends Serializable {
|
||||||
if (b == null)
|
if (b == null)
|
||||||
RelatedEntities(a._1, relatedDataset, relatedPublication)
|
RelatedEntities(a._1, relatedDataset, relatedPublication)
|
||||||
else
|
else
|
||||||
RelatedEntities(a._1, b.relatedDataset + relatedDataset, b.relatedPublication + relatedPublication)
|
RelatedEntities(
|
||||||
|
a._1,
|
||||||
|
b.relatedDataset + relatedDataset,
|
||||||
|
b.relatedPublication + relatedPublication
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
|
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
|
||||||
if (b1 != null && b2 != null)
|
if (b1 != null && b2 != null)
|
||||||
RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication)
|
RelatedEntities(
|
||||||
|
b1.id,
|
||||||
|
b1.relatedDataset + b2.relatedDataset,
|
||||||
|
b1.relatedPublication + b2.relatedPublication
|
||||||
|
)
|
||||||
else if (b1 != null)
|
else if (b1 != null)
|
||||||
b1
|
b1
|
||||||
else
|
else
|
||||||
|
@ -97,18 +106,16 @@ object ScholixUtils extends Serializable {
|
||||||
override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
|
override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] =
|
||||||
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
|
new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
|
||||||
override def zero: Scholix = null
|
override def zero: Scholix = null
|
||||||
|
|
||||||
|
|
||||||
def scholix_complete(s: Scholix): Boolean = {
|
def scholix_complete(s: Scholix): Boolean = {
|
||||||
if (s == null || s.getIdentifier == null) {
|
if (s == null || s.getIdentifier == null) {
|
||||||
false
|
false
|
||||||
} else if (s.getSource == null || s.getTarget == null) {
|
} else if (s.getSource == null || s.getTarget == null) {
|
||||||
false
|
false
|
||||||
}
|
} else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
|
||||||
else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
|
|
||||||
false
|
false
|
||||||
else
|
else
|
||||||
true
|
true
|
||||||
|
@ -129,7 +136,6 @@ object ScholixUtils extends Serializable {
|
||||||
override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def createInverseScholixRelation(scholix: Scholix): Scholix = {
|
def createInverseScholixRelation(scholix: Scholix): Scholix = {
|
||||||
val s = new Scholix
|
val s = new Scholix
|
||||||
s.setPublicationDate(scholix.getPublicationDate)
|
s.setPublicationDate(scholix.getPublicationDate)
|
||||||
|
@ -138,16 +144,19 @@ object ScholixUtils extends Serializable {
|
||||||
s.setRelationship(inverseRelationShip(scholix.getRelationship))
|
s.setRelationship(inverseRelationShip(scholix.getRelationship))
|
||||||
s.setSource(scholix.getTarget)
|
s.setSource(scholix.getTarget)
|
||||||
s.setTarget(scholix.getSource)
|
s.setTarget(scholix.getSource)
|
||||||
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
|
s.setIdentifier(
|
||||||
|
DHPUtils.md5(
|
||||||
|
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||||
|
)
|
||||||
|
)
|
||||||
s
|
s
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
|
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
|
||||||
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
|
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
|
||||||
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map {
|
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
|
||||||
d => new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
|
new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
|
||||||
}(collection.breakOut)
|
}(collection.breakOut)
|
||||||
l
|
l
|
||||||
} else List()
|
} else List()
|
||||||
|
@ -155,8 +164,11 @@ object ScholixUtils extends Serializable {
|
||||||
|
|
||||||
def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
|
def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
|
||||||
if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
|
if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
|
||||||
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map {
|
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d =>
|
||||||
d => new ScholixEntityId(d.getDatasourceName, List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava)
|
new ScholixEntityId(
|
||||||
|
d.getDatasourceName,
|
||||||
|
List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava
|
||||||
|
)
|
||||||
}(collection.breakOut)
|
}(collection.breakOut)
|
||||||
l
|
l
|
||||||
} else List()
|
} else List()
|
||||||
|
@ -165,17 +177,16 @@ object ScholixUtils extends Serializable {
|
||||||
def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
|
def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
|
||||||
if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
|
if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
|
||||||
|
|
||||||
|
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c =>
|
||||||
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map {
|
new ScholixEntityId(
|
||||||
c =>
|
c.getValue,
|
||||||
|
List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava
|
||||||
new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava)
|
)
|
||||||
}.toList
|
}.toList
|
||||||
l
|
l
|
||||||
} else List()
|
} else List()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
|
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
|
||||||
val s = new Scholix
|
val s = new Scholix
|
||||||
s.setPublicationDate(scholix.getPublicationDate)
|
s.setPublicationDate(scholix.getPublicationDate)
|
||||||
|
@ -184,11 +195,14 @@ object ScholixUtils extends Serializable {
|
||||||
s.setRelationship(scholix.getRelationship)
|
s.setRelationship(scholix.getRelationship)
|
||||||
s.setSource(scholix.getSource)
|
s.setSource(scholix.getSource)
|
||||||
s.setTarget(generateScholixResourceFromSummary(target))
|
s.setTarget(generateScholixResourceFromSummary(target))
|
||||||
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
|
s.setIdentifier(
|
||||||
|
DHPUtils.md5(
|
||||||
|
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||||
|
)
|
||||||
|
)
|
||||||
s
|
s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = {
|
def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = {
|
||||||
val s = new Scholix
|
val s = new Scholix
|
||||||
s.setPublicationDate(scholix.getPublicationDate)
|
s.setPublicationDate(scholix.getPublicationDate)
|
||||||
|
@ -197,11 +211,14 @@ object ScholixUtils extends Serializable {
|
||||||
s.setRelationship(scholix.getRelationship)
|
s.setRelationship(scholix.getRelationship)
|
||||||
s.setSource(scholix.getSource)
|
s.setSource(scholix.getSource)
|
||||||
s.setTarget(target)
|
s.setTarget(target)
|
||||||
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
|
s.setIdentifier(
|
||||||
|
DHPUtils.md5(
|
||||||
|
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||||
|
)
|
||||||
|
)
|
||||||
s
|
s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
|
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
|
||||||
val r = new ScholixResource
|
val r = new ScholixResource
|
||||||
r.setIdentifier(summaryObject.getLocalIdentifier)
|
r.setIdentifier(summaryObject.getLocalIdentifier)
|
||||||
|
@ -214,7 +231,8 @@ object ScholixUtils extends Serializable {
|
||||||
r.setTitle(summaryObject.getTitle.get(0))
|
r.setTitle(summaryObject.getTitle.get(0))
|
||||||
|
|
||||||
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
|
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
|
||||||
val l: List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
|
val l: List[ScholixEntityId] =
|
||||||
|
summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
|
||||||
if (l.nonEmpty)
|
if (l.nonEmpty)
|
||||||
r.setCreator(l.asJava)
|
r.setCreator(l.asJava)
|
||||||
}
|
}
|
||||||
|
@ -222,20 +240,27 @@ object ScholixUtils extends Serializable {
|
||||||
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
|
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
|
||||||
r.setPublicationDate(summaryObject.getDate.get(0))
|
r.setPublicationDate(summaryObject.getDate.get(0))
|
||||||
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
|
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
|
||||||
val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
|
val plist: List[ScholixEntityId] =
|
||||||
|
summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
|
||||||
|
|
||||||
if (plist.nonEmpty)
|
if (plist.nonEmpty)
|
||||||
r.setPublisher(plist.asJava)
|
r.setPublisher(plist.asJava)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {
|
if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {
|
||||||
|
|
||||||
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom(
|
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala
|
||||||
new ScholixEntityId(c.getDatasourceName, List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava)
|
.map(c =>
|
||||||
, "collected", "complete"
|
new ScholixCollectedFrom(
|
||||||
|
new ScholixEntityId(
|
||||||
)).toList
|
c.getDatasourceName,
|
||||||
|
List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava
|
||||||
|
),
|
||||||
|
"collected",
|
||||||
|
"complete"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.toList
|
||||||
|
|
||||||
if (l.nonEmpty)
|
if (l.nonEmpty)
|
||||||
r.setCollectedFrom(l.asJava)
|
r.setCollectedFrom(l.asJava)
|
||||||
|
@ -244,8 +269,6 @@ object ScholixUtils extends Serializable {
|
||||||
r
|
r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = {
|
def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = {
|
||||||
if (relation == null || source == null)
|
if (relation == null || source == null)
|
||||||
return null
|
return null
|
||||||
|
@ -262,7 +285,6 @@ object ScholixUtils extends Serializable {
|
||||||
|
|
||||||
s.setPublicationDate(d)
|
s.setPublicationDate(d)
|
||||||
|
|
||||||
|
|
||||||
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
|
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
|
||||||
s.setPublisher(source.getPublisher)
|
s.setPublisher(source.getPublisher)
|
||||||
}
|
}
|
||||||
|
@ -270,13 +292,14 @@ object ScholixUtils extends Serializable {
|
||||||
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
||||||
if (semanticRelation == null)
|
if (semanticRelation == null)
|
||||||
return null
|
return null
|
||||||
s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
|
s.setRelationship(
|
||||||
|
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
||||||
|
)
|
||||||
s.setSource(source)
|
s.setSource(source)
|
||||||
|
|
||||||
s
|
s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
|
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
|
||||||
|
|
||||||
if (relation == null || source == null)
|
if (relation == null || source == null)
|
||||||
|
@ -298,11 +321,9 @@ object ScholixUtils extends Serializable {
|
||||||
|
|
||||||
s.setPublicationDate(d)
|
s.setPublicationDate(d)
|
||||||
|
|
||||||
|
|
||||||
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
|
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
|
||||||
val l: List[ScholixEntityId] = source.getPublisher.asScala
|
val l: List[ScholixEntityId] = source.getPublisher.asScala
|
||||||
.map {
|
.map { p =>
|
||||||
p =>
|
|
||||||
new ScholixEntityId(p, null)
|
new ScholixEntityId(p, null)
|
||||||
}(collection.breakOut)
|
}(collection.breakOut)
|
||||||
|
|
||||||
|
@ -313,16 +334,19 @@ object ScholixUtils extends Serializable {
|
||||||
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
||||||
if (semanticRelation == null)
|
if (semanticRelation == null)
|
||||||
return null
|
return null
|
||||||
s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
|
s.setRelationship(
|
||||||
|
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
||||||
|
)
|
||||||
s.setSource(generateScholixResourceFromSummary(source))
|
s.setSource(generateScholixResourceFromSummary(source))
|
||||||
|
|
||||||
s
|
s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def findURLForPID(
|
||||||
def findURLForPID(pidValue: List[StructuredProperty], urls: List[String]): List[(StructuredProperty, String)] = {
|
pidValue: List[StructuredProperty],
|
||||||
pidValue.map {
|
urls: List[String]
|
||||||
p =>
|
): List[(StructuredProperty, String)] = {
|
||||||
|
pidValue.map { p =>
|
||||||
val pv = p.getValue
|
val pv = p.getValue
|
||||||
|
|
||||||
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
|
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
|
||||||
|
@ -330,14 +354,17 @@ object ScholixUtils extends Serializable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
|
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
|
||||||
if (r.getInstance() == null || r.getInstance().isEmpty)
|
if (r.getInstance() == null || r.getInstance().isEmpty)
|
||||||
return List()
|
return List()
|
||||||
r.getInstance().asScala.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
|
r.getInstance()
|
||||||
|
.asScala
|
||||||
|
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
|
||||||
.filter(i => i.getPid != null && i.getUrl != null)
|
.filter(i => i.getPid != null && i.getUrl != null)
|
||||||
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
|
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
|
||||||
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList
|
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
|
||||||
|
.distinct
|
||||||
|
.toList
|
||||||
}
|
}
|
||||||
|
|
||||||
def resultToSummary(r: Result): ScholixSummary = {
|
def resultToSummary(r: Result): ScholixSummary = {
|
||||||
|
@ -371,7 +398,12 @@ object ScholixUtils extends Serializable {
|
||||||
s.setAuthor(authors.asJava)
|
s.setAuthor(authors.asJava)
|
||||||
}
|
}
|
||||||
if (r.getInstance() != null) {
|
if (r.getInstance() != null) {
|
||||||
val dt: List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue).toList
|
val dt: List[String] = r
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.filter(i => i.getDateofacceptance != null)
|
||||||
|
.map(i => i.getDateofacceptance.getValue)
|
||||||
|
.toList
|
||||||
if (dt.nonEmpty)
|
if (dt.nonEmpty)
|
||||||
s.setDate(dt.distinct.asJava)
|
s.setDate(dt.distinct.asJava)
|
||||||
}
|
}
|
||||||
|
@ -382,7 +414,9 @@ object ScholixUtils extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (r.getSubject != null && !r.getSubject.isEmpty) {
|
if (r.getSubject != null && !r.getSubject.isEmpty) {
|
||||||
val subjects: List[SchemeValue] = r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue)).toList
|
val subjects: List[SchemeValue] = r.getSubject.asScala
|
||||||
|
.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))
|
||||||
|
.toList
|
||||||
if (subjects.nonEmpty)
|
if (subjects.nonEmpty)
|
||||||
s.setSubject(subjects.asJava)
|
s.setSubject(subjects.asJava)
|
||||||
}
|
}
|
||||||
|
@ -391,7 +425,9 @@ object ScholixUtils extends Serializable {
|
||||||
s.setPublisher(List(r.getPublisher.getValue).asJava)
|
s.setPublisher(List(r.getPublisher.getValue).asJava)
|
||||||
|
|
||||||
if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
|
if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
|
||||||
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete")).toList
|
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala
|
||||||
|
.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))
|
||||||
|
.toList
|
||||||
if (cf.nonEmpty)
|
if (cf.nonEmpty)
|
||||||
s.setDatasources(cf.distinct.asJava)
|
s.setDatasources(cf.distinct.asJava)
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,15 +7,13 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode}
|
||||||
|
|
||||||
object CollectionUtils {
|
object CollectionUtils {
|
||||||
|
|
||||||
/**
|
/** This method in pipeline to the transformation phase,
|
||||||
* This method in pipeline to the transformation phase,
|
|
||||||
* generates relations in both verse, typically it should be a phase of flatMap
|
* generates relations in both verse, typically it should be a phase of flatMap
|
||||||
*
|
*
|
||||||
* @param i input OAF
|
* @param i input OAF
|
||||||
* @return
|
* @return
|
||||||
* If the input OAF is an entity -> List(i)
|
* If the input OAF is an entity -> List(i)
|
||||||
* If the input OAF is a relation -> List(relation, inverseRelation)
|
* If the input OAF is a relation -> List(relation, inverseRelation)
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
def fixRelations(i: Oaf): List[Oaf] = {
|
def fixRelations(i: Oaf): List[Oaf] = {
|
||||||
|
|
|
@ -6,7 +6,6 @@ import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest}
|
||||||
import org.apache.http.entity.StringEntity
|
import org.apache.http.entity.StringEntity
|
||||||
import org.apache.http.impl.client.HttpClientBuilder
|
import org.apache.http.impl.client.HttpClientBuilder
|
||||||
|
|
||||||
|
|
||||||
abstract class AbstractRestClient extends Iterator[String] {
|
abstract class AbstractRestClient extends Iterator[String] {
|
||||||
|
|
||||||
var buffer: List[String] = List()
|
var buffer: List[String] = List()
|
||||||
|
@ -16,12 +15,10 @@ abstract class AbstractRestClient extends Iterator[String] {
|
||||||
|
|
||||||
var complete: Boolean = false
|
var complete: Boolean = false
|
||||||
|
|
||||||
|
|
||||||
def extractInfo(input: String): Unit
|
def extractInfo(input: String): Unit
|
||||||
|
|
||||||
protected def getBufferData(): Unit
|
protected def getBufferData(): Unit
|
||||||
|
|
||||||
|
|
||||||
def doHTTPGETRequest(url: String): String = {
|
def doHTTPGETRequest(url: String): String = {
|
||||||
val httpGet = new HttpGet(url)
|
val httpGet = new HttpGet(url)
|
||||||
doHTTPRequest(httpGet)
|
doHTTPRequest(httpGet)
|
||||||
|
@ -43,7 +40,6 @@ abstract class AbstractRestClient extends Iterator[String] {
|
||||||
buffer.nonEmpty && current_index < buffer.size
|
buffer.nonEmpty && current_index < buffer.size
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
override def next(): String = {
|
override def next(): String = {
|
||||||
val next_item: String = buffer(current_index)
|
val next_item: String = buffer(current_index)
|
||||||
current_index = current_index + 1
|
current_index = current_index + 1
|
||||||
|
@ -52,13 +48,14 @@ abstract class AbstractRestClient extends Iterator[String] {
|
||||||
next_item
|
next_item
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
|
private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
|
||||||
val timeout = 60; // seconds
|
val timeout = 60; // seconds
|
||||||
val config = RequestConfig.custom()
|
val config = RequestConfig
|
||||||
|
.custom()
|
||||||
.setConnectTimeout(timeout * 1000)
|
.setConnectTimeout(timeout * 1000)
|
||||||
.setConnectionRequestTimeout(timeout * 1000)
|
.setConnectionRequestTimeout(timeout * 1000)
|
||||||
.setSocketTimeout(timeout * 1000).build()
|
.setSocketTimeout(timeout * 1000)
|
||||||
|
.build()
|
||||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||||
try {
|
try {
|
||||||
var tries = 4
|
var tries = 4
|
||||||
|
@ -69,8 +66,7 @@ abstract class AbstractRestClient extends Iterator[String] {
|
||||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||||
if (response.getStatusLine.getStatusCode > 400) {
|
if (response.getStatusLine.getStatusCode > 400) {
|
||||||
tries -= 1
|
tries -= 1
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
return IOUtils.toString(response.getEntity.getContent)
|
return IOUtils.toString(response.getEntity.getContent)
|
||||||
} catch {
|
} catch {
|
||||||
case e: Throwable =>
|
case e: Throwable =>
|
||||||
|
|
|
@ -24,7 +24,9 @@ class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -
|
||||||
|
|
||||||
override def getBufferData(): Unit = {
|
override def getBufferData(): Unit = {
|
||||||
if (!complete) {
|
if (!complete) {
|
||||||
val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url())
|
val response =
|
||||||
|
if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get)
|
||||||
|
else doHTTPGETRequest(get_url())
|
||||||
extractInfo(response)
|
extractInfo(response)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,8 +10,7 @@ import java.util.Locale
|
||||||
import java.util.regex.Pattern
|
import java.util.regex.Pattern
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
|
||||||
/**
|
/** This class represent the dataModel of the input Dataset of Datacite
|
||||||
* This class represent the dataModel of the input Dataset of Datacite
|
|
||||||
* @param doi THE DOI
|
* @param doi THE DOI
|
||||||
* @param timestamp timestamp of last update date
|
* @param timestamp timestamp of last update date
|
||||||
* @param isActive the record is active or deleted
|
* @param isActive the record is active or deleted
|
||||||
|
@ -23,11 +22,26 @@ case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: S
|
||||||
The following class are utility class used for the mapping from
|
The following class are utility class used for the mapping from
|
||||||
json datacite to OAF Shema
|
json datacite to OAF Shema
|
||||||
*/
|
*/
|
||||||
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
|
case class RelatedIdentifierType(
|
||||||
|
relationType: String,
|
||||||
|
relatedIdentifier: String,
|
||||||
|
relatedIdentifierType: String
|
||||||
|
) {}
|
||||||
|
|
||||||
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
|
case class NameIdentifiersType(
|
||||||
|
nameIdentifierScheme: Option[String],
|
||||||
|
schemeUri: Option[String],
|
||||||
|
nameIdentifier: Option[String]
|
||||||
|
) {}
|
||||||
|
|
||||||
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
|
case class CreatorType(
|
||||||
|
nameType: Option[String],
|
||||||
|
nameIdentifiers: Option[List[NameIdentifiersType]],
|
||||||
|
name: Option[String],
|
||||||
|
familyName: Option[String],
|
||||||
|
givenName: Option[String],
|
||||||
|
affiliation: Option[List[String]]
|
||||||
|
) {}
|
||||||
|
|
||||||
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
||||||
|
|
||||||
|
@ -35,16 +49,20 @@ case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {
|
||||||
|
|
||||||
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
||||||
|
|
||||||
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
|
case class FundingReferenceType(
|
||||||
|
funderIdentifierType: Option[String],
|
||||||
|
awardTitle: Option[String],
|
||||||
|
awardUri: Option[String],
|
||||||
|
funderName: Option[String],
|
||||||
|
funderIdentifier: Option[String],
|
||||||
|
awardNumber: Option[String]
|
||||||
|
) {}
|
||||||
|
|
||||||
case class DateType(date: Option[String], dateType: Option[String]) {}
|
case class DateType(date: Option[String], dateType: Option[String]) {}
|
||||||
|
|
||||||
case class OAFRelations(relation: String, inverse: String, relType: String)
|
case class OAFRelations(relation: String, inverse: String, relType: String)
|
||||||
|
|
||||||
|
class DataciteModelConstants extends Serializable {}
|
||||||
class DataciteModelConstants extends Serializable {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
object DataciteModelConstants {
|
object DataciteModelConstants {
|
||||||
|
|
||||||
|
@ -55,51 +73,147 @@ object DataciteModelConstants {
|
||||||
val SUBJ_CLASS = "keywords"
|
val SUBJ_CLASS = "keywords"
|
||||||
val DATACITE_NAME = "Datacite"
|
val DATACITE_NAME = "Datacite"
|
||||||
val dataInfo: DataInfo = dataciteDataInfo("0.9")
|
val dataInfo: DataInfo = dataciteDataInfo("0.9")
|
||||||
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
|
|
||||||
|
val DATACITE_COLLECTED_FROM: KeyValue =
|
||||||
|
OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
|
||||||
|
|
||||||
val subRelTypeMapping: Map[String, OAFRelations] = Map(
|
val subRelTypeMapping: Map[String, OAFRelations] = Map(
|
||||||
ModelConstants.REFERENCES -> OAFRelations(ModelConstants.REFERENCES, ModelConstants.IS_REFERENCED_BY, ModelConstants.RELATIONSHIP),
|
ModelConstants.REFERENCES -> OAFRelations(
|
||||||
ModelConstants.IS_REFERENCED_BY -> OAFRelations(ModelConstants.IS_REFERENCED_BY,ModelConstants.REFERENCES, ModelConstants.RELATIONSHIP),
|
ModelConstants.REFERENCES,
|
||||||
|
ModelConstants.IS_REFERENCED_BY,
|
||||||
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.SUPPLEMENT),
|
ModelConstants.RELATIONSHIP
|
||||||
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.SUPPLEMENT),
|
),
|
||||||
|
ModelConstants.IS_REFERENCED_BY -> OAFRelations(
|
||||||
ModelConstants.HAS_PART -> OAFRelations(ModelConstants.HAS_PART,ModelConstants.IS_PART_OF, ModelConstants.PART),
|
ModelConstants.IS_REFERENCED_BY,
|
||||||
ModelConstants.IS_PART_OF -> OAFRelations(ModelConstants.IS_PART_OF,ModelConstants.HAS_PART, ModelConstants.PART),
|
ModelConstants.REFERENCES,
|
||||||
|
ModelConstants.RELATIONSHIP
|
||||||
ModelConstants.IS_VERSION_OF-> OAFRelations(ModelConstants.IS_VERSION_OF,ModelConstants.HAS_VERSION,ModelConstants.VERSION),
|
),
|
||||||
ModelConstants.HAS_VERSION-> OAFRelations(ModelConstants.HAS_VERSION,ModelConstants.IS_VERSION_OF,ModelConstants.VERSION),
|
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
|
||||||
|
ModelConstants.IS_SUPPLEMENTED_BY,
|
||||||
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(ModelConstants.IS_IDENTICAL_TO,ModelConstants.IS_IDENTICAL_TO, ModelConstants.RELATIONSHIP),
|
ModelConstants.IS_SUPPLEMENT_TO,
|
||||||
|
ModelConstants.SUPPLEMENT
|
||||||
ModelConstants.IS_CONTINUED_BY -> OAFRelations(ModelConstants.IS_CONTINUED_BY,ModelConstants.CONTINUES, ModelConstants.RELATIONSHIP),
|
),
|
||||||
ModelConstants.CONTINUES -> OAFRelations(ModelConstants.CONTINUES,ModelConstants.IS_CONTINUED_BY, ModelConstants.RELATIONSHIP),
|
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
|
||||||
|
ModelConstants.IS_SUPPLEMENT_TO,
|
||||||
ModelConstants.IS_NEW_VERSION_OF-> OAFRelations(ModelConstants.IS_NEW_VERSION_OF,ModelConstants.IS_PREVIOUS_VERSION_OF, ModelConstants.VERSION),
|
ModelConstants.IS_SUPPLEMENTED_BY,
|
||||||
ModelConstants.IS_PREVIOUS_VERSION_OF ->OAFRelations(ModelConstants.IS_PREVIOUS_VERSION_OF,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
|
ModelConstants.SUPPLEMENT
|
||||||
|
),
|
||||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
|
ModelConstants.HAS_PART -> OAFRelations(
|
||||||
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
|
ModelConstants.HAS_PART,
|
||||||
|
ModelConstants.IS_PART_OF,
|
||||||
ModelConstants.IS_SOURCE_OF -> OAFRelations(ModelConstants.IS_SOURCE_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
|
ModelConstants.PART
|
||||||
ModelConstants.IS_DERIVED_FROM -> OAFRelations(ModelConstants.IS_DERIVED_FROM,ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION),
|
),
|
||||||
|
ModelConstants.IS_PART_OF -> OAFRelations(
|
||||||
ModelConstants.CITES -> OAFRelations(ModelConstants.CITES,ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
|
ModelConstants.IS_PART_OF,
|
||||||
ModelConstants.IS_CITED_BY -> OAFRelations(ModelConstants.IS_CITED_BY,ModelConstants.CITES, ModelConstants.CITATION),
|
ModelConstants.HAS_PART,
|
||||||
|
ModelConstants.PART
|
||||||
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(ModelConstants.IS_VARIANT_FORM_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
|
),
|
||||||
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(ModelConstants.IS_OBSOLETED_BY,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
|
ModelConstants.IS_VERSION_OF -> OAFRelations(
|
||||||
|
ModelConstants.IS_VERSION_OF,
|
||||||
ModelConstants.REVIEWS -> OAFRelations(ModelConstants.REVIEWS,ModelConstants.IS_REVIEWED_BY, ModelConstants.REVIEW),
|
ModelConstants.HAS_VERSION,
|
||||||
ModelConstants.IS_REVIEWED_BY -> OAFRelations(ModelConstants.IS_REVIEWED_BY,ModelConstants.REVIEWS, ModelConstants.REVIEW),
|
ModelConstants.VERSION
|
||||||
|
),
|
||||||
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
|
ModelConstants.HAS_VERSION -> OAFRelations(
|
||||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
|
ModelConstants.HAS_VERSION,
|
||||||
|
ModelConstants.IS_VERSION_OF,
|
||||||
ModelConstants.COMPILES -> OAFRelations(ModelConstants.COMPILES,ModelConstants.IS_COMPILED_BY, ModelConstants.RELATIONSHIP),
|
ModelConstants.VERSION
|
||||||
ModelConstants.IS_COMPILED_BY -> OAFRelations(ModelConstants.IS_COMPILED_BY,ModelConstants.COMPILES, ModelConstants.RELATIONSHIP)
|
),
|
||||||
|
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
|
||||||
|
ModelConstants.IS_IDENTICAL_TO,
|
||||||
|
ModelConstants.IS_IDENTICAL_TO,
|
||||||
|
ModelConstants.RELATIONSHIP
|
||||||
|
),
|
||||||
|
ModelConstants.IS_CONTINUED_BY -> OAFRelations(
|
||||||
|
ModelConstants.IS_CONTINUED_BY,
|
||||||
|
ModelConstants.CONTINUES,
|
||||||
|
ModelConstants.RELATIONSHIP
|
||||||
|
),
|
||||||
|
ModelConstants.CONTINUES -> OAFRelations(
|
||||||
|
ModelConstants.CONTINUES,
|
||||||
|
ModelConstants.IS_CONTINUED_BY,
|
||||||
|
ModelConstants.RELATIONSHIP
|
||||||
|
),
|
||||||
|
ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
|
||||||
|
ModelConstants.IS_NEW_VERSION_OF,
|
||||||
|
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
||||||
|
ModelConstants.VERSION
|
||||||
|
),
|
||||||
|
ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
|
||||||
|
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
||||||
|
ModelConstants.IS_NEW_VERSION_OF,
|
||||||
|
ModelConstants.VERSION
|
||||||
|
),
|
||||||
|
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
||||||
|
ModelConstants.IS_DOCUMENTED_BY,
|
||||||
|
ModelConstants.DOCUMENTS,
|
||||||
|
ModelConstants.RELATIONSHIP
|
||||||
|
),
|
||||||
|
ModelConstants.DOCUMENTS -> OAFRelations(
|
||||||
|
ModelConstants.DOCUMENTS,
|
||||||
|
ModelConstants.IS_DOCUMENTED_BY,
|
||||||
|
ModelConstants.RELATIONSHIP
|
||||||
|
),
|
||||||
|
ModelConstants.IS_SOURCE_OF -> OAFRelations(
|
||||||
|
ModelConstants.IS_SOURCE_OF,
|
||||||
|
ModelConstants.IS_DERIVED_FROM,
|
||||||
|
ModelConstants.VERSION
|
||||||
|
),
|
||||||
|
ModelConstants.IS_DERIVED_FROM -> OAFRelations(
|
||||||
|
ModelConstants.IS_DERIVED_FROM,
|
||||||
|
ModelConstants.IS_SOURCE_OF,
|
||||||
|
ModelConstants.VERSION
|
||||||
|
),
|
||||||
|
ModelConstants.CITES -> OAFRelations(
|
||||||
|
ModelConstants.CITES,
|
||||||
|
ModelConstants.IS_CITED_BY,
|
||||||
|
ModelConstants.CITATION
|
||||||
|
),
|
||||||
|
ModelConstants.IS_CITED_BY -> OAFRelations(
|
||||||
|
ModelConstants.IS_CITED_BY,
|
||||||
|
ModelConstants.CITES,
|
||||||
|
ModelConstants.CITATION
|
||||||
|
),
|
||||||
|
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
|
||||||
|
ModelConstants.IS_VARIANT_FORM_OF,
|
||||||
|
ModelConstants.IS_DERIVED_FROM,
|
||||||
|
ModelConstants.VERSION
|
||||||
|
),
|
||||||
|
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
|
||||||
|
ModelConstants.IS_OBSOLETED_BY,
|
||||||
|
ModelConstants.IS_NEW_VERSION_OF,
|
||||||
|
ModelConstants.VERSION
|
||||||
|
),
|
||||||
|
ModelConstants.REVIEWS -> OAFRelations(
|
||||||
|
ModelConstants.REVIEWS,
|
||||||
|
ModelConstants.IS_REVIEWED_BY,
|
||||||
|
ModelConstants.REVIEW
|
||||||
|
),
|
||||||
|
ModelConstants.IS_REVIEWED_BY -> OAFRelations(
|
||||||
|
ModelConstants.IS_REVIEWED_BY,
|
||||||
|
ModelConstants.REVIEWS,
|
||||||
|
ModelConstants.REVIEW
|
||||||
|
),
|
||||||
|
ModelConstants.DOCUMENTS -> OAFRelations(
|
||||||
|
ModelConstants.DOCUMENTS,
|
||||||
|
ModelConstants.IS_DOCUMENTED_BY,
|
||||||
|
ModelConstants.RELATIONSHIP
|
||||||
|
),
|
||||||
|
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
||||||
|
ModelConstants.IS_DOCUMENTED_BY,
|
||||||
|
ModelConstants.DOCUMENTS,
|
||||||
|
ModelConstants.RELATIONSHIP
|
||||||
|
),
|
||||||
|
ModelConstants.COMPILES -> OAFRelations(
|
||||||
|
ModelConstants.COMPILES,
|
||||||
|
ModelConstants.IS_COMPILED_BY,
|
||||||
|
ModelConstants.RELATIONSHIP
|
||||||
|
),
|
||||||
|
ModelConstants.IS_COMPILED_BY -> OAFRelations(
|
||||||
|
ModelConstants.IS_COMPILED_BY,
|
||||||
|
ModelConstants.COMPILES,
|
||||||
|
ModelConstants.RELATIONSHIP
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
val datacite_filter: List[String] = {
|
val datacite_filter: List[String] = {
|
||||||
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
|
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
|
||||||
|
@ -107,28 +221,58 @@ object DataciteModelConstants {
|
||||||
Source.fromInputStream(stream).getLines().toList
|
Source.fromInputStream(stream).getLines().toList
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
|
||||||
|
false,
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||||
|
trust
|
||||||
|
)
|
||||||
|
|
||||||
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(false,null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, trust)
|
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
|
||||||
|
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
|
||||||
|
Locale.ENGLISH
|
||||||
|
)
|
||||||
|
|
||||||
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
|
val df_it: DateTimeFormatter =
|
||||||
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||||
|
|
||||||
val funder_regex: List[(Pattern, String)] = List(
|
val funder_regex: List[(Pattern, String)] = List(
|
||||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
|
(
|
||||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
|
Pattern.compile(
|
||||||
|
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
||||||
|
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
||||||
|
),
|
||||||
|
"40|corda__h2020::"
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Pattern.compile(
|
||||||
|
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
||||||
|
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
||||||
|
),
|
||||||
|
"40|corda_______::"
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
val Date_regex: List[Pattern] = List(
|
val Date_regex: List[Pattern] = List(
|
||||||
//Y-M-D
|
//Y-M-D
|
||||||
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
Pattern.compile(
|
||||||
|
"(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
|
||||||
|
Pattern.MULTILINE
|
||||||
|
),
|
||||||
//M-D-Y
|
//M-D-Y
|
||||||
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
|
Pattern.compile(
|
||||||
|
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
|
||||||
|
Pattern.MULTILINE
|
||||||
|
),
|
||||||
//D-M-Y
|
//D-M-Y
|
||||||
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
|
Pattern.compile(
|
||||||
|
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
|
||||||
|
Pattern.MULTILINE
|
||||||
|
),
|
||||||
//Y
|
//Y
|
||||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,14 +20,11 @@ import java.time.format.DateTimeFormatter
|
||||||
import java.util.{Date, Locale}
|
import java.util.{Date, Locale}
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
|
|
||||||
object DataciteToOAFTransformation {
|
object DataciteToOAFTransformation {
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
/** This method should skip record if json contains invalid text
|
||||||
/**
|
|
||||||
* This method should skip record if json contains invalid text
|
|
||||||
* defined in gile datacite_filter
|
* defined in gile datacite_filter
|
||||||
*
|
*
|
||||||
* @param json
|
* @param json
|
||||||
|
@ -74,30 +71,30 @@ object DataciteToOAFTransformation {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def embargo_end(embargo_end_date: String): Boolean = {
|
def embargo_end(embargo_end_date: String): Boolean = {
|
||||||
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
|
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
|
||||||
val td = LocalDate.now()
|
val td = LocalDate.now()
|
||||||
td.isAfter(dt)
|
td.isAfter(dt)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def extract_date(input: String): Option[String] = {
|
def extract_date(input: String): Option[String] = {
|
||||||
val d = Date_regex.map(pattern => {
|
val d = Date_regex
|
||||||
|
.map(pattern => {
|
||||||
val matcher = pattern.matcher(input)
|
val matcher = pattern.matcher(input)
|
||||||
if (matcher.find())
|
if (matcher.find())
|
||||||
matcher.group(0)
|
matcher.group(0)
|
||||||
else
|
else
|
||||||
null
|
null
|
||||||
}
|
})
|
||||||
).find(s => s != null)
|
.find(s => s != null)
|
||||||
|
|
||||||
if (d.isDefined) {
|
if (d.isDefined) {
|
||||||
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
|
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
|
||||||
try {
|
try {
|
||||||
return Some(LocalDate.parse(a_date, df_en).toString)
|
return Some(LocalDate.parse(a_date, df_en).toString)
|
||||||
} catch {
|
} catch {
|
||||||
case _: Throwable => try {
|
case _: Throwable =>
|
||||||
|
try {
|
||||||
return Some(LocalDate.parse(a_date, df_it).toString)
|
return Some(LocalDate.parse(a_date, df_it).toString)
|
||||||
} catch {
|
} catch {
|
||||||
case _: Throwable =>
|
case _: Throwable =>
|
||||||
|
@ -118,31 +115,63 @@ object DataciteToOAFTransformation {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def getTypeQualifier(
|
||||||
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
|
resourceType: String,
|
||||||
|
resourceTypeGeneral: String,
|
||||||
|
schemaOrg: String,
|
||||||
|
vocabularies: VocabularyGroup
|
||||||
|
): (Qualifier, Qualifier) = {
|
||||||
if (resourceType != null && resourceType.nonEmpty) {
|
if (resourceType != null && resourceType.nonEmpty) {
|
||||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
|
val typeQualifier =
|
||||||
|
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
|
||||||
if (typeQualifier != null)
|
if (typeQualifier != null)
|
||||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
return (
|
||||||
|
typeQualifier,
|
||||||
|
vocabularies.getSynonymAsQualifier(
|
||||||
|
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||||
|
typeQualifier.getClassid
|
||||||
|
)
|
||||||
|
)
|
||||||
}
|
}
|
||||||
if (schemaOrg != null && schemaOrg.nonEmpty) {
|
if (schemaOrg != null && schemaOrg.nonEmpty) {
|
||||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
|
val typeQualifier =
|
||||||
|
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
|
||||||
if (typeQualifier != null)
|
if (typeQualifier != null)
|
||||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
return (
|
||||||
|
typeQualifier,
|
||||||
|
vocabularies.getSynonymAsQualifier(
|
||||||
|
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||||
|
typeQualifier.getClassid
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
|
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
|
||||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
|
val typeQualifier = vocabularies.getSynonymAsQualifier(
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||||
|
resourceTypeGeneral
|
||||||
|
)
|
||||||
if (typeQualifier != null)
|
if (typeQualifier != null)
|
||||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
return (
|
||||||
|
typeQualifier,
|
||||||
|
vocabularies.getSynonymAsQualifier(
|
||||||
|
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||||
|
typeQualifier.getClassid
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
null
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def getResult(
|
||||||
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
|
resourceType: String,
|
||||||
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
resourceTypeGeneral: String,
|
||||||
|
schemaOrg: String,
|
||||||
|
vocabularies: VocabularyGroup
|
||||||
|
): Result = {
|
||||||
|
val typeQualifiers: (Qualifier, Qualifier) =
|
||||||
|
getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||||
if (typeQualifiers == null)
|
if (typeQualifiers == null)
|
||||||
return null
|
return null
|
||||||
val i = new Instance
|
val i = new Instance
|
||||||
|
@ -168,7 +197,6 @@ object DataciteToOAFTransformation {
|
||||||
null
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def available_date(input: String): Boolean = {
|
def available_date(input: String): Boolean = {
|
||||||
|
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
@ -182,9 +210,7 @@ object DataciteToOAFTransformation {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** As describe in ticket #6377
|
||||||
/**
|
|
||||||
* As describe in ticket #6377
|
|
||||||
* when the result come from figshare we need to remove subject
|
* when the result come from figshare we need to remove subject
|
||||||
* and set Access rights OPEN.
|
* and set Access rights OPEN.
|
||||||
*
|
*
|
||||||
|
@ -193,7 +219,10 @@ object DataciteToOAFTransformation {
|
||||||
def fix_figshare(r: Result): Unit = {
|
def fix_figshare(r: Result): Unit = {
|
||||||
|
|
||||||
if (r.getInstance() != null) {
|
if (r.getInstance() != null) {
|
||||||
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
|
val hosted_by_figshare = r
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
|
||||||
if (hosted_by_figshare) {
|
if (hosted_by_figshare) {
|
||||||
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
|
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
|
||||||
val l: List[StructuredProperty] = List()
|
val l: List[StructuredProperty] = List()
|
||||||
|
@ -201,10 +230,8 @@ object DataciteToOAFTransformation {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
|
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
|
||||||
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
|
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
|
||||||
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
|
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
|
||||||
|
@ -214,7 +241,13 @@ object DataciteToOAFTransformation {
|
||||||
OafMapperUtils.structuredProperty(dt, q, null)
|
OafMapperUtils.structuredProperty(dt, q, null)
|
||||||
}
|
}
|
||||||
|
|
||||||
def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
|
def generateRelation(
|
||||||
|
sourceId: String,
|
||||||
|
targetId: String,
|
||||||
|
relClass: String,
|
||||||
|
cf: KeyValue,
|
||||||
|
di: DataInfo
|
||||||
|
): Relation = {
|
||||||
|
|
||||||
val r = new Relation
|
val r = new Relation
|
||||||
r.setSource(sourceId)
|
r.setSource(sourceId)
|
||||||
|
@ -226,7 +259,6 @@ object DataciteToOAFTransformation {
|
||||||
r.setDataInfo(di)
|
r.setDataInfo(di)
|
||||||
r
|
r
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
|
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
|
||||||
|
@ -238,14 +270,18 @@ object DataciteToOAFTransformation {
|
||||||
val grantId = m.matcher(awardUri).replaceAll("$2")
|
val grantId = m.matcher(awardUri).replaceAll("$2")
|
||||||
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
||||||
List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo))
|
List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo))
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
List()
|
List()
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def generateOAF(
|
||||||
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
|
input: String,
|
||||||
|
ts: Long,
|
||||||
|
dateOfCollection: Long,
|
||||||
|
vocabularies: VocabularyGroup,
|
||||||
|
exportLinks: Boolean
|
||||||
|
): List[Oaf] = {
|
||||||
if (skip_record(input))
|
if (skip_record(input))
|
||||||
return List()
|
return List()
|
||||||
|
|
||||||
|
@ -253,7 +289,8 @@ object DataciteToOAFTransformation {
|
||||||
lazy val json = parse(input)
|
lazy val json = parse(input)
|
||||||
|
|
||||||
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
|
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
|
||||||
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
|
val resourceTypeGeneral =
|
||||||
|
(json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
|
||||||
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
|
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
|
||||||
|
|
||||||
val doi = (json \ "attributes" \ "doi").extract[String]
|
val doi = (json \ "attributes" \ "doi").extract[String]
|
||||||
|
@ -265,8 +302,12 @@ object DataciteToOAFTransformation {
|
||||||
if (result == null)
|
if (result == null)
|
||||||
return List()
|
return List()
|
||||||
|
|
||||||
|
val doi_q = OafMapperUtils.qualifier(
|
||||||
val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
|
"doi",
|
||||||
|
"doi",
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES
|
||||||
|
)
|
||||||
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
|
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
|
||||||
result.setPid(List(pid).asJava)
|
result.setPid(List(pid).asJava)
|
||||||
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
|
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
|
||||||
|
@ -275,48 +316,70 @@ object DataciteToOAFTransformation {
|
||||||
val d = new Date(dateOfCollection * 1000)
|
val d = new Date(dateOfCollection * 1000)
|
||||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||||
|
|
||||||
|
|
||||||
result.setDateofcollection(ISO8601FORMAT.format(d))
|
result.setDateofcollection(ISO8601FORMAT.format(d))
|
||||||
result.setDateoftransformation(ISO8601FORMAT.format(d))
|
result.setDateoftransformation(ISO8601FORMAT.format(d))
|
||||||
result.setDataInfo(dataInfo)
|
result.setDataInfo(dataInfo)
|
||||||
|
|
||||||
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
|
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
|
||||||
|
|
||||||
|
|
||||||
val authors = creators.zipWithIndex.map { case (c, idx) =>
|
val authors = creators.zipWithIndex.map { case (c, idx) =>
|
||||||
val a = new Author
|
val a = new Author
|
||||||
a.setFullname(c.name.orNull)
|
a.setFullname(c.name.orNull)
|
||||||
a.setName(c.givenName.orNull)
|
a.setName(c.givenName.orNull)
|
||||||
a.setSurname(c.familyName.orNull)
|
a.setSurname(c.familyName.orNull)
|
||||||
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
|
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
|
||||||
a.setPid(c.nameIdentifiers.get.map(ni => {
|
a.setPid(
|
||||||
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
|
c.nameIdentifiers.get
|
||||||
|
.map(ni => {
|
||||||
|
val q =
|
||||||
|
if (ni.nameIdentifierScheme.isDefined)
|
||||||
|
vocabularies.getTermAsQualifier(
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ni.nameIdentifierScheme.get.toLowerCase()
|
||||||
|
)
|
||||||
|
else null
|
||||||
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
|
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
|
||||||
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
|
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
null
|
null
|
||||||
|
|
||||||
}
|
})
|
||||||
|
.asJava
|
||||||
)
|
)
|
||||||
.asJava)
|
|
||||||
}
|
}
|
||||||
if (c.affiliation.isDefined)
|
if (c.affiliation.isDefined)
|
||||||
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
|
a.setAffiliation(
|
||||||
|
c.affiliation.get
|
||||||
|
.filter(af => af.nonEmpty)
|
||||||
|
.map(af => OafMapperUtils.field(af, dataInfo))
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
a.setRank(idx + 1)
|
a.setRank(idx + 1)
|
||||||
a
|
a
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
|
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
|
||||||
|
|
||||||
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
|
result.setTitle(
|
||||||
|
titles
|
||||||
|
.filter(t => t.title.nonEmpty)
|
||||||
|
.map(t => {
|
||||||
if (t.titleType.isEmpty) {
|
if (t.titleType.isEmpty) {
|
||||||
OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
|
OafMapperUtils
|
||||||
|
.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
|
||||||
} else {
|
} else {
|
||||||
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null)
|
OafMapperUtils.structuredProperty(
|
||||||
|
t.title.get,
|
||||||
|
t.titleType.get,
|
||||||
|
t.titleType.get,
|
||||||
|
ModelConstants.DNET_DATACITE_TITLE,
|
||||||
|
ModelConstants.DNET_DATACITE_TITLE,
|
||||||
|
null
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}).asJava)
|
})
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
|
||||||
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
|
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
|
||||||
return List()
|
return List()
|
||||||
|
@ -337,46 +400,81 @@ object DataciteToOAFTransformation {
|
||||||
|
|
||||||
if (a_date.isDefined) {
|
if (a_date.isDefined) {
|
||||||
if (doi.startsWith("10.14457"))
|
if (doi.startsWith("10.14457"))
|
||||||
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null))
|
result.setEmbargoenddate(
|
||||||
|
OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null)
|
||||||
|
)
|
||||||
else
|
else
|
||||||
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
|
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
|
||||||
}
|
}
|
||||||
if (i_date.isDefined && i_date.get.isDefined) {
|
if (i_date.isDefined && i_date.get.isDefined) {
|
||||||
if (doi.startsWith("10.14457")) {
|
if (doi.startsWith("10.14457")) {
|
||||||
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
|
result.setDateofacceptance(
|
||||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
|
OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
|
||||||
}
|
)
|
||||||
else {
|
result
|
||||||
|
.getInstance()
|
||||||
|
.get(0)
|
||||||
|
.setDateofacceptance(
|
||||||
|
OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
|
||||||
|
)
|
||||||
|
} else {
|
||||||
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||||
}
|
}
|
||||||
}
|
} else if (publication_year != null) {
|
||||||
else if (publication_year != null) {
|
|
||||||
if (doi.startsWith("10.14457")) {
|
if (doi.startsWith("10.14457")) {
|
||||||
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
|
result.setDateofacceptance(
|
||||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
|
OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
|
||||||
|
)
|
||||||
|
result
|
||||||
|
.getInstance()
|
||||||
|
.get(0)
|
||||||
|
.setDateofacceptance(
|
||||||
|
OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
|
||||||
|
)
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
result
|
||||||
|
.getInstance()
|
||||||
|
.get(0)
|
||||||
|
.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
result.setRelevantdate(
|
||||||
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
|
dates
|
||||||
|
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||||
.map(d => (extract_date(d.date.get), d.dateType.get))
|
.map(d => (extract_date(d.date.get), d.dateType.get))
|
||||||
.filter(d => d._1.isDefined)
|
.filter(d => d._1.isDefined)
|
||||||
.map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())))
|
.map(d =>
|
||||||
|
(
|
||||||
|
d._1.get,
|
||||||
|
vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
|
||||||
|
)
|
||||||
|
)
|
||||||
.filter(d => d._2 != null)
|
.filter(d => d._2 != null)
|
||||||
.map(d => generateOAFDate(d._1, d._2)).asJava)
|
.map(d => generateOAFDate(d._1, d._2))
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
|
||||||
val subjects = (json \\ "subjects").extract[List[SubjectType]]
|
val subjects = (json \\ "subjects").extract[List[SubjectType]]
|
||||||
|
|
||||||
result.setSubject(subjects.filter(s => s.subject.nonEmpty)
|
result.setSubject(
|
||||||
|
subjects
|
||||||
|
.filter(s => s.subject.nonEmpty)
|
||||||
.map(s =>
|
.map(s =>
|
||||||
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
|
OafMapperUtils.structuredProperty(
|
||||||
).asJava)
|
s.subject.get,
|
||||||
|
SUBJ_CLASS,
|
||||||
|
SUBJ_CLASS,
|
||||||
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||||
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||||
|
null
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
|
||||||
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||||
|
|
||||||
|
@ -384,22 +482,22 @@ object DataciteToOAFTransformation {
|
||||||
|
|
||||||
result.setDescription(
|
result.setDescription(
|
||||||
descriptions
|
descriptions
|
||||||
.filter(d => d.description.isDefined).
|
.filter(d => d.description.isDefined)
|
||||||
map(d =>
|
.map(d => OafMapperUtils.field(d.description.get, null))
|
||||||
OafMapperUtils.field(d.description.get, null)
|
.filter(s => s != null)
|
||||||
).filter(s => s != null).asJava)
|
.asJava
|
||||||
|
)
|
||||||
|
|
||||||
val publisher = (json \\ "publisher").extractOrElse[String](null)
|
val publisher = (json \\ "publisher").extractOrElse[String](null)
|
||||||
if (publisher != null)
|
if (publisher != null)
|
||||||
result.setPublisher(OafMapperUtils.field(publisher, null))
|
result.setPublisher(OafMapperUtils.field(publisher, null))
|
||||||
|
|
||||||
|
|
||||||
val language: String = (json \\ "language").extractOrElse[String](null)
|
val language: String = (json \\ "language").extractOrElse[String](null)
|
||||||
|
|
||||||
if (language != null)
|
if (language != null)
|
||||||
result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
|
result.setLanguage(
|
||||||
|
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
|
||||||
|
)
|
||||||
|
|
||||||
val instance = result.getInstance().get(0)
|
val instance = result.getInstance().get(0)
|
||||||
|
|
||||||
|
@ -410,9 +508,12 @@ object DataciteToOAFTransformation {
|
||||||
JField("rightsUri", JString(rightsUri)) <- rightsList
|
JField("rightsUri", JString(rightsUri)) <- rightsList
|
||||||
} yield rightsUri
|
} yield rightsUri
|
||||||
|
|
||||||
val aRights: Option[AccessRight] = accessRights.map(r => {
|
val aRights: Option[AccessRight] = accessRights
|
||||||
|
.map(r => {
|
||||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
|
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
|
||||||
}).find(q => q != null).map(q => {
|
})
|
||||||
|
.find(q => q != null)
|
||||||
|
.map(q => {
|
||||||
val a = new AccessRight
|
val a = new AccessRight
|
||||||
a.setClassid(q.getClassid)
|
a.setClassid(q.getClassid)
|
||||||
a.setClassname(q.getClassname)
|
a.setClassname(q.getClassname)
|
||||||
|
@ -421,18 +522,34 @@ object DataciteToOAFTransformation {
|
||||||
a
|
a
|
||||||
})
|
})
|
||||||
|
|
||||||
|
val access_rights_qualifier =
|
||||||
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
if (aRights.isDefined) aRights.get
|
||||||
|
else
|
||||||
|
OafMapperUtils.accessRight(
|
||||||
|
ModelConstants.UNKNOWN,
|
||||||
|
ModelConstants.NOT_AVAILABLE,
|
||||||
|
ModelConstants.DNET_ACCESS_MODES,
|
||||||
|
ModelConstants.DNET_ACCESS_MODES
|
||||||
|
)
|
||||||
|
|
||||||
if (client.isDefined) {
|
if (client.isDefined) {
|
||||||
|
|
||||||
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID), ModelConstants.UNKNOWN_REPOSITORY.getValue))
|
instance.setHostedby(
|
||||||
|
OafMapperUtils.keyValue(
|
||||||
|
generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID),
|
||||||
|
ModelConstants.UNKNOWN_REPOSITORY.getValue
|
||||||
|
)
|
||||||
|
)
|
||||||
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
|
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
|
||||||
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
|
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
|
||||||
instance.setAccessright(access_rights_qualifier)
|
instance.setAccessright(access_rights_qualifier)
|
||||||
instance.setPid(result.getPid)
|
instance.setPid(result.getPid)
|
||||||
val license = accessRights
|
val license = accessRights
|
||||||
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
|
.find(r =>
|
||||||
|
r.startsWith("http") && r.matches(
|
||||||
|
".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
|
||||||
|
)
|
||||||
|
)
|
||||||
if (license.isDefined)
|
if (license.isDefined)
|
||||||
instance.setLicense(OafMapperUtils.field(license.get, null))
|
instance.setLicense(OafMapperUtils.field(license.get, null))
|
||||||
}
|
}
|
||||||
|
@ -443,7 +560,8 @@ object DataciteToOAFTransformation {
|
||||||
} yield awardUri
|
} yield awardUri
|
||||||
|
|
||||||
result.setId(IdentifierFactory.createIdentifier(result))
|
result.setId(IdentifierFactory.createIdentifier(result))
|
||||||
var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
var relations: List[Relation] =
|
||||||
|
awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
||||||
|
|
||||||
fix_figshare(result)
|
fix_figshare(result)
|
||||||
|
|
||||||
|
@ -458,20 +576,27 @@ object DataciteToOAFTransformation {
|
||||||
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
|
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
|
||||||
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
|
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
|
||||||
|
|
||||||
relations = relations ::: generateRelations(rels, result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null)
|
relations = relations ::: generateRelations(
|
||||||
|
rels,
|
||||||
|
result.getId,
|
||||||
|
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
|
||||||
|
)
|
||||||
}
|
}
|
||||||
if (relations != null && relations.nonEmpty) {
|
if (relations != null && relations.nonEmpty) {
|
||||||
List(result) ::: relations
|
List(result) ::: relations
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
List(result)
|
List(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def generateRelations(rels: List[RelatedIdentifierType], id: String, date: String): List[Relation] = {
|
private def generateRelations(
|
||||||
|
rels: List[RelatedIdentifierType],
|
||||||
|
id: String,
|
||||||
|
date: String
|
||||||
|
): List[Relation] = {
|
||||||
rels
|
rels
|
||||||
.filter(r =>
|
.filter(r =>
|
||||||
subRelTypeMapping.contains(r.relationType) && (
|
subRelTypeMapping
|
||||||
r.relatedIdentifierType.equalsIgnoreCase("doi") ||
|
.contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
|
||||||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
|
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
|
||||||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
|
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
|
||||||
)
|
)
|
||||||
|
@ -490,19 +615,19 @@ object DataciteToOAFTransformation {
|
||||||
rel.setProperties(List(dateProps).asJava)
|
rel.setProperties(List(dateProps).asJava)
|
||||||
|
|
||||||
rel.setSource(id)
|
rel.setSource(id)
|
||||||
rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType))
|
rel.setTarget(
|
||||||
|
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
|
||||||
|
)
|
||||||
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||||
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
|
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
|
||||||
rel
|
rel
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateDSId(input: String): String = {
|
def generateDSId(input: String): String = {
|
||||||
val b = StringUtils.substringBefore(input, "::")
|
val b = StringUtils.substringBefore(input, "::")
|
||||||
val a = StringUtils.substringAfter(input, "::")
|
val a = StringUtils.substringAfter(input, "::")
|
||||||
s"10|$b::${DHPUtils.md5(a)}"
|
s"10|$b::${DHPUtils.md5(a)}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,10 +12,10 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
class GenerateDataciteDatasetSpark(propertyPath: String, args: Array[String], log: Logger)
|
||||||
|
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||||
|
|
||||||
class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) {
|
/** Here all the spark applications runs this method
|
||||||
/**
|
|
||||||
* Here all the spark applications runs this method
|
|
||||||
* where the whole logic of the spark node is defined
|
* where the whole logic of the spark node is defined
|
||||||
*/
|
*/
|
||||||
override def run(): Unit = {
|
override def run(): Unit = {
|
||||||
|
@ -46,27 +46,34 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
|
||||||
reportTotalSize(targetPath, outputBasePath)
|
reportTotalSize(targetPath, outputBasePath)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** For working with MDStore we need to store in a file on hdfs the size of
|
||||||
/**
|
|
||||||
* For working with MDStore we need to store in a file on hdfs the size of
|
|
||||||
* the current dataset
|
* the current dataset
|
||||||
* @param targetPath
|
* @param targetPath
|
||||||
* @param outputBasePath
|
* @param outputBasePath
|
||||||
*/
|
*/
|
||||||
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
||||||
val total_items = spark.read.text(targetPath).count()
|
val total_items = spark.read.text(targetPath).count()
|
||||||
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
|
writeHdfsFile(
|
||||||
|
spark.sparkContext.hadoopConfiguration,
|
||||||
|
s"$total_items",
|
||||||
|
outputBasePath + MDSTORE_SIZE_PATH
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Generate the transformed and cleaned OAF Dataset from the native one
|
||||||
* Generate the transformed and cleaned OAF Dataset from the native one
|
*
|
||||||
|
|
||||||
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
|
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
|
||||||
* @param exportLinks If true it generates unresolved links
|
* @param exportLinks If true it generates unresolved links
|
||||||
* @param vocabularies vocabularies for cleaning
|
* @param vocabularies vocabularies for cleaning
|
||||||
* @param targetPath the targetPath of the result Dataset
|
* @param targetPath the targetPath of the result Dataset
|
||||||
*/
|
*/
|
||||||
def generateDataciteDataset(sourcePath: String, exportLinks: Boolean, vocabularies: VocabularyGroup, targetPath: String, spark:SparkSession):Unit = {
|
def generateDataciteDataset(
|
||||||
|
sourcePath: String,
|
||||||
|
exportLinks: Boolean,
|
||||||
|
vocabularies: VocabularyGroup,
|
||||||
|
targetPath: String,
|
||||||
|
spark: SparkSession
|
||||||
|
): Unit = {
|
||||||
require(spark != null)
|
require(spark != null)
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
@ -74,21 +81,30 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
|
||||||
|
|
||||||
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
CollectionUtils.saveDataset(
|
CollectionUtils.saveDataset(
|
||||||
spark.read.load(sourcePath).as[DataciteType]
|
spark.read
|
||||||
|
.load(sourcePath)
|
||||||
|
.as[DataciteType]
|
||||||
.filter(d => d.isActive)
|
.filter(d => d.isActive)
|
||||||
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
|
.flatMap(d =>
|
||||||
|
DataciteToOAFTransformation
|
||||||
|
.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)
|
||||||
|
)
|
||||||
.filter(d => d != null),
|
.filter(d => d != null),
|
||||||
targetPath)
|
targetPath
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
object GenerateDataciteDatasetSpark {
|
object GenerateDataciteDatasetSpark {
|
||||||
|
|
||||||
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
|
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
new GenerateDataciteDatasetSpark("/eu/dnetlib/dhp/datacite/generate_dataset_params.json", args, log).initialize().run()
|
new GenerateDataciteDatasetSpark(
|
||||||
|
"/eu/dnetlib/dhp/datacite/generate_dataset_params.json",
|
||||||
|
args,
|
||||||
|
log
|
||||||
|
).initialize().run()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,7 +22,6 @@ object ImportDatacite {
|
||||||
|
|
||||||
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
|
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
|
||||||
|
|
||||||
|
|
||||||
def convertAPIStringToDataciteItem(input: String): DataciteType = {
|
def convertAPIStringToDataciteItem(input: String): DataciteType = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json: org.json4s.JValue = parse(input)
|
lazy val json: org.json4s.JValue = parse(input)
|
||||||
|
@ -32,14 +31,26 @@ object ImportDatacite {
|
||||||
|
|
||||||
val timestamp_string = (json \ "attributes" \ "updated").extract[String]
|
val timestamp_string = (json \ "attributes" \ "updated").extract[String]
|
||||||
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
|
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
|
||||||
DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input)
|
DataciteType(
|
||||||
|
doi = doi,
|
||||||
|
timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000,
|
||||||
|
isActive = isActive,
|
||||||
|
json = input
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
|
val parser = new ArgumentApplicationParser(
|
||||||
|
Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val master = parser.get("master")
|
val master = parser.get("master")
|
||||||
|
|
||||||
|
@ -60,7 +71,8 @@ object ImportDatacite {
|
||||||
val spkipImport = parser.get("skipImport")
|
val spkipImport = parser.get("skipImport")
|
||||||
log.info(s"skipImport is $spkipImport")
|
log.info(s"skipImport is $spkipImport")
|
||||||
|
|
||||||
val spark: SparkSession = SparkSession.builder()
|
val spark: SparkSession = SparkSession
|
||||||
|
.builder()
|
||||||
.appName(ImportDatacite.getClass.getSimpleName)
|
.appName(ImportDatacite.getClass.getSimpleName)
|
||||||
.master(master)
|
.master(master)
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
|
@ -78,8 +90,8 @@ object ImportDatacite {
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] =
|
||||||
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
|
new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
|
||||||
|
|
||||||
override def zero: DataciteType = null
|
override def zero: DataciteType = null
|
||||||
|
|
||||||
|
@ -110,13 +122,16 @@ object ImportDatacite {
|
||||||
|
|
||||||
println(s"last Timestamp is $ts")
|
println(s"last Timestamp is $ts")
|
||||||
|
|
||||||
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
|
val cnt =
|
||||||
|
if ("true".equalsIgnoreCase(spkipImport)) 1
|
||||||
|
else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
|
||||||
|
|
||||||
println(s"Imported from Datacite API $cnt documents")
|
println(s"Imported from Datacite API $cnt documents")
|
||||||
|
|
||||||
if (cnt > 0) {
|
if (cnt > 0) {
|
||||||
|
|
||||||
val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
|
val inputRdd: RDD[DataciteType] = sc
|
||||||
|
.sequenceFile(targetPath, classOf[Int], classOf[Text])
|
||||||
.map(s => s._2.toString)
|
.map(s => s._2.toString)
|
||||||
.map(s => convertAPIStringToDataciteItem(s))
|
.map(s => convertAPIStringToDataciteItem(s))
|
||||||
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
|
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
|
||||||
|
@ -129,7 +144,9 @@ object ImportDatacite {
|
||||||
.agg(dataciteAggregator.toColumn)
|
.agg(dataciteAggregator.toColumn)
|
||||||
.map(s => s._2)
|
.map(s => s._2)
|
||||||
.repartition(4000)
|
.repartition(4000)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"${dataciteDump}_updated")
|
||||||
|
|
||||||
val fs = FileSystem.get(sc.hadoopConfiguration)
|
val fs = FileSystem.get(sc.hadoopConfiguration)
|
||||||
fs.delete(new Path(s"$dataciteDump"), true)
|
fs.delete(new Path(s"$dataciteDump"), true)
|
||||||
|
@ -137,14 +154,24 @@ object ImportDatacite {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs: Int): Long = {
|
private def writeSequenceFile(
|
||||||
|
hdfsTargetPath: Path,
|
||||||
|
timestamp: Long,
|
||||||
|
conf: Configuration,
|
||||||
|
bs: Int
|
||||||
|
): Long = {
|
||||||
var from: Long = timestamp * 1000
|
var from: Long = timestamp * 1000
|
||||||
val delta: Long = 100000000L
|
val delta: Long = 100000000L
|
||||||
var client: DataciteAPIImporter = null
|
var client: DataciteAPIImporter = null
|
||||||
val now: Long = System.currentTimeMillis()
|
val now: Long = System.currentTimeMillis()
|
||||||
var i = 0
|
var i = 0
|
||||||
try {
|
try {
|
||||||
val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
|
val writer = SequenceFile.createWriter(
|
||||||
|
conf,
|
||||||
|
SequenceFile.Writer.file(hdfsTargetPath),
|
||||||
|
SequenceFile.Writer.keyClass(classOf[IntWritable]),
|
||||||
|
SequenceFile.Writer.valueClass(classOf[Text])
|
||||||
|
)
|
||||||
try {
|
try {
|
||||||
var start: Long = System.currentTimeMillis
|
var start: Long = System.currentTimeMillis
|
||||||
while (from < now) {
|
while (from < now) {
|
||||||
|
@ -153,16 +180,16 @@ object ImportDatacite {
|
||||||
val key: IntWritable = new IntWritable(i)
|
val key: IntWritable = new IntWritable(i)
|
||||||
val value: Text = new Text
|
val value: Text = new Text
|
||||||
while (client.hasNext) {
|
while (client.hasNext) {
|
||||||
key.set({
|
key.set {
|
||||||
i += 1;
|
i += 1;
|
||||||
i - 1
|
i - 1
|
||||||
})
|
}
|
||||||
value.set(client.next())
|
value.set(client.next())
|
||||||
writer.append(key, value)
|
writer.append(key, value)
|
||||||
writer.hflush()
|
writer.hflush()
|
||||||
if (i % 1000 == 0) {
|
if (i % 1000 == 0) {
|
||||||
end = System.currentTimeMillis
|
end = System.currentTimeMillis
|
||||||
val time = (end - start) / 1000.0F
|
val time = (end - start) / 1000.0f
|
||||||
println(s"Imported $i in $time seconds")
|
println(s"Imported $i in $time seconds")
|
||||||
start = System.currentTimeMillis
|
start = System.currentTimeMillis
|
||||||
}
|
}
|
||||||
|
@ -174,8 +201,7 @@ object ImportDatacite {
|
||||||
case e: Throwable =>
|
case e: Throwable =>
|
||||||
println("Error", e)
|
println("Error", e)
|
||||||
} finally if (writer != null) writer.close()
|
} finally if (writer != null) writer.close()
|
||||||
}
|
} catch {
|
||||||
catch {
|
|
||||||
case e: Throwable =>
|
case e: Throwable =>
|
||||||
log.error("Error", e)
|
log.error("Error", e)
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,7 +17,13 @@ object SparkDownloadUpdateDatacite {
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
val conf = new SparkConf
|
val conf = new SparkConf
|
||||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString)
|
val parser = new ArgumentApplicationParser(
|
||||||
|
Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val master = parser.get("master")
|
val master = parser.get("master")
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
|
@ -26,8 +32,9 @@ object SparkDownloadUpdateDatacite {
|
||||||
val hdfsuri = parser.get("namenode")
|
val hdfsuri = parser.get("namenode")
|
||||||
log.info(s"namenode is $hdfsuri")
|
log.info(s"namenode is $hdfsuri")
|
||||||
|
|
||||||
|
val spark: SparkSession = SparkSession
|
||||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
.builder()
|
||||||
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(master)
|
.master(master)
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
|
@ -37,13 +44,18 @@ object SparkDownloadUpdateDatacite {
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
val maxDate: String = spark.read
|
||||||
val maxDate: String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0)
|
.load(workingPath)
|
||||||
|
.as[Oaf]
|
||||||
|
.filter(s => s.isInstanceOf[Result])
|
||||||
|
.map(r => r.asInstanceOf[Result].getDateofcollection)
|
||||||
|
.select(max("value"))
|
||||||
|
.first()
|
||||||
|
.getString(0)
|
||||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||||
val string_to_date = ISO8601FORMAT.parse(maxDate)
|
val string_to_date = ISO8601FORMAT.parse(maxDate)
|
||||||
val ts = string_to_date.getTime
|
val ts = string_to_date.getTime
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,13 +12,36 @@ object BioDBToOAF {
|
||||||
|
|
||||||
case class EBILinkItem(id: Long, links: String) {}
|
case class EBILinkItem(id: Long, links: String) {}
|
||||||
|
|
||||||
case class EBILinks(relType: String, date: String, title: String, pmid: String, targetPid: String, targetPidType: String, targetUrl: String) {}
|
case class EBILinks(
|
||||||
|
relType: String,
|
||||||
|
date: String,
|
||||||
|
title: String,
|
||||||
|
pmid: String,
|
||||||
|
targetPid: String,
|
||||||
|
targetPidType: String,
|
||||||
|
targetUrl: String
|
||||||
|
) {}
|
||||||
|
|
||||||
case class UniprotDate(date: String, date_info: String) {}
|
case class UniprotDate(date: String, date_info: String) {}
|
||||||
|
|
||||||
case class ScholixResolved(pid: String, pidType: String, typology: String, tilte: List[String], datasource: List[String], date: List[String], authors: List[String]) {}
|
case class ScholixResolved(
|
||||||
|
pid: String,
|
||||||
|
pidType: String,
|
||||||
|
typology: String,
|
||||||
|
tilte: List[String],
|
||||||
|
datasource: List[String],
|
||||||
|
date: List[String],
|
||||||
|
authors: List[String]
|
||||||
|
) {}
|
||||||
|
|
||||||
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
|
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
|
||||||
|
false,
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||||
|
"0.9"
|
||||||
|
)
|
||||||
val SUBJ_CLASS = "Keywords"
|
val SUBJ_CLASS = "Keywords"
|
||||||
|
|
||||||
val DATE_RELATION_KEY = "RelationDate"
|
val DATE_RELATION_KEY = "RelationDate"
|
||||||
|
@ -35,16 +58,35 @@ object BioDBToOAF {
|
||||||
"geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
|
"geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
val collectedFromMap: Map[String, KeyValue] = {
|
val collectedFromMap: Map[String, KeyValue] = {
|
||||||
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", "Protein Data Bank")
|
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||||
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive")
|
"10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2",
|
||||||
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", "NCBI Nucleotide")
|
"Protein Data Bank"
|
||||||
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::296e1abaf1302897a6838d3588cd0310", "UniProtKB/Swiss-Prot")
|
)
|
||||||
val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
|
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||||
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature")
|
"10|re3data_____::c2a591f440598b63d854556beaf01591",
|
||||||
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)")
|
"European Nucleotide Archive"
|
||||||
val pubmedCollectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
)
|
||||||
|
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||||
|
"10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6",
|
||||||
|
"NCBI Nucleotide"
|
||||||
|
)
|
||||||
|
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||||
|
"10|re3data_____::296e1abaf1302897a6838d3588cd0310",
|
||||||
|
"UniProtKB/Swiss-Prot"
|
||||||
|
)
|
||||||
|
val ElsevierCollectedFrom: KeyValue =
|
||||||
|
OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
|
||||||
|
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||||
|
"10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e",
|
||||||
|
"Springer Nature"
|
||||||
|
)
|
||||||
|
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||||
|
"10|opendoar____::83e60e09c222f206c725385f53d7e567c",
|
||||||
|
"EMBL-EBIs Protein Data Bank in Europe (PDBe)"
|
||||||
|
)
|
||||||
|
val pubmedCollectedFrom: KeyValue =
|
||||||
|
OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||||
|
|
||||||
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
|
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
|
||||||
PDBCollectedFrom.setDataInfo(DATA_INFO)
|
PDBCollectedFrom.setDataInfo(DATA_INFO)
|
||||||
|
@ -80,18 +122,32 @@ object BioDBToOAF {
|
||||||
|
|
||||||
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
|
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
|
||||||
|
|
||||||
createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type), collectedFromMap("elsevier"), "relationship", relation_semantic, date)
|
createRelation(
|
||||||
|
target_pid,
|
||||||
|
target_pid_type,
|
||||||
|
generate_unresolved_id(source_pid, source_pid_type),
|
||||||
|
collectedFromMap("elsevier"),
|
||||||
|
"relationship",
|
||||||
|
relation_semantic,
|
||||||
|
date
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
|
def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
|
||||||
|
|
||||||
val d = new Dataset
|
val d = new Dataset
|
||||||
|
|
||||||
d.setPid(
|
d.setPid(
|
||||||
List(
|
List(
|
||||||
OafMapperUtils.structuredProperty(input.pid.toLowerCase, input.pidType.toLowerCase, input.pidType.toLowerCase, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
OafMapperUtils.structuredProperty(
|
||||||
|
input.pid.toLowerCase,
|
||||||
|
input.pidType.toLowerCase,
|
||||||
|
input.pidType.toLowerCase,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
DATA_INFO
|
||||||
|
)
|
||||||
).asJava
|
).asJava
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -101,7 +157,15 @@ object BioDBToOAF {
|
||||||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
|
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
|
||||||
|
|
||||||
if (input.tilte != null && input.tilte.nonEmpty)
|
if (input.tilte != null && input.tilte.nonEmpty)
|
||||||
d.setTitle(List(OafMapperUtils.structuredProperty(input.tilte.head, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
d.setTitle(
|
||||||
|
List(
|
||||||
|
OafMapperUtils.structuredProperty(
|
||||||
|
input.tilte.head,
|
||||||
|
ModelConstants.MAIN_TITLE_QUALIFIER,
|
||||||
|
DATA_INFO
|
||||||
|
)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
|
|
||||||
d.setOriginalId(List(input.pid).asJava)
|
d.setOriginalId(List(input.pid).asJava)
|
||||||
val i = new Instance
|
val i = new Instance
|
||||||
|
@ -113,9 +177,23 @@ object BioDBToOAF {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
|
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
|
||||||
i.setInstancetype(OafMapperUtils.qualifier("0037", "Clinical Trial", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
i.setInstancetype(
|
||||||
|
OafMapperUtils.qualifier(
|
||||||
|
"0037",
|
||||||
|
"Clinical Trial",
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||||
|
)
|
||||||
|
)
|
||||||
else
|
else
|
||||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
i.setInstancetype(
|
||||||
|
OafMapperUtils.qualifier(
|
||||||
|
"0046",
|
||||||
|
"Bioentity",
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if (input.datasource == null || input.datasource.isEmpty)
|
if (input.datasource == null || input.datasource.isEmpty)
|
||||||
return null
|
return null
|
||||||
|
@ -141,7 +219,6 @@ object BioDBToOAF {
|
||||||
d
|
d
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def uniprotToOAF(input: String): List[Oaf] = {
|
def uniprotToOAF(input: String): List[Oaf] = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json = parse(input)
|
lazy val json = parse(input)
|
||||||
|
@ -151,7 +228,14 @@ object BioDBToOAF {
|
||||||
|
|
||||||
d.setPid(
|
d.setPid(
|
||||||
List(
|
List(
|
||||||
OafMapperUtils.structuredProperty(pid, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
OafMapperUtils.structuredProperty(
|
||||||
|
pid,
|
||||||
|
"uniprot",
|
||||||
|
"uniprot",
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
DATA_INFO
|
||||||
|
)
|
||||||
).asJava
|
).asJava
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -162,14 +246,25 @@ object BioDBToOAF {
|
||||||
val title: String = (json \ "title").extractOrElse[String](null)
|
val title: String = (json \ "title").extractOrElse[String](null)
|
||||||
|
|
||||||
if (title != null)
|
if (title != null)
|
||||||
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
d.setTitle(
|
||||||
|
List(
|
||||||
|
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
|
|
||||||
d.setOriginalId(List(pid).asJava)
|
d.setOriginalId(List(pid).asJava)
|
||||||
val i = new Instance
|
val i = new Instance
|
||||||
|
|
||||||
i.setPid(d.getPid)
|
i.setPid(d.getPid)
|
||||||
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
|
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
|
||||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
i.setInstancetype(
|
||||||
|
OafMapperUtils.qualifier(
|
||||||
|
"0046",
|
||||||
|
"Bioentity",
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
i.setCollectedfrom(collectedFromMap("uniprot"))
|
i.setCollectedfrom(collectedFromMap("uniprot"))
|
||||||
d.setInstance(List(i).asJava)
|
d.setInstance(List(i).asJava)
|
||||||
|
@ -182,12 +277,21 @@ object BioDBToOAF {
|
||||||
|
|
||||||
val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
|
val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
|
||||||
|
|
||||||
|
|
||||||
if (subjects != null) {
|
if (subjects != null) {
|
||||||
d.setSubject(
|
d.setSubject(
|
||||||
subjects.map(s =>
|
subjects
|
||||||
OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
|
.map(s =>
|
||||||
).asJava)
|
OafMapperUtils.structuredProperty(
|
||||||
|
s,
|
||||||
|
SUBJ_CLASS,
|
||||||
|
SUBJ_CLASS,
|
||||||
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||||
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||||
|
null
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
}
|
}
|
||||||
var i_date: Option[UniprotDate] = None
|
var i_date: Option[UniprotDate] = None
|
||||||
|
|
||||||
|
@ -197,14 +301,23 @@ object BioDBToOAF {
|
||||||
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||||
}
|
}
|
||||||
val relevant_dates: List[StructuredProperty] = dates.filter(d => !d.date_info.contains("entry version"))
|
val relevant_dates: List[StructuredProperty] = dates
|
||||||
.map(date => OafMapperUtils.structuredProperty(date.date, ModelConstants.UNKNOWN, ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO))
|
.filter(d => !d.date_info.contains("entry version"))
|
||||||
|
.map(date =>
|
||||||
|
OafMapperUtils.structuredProperty(
|
||||||
|
date.date,
|
||||||
|
ModelConstants.UNKNOWN,
|
||||||
|
ModelConstants.UNKNOWN,
|
||||||
|
ModelConstants.DNET_DATACITE_DATE,
|
||||||
|
ModelConstants.DNET_DATACITE_DATE,
|
||||||
|
DATA_INFO
|
||||||
|
)
|
||||||
|
)
|
||||||
if (relevant_dates != null && relevant_dates.nonEmpty)
|
if (relevant_dates != null && relevant_dates.nonEmpty)
|
||||||
d.setRelevantdate(relevant_dates.asJava)
|
d.setRelevantdate(relevant_dates.asJava)
|
||||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
val references_pmid: List[String] = for {
|
val references_pmid: List[String] = for {
|
||||||
JObject(reference) <- json \ "references"
|
JObject(reference) <- json \ "references"
|
||||||
JField("PubMed", JString(pid)) <- reference
|
JField("PubMed", JString(pid)) <- reference
|
||||||
|
@ -215,27 +328,46 @@ object BioDBToOAF {
|
||||||
JField(" DOI", JString(pid)) <- reference
|
JField(" DOI", JString(pid)) <- reference
|
||||||
} yield pid
|
} yield pid
|
||||||
|
|
||||||
|
|
||||||
if (references_pmid != null && references_pmid.nonEmpty) {
|
if (references_pmid != null && references_pmid.nonEmpty) {
|
||||||
val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
|
val rel = createRelation(
|
||||||
|
references_pmid.head,
|
||||||
|
"pmid",
|
||||||
|
d.getId,
|
||||||
|
collectedFromMap("uniprot"),
|
||||||
|
ModelConstants.RELATIONSHIP,
|
||||||
|
ModelConstants.IS_RELATED_TO,
|
||||||
|
if (i_date.isDefined) i_date.get.date else null
|
||||||
|
)
|
||||||
rel.getCollectedfrom
|
rel.getCollectedfrom
|
||||||
List(d, rel)
|
List(d, rel)
|
||||||
}
|
} else if (references_doi != null && references_doi.nonEmpty) {
|
||||||
else if (references_doi != null && references_doi.nonEmpty) {
|
val rel = createRelation(
|
||||||
val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
|
references_doi.head,
|
||||||
|
"doi",
|
||||||
|
d.getId,
|
||||||
|
collectedFromMap("uniprot"),
|
||||||
|
ModelConstants.RELATIONSHIP,
|
||||||
|
ModelConstants.IS_RELATED_TO,
|
||||||
|
if (i_date.isDefined) i_date.get.date else null
|
||||||
|
)
|
||||||
List(d, rel)
|
List(d, rel)
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
List(d)
|
List(d)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generate_unresolved_id(pid: String, pidType: String): String = {
|
def generate_unresolved_id(pid: String, pidType: String): String = {
|
||||||
s"unresolved::$pid::$pidType"
|
s"unresolved::$pid::$pidType"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def createRelation(
|
||||||
def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType: String, relClass: String, date: String): Relation = {
|
pid: String,
|
||||||
|
pidType: String,
|
||||||
|
sourceId: String,
|
||||||
|
collectedFrom: KeyValue,
|
||||||
|
subRelType: String,
|
||||||
|
relClass: String,
|
||||||
|
date: String
|
||||||
|
): Relation = {
|
||||||
|
|
||||||
val rel = new Relation
|
val rel = new Relation
|
||||||
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
||||||
|
@ -248,7 +380,6 @@ object BioDBToOAF {
|
||||||
rel.setSource(sourceId)
|
rel.setSource(sourceId)
|
||||||
rel.setTarget(s"unresolved::$pid::$pidType")
|
rel.setTarget(s"unresolved::$pid::$pidType")
|
||||||
|
|
||||||
|
|
||||||
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
||||||
|
|
||||||
rel.setProperties(List(dateProps).asJava)
|
rel.setProperties(List(dateProps).asJava)
|
||||||
|
@ -259,12 +390,24 @@ object BioDBToOAF {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def createSupplementaryRelation(
|
||||||
def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date: String): Relation = {
|
pid: String,
|
||||||
createRelation(pid, pidType, sourceId, collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date)
|
pidType: String,
|
||||||
|
sourceId: String,
|
||||||
|
collectedFrom: KeyValue,
|
||||||
|
date: String
|
||||||
|
): Relation = {
|
||||||
|
createRelation(
|
||||||
|
pid,
|
||||||
|
pidType,
|
||||||
|
sourceId,
|
||||||
|
collectedFrom,
|
||||||
|
ModelConstants.SUPPLEMENT,
|
||||||
|
ModelConstants.IS_SUPPLEMENT_TO,
|
||||||
|
date
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def pdbTOOaf(input: String): List[Oaf] = {
|
def pdbTOOaf(input: String): List[Oaf] = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json = parse(input)
|
lazy val json = parse(input)
|
||||||
|
@ -277,7 +420,14 @@ object BioDBToOAF {
|
||||||
|
|
||||||
d.setPid(
|
d.setPid(
|
||||||
List(
|
List(
|
||||||
OafMapperUtils.structuredProperty(pdb, "pdb", "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
OafMapperUtils.structuredProperty(
|
||||||
|
pdb,
|
||||||
|
"pdb",
|
||||||
|
"Protein Data Bank Identifier",
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
DATA_INFO
|
||||||
|
)
|
||||||
).asJava
|
).asJava
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -290,13 +440,16 @@ object BioDBToOAF {
|
||||||
|
|
||||||
if (title == null)
|
if (title == null)
|
||||||
return List()
|
return List()
|
||||||
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
d.setTitle(
|
||||||
|
List(
|
||||||
|
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
|
|
||||||
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
|
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
|
||||||
|
|
||||||
if (authors != null) {
|
if (authors != null) {
|
||||||
val convertedAuthors = authors.zipWithIndex.map { a =>
|
val convertedAuthors = authors.zipWithIndex.map { a =>
|
||||||
|
|
||||||
val res = new Author
|
val res = new Author
|
||||||
res.setFullname(a._1)
|
res.setFullname(a._1)
|
||||||
res.setRank(a._2 + 1)
|
res.setRank(a._2 + 1)
|
||||||
|
@ -310,7 +463,14 @@ object BioDBToOAF {
|
||||||
|
|
||||||
i.setPid(d.getPid)
|
i.setPid(d.getPid)
|
||||||
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
|
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
|
||||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
i.setInstancetype(
|
||||||
|
OafMapperUtils.qualifier(
|
||||||
|
"0046",
|
||||||
|
"Bioentity",
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
i.setCollectedfrom(collectedFromMap("pdb"))
|
i.setCollectedfrom(collectedFromMap("pdb"))
|
||||||
d.setInstance(List(i).asJava)
|
d.setInstance(List(i).asJava)
|
||||||
|
@ -323,7 +483,6 @@ object BioDBToOAF {
|
||||||
List(d)
|
List(d)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def extractEBILinksFromDump(input: String): EBILinkItem = {
|
def extractEBILinksFromDump(input: String): EBILinkItem = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json = parse(input)
|
lazy val json = parse(input)
|
||||||
|
@ -333,14 +492,14 @@ object BioDBToOAF {
|
||||||
EBILinkItem(pmid.toLong, compact(render(links)))
|
EBILinkItem(pmid.toLong, compact(render(links)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def EBITargetLinksFilter(input: EBILinks): Boolean = {
|
def EBITargetLinksFilter(input: EBILinks): Boolean = {
|
||||||
|
|
||||||
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot")
|
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase(
|
||||||
|
"pdb"
|
||||||
|
) || input.targetPidType.equalsIgnoreCase("uniprot")
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_ebi_links(input: String): List[EBILinks] = {
|
def parse_ebi_links(input: String): List[EBILinks] = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json = parse(input)
|
lazy val json = parse(input)
|
||||||
|
@ -357,25 +516,46 @@ object BioDBToOAF {
|
||||||
JField("IDURL", JString(idUrl)) <- identifier
|
JField("IDURL", JString(idUrl)) <- identifier
|
||||||
JField("ID", JString(id)) <- identifier
|
JField("ID", JString(id)) <- identifier
|
||||||
|
|
||||||
} yield EBILinks(relation, GraphCleaningFunctions.cleanDate(publicationDate), title, pmid, id, idScheme, idUrl)
|
} yield EBILinks(
|
||||||
|
relation,
|
||||||
|
GraphCleaningFunctions.cleanDate(publicationDate),
|
||||||
|
title,
|
||||||
|
pmid,
|
||||||
|
id,
|
||||||
|
idScheme,
|
||||||
|
idUrl
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
|
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
|
||||||
val d = new Dataset
|
val d = new Dataset
|
||||||
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
|
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
|
||||||
d.setDataInfo(DATA_INFO)
|
d.setDataInfo(DATA_INFO)
|
||||||
d.setTitle(List(OafMapperUtils.structuredProperty(input.title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
d.setTitle(
|
||||||
|
List(
|
||||||
|
OafMapperUtils.structuredProperty(
|
||||||
|
input.title,
|
||||||
|
ModelConstants.MAIN_TITLE_QUALIFIER,
|
||||||
|
DATA_INFO
|
||||||
|
)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
|
|
||||||
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
|
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
|
||||||
|
|
||||||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
|
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
|
||||||
d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
|
d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
|
||||||
|
|
||||||
|
|
||||||
d.setPid(
|
d.setPid(
|
||||||
List(
|
List(
|
||||||
OafMapperUtils.structuredProperty(input.targetPid.toLowerCase, input.targetPidType.toLowerCase, "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
OafMapperUtils.structuredProperty(
|
||||||
|
input.targetPid.toLowerCase,
|
||||||
|
input.targetPidType.toLowerCase,
|
||||||
|
"Protein Data Bank Identifier",
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
DATA_INFO
|
||||||
|
)
|
||||||
).asJava
|
).asJava
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -383,13 +563,35 @@ object BioDBToOAF {
|
||||||
|
|
||||||
i.setPid(d.getPid)
|
i.setPid(d.getPid)
|
||||||
i.setUrl(List(input.targetUrl).asJava)
|
i.setUrl(List(input.targetUrl).asJava)
|
||||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
i.setInstancetype(
|
||||||
|
OafMapperUtils.qualifier(
|
||||||
|
"0046",
|
||||||
|
"Bioentity",
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
i.setCollectedfrom(collectedFromMap("ebi"))
|
i.setCollectedfrom(collectedFromMap("ebi"))
|
||||||
d.setInstance(List(i).asJava)
|
d.setInstance(List(i).asJava)
|
||||||
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
|
i.setDateofacceptance(
|
||||||
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
|
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
|
||||||
|
)
|
||||||
|
d.setDateofacceptance(
|
||||||
|
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
|
||||||
|
)
|
||||||
|
|
||||||
List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, GraphCleaningFunctions.cleanDate(input.date)))
|
List(
|
||||||
|
d,
|
||||||
|
createRelation(
|
||||||
|
input.pmid,
|
||||||
|
"pmid",
|
||||||
|
d.getId,
|
||||||
|
collectedFromMap("ebi"),
|
||||||
|
ModelConstants.RELATIONSHIP,
|
||||||
|
ModelConstants.IS_RELATED_TO,
|
||||||
|
GraphCleaningFunctions.cleanDate(input.date)
|
||||||
|
)
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,11 @@ object SparkTransformBioDatabaseToOAF {
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val database: String = parser.get("database")
|
val database: String = parser.get("database")
|
||||||
log.info("database: {}", database)
|
log.info("database: {}", database)
|
||||||
|
@ -29,20 +33,33 @@ object SparkTransformBioDatabaseToOAF {
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
val sc = spark.sparkContext
|
val sc = spark.sparkContext
|
||||||
|
|
||||||
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
database.toUpperCase() match {
|
database.toUpperCase() match {
|
||||||
case "UNIPROT" =>
|
case "UNIPROT" =>
|
||||||
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))), targetPath)
|
CollectionUtils.saveDataset(
|
||||||
|
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
|
||||||
|
targetPath
|
||||||
|
)
|
||||||
case "PDB" =>
|
case "PDB" =>
|
||||||
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))), targetPath)
|
CollectionUtils.saveDataset(
|
||||||
|
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
|
||||||
|
targetPath
|
||||||
|
)
|
||||||
case "SCHOLIX" =>
|
case "SCHOLIX" =>
|
||||||
CollectionUtils.saveDataset(spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)), targetPath)
|
CollectionUtils.saveDataset(
|
||||||
|
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
|
||||||
|
targetPath
|
||||||
|
)
|
||||||
case "CROSSREF_LINKS" =>
|
case "CROSSREF_LINKS" =>
|
||||||
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))), targetPath)
|
CollectionUtils.saveDataset(
|
||||||
|
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
|
||||||
|
targetPath
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,11 +24,12 @@ import scala.xml.pull.XMLEventReader
|
||||||
|
|
||||||
object SparkCreateBaselineDataFrame {
|
object SparkCreateBaselineDataFrame {
|
||||||
|
|
||||||
|
|
||||||
def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = {
|
def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = {
|
||||||
val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
|
val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
|
||||||
|
|
||||||
val result = data.lines.filter(l => l.startsWith("<a href=")).map { l =>
|
val result = data.lines
|
||||||
|
.filter(l => l.startsWith("<a href="))
|
||||||
|
.map { l =>
|
||||||
val end = l.lastIndexOf("\">")
|
val end = l.lastIndexOf("\">")
|
||||||
val start = l.indexOf("<a href=\"")
|
val start = l.indexOf("<a href=\"")
|
||||||
|
|
||||||
|
@ -36,19 +37,24 @@ object SparkCreateBaselineDataFrame {
|
||||||
l.substring(start + 9, end - start)
|
l.substring(start + 9, end - start)
|
||||||
else
|
else
|
||||||
""
|
""
|
||||||
}.filter(s => s.endsWith(".gz")).filter(s => s > maxFile).map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList
|
}
|
||||||
|
.filter(s => s.endsWith(".gz"))
|
||||||
|
.filter(s => s > maxFile)
|
||||||
|
.map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s"))
|
||||||
|
.toList
|
||||||
|
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def downloadBaselinePart(url: String): InputStream = {
|
def downloadBaselinePart(url: String): InputStream = {
|
||||||
val r = new HttpGet(url)
|
val r = new HttpGet(url)
|
||||||
val timeout = 60; // seconds
|
val timeout = 60; // seconds
|
||||||
val config = RequestConfig.custom()
|
val config = RequestConfig
|
||||||
|
.custom()
|
||||||
.setConnectTimeout(timeout * 1000)
|
.setConnectTimeout(timeout * 1000)
|
||||||
.setConnectionRequestTimeout(timeout * 1000)
|
.setConnectionRequestTimeout(timeout * 1000)
|
||||||
.setSocketTimeout(timeout * 1000).build()
|
.setSocketTimeout(timeout * 1000)
|
||||||
|
.build()
|
||||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||||
val response = client.execute(r)
|
val response = client.execute(r)
|
||||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||||
|
@ -59,10 +65,12 @@ object SparkCreateBaselineDataFrame {
|
||||||
def requestPage(url: String): String = {
|
def requestPage(url: String): String = {
|
||||||
val r = new HttpGet(url)
|
val r = new HttpGet(url)
|
||||||
val timeout = 60; // seconds
|
val timeout = 60; // seconds
|
||||||
val config = RequestConfig.custom()
|
val config = RequestConfig
|
||||||
|
.custom()
|
||||||
.setConnectTimeout(timeout * 1000)
|
.setConnectTimeout(timeout * 1000)
|
||||||
.setConnectionRequestTimeout(timeout * 1000)
|
.setConnectionRequestTimeout(timeout * 1000)
|
||||||
.setSocketTimeout(timeout * 1000).build()
|
.setSocketTimeout(timeout * 1000)
|
||||||
|
.build()
|
||||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||||
try {
|
try {
|
||||||
var tries = 4
|
var tries = 4
|
||||||
|
@ -73,8 +81,7 @@ object SparkCreateBaselineDataFrame {
|
||||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||||
if (response.getStatusLine.getStatusCode > 400) {
|
if (response.getStatusLine.getStatusCode > 400) {
|
||||||
tries -= 1
|
tries -= 1
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
return IOUtils.toString(response.getEntity.getContent)
|
return IOUtils.toString(response.getEntity.getContent)
|
||||||
} catch {
|
} catch {
|
||||||
case e: Throwable =>
|
case e: Throwable =>
|
||||||
|
@ -90,10 +97,8 @@ object SparkCreateBaselineDataFrame {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = {
|
def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = {
|
||||||
|
|
||||||
|
|
||||||
val conf = new Configuration
|
val conf = new Configuration
|
||||||
conf.set("fs.defaultFS", hdfsServerUri)
|
conf.set("fs.defaultFS", hdfsServerUri)
|
||||||
val fs = FileSystem.get(conf)
|
val fs = FileSystem.get(conf)
|
||||||
|
@ -122,8 +127,8 @@ object SparkCreateBaselineDataFrame {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] =
|
||||||
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
|
new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
|
||||||
override def zero: PMArticle = new PMArticle
|
override def zero: PMArticle = new PMArticle
|
||||||
|
|
||||||
override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
|
override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
|
||||||
|
@ -142,11 +147,16 @@ object SparkCreateBaselineDataFrame {
|
||||||
override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
SparkEBILinksToOaf.getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||||
log.info("isLookupUrl: {}", isLookupUrl)
|
log.info("isLookupUrl: {}", isLookupUrl)
|
||||||
|
@ -162,7 +172,6 @@ object SparkCreateBaselineDataFrame {
|
||||||
val skipUpdate = parser.get("skipUpdate")
|
val skipUpdate = parser.get("skipUpdate")
|
||||||
log.info("skipUpdate: {}", skipUpdate)
|
log.info("skipUpdate: {}", skipUpdate)
|
||||||
|
|
||||||
|
|
||||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
|
@ -170,7 +179,8 @@ object SparkCreateBaselineDataFrame {
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val sc = spark.sparkContext
|
val sc = spark.sparkContext
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
@ -183,20 +193,30 @@ object SparkCreateBaselineDataFrame {
|
||||||
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
||||||
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
||||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
|
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
|
||||||
val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => {
|
val ds: Dataset[PMArticle] = spark.createDataset(
|
||||||
|
k.filter(i => i._1.endsWith(".gz"))
|
||||||
|
.flatMap(i => {
|
||||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
||||||
new PMParser(xml)
|
new PMParser(xml)
|
||||||
}))
|
})
|
||||||
ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
|
)
|
||||||
|
ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder))
|
||||||
|
.groupByKey(_._1)
|
||||||
.agg(pmArticleAggregator.toColumn)
|
.agg(pmArticleAggregator.toColumn)
|
||||||
.map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
|
.map(p => p._2)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/baseline_dataset")
|
||||||
}
|
}
|
||||||
|
|
||||||
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
|
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
|
||||||
CollectionUtils.saveDataset(exported_dataset
|
CollectionUtils.saveDataset(
|
||||||
.map(a => PubMedToOaf.convert(a, vocabularies)).as[Oaf]
|
exported_dataset
|
||||||
|
.map(a => PubMedToOaf.convert(a, vocabularies))
|
||||||
|
.as[Oaf]
|
||||||
.filter(p => p != null),
|
.filter(p => p != null),
|
||||||
targetPath)
|
targetPath
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,10 +25,12 @@ object SparkDownloadEBILinks {
|
||||||
def requestPage(url: String): String = {
|
def requestPage(url: String): String = {
|
||||||
val r = new HttpGet(url)
|
val r = new HttpGet(url)
|
||||||
val timeout = 60; // seconds
|
val timeout = 60; // seconds
|
||||||
val config = RequestConfig.custom()
|
val config = RequestConfig
|
||||||
|
.custom()
|
||||||
.setConnectTimeout(timeout * 1000)
|
.setConnectTimeout(timeout * 1000)
|
||||||
.setConnectionRequestTimeout(timeout * 1000)
|
.setConnectionRequestTimeout(timeout * 1000)
|
||||||
.setSocketTimeout(timeout * 1000).build()
|
.setSocketTimeout(timeout * 1000)
|
||||||
|
.build()
|
||||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||||
try {
|
try {
|
||||||
var tries = 4
|
var tries = 4
|
||||||
|
@ -39,8 +41,7 @@ object SparkDownloadEBILinks {
|
||||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||||
if (response.getStatusLine.getStatusCode > 400) {
|
if (response.getStatusLine.getStatusCode > 400) {
|
||||||
tries -= 1
|
tries -= 1
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
return IOUtils.toString(response.getEntity.getContent)
|
return IOUtils.toString(response.getEntity.getContent)
|
||||||
} catch {
|
} catch {
|
||||||
case e: Throwable =>
|
case e: Throwable =>
|
||||||
|
@ -66,14 +67,19 @@ object SparkDownloadEBILinks {
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val MAX_ITEM_PER_PARTITION = 20000
|
val MAX_ITEM_PER_PARTITION = 20000
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
@ -87,22 +93,40 @@ object SparkDownloadEBILinks {
|
||||||
log.info(s"workingPath -> $workingPath")
|
log.info(s"workingPath -> $workingPath")
|
||||||
|
|
||||||
log.info("Getting max pubmedId where the links have already requested")
|
log.info("Getting max pubmedId where the links have already requested")
|
||||||
val links: Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
|
val links: Dataset[EBILinkItem] =
|
||||||
|
spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
|
||||||
val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)
|
val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)
|
||||||
|
|
||||||
log.info("Retrieving PMID to request links")
|
log.info("Retrieving PMID to request links")
|
||||||
val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
|
val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
|
||||||
pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request")
|
pubmed
|
||||||
|
.map(p => p.getPmid.toLong)
|
||||||
|
.where(s"value > $lastPMIDRequested")
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/id_to_request")
|
||||||
|
|
||||||
val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
|
val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
|
||||||
|
|
||||||
val total = pmidToReq.count()
|
val total = pmidToReq.count()
|
||||||
|
|
||||||
spark.createDataset(pmidToReq.rdd.repartition((total / MAX_ITEM_PER_PARTITION).toInt).map(pmid => createEBILinks(pmid)).filter(l => l != null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update")
|
spark
|
||||||
|
.createDataset(
|
||||||
|
pmidToReq.rdd
|
||||||
|
.repartition((total / MAX_ITEM_PER_PARTITION).toInt)
|
||||||
|
.map(pmid => createEBILinks(pmid))
|
||||||
|
.filter(l => l != null)
|
||||||
|
)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/links_update")
|
||||||
|
|
||||||
val updates: Dataset[EBILinkItem] = spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
|
val updates: Dataset[EBILinkItem] =
|
||||||
|
spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
|
||||||
|
|
||||||
links.union(updates).groupByKey(_.id)
|
links
|
||||||
|
.union(updates)
|
||||||
|
.groupByKey(_.id)
|
||||||
.reduceGroups { (x, y) =>
|
.reduceGroups { (x, y) =>
|
||||||
if (x == null || x.links == null)
|
if (x == null || x.links == null)
|
||||||
y
|
y
|
||||||
|
@ -112,6 +136,10 @@ object SparkDownloadEBILinks {
|
||||||
x
|
x
|
||||||
else
|
else
|
||||||
y
|
y
|
||||||
}.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final")
|
}
|
||||||
|
.map(_._2)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/links_final")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,15 +15,19 @@ object SparkEBILinksToOaf {
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
|
@ -32,11 +36,17 @@ object SparkEBILinksToOaf {
|
||||||
log.info(s"targetPath -> $targetPath")
|
log.info(s"targetPath -> $targetPath")
|
||||||
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||||
|
|
||||||
val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{"))
|
val ebLinks: Dataset[EBILinkItem] = spark.read
|
||||||
|
.load(sourcePath)
|
||||||
|
.as[EBILinkItem]
|
||||||
|
.filter(l => l.links != null && l.links.startsWith("{"))
|
||||||
|
|
||||||
CollectionUtils.saveDataset(ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
CollectionUtils.saveDataset(
|
||||||
|
ebLinks
|
||||||
|
.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
||||||
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
||||||
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
|
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
|
||||||
targetPath)
|
targetPath
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,10 +3,7 @@ package eu.dnetlib.dhp.sx.bio.pubmed
|
||||||
import scala.xml.MetaData
|
import scala.xml.MetaData
|
||||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
||||||
|
|
||||||
|
/** @param xml
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @param xml
|
|
||||||
*/
|
*/
|
||||||
class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] {
|
class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] {
|
||||||
|
|
||||||
|
@ -29,10 +26,8 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
||||||
s.head.text
|
s.head.text
|
||||||
else
|
else
|
||||||
null
|
null
|
||||||
|
} else null
|
||||||
}
|
}
|
||||||
else null
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def validate_Date(year: String, month: String, day: String): String = {
|
def validate_Date(year: String, month: String, day: String): String = {
|
||||||
try {
|
try {
|
||||||
|
@ -45,7 +40,6 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
||||||
|
|
||||||
def generateNextArticle(): PMArticle = {
|
def generateNextArticle(): PMArticle = {
|
||||||
|
|
||||||
|
|
||||||
var currentSubject: PMSubject = null
|
var currentSubject: PMSubject = null
|
||||||
var currentAuthor: PMAuthor = null
|
var currentAuthor: PMAuthor = null
|
||||||
var currentJournal: PMJournal = null
|
var currentJournal: PMJournal = null
|
||||||
|
@ -56,11 +50,6 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
||||||
var currentDay = "01"
|
var currentDay = "01"
|
||||||
var currentArticleType: String = null
|
var currentArticleType: String = null
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
while (xml.hasNext) {
|
while (xml.hasNext) {
|
||||||
xml.next match {
|
xml.next match {
|
||||||
case EvElemStart(_, label, attrs, _) =>
|
case EvElemStart(_, label, attrs, _) =>
|
||||||
|
@ -83,7 +72,8 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
||||||
case "Author" => currentArticle.getAuthors.add(currentAuthor)
|
case "Author" => currentArticle.getAuthors.add(currentAuthor)
|
||||||
case "Journal" => currentArticle.setJournal(currentJournal)
|
case "Journal" => currentArticle.setJournal(currentJournal)
|
||||||
case "Grant" => currentArticle.getGrants.add(currentGrant)
|
case "Grant" => currentArticle.getGrants.add(currentGrant)
|
||||||
case "PubMedPubDate" => if (currentArticle.getDate== null)
|
case "PubMedPubDate" =>
|
||||||
|
if (currentArticle.getDate == null)
|
||||||
currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
|
currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
|
||||||
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
|
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||||
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
|
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
|
||||||
|
@ -106,7 +96,8 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
||||||
currentArticle.setDescription(currentArticle.getDescription + text.trim)
|
currentArticle.setDescription(currentArticle.getDescription + text.trim)
|
||||||
}
|
}
|
||||||
case "PMID" => currentArticle.setPmid(text.trim)
|
case "PMID" => currentArticle.setPmid(text.trim)
|
||||||
case "ArticleId" => if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
|
case "ArticleId" =>
|
||||||
|
if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
|
||||||
case "Language" => currentArticle.setLanguage(text.trim)
|
case "Language" => currentArticle.setLanguage(text.trim)
|
||||||
case "ISSN" => currentJournal.setIssn(text.trim)
|
case "ISSN" => currentJournal.setIssn(text.trim)
|
||||||
case "GrantID" => currentGrant.setGrantID(text.trim)
|
case "GrantID" => currentGrant.setGrantID(text.trim)
|
||||||
|
@ -122,7 +113,8 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
||||||
if (currentAuthor != null)
|
if (currentAuthor != null)
|
||||||
currentAuthor.setLastName(text.trim)
|
currentAuthor.setLastName(text.trim)
|
||||||
}
|
}
|
||||||
case "ForeName" => if (currentAuthor != null)
|
case "ForeName" =>
|
||||||
|
if (currentAuthor != null)
|
||||||
currentAuthor.setForeName(text.trim)
|
currentAuthor.setForeName(text.trim)
|
||||||
case "Title" =>
|
case "Title" =>
|
||||||
if (currentJournal.getTitle == null)
|
if (currentJournal.getTitle == null)
|
||||||
|
@ -139,8 +131,3 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
||||||
null
|
null
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,21 +9,29 @@ import collection.JavaConverters._
|
||||||
import java.util.regex.Pattern
|
import java.util.regex.Pattern
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
object PubMedToOaf {
|
object PubMedToOaf {
|
||||||
|
|
||||||
val SUBJ_CLASS = "keywords"
|
val SUBJ_CLASS = "keywords"
|
||||||
|
|
||||||
val urlMap = Map(
|
val urlMap = Map(
|
||||||
"pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
|
"pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
|
||||||
"doi" -> "https://dx.doi.org/"
|
"doi" -> "https://dx.doi.org/"
|
||||||
)
|
)
|
||||||
val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
|
|
||||||
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
|
||||||
|
|
||||||
|
val dataInfo: DataInfo = OafMapperUtils.dataInfo(
|
||||||
|
false,
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||||
|
"0.9"
|
||||||
|
)
|
||||||
|
|
||||||
/**
|
val collectedFrom: KeyValue =
|
||||||
* Cleaning the DOI Applying regex in order to
|
OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||||
|
|
||||||
|
/** Cleaning the DOI Applying regex in order to
|
||||||
* remove doi starting with URL
|
* remove doi starting with URL
|
||||||
*
|
*
|
||||||
* @param doi input DOI
|
* @param doi input DOI
|
||||||
|
@ -33,7 +41,6 @@ object PubMedToOaf {
|
||||||
|
|
||||||
val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
|
val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
|
||||||
|
|
||||||
|
|
||||||
val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
|
val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
|
||||||
val matcher = pattern.matcher(doi)
|
val matcher = pattern.matcher(doi)
|
||||||
|
|
||||||
|
@ -43,9 +50,7 @@ object PubMedToOaf {
|
||||||
null
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Create an instance of class extends Result
|
||||||
*
|
|
||||||
* Create an instance of class extends Result
|
|
||||||
* starting from OAF instanceType value
|
* starting from OAF instanceType value
|
||||||
*
|
*
|
||||||
* @param cobjQualifier OAF instance type
|
* @param cobjQualifier OAF instance type
|
||||||
|
@ -53,7 +58,11 @@ object PubMedToOaf {
|
||||||
* @return the correct instance
|
* @return the correct instance
|
||||||
*/
|
*/
|
||||||
def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
|
def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
|
||||||
val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid)
|
val result_typologies = getVocabularyTerm(
|
||||||
|
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||||
|
vocabularies,
|
||||||
|
cobjQualifier.getClassid
|
||||||
|
)
|
||||||
result_typologies.getClassid match {
|
result_typologies.getClassid match {
|
||||||
case "dataset" => new Dataset
|
case "dataset" => new Dataset
|
||||||
case "publication" => new Publication
|
case "publication" => new Publication
|
||||||
|
@ -64,8 +73,7 @@ object PubMedToOaf {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Mapping the Pubmedjournal info into the OAF Journale
|
||||||
* Mapping the Pubmedjournal info into the OAF Journale
|
|
||||||
*
|
*
|
||||||
* @param j the pubmedJournal
|
* @param j the pubmedJournal
|
||||||
* @return the OAF Journal
|
* @return the OAF Journal
|
||||||
|
@ -83,27 +91,26 @@ object PubMedToOaf {
|
||||||
journal.setIss(j.getIssue)
|
journal.setIss(j.getIssue)
|
||||||
journal
|
journal
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Find vocabulary term into synonyms and term in the vocabulary
|
||||||
*
|
|
||||||
* Find vocabulary term into synonyms and term in the vocabulary
|
|
||||||
*
|
*
|
||||||
* @param vocabularyName the input vocabulary name
|
* @param vocabularyName the input vocabulary name
|
||||||
* @param vocabularies all the vocabularies
|
* @param vocabularies all the vocabularies
|
||||||
* @param term the term to find
|
* @param term the term to find
|
||||||
* @return the cleaned term value
|
* @return the cleaned term value
|
||||||
*/
|
*/
|
||||||
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
|
def getVocabularyTerm(
|
||||||
|
vocabularyName: String,
|
||||||
|
vocabularies: VocabularyGroup,
|
||||||
|
term: String
|
||||||
|
): Qualifier = {
|
||||||
val a = vocabularies.getSynonymAsQualifier(vocabularyName, term)
|
val a = vocabularies.getSynonymAsQualifier(vocabularyName, term)
|
||||||
val b = vocabularies.getTermAsQualifier(vocabularyName, term)
|
val b = vocabularies.getTermAsQualifier(vocabularyName, term)
|
||||||
if (a == null) b else a
|
if (a == null) b else a
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Map the Pubmed Article into the OAF instance
|
||||||
/**
|
|
||||||
* Map the Pubmed Article into the OAF instance
|
|
||||||
*
|
*
|
||||||
* @param article the pubmed articles
|
* @param article the pubmed articles
|
||||||
* @param vocabularies the vocabularies
|
* @param vocabularies the vocabularies
|
||||||
|
@ -114,9 +121,17 @@ object PubMedToOaf {
|
||||||
if (article.getPublicationTypes == null)
|
if (article.getPublicationTypes == null)
|
||||||
return null
|
return null
|
||||||
|
|
||||||
|
|
||||||
// MAP PMID into pid with classid = classname = pmid
|
// MAP PMID into pid with classid = classname = pmid
|
||||||
val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
|
val pidList: List[StructuredProperty] = List(
|
||||||
|
OafMapperUtils.structuredProperty(
|
||||||
|
article.getPmid,
|
||||||
|
PidType.pmid.toString,
|
||||||
|
PidType.pmid.toString,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
dataInfo
|
||||||
|
)
|
||||||
|
)
|
||||||
if (pidList == null)
|
if (pidList == null)
|
||||||
return null
|
return null
|
||||||
|
|
||||||
|
@ -125,7 +140,14 @@ object PubMedToOaf {
|
||||||
if (article.getDoi != null) {
|
if (article.getDoi != null) {
|
||||||
val normalizedPid = cleanDoi(article.getDoi)
|
val normalizedPid = cleanDoi(article.getDoi)
|
||||||
if (normalizedPid != null)
|
if (normalizedPid != null)
|
||||||
alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
|
alternateIdentifier = OafMapperUtils.structuredProperty(
|
||||||
|
normalizedPid,
|
||||||
|
PidType.doi.toString,
|
||||||
|
PidType.doi.toString,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
dataInfo
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
// INSTANCE MAPPING
|
// INSTANCE MAPPING
|
||||||
|
@ -133,10 +155,12 @@ object PubMedToOaf {
|
||||||
|
|
||||||
// If the article contains the typology Journal Article then we apply this type
|
// If the article contains the typology Journal Article then we apply this type
|
||||||
//else We have to find a terms that match the vocabulary otherwise we discard it
|
//else We have to find a terms that match the vocabulary otherwise we discard it
|
||||||
val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
|
val ja =
|
||||||
|
article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
|
||||||
val pubmedInstance = new Instance
|
val pubmedInstance = new Instance
|
||||||
if (ja.isDefined) {
|
if (ja.isDefined) {
|
||||||
val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
|
val cojbCategory =
|
||||||
|
getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
|
||||||
pubmedInstance.setInstancetype(cojbCategory)
|
pubmedInstance.setInstancetype(cojbCategory)
|
||||||
} else {
|
} else {
|
||||||
val i_type = article.getPublicationTypes.asScala
|
val i_type = article.getPublicationTypes.asScala
|
||||||
|
@ -155,7 +179,9 @@ object PubMedToOaf {
|
||||||
if (alternateIdentifier != null)
|
if (alternateIdentifier != null)
|
||||||
pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava)
|
pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava)
|
||||||
result.setInstance(List(pubmedInstance).asJava)
|
result.setInstance(List(pubmedInstance).asJava)
|
||||||
pubmedInstance.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut)
|
pubmedInstance.getPid.asScala
|
||||||
|
.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid))
|
||||||
|
.map(p => p.getValue)(collection.breakOut)
|
||||||
//CREATE URL From pmid
|
//CREATE URL From pmid
|
||||||
val urlLists: List[String] = pidList
|
val urlLists: List[String] = pidList
|
||||||
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
|
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
|
||||||
|
@ -165,7 +191,9 @@ object PubMedToOaf {
|
||||||
pubmedInstance.setUrl(urlLists.asJava)
|
pubmedInstance.setUrl(urlLists.asJava)
|
||||||
|
|
||||||
//ASSIGN DateofAcceptance
|
//ASSIGN DateofAcceptance
|
||||||
pubmedInstance.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
|
pubmedInstance.setDateofacceptance(
|
||||||
|
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
|
||||||
|
)
|
||||||
//ASSIGN COLLECTEDFROM
|
//ASSIGN COLLECTEDFROM
|
||||||
pubmedInstance.setCollectedfrom(collectedFrom)
|
pubmedInstance.setCollectedfrom(collectedFrom)
|
||||||
result.setPid(pidList.asJava)
|
result.setPid(pidList.asJava)
|
||||||
|
@ -173,7 +201,6 @@ object PubMedToOaf {
|
||||||
//END INSTANCE MAPPING
|
//END INSTANCE MAPPING
|
||||||
//--------------------------------------------------------------------------------------
|
//--------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
// JOURNAL MAPPING
|
// JOURNAL MAPPING
|
||||||
//--------------------------------------------------------------------------------------
|
//--------------------------------------------------------------------------------------
|
||||||
if (article.getJournal != null && result.isInstanceOf[Publication])
|
if (article.getJournal != null && result.isInstanceOf[Publication])
|
||||||
|
@ -182,31 +209,48 @@ object PubMedToOaf {
|
||||||
//END JOURNAL MAPPING
|
//END JOURNAL MAPPING
|
||||||
//--------------------------------------------------------------------------------------
|
//--------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
// RESULT MAPPING
|
// RESULT MAPPING
|
||||||
//--------------------------------------------------------------------------------------
|
//--------------------------------------------------------------------------------------
|
||||||
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
|
result.setDateofacceptance(
|
||||||
|
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
|
||||||
|
)
|
||||||
|
|
||||||
if (article.getTitle == null || article.getTitle.isEmpty)
|
if (article.getTitle == null || article.getTitle.isEmpty)
|
||||||
return null
|
return null
|
||||||
result.setTitle(List(OafMapperUtils.structuredProperty(article.getTitle, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo)).asJava)
|
result.setTitle(
|
||||||
|
List(
|
||||||
|
OafMapperUtils.structuredProperty(
|
||||||
|
article.getTitle,
|
||||||
|
ModelConstants.MAIN_TITLE_QUALIFIER,
|
||||||
|
dataInfo
|
||||||
|
)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
|
|
||||||
if (article.getDescription != null && article.getDescription.nonEmpty)
|
if (article.getDescription != null && article.getDescription.nonEmpty)
|
||||||
result.setDescription(List(OafMapperUtils.field(article.getDescription, dataInfo)).asJava)
|
result.setDescription(List(OafMapperUtils.field(article.getDescription, dataInfo)).asJava)
|
||||||
|
|
||||||
if (article.getLanguage != null) {
|
if (article.getLanguage != null) {
|
||||||
|
|
||||||
val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
|
val term =
|
||||||
|
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
|
||||||
if (term != null)
|
if (term != null)
|
||||||
result.setLanguage(term)
|
result.setLanguage(term)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s =>
|
||||||
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection.breakOut)
|
OafMapperUtils.structuredProperty(
|
||||||
|
s.getValue,
|
||||||
|
SUBJ_CLASS,
|
||||||
|
SUBJ_CLASS,
|
||||||
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||||
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||||
|
dataInfo
|
||||||
|
)
|
||||||
|
)(collection.breakOut)
|
||||||
if (subjects != null)
|
if (subjects != null)
|
||||||
result.setSubject(subjects.asJava)
|
result.setSubject(subjects.asJava)
|
||||||
|
|
||||||
|
|
||||||
val authors: List[Author] = article.getAuthors.asScala.zipWithIndex.map { case (a, index) =>
|
val authors: List[Author] = article.getAuthors.asScala.zipWithIndex.map { case (a, index) =>
|
||||||
val author = new Author()
|
val author = new Author()
|
||||||
author.setName(a.getForeName)
|
author.setName(a.getForeName)
|
||||||
|
@ -216,15 +260,12 @@ object PubMedToOaf {
|
||||||
author
|
author
|
||||||
}(collection.breakOut)
|
}(collection.breakOut)
|
||||||
|
|
||||||
|
|
||||||
if (authors != null && authors.nonEmpty)
|
if (authors != null && authors.nonEmpty)
|
||||||
result.setAuthor(authors.asJava)
|
result.setAuthor(authors.asJava)
|
||||||
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
||||||
|
|
||||||
|
|
||||||
result.setId(article.getPmid)
|
result.setId(article.getPmid)
|
||||||
|
|
||||||
|
|
||||||
// END RESULT MAPPING
|
// END RESULT MAPPING
|
||||||
//--------------------------------------------------------------------------------------
|
//--------------------------------------------------------------------------------------
|
||||||
val id = IdentifierFactory.createIdentifier(result)
|
val id = IdentifierFactory.createIdentifier(result)
|
||||||
|
@ -234,5 +275,4 @@ object PubMedToOaf {
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,7 +17,8 @@ import org.slf4j.{Logger, LoggerFactory}
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import java.text.SimpleDateFormat
|
import java.text.SimpleDateFormat
|
||||||
|
|
||||||
class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) {
|
class SparkRetrieveDataciteDelta(propertyPath: String, args: Array[String], log: Logger)
|
||||||
|
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||||
|
|
||||||
val ISO_DATE_PATTERN = "yyyy-MM-dd'T'HH:mm:ssZ"
|
val ISO_DATE_PATTERN = "yyyy-MM-dd'T'HH:mm:ssZ"
|
||||||
val simpleFormatter = new SimpleDateFormat(ISO_DATE_PATTERN)
|
val simpleFormatter = new SimpleDateFormat(ISO_DATE_PATTERN)
|
||||||
|
@ -28,16 +29,13 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
||||||
val RESOLVED_REL_PATH_NAME = "resolvedRelation"
|
val RESOLVED_REL_PATH_NAME = "resolvedRelation"
|
||||||
val SCHOLIX_PATH_NAME = "scholix"
|
val SCHOLIX_PATH_NAME = "scholix"
|
||||||
|
|
||||||
|
|
||||||
def scholixResourcePath(workingPath: String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME"
|
def scholixResourcePath(workingPath: String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME"
|
||||||
def dataciteOAFPath(workingPath: String) = s"$workingPath/$DATACITE_OAF_PATH_NAME"
|
def dataciteOAFPath(workingPath: String) = s"$workingPath/$DATACITE_OAF_PATH_NAME"
|
||||||
def pidMapPath(workingPath: String) = s"$workingPath/$PID_MAP_PATH_NAME"
|
def pidMapPath(workingPath: String) = s"$workingPath/$PID_MAP_PATH_NAME"
|
||||||
def resolvedRelationPath(workingPath: String) = s"$workingPath/$RESOLVED_REL_PATH_NAME"
|
def resolvedRelationPath(workingPath: String) = s"$workingPath/$RESOLVED_REL_PATH_NAME"
|
||||||
def scholixPath(workingPath: String) = s"$workingPath/$SCHOLIX_PATH_NAME"
|
def scholixPath(workingPath: String) = s"$workingPath/$SCHOLIX_PATH_NAME"
|
||||||
|
|
||||||
|
/** Utility to parse Date in ISO8601 to epochMillis
|
||||||
/**
|
|
||||||
* Utility to parse Date in ISO8601 to epochMillis
|
|
||||||
* @param inputDate The String represents an input date in ISO8601
|
* @param inputDate The String represents an input date in ISO8601
|
||||||
* @return The relative epochMillis of parsed date
|
* @return The relative epochMillis of parsed date
|
||||||
*/
|
*/
|
||||||
|
@ -45,9 +43,7 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
||||||
simpleFormatter.parse(inputDate).getTime
|
simpleFormatter.parse(inputDate).getTime
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** This method tries to retrieve the last collection date from all datacite
|
||||||
/**
|
|
||||||
* This method tries to retrieve the last collection date from all datacite
|
|
||||||
* records in HDFS.
|
* records in HDFS.
|
||||||
* This method should be called before indexing scholexplorer to retrieve
|
* This method should be called before indexing scholexplorer to retrieve
|
||||||
* the delta of Datacite record to download, since from the generation of
|
* the delta of Datacite record to download, since from the generation of
|
||||||
|
@ -63,16 +59,23 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
||||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
|
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
val entitiesDS = spark.read.load(s"$entitiesPath/*").as[Oaf].filter(o =>o.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
|
val entitiesDS = spark.read
|
||||||
|
.load(s"$entitiesPath/*")
|
||||||
|
.as[Oaf]
|
||||||
|
.filter(o => o.isInstanceOf[Result])
|
||||||
|
.map(r => r.asInstanceOf[Result])
|
||||||
|
|
||||||
val date = entitiesDS.filter(r => r.getDateofcollection!= null).map(_.getDateofcollection).select(max("value")).first.getString(0)
|
val date = entitiesDS
|
||||||
|
.filter(r => r.getDateofcollection != null)
|
||||||
|
.map(_.getDateofcollection)
|
||||||
|
.select(max("value"))
|
||||||
|
.first
|
||||||
|
.getString(0)
|
||||||
|
|
||||||
ISO8601toEpochMillis(date) / 1000
|
ISO8601toEpochMillis(date) / 1000
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** The method of update Datacite relationships on Scholexplorer
|
||||||
/**
|
|
||||||
* The method of update Datacite relationships on Scholexplorer
|
|
||||||
* needs some utilities data structures
|
* needs some utilities data structures
|
||||||
* One is the scholixResource DS that stores all the nodes in the Scholix Graph
|
* One is the scholixResource DS that stores all the nodes in the Scholix Graph
|
||||||
* in format ScholixResource
|
* in format ScholixResource
|
||||||
|
@ -80,19 +83,26 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
||||||
* @param workingPath the working path
|
* @param workingPath the working path
|
||||||
* @param spark the spark session
|
* @param spark the spark session
|
||||||
*/
|
*/
|
||||||
def generateScholixResource(summaryPath:String, workingPath: String, spark:SparkSession) :Unit = {
|
def generateScholixResource(
|
||||||
|
summaryPath: String,
|
||||||
|
workingPath: String,
|
||||||
|
spark: SparkSession
|
||||||
|
): Unit = {
|
||||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||||
|
|
||||||
log.info("Convert All summary to ScholixResource")
|
log.info("Convert All summary to ScholixResource")
|
||||||
spark.read.load(summaryPath).as[ScholixSummary]
|
spark.read
|
||||||
|
.load(summaryPath)
|
||||||
|
.as[ScholixSummary]
|
||||||
.map(ScholixUtils.generateScholixResourceFromSummary)(scholixResourceEncoder)
|
.map(ScholixUtils.generateScholixResourceFromSummary)(scholixResourceEncoder)
|
||||||
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
|
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_native")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"${scholixResourcePath(workingPath)}_native")
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** This method convert the new Datacite Resource into Scholix Resource
|
||||||
* This method convert the new Datacite Resource into Scholix Resource
|
|
||||||
* Needed to fill the source and the type of Scholix Relationships
|
* Needed to fill the source and the type of Scholix Relationships
|
||||||
* @param workingPath the Working Path
|
* @param workingPath the Working Path
|
||||||
* @param spark The spark Session
|
* @param spark The spark Session
|
||||||
|
@ -103,25 +113,28 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
||||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
|
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
spark.read.load(dataciteOAFPath(workingPath)).as[Oaf]
|
spark.read
|
||||||
|
.load(dataciteOAFPath(workingPath))
|
||||||
|
.as[Oaf]
|
||||||
.filter(_.isInstanceOf[Result])
|
.filter(_.isInstanceOf[Result])
|
||||||
.map(_.asInstanceOf[Result])
|
.map(_.asInstanceOf[Result])
|
||||||
.map(ScholixUtils.generateScholixResourceFromResult)
|
.map(ScholixUtils.generateScholixResourceFromResult)
|
||||||
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
|
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_update")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"${scholixResourcePath(workingPath)}_update")
|
||||||
|
|
||||||
val update = spark.read.load(s"${scholixResourcePath(workingPath)}_update").as[ScholixResource]
|
val update = spark.read.load(s"${scholixResourcePath(workingPath)}_update").as[ScholixResource]
|
||||||
val native = spark.read.load(s"${scholixResourcePath(workingPath)}_native").as[ScholixResource]
|
val native = spark.read.load(s"${scholixResourcePath(workingPath)}_native").as[ScholixResource]
|
||||||
val graph = update.union(native)
|
val graph = update
|
||||||
|
.union(native)
|
||||||
.groupByKey(_.getDnetIdentifier)
|
.groupByKey(_.getDnetIdentifier)
|
||||||
.reduceGroups((a, b) => if (a != null && a.getDnetIdentifier != null) a else b)
|
.reduceGroups((a, b) => if (a != null && a.getDnetIdentifier != null) a else b)
|
||||||
.map(_._2)
|
.map(_._2)
|
||||||
graph.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_graph")
|
graph.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_graph")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** This method get and Transform only datacite records with
|
||||||
/**
|
|
||||||
* This method get and Transform only datacite records with
|
|
||||||
* timestamp greater than timestamp
|
* timestamp greater than timestamp
|
||||||
* @param datacitePath the datacite input Path
|
* @param datacitePath the datacite input Path
|
||||||
* @param timestamp the timestamp
|
* @param timestamp the timestamp
|
||||||
|
@ -130,31 +143,44 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
||||||
* @param vocabularies Vocabularies needed for transformation
|
* @param vocabularies Vocabularies needed for transformation
|
||||||
*/
|
*/
|
||||||
|
|
||||||
def getDataciteUpdate(datacitePath:String, timestamp:Long, workingPath:String, spark:SparkSession,vocabularies: VocabularyGroup): Long = {
|
def getDataciteUpdate(
|
||||||
|
datacitePath: String,
|
||||||
|
timestamp: Long,
|
||||||
|
workingPath: String,
|
||||||
|
spark: SparkSession,
|
||||||
|
vocabularies: VocabularyGroup
|
||||||
|
): Long = {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
val ds = spark.read.load(datacitePath).as[DataciteType]
|
val ds = spark.read.load(datacitePath).as[DataciteType]
|
||||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
val total = ds.filter(_.timestamp >= timestamp).count()
|
val total = ds.filter(_.timestamp >= timestamp).count()
|
||||||
if (total > 0) {
|
if (total > 0) {
|
||||||
ds.filter(_.timestamp >= timestamp)
|
ds.filter(_.timestamp >= timestamp)
|
||||||
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true))
|
.flatMap(d =>
|
||||||
.flatMap(i => fixRelations(i)).filter(i => i != null)
|
DataciteToOAFTransformation
|
||||||
.write.mode(SaveMode.Overwrite).save(dataciteOAFPath(workingPath))
|
.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true)
|
||||||
|
)
|
||||||
|
.flatMap(i => fixRelations(i))
|
||||||
|
.filter(i => i != null)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(dataciteOAFPath(workingPath))
|
||||||
}
|
}
|
||||||
total
|
total
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** After added the new ScholixResource, we need to update the scholix Pid Map
|
||||||
* After added the new ScholixResource, we need to update the scholix Pid Map
|
|
||||||
* to intersected with the new Datacite Relations
|
* to intersected with the new Datacite Relations
|
||||||
|
*
|
||||||
* @param workingPath The working Path starting from save the new Map
|
* @param workingPath The working Path starting from save the new Map
|
||||||
* @param spark the spark session
|
* @param spark the spark session
|
||||||
*/
|
*/
|
||||||
def generatePidMap(workingPath: String, spark: SparkSession): Unit = {
|
def generatePidMap(workingPath: String, spark: SparkSession): Unit = {
|
||||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
spark.read.load(s"${scholixResourcePath(workingPath)}_graph").as[ScholixResource]
|
spark.read
|
||||||
|
.load(s"${scholixResourcePath(workingPath)}_graph")
|
||||||
|
.as[ScholixResource]
|
||||||
.flatMap(r =>
|
.flatMap(r =>
|
||||||
r.getIdentifier.asScala
|
r.getIdentifier.asScala
|
||||||
.map(i => DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema))
|
.map(i => DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema))
|
||||||
|
@ -163,11 +189,12 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
||||||
.groupByKey(_._1)
|
.groupByKey(_._1)
|
||||||
.reduceGroups((a, b) => if (a != null && a._2 != null) a else b)
|
.reduceGroups((a, b) => if (a != null && a._2 != null) a else b)
|
||||||
.map(_._2)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
.map(_._2)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||||
.write.mode(SaveMode.Overwrite).save(pidMapPath(workingPath))
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(pidMapPath(workingPath))
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** This method resolve the datacite relation and filter the resolved
|
||||||
* This method resolve the datacite relation and filter the resolved
|
|
||||||
* relation
|
* relation
|
||||||
* @param workingPath the working path
|
* @param workingPath the working path
|
||||||
* @param spark the spark session
|
* @param spark the spark session
|
||||||
|
@ -180,7 +207,9 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
||||||
|
|
||||||
val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String, String)]
|
val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String, String)]
|
||||||
|
|
||||||
val unresolvedRelations:Dataset[(String,Relation)] = spark.read.load(dataciteOAFPath(workingPath)).as[Oaf]
|
val unresolvedRelations: Dataset[(String, Relation)] = spark.read
|
||||||
|
.load(dataciteOAFPath(workingPath))
|
||||||
|
.as[Oaf]
|
||||||
.filter(_.isInstanceOf[Relation])
|
.filter(_.isInstanceOf[Relation])
|
||||||
.map(_.asInstanceOf[Relation])
|
.map(_.asInstanceOf[Relation])
|
||||||
.map { r =>
|
.map { r =>
|
||||||
|
@ -202,15 +231,12 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
||||||
r
|
r
|
||||||
})(relationEncoder)
|
})(relationEncoder)
|
||||||
.filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved")))
|
.filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved")))
|
||||||
.write.mode(SaveMode.Overwrite)
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
.save(resolvedRelationPath(workingPath))
|
.save(resolvedRelationPath(workingPath))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** This method generate scholix starting from resolved relation
|
||||||
|
|
||||||
/**
|
|
||||||
* This method generate scholix starting from resolved relation
|
|
||||||
*
|
|
||||||
*
|
*
|
||||||
* @param workingPath
|
* @param workingPath
|
||||||
* @param spark
|
* @param spark
|
||||||
|
@ -220,35 +246,44 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
||||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||||
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||||
implicit val intermediateEncoder :Encoder[(String,Scholix)] = Encoders.tuple(Encoders.STRING, scholixEncoder)
|
implicit val intermediateEncoder: Encoder[(String, Scholix)] =
|
||||||
|
Encoders.tuple(Encoders.STRING, scholixEncoder)
|
||||||
|
|
||||||
|
val relations: Dataset[(String, Relation)] = spark.read
|
||||||
|
.load(resolvedRelationPath(workingPath))
|
||||||
|
.as[Relation]
|
||||||
|
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relationEncoder))
|
||||||
|
|
||||||
val relations:Dataset[(String, Relation)] = spark.read.load(resolvedRelationPath(workingPath)).as[Relation].map(r =>(r.getSource,r))(Encoders.tuple(Encoders.STRING, relationEncoder))
|
val id_summary: Dataset[(String, ScholixResource)] = spark.read
|
||||||
|
.load(s"${scholixResourcePath(workingPath)}_graph")
|
||||||
val id_summary:Dataset[(String,ScholixResource)] = spark.read.load(s"${scholixResourcePath(workingPath)}_graph").as[ScholixResource].map(r => (r.getDnetIdentifier,r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder))
|
.as[ScholixResource]
|
||||||
|
.map(r => (r.getDnetIdentifier, r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder))
|
||||||
|
|
||||||
id_summary.cache()
|
id_summary.cache()
|
||||||
|
|
||||||
relations.joinWith(id_summary, relations("_1").equalTo(id_summary("_1")),"inner")
|
relations
|
||||||
|
.joinWith(id_summary, relations("_1").equalTo(id_summary("_1")), "inner")
|
||||||
.map(t => (t._1._2.getTarget, ScholixUtils.scholixFromSource(t._1._2, t._2._2)))
|
.map(t => (t._1._2.getTarget, ScholixUtils.scholixFromSource(t._1._2, t._2._2)))
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/scholix_one_verse")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/scholix_one_verse")
|
||||||
|
|
||||||
val source_scholix:Dataset[(String, Scholix)] =spark.read.load(s"$workingPath/scholix_one_verse").as[(String,Scholix)]
|
val source_scholix: Dataset[(String, Scholix)] =
|
||||||
|
spark.read.load(s"$workingPath/scholix_one_verse").as[(String, Scholix)]
|
||||||
|
|
||||||
source_scholix.joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")),"inner")
|
source_scholix
|
||||||
|
.joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")), "inner")
|
||||||
.map(t => {
|
.map(t => {
|
||||||
val target: ScholixResource = t._2._2
|
val target: ScholixResource = t._2._2
|
||||||
val scholix: Scholix = t._1._2
|
val scholix: Scholix = t._1._2
|
||||||
ScholixUtils.generateCompleteScholix(scholix, target)
|
ScholixUtils.generateCompleteScholix(scholix, target)
|
||||||
})(scholixEncoder).write.mode(SaveMode.Overwrite).save(s"$workingPath/scholix")
|
})(scholixEncoder)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/scholix")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Here all the spark applications runs this method
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Here all the spark applications runs this method
|
|
||||||
* where the whole logic of the spark node is defined
|
* where the whole logic of the spark node is defined
|
||||||
*/
|
*/
|
||||||
override def run(): Unit = {
|
override def run(): Unit = {
|
||||||
|
@ -268,7 +303,6 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
||||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||||
require(vocabularies != null)
|
require(vocabularies != null)
|
||||||
|
|
||||||
|
|
||||||
val updateDS: Boolean = "true".equalsIgnoreCase(parser.get("updateDS"))
|
val updateDS: Boolean = "true".equalsIgnoreCase(parser.get("updateDS"))
|
||||||
log.info(s"updateDS is '$updateDS'")
|
log.info(s"updateDS is '$updateDS'")
|
||||||
|
|
||||||
|
@ -277,15 +311,18 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
||||||
generateScholixResource(s"$sourcePath/provision/summaries", workingPath, spark)
|
generateScholixResource(s"$sourcePath/provision/summaries", workingPath, spark)
|
||||||
log.info("Retrieve last entities collected From starting from scholix Graph")
|
log.info("Retrieve last entities collected From starting from scholix Graph")
|
||||||
lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities")
|
lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities")
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
|
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
|
||||||
fs.delete(new Path(s"${scholixResourcePath(workingPath)}_native"), true)
|
fs.delete(new Path(s"${scholixResourcePath(workingPath)}_native"), true)
|
||||||
fs.rename(new Path(s"${scholixResourcePath(workingPath)}_graph"), new Path(s"${scholixResourcePath(workingPath)}_native"))
|
fs.rename(
|
||||||
|
new Path(s"${scholixResourcePath(workingPath)}_graph"),
|
||||||
|
new Path(s"${scholixResourcePath(workingPath)}_native")
|
||||||
|
)
|
||||||
lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath))
|
lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath))
|
||||||
}
|
}
|
||||||
|
|
||||||
val numRecords = getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies)
|
val numRecords =
|
||||||
|
getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies)
|
||||||
if (numRecords > 0) {
|
if (numRecords > 0) {
|
||||||
addMissingScholixResource(workingPath, spark)
|
addMissingScholixResource(workingPath, spark)
|
||||||
generatePidMap(workingPath, spark)
|
generatePidMap(workingPath, spark)
|
||||||
|
@ -295,11 +332,14 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
object SparkRetrieveDataciteDelta {
|
object SparkRetrieveDataciteDelta {
|
||||||
val log: Logger = LoggerFactory.getLogger(SparkRetrieveDataciteDelta.getClass)
|
val log: Logger = LoggerFactory.getLogger(SparkRetrieveDataciteDelta.getClass)
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
new SparkRetrieveDataciteDelta("/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json", args, log).initialize().run()
|
new SparkRetrieveDataciteDelta(
|
||||||
|
"/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json",
|
||||||
|
args,
|
||||||
|
log
|
||||||
|
).initialize().run()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
package eu.dnetlib.dhp.datacite
|
package eu.dnetlib.dhp.datacite
|
||||||
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
|
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
|
||||||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
|
@ -37,7 +36,6 @@ class DataciteToOAFTest extends AbstractVocabularyTest{
|
||||||
FileUtils.deleteDirectory(workingDir.toFile)
|
FileUtils.deleteDirectory(workingDir.toFile)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testDateMapping: Unit = {
|
def testDateMapping: Unit = {
|
||||||
val inputDate = "2021-07-14T11:52:54+0000"
|
val inputDate = "2021-07-14T11:52:54+0000"
|
||||||
|
@ -45,24 +43,21 @@ class DataciteToOAFTest extends AbstractVocabularyTest{
|
||||||
val dt = ISO8601FORMAT.parse(inputDate)
|
val dt = ISO8601FORMAT.parse(inputDate)
|
||||||
println(dt.getTime)
|
println(dt.getTime)
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testConvert(): Unit = {
|
def testConvert(): Unit = {
|
||||||
|
|
||||||
|
|
||||||
val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
|
val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
|
||||||
|
|
||||||
val conf = new SparkConf()
|
val conf = new SparkConf()
|
||||||
val spark:SparkSession = SparkSession.builder().config(conf)
|
val spark: SparkSession = SparkSession
|
||||||
|
.builder()
|
||||||
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master("local[*]")
|
.master("local[*]")
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
val instance = new GenerateDataciteDatasetSpark(null, null, log)
|
val instance = new GenerateDataciteDatasetSpark(null, null, log)
|
||||||
val targetPath = s"$workingDir/result"
|
val targetPath = s"$workingDir/result"
|
||||||
|
@ -73,30 +68,31 @@ class DataciteToOAFTest extends AbstractVocabularyTest{
|
||||||
|
|
||||||
val nativeSize = spark.read.load(path).count()
|
val nativeSize = spark.read.load(path).count()
|
||||||
|
|
||||||
|
|
||||||
assertEquals(100, nativeSize)
|
assertEquals(100, nativeSize)
|
||||||
|
|
||||||
val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
|
val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
|
||||||
|
|
||||||
|
result
|
||||||
result.map(s => s.getClass.getSimpleName).groupBy(col("value").alias("class")).agg(count("value").alias("Total")).show(false)
|
.map(s => s.getClass.getSimpleName)
|
||||||
|
.groupBy(col("value").alias("class"))
|
||||||
|
.agg(count("value").alias("Total"))
|
||||||
|
.show(false)
|
||||||
|
|
||||||
val t = spark.read.load(targetPath).count()
|
val t = spark.read.load(targetPath).count()
|
||||||
|
|
||||||
assertTrue(t > 0)
|
assertTrue(t > 0)
|
||||||
|
|
||||||
|
|
||||||
spark.stop()
|
spark.stop()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testMapping(): Unit = {
|
def testMapping(): Unit = {
|
||||||
val record =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")).mkString
|
val record = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
|
||||||
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
||||||
val res: List[Oaf] = DataciteToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true)
|
val res: List[Oaf] = DataciteToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true)
|
||||||
|
@ -107,8 +103,6 @@ class DataciteToOAFTest extends AbstractVocabularyTest{
|
||||||
|
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -22,7 +22,6 @@ import scala.xml.pull.XMLEventReader
|
||||||
@ExtendWith(Array(classOf[MockitoExtension]))
|
@ExtendWith(Array(classOf[MockitoExtension]))
|
||||||
class BioScholixTest extends AbstractVocabularyTest {
|
class BioScholixTest extends AbstractVocabularyTest {
|
||||||
|
|
||||||
|
|
||||||
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
||||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
|
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
|
||||||
|
|
||||||
|
@ -38,53 +37,55 @@ class BioScholixTest extends AbstractVocabularyTest{
|
||||||
}
|
}
|
||||||
|
|
||||||
object GzFileIterator {
|
object GzFileIterator {
|
||||||
|
|
||||||
def apply(is: InputStream, encoding: String) = {
|
def apply(is: InputStream, encoding: String) = {
|
||||||
new BufferedReaderIterator(
|
new BufferedReaderIterator(
|
||||||
new BufferedReader(
|
new BufferedReader(new InputStreamReader(new GZIPInputStream(is), encoding))
|
||||||
new InputStreamReader(
|
)
|
||||||
new GZIPInputStream(
|
|
||||||
is), encoding)))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testEBIData() = {
|
def testEBIData() = {
|
||||||
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")).mkString
|
val inputXML = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
||||||
|
.mkString
|
||||||
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
||||||
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
|
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testPubmedToOaf(): Unit = {
|
def testPubmedToOaf(): Unit = {
|
||||||
assertNotNull(vocabularies)
|
assertNotNull(vocabularies)
|
||||||
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
||||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")).mkString
|
val records: String = Source
|
||||||
val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies))
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump"))
|
||||||
|
.mkString
|
||||||
|
val r: List[Oaf] = records.lines.toList
|
||||||
|
.map(s => mapper.readValue(s, classOf[PMArticle]))
|
||||||
|
.map(a => PubMedToOaf.convert(a, vocabularies))
|
||||||
assertEquals(10, r.size)
|
assertEquals(10, r.size)
|
||||||
assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p)))
|
assertTrue(
|
||||||
|
r.map(p => p.asInstanceOf[Result])
|
||||||
|
.flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid))
|
||||||
|
.exists(p => "0037".equalsIgnoreCase(p))
|
||||||
|
)
|
||||||
println(mapper.writeValueAsString(r.head))
|
println(mapper.writeValueAsString(r.head))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testPDBToOAF(): Unit = {
|
def testPDBToOAF(): Unit = {
|
||||||
|
|
||||||
assertNotNull(vocabularies)
|
assertNotNull(vocabularies)
|
||||||
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
||||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump")).mkString
|
val records: String = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump"))
|
||||||
|
.mkString
|
||||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||||
|
|
||||||
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
|
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
assertTrue(result.nonEmpty)
|
assertTrue(result.nonEmpty)
|
||||||
result.foreach(r => assertNotNull(r))
|
result.foreach(r => assertNotNull(r))
|
||||||
|
|
||||||
|
@ -93,20 +94,19 @@ class BioScholixTest extends AbstractVocabularyTest{
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testUNIprotToOAF(): Unit = {
|
def testUNIprotToOAF(): Unit = {
|
||||||
|
|
||||||
assertNotNull(vocabularies)
|
assertNotNull(vocabularies)
|
||||||
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
||||||
|
|
||||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump")).mkString
|
val records: String = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump"))
|
||||||
|
.mkString
|
||||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||||
|
|
||||||
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
|
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
assertTrue(result.nonEmpty)
|
assertTrue(result.nonEmpty)
|
||||||
result.foreach(r => assertNotNull(r))
|
result.foreach(r => assertNotNull(r))
|
||||||
|
|
||||||
|
@ -115,7 +115,14 @@ class BioScholixTest extends AbstractVocabularyTest{
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
case class EBILinks(relType:String, date:String, title:String, pmid:String, targetPid:String, targetPidType:String) {}
|
case class EBILinks(
|
||||||
|
relType: String,
|
||||||
|
date: String,
|
||||||
|
title: String,
|
||||||
|
pmid: String,
|
||||||
|
targetPid: String,
|
||||||
|
targetPidType: String
|
||||||
|
) {}
|
||||||
|
|
||||||
def parse_ebi_links(input: String): List[EBILinks] = {
|
def parse_ebi_links(input: String): List[EBILinks] = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
@ -135,14 +142,14 @@ class BioScholixTest extends AbstractVocabularyTest{
|
||||||
} yield EBILinks(relation, publicationDate, title, pmid, id, idScheme)
|
} yield EBILinks(relation, publicationDate, title, pmid, id, idScheme)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testCrossrefLinksToOAF(): Unit = {
|
def testCrossrefLinksToOAF(): Unit = {
|
||||||
|
|
||||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links")).mkString
|
val records: String = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links"))
|
||||||
|
.mkString
|
||||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||||
|
|
||||||
|
|
||||||
val result: List[Oaf] = records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
|
val result: List[Oaf] = records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
|
||||||
|
|
||||||
assertNotNull(result)
|
assertNotNull(result)
|
||||||
|
@ -154,24 +161,30 @@ class BioScholixTest extends AbstractVocabularyTest{
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testEBILinksToOAF(): Unit = {
|
def testEBILinksToOAF(): Unit = {
|
||||||
val iterator = GzFileIterator(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"), "UTF-8")
|
val iterator = GzFileIterator(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"),
|
||||||
|
"UTF-8"
|
||||||
|
)
|
||||||
val data = iterator.next()
|
val data = iterator.next()
|
||||||
|
|
||||||
val res = BioDBToOAF.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links).filter(BioDBToOAF.EBITargetLinksFilter).flatMap(BioDBToOAF.convertEBILinksToOaf)
|
val res = BioDBToOAF
|
||||||
|
.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links)
|
||||||
|
.filter(BioDBToOAF.EBITargetLinksFilter)
|
||||||
|
.flatMap(BioDBToOAF.convertEBILinksToOaf)
|
||||||
print(res.length)
|
print(res.length)
|
||||||
|
|
||||||
|
|
||||||
println(mapper.writeValueAsString(res.head))
|
println(mapper.writeValueAsString(res.head))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def scholixResolvedToOAF(): Unit = {
|
def scholixResolvedToOAF(): Unit = {
|
||||||
|
|
||||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")).mkString
|
val records: String = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||||
|
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
@ -181,7 +194,6 @@ class BioScholixTest extends AbstractVocabularyTest{
|
||||||
json.extract[ScholixResolved]
|
json.extract[ScholixResolved]
|
||||||
}.toList
|
}.toList
|
||||||
|
|
||||||
|
|
||||||
val result: List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
|
val result: List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
|
||||||
|
|
||||||
assertTrue(result.nonEmpty)
|
assertTrue(result.nonEmpty)
|
||||||
|
|
|
@ -16,10 +16,22 @@ import java.time.LocalDate
|
||||||
import java.time.format.DateTimeFormatter
|
import java.time.format.DateTimeFormatter
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
|
case class HostedByItemType(
|
||||||
|
id: String,
|
||||||
|
officialname: String,
|
||||||
|
issn: String,
|
||||||
|
eissn: String,
|
||||||
|
lissn: String,
|
||||||
|
openAccess: Boolean
|
||||||
|
) {}
|
||||||
|
|
||||||
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
case class DoiBoostAffiliation(
|
||||||
|
PaperId: Long,
|
||||||
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){}
|
AffiliationId: Long,
|
||||||
|
GridId: Option[String],
|
||||||
|
OfficialPage: Option[String],
|
||||||
|
DisplayName: Option[String]
|
||||||
|
) {}
|
||||||
|
|
||||||
object DoiBoostMappingUtil {
|
object DoiBoostMappingUtil {
|
||||||
|
|
||||||
|
@ -43,7 +55,17 @@ object DoiBoostMappingUtil {
|
||||||
val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"
|
val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"
|
||||||
val DOI_PREFIX = "10."
|
val DOI_PREFIX = "10."
|
||||||
|
|
||||||
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
|
val invalidName = List(
|
||||||
|
",",
|
||||||
|
"none none",
|
||||||
|
"none, none",
|
||||||
|
"none &na;",
|
||||||
|
"(:null)",
|
||||||
|
"test test test",
|
||||||
|
"test test",
|
||||||
|
"test",
|
||||||
|
"&na; &na;"
|
||||||
|
)
|
||||||
|
|
||||||
def toActionSet(item: Oaf): (String, String) = {
|
def toActionSet(item: Oaf): (String, String) = {
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
@ -75,7 +97,6 @@ object DoiBoostMappingUtil {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def toHostedByItem(input: String): (String, HostedByItemType) = {
|
def toHostedByItem(input: String): (String, HostedByItemType) = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
|
||||||
|
@ -84,7 +105,6 @@ object DoiBoostMappingUtil {
|
||||||
(c.keys.head, c.values.head)
|
(c.keys.head, c.values.head)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def toISSNPair(publication: Publication): (String, Publication) = {
|
def toISSNPair(publication: Publication): (String, Publication) = {
|
||||||
val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
|
val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
|
||||||
val eissn = if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
|
val eissn = if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
|
||||||
|
@ -100,26 +120,24 @@ object DoiBoostMappingUtil {
|
||||||
(publication.getId, publication)
|
(publication.getId, publication)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def generateGridAffiliationId(gridId: String): String = {
|
def generateGridAffiliationId(gridId: String): String = {
|
||||||
s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
|
s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def fixResult(result: Dataset): Dataset = {
|
def fixResult(result: Dataset): Dataset = {
|
||||||
val instanceType = extractInstance(result)
|
val instanceType = extractInstance(result)
|
||||||
if (instanceType.isDefined) {
|
if (instanceType.isDefined) {
|
||||||
result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
|
result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
|
||||||
}
|
}
|
||||||
result.getInstance().asScala.foreach(i => {
|
result
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.foreach(i => {
|
||||||
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
|
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
|
||||||
})
|
})
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def decideAccessRight(lic: Field[String], date: String): AccessRight = {
|
def decideAccessRight(lic: Field[String], date: String): AccessRight = {
|
||||||
if (lic == null) {
|
if (lic == null) {
|
||||||
//Default value Unknown
|
//Default value Unknown
|
||||||
|
@ -127,7 +145,8 @@ object DoiBoostMappingUtil {
|
||||||
}
|
}
|
||||||
val license: String = lic.getValue
|
val license: String = lic.getValue
|
||||||
//CC licenses
|
//CC licenses
|
||||||
if(license.startsWith("cc") ||
|
if (
|
||||||
|
license.startsWith("cc") ||
|
||||||
license.startsWith("http://creativecommons.org/licenses") ||
|
license.startsWith("http://creativecommons.org/licenses") ||
|
||||||
license.startsWith("https://creativecommons.org/licenses") ||
|
license.startsWith("https://creativecommons.org/licenses") ||
|
||||||
|
|
||||||
|
@ -137,7 +156,8 @@ object DoiBoostMappingUtil {
|
||||||
license.equals("http://pubs.acs.org/page/policy/authorchoice_ccbyncnd_termsofuse.html") ||
|
license.equals("http://pubs.acs.org/page/policy/authorchoice_ccbyncnd_termsofuse.html") ||
|
||||||
|
|
||||||
//APA (considered OPEN also by Unpaywall)
|
//APA (considered OPEN also by Unpaywall)
|
||||||
license.equals("http://www.apa.org/pubs/journals/resources/open-access.aspx")){
|
license.equals("http://www.apa.org/pubs/journals/resources/open-access.aspx")
|
||||||
|
) {
|
||||||
|
|
||||||
val oaq: AccessRight = getOpenAccessQualifier()
|
val oaq: AccessRight = getOpenAccessQualifier()
|
||||||
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
|
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
|
||||||
|
@ -145,7 +165,11 @@ object DoiBoostMappingUtil {
|
||||||
}
|
}
|
||||||
|
|
||||||
//OUP (BUT ONLY AFTER 12 MONTHS FROM THE PUBLICATION DATE, OTHERWISE THEY ARE EMBARGOED)
|
//OUP (BUT ONLY AFTER 12 MONTHS FROM THE PUBLICATION DATE, OTHERWISE THEY ARE EMBARGOED)
|
||||||
if(license.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")){
|
if (
|
||||||
|
license.equals(
|
||||||
|
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
|
||||||
|
)
|
||||||
|
) {
|
||||||
val now = java.time.LocalDate.now
|
val now = java.time.LocalDate.now
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -154,20 +178,19 @@ object DoiBoostMappingUtil {
|
||||||
val oaq: AccessRight = getOpenAccessQualifier()
|
val oaq: AccessRight = getOpenAccessQualifier()
|
||||||
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
|
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
|
||||||
return oaq
|
return oaq
|
||||||
}
|
} else {
|
||||||
else{
|
|
||||||
return getEmbargoedAccessQualifier()
|
return getEmbargoedAccessQualifier()
|
||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
case e: Exception => {
|
case e: Exception => {
|
||||||
try {
|
try {
|
||||||
val pub_date = LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))
|
val pub_date =
|
||||||
|
LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))
|
||||||
if (((now.toEpochDay - pub_date.toEpochDay) / 365.0) > 1) {
|
if (((now.toEpochDay - pub_date.toEpochDay) / 365.0) > 1) {
|
||||||
val oaq: AccessRight = getOpenAccessQualifier()
|
val oaq: AccessRight = getOpenAccessQualifier()
|
||||||
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
|
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
|
||||||
return oaq
|
return oaq
|
||||||
}
|
} else {
|
||||||
else{
|
|
||||||
return getEmbargoedAccessQualifier()
|
return getEmbargoedAccessQualifier()
|
||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
|
@ -183,34 +206,56 @@ object DoiBoostMappingUtil {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def getOpenAccessQualifier(): AccessRight = {
|
def getOpenAccessQualifier(): AccessRight = {
|
||||||
|
|
||||||
OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN,"Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
OafMapperUtils.accessRight(
|
||||||
|
ModelConstants.ACCESS_RIGHT_OPEN,
|
||||||
|
"Open Access",
|
||||||
|
ModelConstants.DNET_ACCESS_MODES,
|
||||||
|
ModelConstants.DNET_ACCESS_MODES
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
def getRestrictedQualifier(): AccessRight = {
|
def getRestrictedQualifier(): AccessRight = {
|
||||||
OafMapperUtils.accessRight( "RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
OafMapperUtils.accessRight(
|
||||||
|
"RESTRICTED",
|
||||||
|
"Restricted",
|
||||||
|
ModelConstants.DNET_ACCESS_MODES,
|
||||||
|
ModelConstants.DNET_ACCESS_MODES
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def getUnknownQualifier(): AccessRight = {
|
def getUnknownQualifier(): AccessRight = {
|
||||||
OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
OafMapperUtils.accessRight(
|
||||||
|
ModelConstants.UNKNOWN,
|
||||||
|
ModelConstants.NOT_AVAILABLE,
|
||||||
|
ModelConstants.DNET_ACCESS_MODES,
|
||||||
|
ModelConstants.DNET_ACCESS_MODES
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def getEmbargoedAccessQualifier(): AccessRight = {
|
def getEmbargoedAccessQualifier(): AccessRight = {
|
||||||
OafMapperUtils.accessRight("EMBARGO","Embargo",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
OafMapperUtils.accessRight(
|
||||||
|
"EMBARGO",
|
||||||
|
"Embargo",
|
||||||
|
ModelConstants.DNET_ACCESS_MODES,
|
||||||
|
ModelConstants.DNET_ACCESS_MODES
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
def getClosedAccessQualifier(): AccessRight = {
|
def getClosedAccessQualifier(): AccessRight = {
|
||||||
OafMapperUtils.accessRight("CLOSED","Closed Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
OafMapperUtils.accessRight(
|
||||||
|
"CLOSED",
|
||||||
|
"Closed Access",
|
||||||
|
ModelConstants.DNET_ACCESS_MODES,
|
||||||
|
ModelConstants.DNET_ACCESS_MODES
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def extractInstance(r: Result): Option[Instance] = {
|
def extractInstance(r: Result): Option[Instance] = {
|
||||||
r.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
|
r.getInstance()
|
||||||
|
.asScala
|
||||||
|
.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
|
||||||
}
|
}
|
||||||
|
|
||||||
def fixPublication(input: ((String, Publication), (String, HostedByItemType))): Publication = {
|
def fixPublication(input: ((String, Publication), (String, HostedByItemType))): Publication = {
|
||||||
|
@ -222,10 +267,16 @@ object DoiBoostMappingUtil {
|
||||||
val instanceType: Option[Instance] = extractInstance(publication)
|
val instanceType: Option[Instance] = extractInstance(publication)
|
||||||
|
|
||||||
if (instanceType.isDefined) {
|
if (instanceType.isDefined) {
|
||||||
publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
|
publication
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
|
||||||
}
|
}
|
||||||
|
|
||||||
publication.getInstance().asScala.foreach(i => {
|
publication
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.foreach(i => {
|
||||||
var hb = new KeyValue
|
var hb = new KeyValue
|
||||||
if (item != null) {
|
if (item != null) {
|
||||||
hb.setValue(item.officialname)
|
hb.setValue(item.officialname)
|
||||||
|
@ -235,8 +286,7 @@ object DoiBoostMappingUtil {
|
||||||
i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
|
i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
hb = ModelConstants.UNKNOWN_REPOSITORY
|
hb = ModelConstants.UNKNOWN_REPOSITORY
|
||||||
}
|
}
|
||||||
i.setHostedby(hb)
|
i.setHostedby(hb)
|
||||||
|
@ -270,17 +320,22 @@ object DoiBoostMappingUtil {
|
||||||
if (publication.getTitle == null || publication.getTitle.size == 0)
|
if (publication.getTitle == null || publication.getTitle.size == 0)
|
||||||
return false
|
return false
|
||||||
|
|
||||||
|
val s = publication.getTitle.asScala.count(p =>
|
||||||
val s = publication.getTitle.asScala.count(p => p.getValue != null
|
p.getValue != null
|
||||||
&& p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]"))
|
&& p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]")
|
||||||
|
)
|
||||||
|
|
||||||
if (s == 0)
|
if (s == 0)
|
||||||
return false
|
return false
|
||||||
|
|
||||||
// fixes #4360 (test publisher)
|
// fixes #4360 (test publisher)
|
||||||
val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
|
val publisher =
|
||||||
|
if (publication.getPublisher != null) publication.getPublisher.getValue else null
|
||||||
|
|
||||||
if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
|
if (
|
||||||
|
publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher
|
||||||
|
.equalsIgnoreCase("CrossRef Test Account"))
|
||||||
|
) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -288,18 +343,12 @@ object DoiBoostMappingUtil {
|
||||||
if (publication.getAuthor == null || publication.getAuthor.size() == 0)
|
if (publication.getAuthor == null || publication.getAuthor.size() == 0)
|
||||||
return false
|
return false
|
||||||
|
|
||||||
|
|
||||||
//filter invalid author
|
//filter invalid author
|
||||||
val authors = publication.getAuthor.asScala.map(s => {
|
val authors = publication.getAuthor.asScala.map(s => {
|
||||||
if (s.getFullname.nonEmpty) {
|
if (s.getFullname.nonEmpty) {
|
||||||
s.getFullname
|
s.getFullname
|
||||||
}
|
} else
|
||||||
else
|
s"${s.getName} ${s.getSurname}"
|
||||||
s"${
|
|
||||||
s.getName
|
|
||||||
} ${
|
|
||||||
s.getSurname
|
|
||||||
}"
|
|
||||||
})
|
})
|
||||||
|
|
||||||
val c = authors.count(isValidAuthorName)
|
val c = authors.count(isValidAuthorName)
|
||||||
|
@ -307,13 +356,16 @@ object DoiBoostMappingUtil {
|
||||||
return false
|
return false
|
||||||
|
|
||||||
// fixes #4368
|
// fixes #4368
|
||||||
if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue))
|
if (
|
||||||
|
authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(
|
||||||
|
publication.getPublisher.getValue
|
||||||
|
)
|
||||||
|
)
|
||||||
return false
|
return false
|
||||||
|
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def isValidAuthorName(fullName: String): Boolean = {
|
def isValidAuthorName(fullName: String): Boolean = {
|
||||||
if (fullName == null || fullName.isEmpty)
|
if (fullName == null || fullName.isEmpty)
|
||||||
return false
|
return false
|
||||||
|
@ -322,20 +374,30 @@ object DoiBoostMappingUtil {
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateDataInfo(trust: String): DataInfo = {
|
def generateDataInfo(trust: String): DataInfo = {
|
||||||
val di = new DataInfo
|
val di = new DataInfo
|
||||||
di.setDeletedbyinference(false)
|
di.setDeletedbyinference(false)
|
||||||
di.setInferred(false)
|
di.setInferred(false)
|
||||||
di.setInvisible(false)
|
di.setInvisible(false)
|
||||||
di.setTrust(trust)
|
di.setTrust(trust)
|
||||||
di.setProvenanceaction(OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_ACTIONSET,ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS))
|
di.setProvenanceaction(
|
||||||
|
OafMapperUtils.qualifier(
|
||||||
|
ModelConstants.SYSIMPORT_ACTIONSET,
|
||||||
|
ModelConstants.SYSIMPORT_ACTIONSET,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS
|
||||||
|
)
|
||||||
|
)
|
||||||
di
|
di
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def createSP(
|
||||||
|
value: String,
|
||||||
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
|
classId: String,
|
||||||
|
className: String,
|
||||||
|
schemeId: String,
|
||||||
|
schemeName: String
|
||||||
|
): StructuredProperty = {
|
||||||
val sp = new StructuredProperty
|
val sp = new StructuredProperty
|
||||||
sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
|
sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
|
||||||
sp.setValue(value)
|
sp.setValue(value)
|
||||||
|
@ -343,9 +405,14 @@ object DoiBoostMappingUtil {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def createSP(
|
||||||
|
value: String,
|
||||||
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
|
classId: String,
|
||||||
|
className: String,
|
||||||
|
schemeId: String,
|
||||||
|
schemeName: String,
|
||||||
|
dataInfo: DataInfo
|
||||||
|
): StructuredProperty = {
|
||||||
val sp = new StructuredProperty
|
val sp = new StructuredProperty
|
||||||
sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
|
sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
|
||||||
sp.setValue(value)
|
sp.setValue(value)
|
||||||
|
@ -362,9 +429,12 @@ object DoiBoostMappingUtil {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def createSP(
|
||||||
|
value: String,
|
||||||
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
|
classId: String,
|
||||||
|
schemeId: String,
|
||||||
|
dataInfo: DataInfo
|
||||||
|
): StructuredProperty = {
|
||||||
val sp = new StructuredProperty
|
val sp = new StructuredProperty
|
||||||
sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
|
sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
|
||||||
sp.setValue(value)
|
sp.setValue(value)
|
||||||
|
@ -382,7 +452,6 @@ object DoiBoostMappingUtil {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def createUnpayWallCollectedFrom(): KeyValue = {
|
def createUnpayWallCollectedFrom(): KeyValue = {
|
||||||
|
|
||||||
val cf = new KeyValue
|
val cf = new KeyValue
|
||||||
|
@ -401,15 +470,11 @@ object DoiBoostMappingUtil {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateIdentifier(oaf: Result, doi: String): String = {
|
def generateIdentifier(oaf: Result, doi: String): String = {
|
||||||
val id = DHPUtils.md5(doi.toLowerCase)
|
val id = DHPUtils.md5(doi.toLowerCase)
|
||||||
s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}"
|
s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def createMAGCollectedFrom(): KeyValue = {
|
def createMAGCollectedFrom(): KeyValue = {
|
||||||
|
|
||||||
val cf = new KeyValue
|
val cf = new KeyValue
|
||||||
|
@ -424,7 +489,6 @@ object DoiBoostMappingUtil {
|
||||||
tmp.setValue(value)
|
tmp.setValue(value)
|
||||||
tmp
|
tmp
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def isEmpty(x: String) = x == null || x.trim.isEmpty
|
def isEmpty(x: String) = x == null || x.trim.isEmpty
|
||||||
|
@ -432,7 +496,10 @@ object DoiBoostMappingUtil {
|
||||||
def normalizeDoi(input: String): String = {
|
def normalizeDoi(input: String): String = {
|
||||||
if (input == null)
|
if (input == null)
|
||||||
return null
|
return null
|
||||||
val replaced = input.replaceAll("(?:\\n|\\r|\\t|\\s)", "").toLowerCase.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
|
val replaced = input
|
||||||
|
.replaceAll("(?:\\n|\\r|\\t|\\s)", "")
|
||||||
|
.toLowerCase
|
||||||
|
.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
|
||||||
if (isEmpty(replaced))
|
if (isEmpty(replaced))
|
||||||
return null
|
return null
|
||||||
|
|
||||||
|
@ -446,9 +513,6 @@ object DoiBoostMappingUtil {
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,22 +17,29 @@ object SparkGenerateDOIBoostActionSet {
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
|
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
|
||||||
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||||
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
|
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
|
||||||
implicit val mapEncoderAS: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
implicit val mapEncoderAS: Encoder[(String, String)] =
|
||||||
|
Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||||
|
|
||||||
implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]]
|
implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] =
|
||||||
|
Encoders.kryo[AtomicAction[OafDataset]]
|
||||||
|
|
||||||
val dbPublicationPath = parser.get("dbPublicationPath")
|
val dbPublicationPath = parser.get("dbPublicationPath")
|
||||||
val dbDatasetPath = parser.get("dbDatasetPath")
|
val dbDatasetPath = parser.get("dbDatasetPath")
|
||||||
|
@ -41,35 +48,61 @@ object SparkGenerateDOIBoostActionSet {
|
||||||
val dbOrganizationPath = parser.get("dbOrganizationPath")
|
val dbOrganizationPath = parser.get("dbOrganizationPath")
|
||||||
val sequenceFilePath = parser.get("sFilePath")
|
val sequenceFilePath = parser.get("sFilePath")
|
||||||
|
|
||||||
val asDataset = spark.read.load(dbDatasetPath).as[OafDataset]
|
val asDataset = spark.read
|
||||||
|
.load(dbDatasetPath)
|
||||||
|
.as[OafDataset]
|
||||||
.filter(p => p != null || p.getId != null)
|
.filter(p => p != null || p.getId != null)
|
||||||
.map(d => DoiBoostMappingUtil.fixResult(d))
|
.map(d => DoiBoostMappingUtil.fixResult(d))
|
||||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
.map(d => DoiBoostMappingUtil.toActionSet(d))(
|
||||||
|
Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||||
|
)
|
||||||
|
|
||||||
|
val asPublication = spark.read
|
||||||
val asPublication = spark.read.load(dbPublicationPath).as[Publication]
|
.load(dbPublicationPath)
|
||||||
|
.as[Publication]
|
||||||
.filter(p => p != null || p.getId != null)
|
.filter(p => p != null || p.getId != null)
|
||||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
.map(d => DoiBoostMappingUtil.toActionSet(d))(
|
||||||
|
Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||||
|
)
|
||||||
|
|
||||||
|
val asOrganization = spark.read
|
||||||
|
.load(dbOrganizationPath)
|
||||||
|
.as[Organization]
|
||||||
|
.map(d => DoiBoostMappingUtil.toActionSet(d))(
|
||||||
|
Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||||
|
)
|
||||||
|
|
||||||
val asOrganization = spark.read.load(dbOrganizationPath).as[Organization]
|
val asCRelation = spark.read
|
||||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
.load(crossRefRelation)
|
||||||
|
.as[Relation]
|
||||||
|
|
||||||
val asCRelation = spark.read.load(crossRefRelation).as[Relation]
|
|
||||||
.filter(r => r != null && r.getSource != null && r.getTarget != null)
|
.filter(r => r != null && r.getSource != null && r.getTarget != null)
|
||||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
.map(d => DoiBoostMappingUtil.toActionSet(d))(
|
||||||
|
Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||||
|
)
|
||||||
|
|
||||||
|
val asRelAffiliation = spark.read
|
||||||
|
.load(dbaffiliationRelationPath)
|
||||||
|
.as[Relation]
|
||||||
|
.map(d => DoiBoostMappingUtil.toActionSet(d))(
|
||||||
|
Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||||
|
)
|
||||||
|
|
||||||
val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation]
|
val d: Dataset[(String, String)] = asDataset
|
||||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
.union(asPublication)
|
||||||
|
.union(asOrganization)
|
||||||
|
.union(asCRelation)
|
||||||
val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation)
|
.union(asRelAffiliation)
|
||||||
|
|
||||||
|
|
||||||
d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
|
|
||||||
|
|
||||||
|
d.rdd
|
||||||
|
.repartition(6000)
|
||||||
|
.map(s => (new Text(s._1), new Text(s._2)))
|
||||||
|
.saveAsHadoopFile(
|
||||||
|
s"$sequenceFilePath",
|
||||||
|
classOf[Text],
|
||||||
|
classOf[Text],
|
||||||
|
classOf[SequenceFileOutputFormat[Text, Text]],
|
||||||
|
classOf[GzipCodec]
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,8 +15,8 @@ import org.json4s.JsonAST.{JField, JObject, JString}
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
object SparkGenerateDoiBoost {
|
|
||||||
|
|
||||||
|
object SparkGenerateDoiBoost {
|
||||||
|
|
||||||
def extractIdGRID(input: String): List[(String, String)] = {
|
def extractIdGRID(input: String): List[(String, String)] = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
@ -35,19 +35,23 @@ object SparkGenerateDoiBoost {
|
||||||
grids.map(g => (id, s"unresolved::grid::${g.toLowerCase}"))(collection.breakOut)
|
grids.map(g => (id, s"unresolved::grid::${g.toLowerCase}"))(collection.breakOut)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
@ -65,8 +69,7 @@ object SparkGenerateDoiBoost {
|
||||||
a._2.setId(a._1)
|
a._2.setId(a._1)
|
||||||
return a._2
|
return a._2
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
if (a != null && a._2 != null) {
|
if (a != null && a._2 != null) {
|
||||||
b.mergeFrom(a._2)
|
b.mergeFrom(a._2)
|
||||||
b.setId(a._1)
|
b.setId(a._1)
|
||||||
|
@ -82,8 +85,7 @@ object SparkGenerateDoiBoost {
|
||||||
if (b1 == null) {
|
if (b1 == null) {
|
||||||
if (b2 != null)
|
if (b2 != null)
|
||||||
return b2
|
return b2
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
if (b2 != null) {
|
if (b2 != null) {
|
||||||
b1.mergeFrom(b2)
|
b1.mergeFrom(b2)
|
||||||
val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
|
val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
|
||||||
|
@ -103,17 +105,19 @@ object SparkGenerateDoiBoost {
|
||||||
override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication]
|
override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
|
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
|
||||||
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =
|
||||||
|
Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
||||||
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
|
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
|
||||||
|
|
||||||
logger.info("Phase 2) Join Crossref with UnpayWall")
|
logger.info("Phase 2) Join Crossref with UnpayWall")
|
||||||
|
|
||||||
val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
|
val crossrefPublication: Dataset[(String, Publication)] =
|
||||||
val uwPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
|
spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
|
||||||
|
val uwPublication: Dataset[(String, Publication)] =
|
||||||
|
spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
|
||||||
|
|
||||||
def applyMerge(item: ((String, Publication), (String, Publication))): Publication = {
|
def applyMerge(item: ((String, Publication), (String, Publication))): Publication = {
|
||||||
val crossrefPub = item._1._2
|
val crossrefPub = item._1._2
|
||||||
|
@ -127,53 +131,95 @@ object SparkGenerateDoiBoost {
|
||||||
crossrefPub
|
crossrefPub
|
||||||
}
|
}
|
||||||
|
|
||||||
crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/firstJoin")
|
crossrefPublication
|
||||||
|
.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left")
|
||||||
|
.map(applyMerge)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingDirPath/firstJoin")
|
||||||
logger.info("Phase 3) Join Result with ORCID")
|
logger.info("Phase 3) Join Result with ORCID")
|
||||||
val fj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
|
val fj: Dataset[(String, Publication)] =
|
||||||
val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
|
spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
|
||||||
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
|
val orcidPublication: Dataset[(String, Publication)] =
|
||||||
|
spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
|
||||||
|
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left")
|
||||||
|
.map(applyMerge)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingDirPath/secondJoin")
|
||||||
|
|
||||||
logger.info("Phase 4) Join Result with MAG")
|
logger.info("Phase 4) Join Result with MAG")
|
||||||
val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
|
val sj: Dataset[(String, Publication)] =
|
||||||
|
spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
|
||||||
|
|
||||||
val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
|
val magPublication: Dataset[(String, Publication)] =
|
||||||
sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublication")
|
spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
|
||||||
|
sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left")
|
||||||
|
.map(applyMerge)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingDirPath/doiBoostPublication")
|
||||||
|
|
||||||
|
val doiBoostPublication: Dataset[(String, Publication)] = spark.read
|
||||||
|
.load(s"$workingDirPath/doiBoostPublication")
|
||||||
|
.as[Publication]
|
||||||
|
.filter(p => DoiBoostMappingUtil.filterPublication(p))
|
||||||
|
.map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
|
||||||
|
|
||||||
val doiBoostPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication].filter(p => DoiBoostMappingUtil.filterPublication(p)).map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
|
val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(
|
||||||
|
spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem)
|
||||||
|
)
|
||||||
|
|
||||||
val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem))
|
doiBoostPublication
|
||||||
|
.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
|
||||||
|
|
||||||
doiBoostPublication.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
|
|
||||||
.map(DoiBoostMappingUtil.fixPublication)
|
.map(DoiBoostMappingUtil.fixPublication)
|
||||||
.map(p => (p.getId, p))
|
.map(p => (p.getId, p))
|
||||||
.groupByKey(_._1)
|
.groupByKey(_._1)
|
||||||
.agg(crossrefAggregator.toColumn)
|
.agg(crossrefAggregator.toColumn)
|
||||||
.map(p => p._2)
|
.map(p => p._2)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingDirPath/doiBoostPublicationFiltered")
|
||||||
|
|
||||||
val affiliationPath = parser.get("affiliationPath")
|
val affiliationPath = parser.get("affiliationPath")
|
||||||
val paperAffiliationPath = parser.get("paperAffiliationPath")
|
val paperAffiliationPath = parser.get("paperAffiliationPath")
|
||||||
|
|
||||||
val affiliation = spark.read.load(affiliationPath).select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))
|
val affiliation = spark.read
|
||||||
|
.load(affiliationPath)
|
||||||
val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId"))
|
.select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))
|
||||||
|
|
||||||
|
val paperAffiliation = spark.read
|
||||||
|
.load(paperAffiliationPath)
|
||||||
|
.select(col("AffiliationId").alias("affId"), col("PaperId"))
|
||||||
|
|
||||||
val a: Dataset[DoiBoostAffiliation] = paperAffiliation
|
val a: Dataset[DoiBoostAffiliation] = paperAffiliation
|
||||||
.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId")))
|
.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId")))
|
||||||
.select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId"), col("_2.OfficialPage"), col("_2.DisplayName")).as[DoiBoostAffiliation]
|
.select(
|
||||||
|
col("_1.PaperId"),
|
||||||
|
col("_2.AffiliationId"),
|
||||||
|
col("_2.GridId"),
|
||||||
|
col("_2.OfficialPage"),
|
||||||
|
col("_2.DisplayName")
|
||||||
|
)
|
||||||
|
.as[DoiBoostAffiliation]
|
||||||
|
|
||||||
|
val magPubs: Dataset[(String, Publication)] = spark.read
|
||||||
|
.load(s"$workingDirPath/doiBoostPublicationFiltered")
|
||||||
|
.as[Publication]
|
||||||
|
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(
|
||||||
|
tupleForJoinEncoder
|
||||||
|
)
|
||||||
|
.filter(s => s._1 != null)
|
||||||
|
|
||||||
val magPubs: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
|
magPubs
|
||||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s => s._1 != null)
|
.joinWith(a, magPubs("_1").equalTo(a("PaperId")))
|
||||||
|
.flatMap(item => {
|
||||||
|
|
||||||
magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
|
|
||||||
val pub: Publication = item._1._2
|
val pub: Publication = item._1._2
|
||||||
val affiliation = item._2
|
val affiliation = item._2
|
||||||
val affId: String = if (affiliation.GridId.isDefined) s"unresolved::grid::${affiliation.GridId.get.toLowerCase}" else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
|
val affId: String =
|
||||||
|
if (affiliation.GridId.isDefined)
|
||||||
|
s"unresolved::grid::${affiliation.GridId.get.toLowerCase}"
|
||||||
|
else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
|
||||||
val r: Relation = new Relation
|
val r: Relation = new Relation
|
||||||
r.setSource(pub.getId)
|
r.setSource(pub.getId)
|
||||||
r.setTarget(affId)
|
r.setTarget(affId)
|
||||||
|
@ -191,10 +237,15 @@ object SparkGenerateDoiBoost {
|
||||||
r1.setDataInfo(pub.getDataInfo)
|
r1.setDataInfo(pub.getDataInfo)
|
||||||
r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
|
r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
|
||||||
List(r, r1)
|
List(r, r1)
|
||||||
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
|
})(mapEncoderRel)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
|
||||||
|
|
||||||
|
val unresolvedRels: Dataset[(String, Relation)] = spark.read
|
||||||
val unresolvedRels: Dataset[(String, Relation)] = spark.read.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved").as[Relation].map(r => {
|
.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
|
||||||
|
.as[Relation]
|
||||||
|
.map(r => {
|
||||||
|
|
||||||
if (r.getSource.startsWith("unresolved"))
|
if (r.getSource.startsWith("unresolved"))
|
||||||
(r.getSource, r)
|
(r.getSource, r)
|
||||||
|
@ -204,9 +255,16 @@ object SparkGenerateDoiBoost {
|
||||||
("resolved", r)
|
("resolved", r)
|
||||||
})(Encoders.tuple(Encoders.STRING, mapEncoderRel))
|
})(Encoders.tuple(Encoders.STRING, mapEncoderRel))
|
||||||
|
|
||||||
val openaireOrganization: Dataset[(String, String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x, y) => if (x != null) x else y).map(_._2)
|
val openaireOrganization: Dataset[(String, String)] = spark.read
|
||||||
|
.text(openaireOrganizationPath)
|
||||||
|
.as[String]
|
||||||
|
.flatMap(s => extractIdGRID(s))
|
||||||
|
.groupByKey(_._2)
|
||||||
|
.reduceGroups((x, y) => if (x != null) x else y)
|
||||||
|
.map(_._2)
|
||||||
|
|
||||||
unresolvedRels.joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2")))
|
unresolvedRels
|
||||||
|
.joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2")))
|
||||||
.map { x =>
|
.map { x =>
|
||||||
val currentRels = x._1._2
|
val currentRels = x._1._2
|
||||||
val currentOrgs = x._2
|
val currentOrgs = x._2
|
||||||
|
@ -216,9 +274,15 @@ object SparkGenerateDoiBoost {
|
||||||
else
|
else
|
||||||
currentRels.setTarget(currentOrgs._1)
|
currentRels.setTarget(currentOrgs._1)
|
||||||
currentRels
|
currentRels
|
||||||
}.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
|
}
|
||||||
|
.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingDirPath/doiBoostPublicationAffiliation")
|
||||||
|
|
||||||
magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).map(item => {
|
magPubs
|
||||||
|
.joinWith(a, magPubs("_1").equalTo(a("PaperId")))
|
||||||
|
.map(item => {
|
||||||
val affiliation = item._2
|
val affiliation = item._2
|
||||||
if (affiliation.GridId.isEmpty) {
|
if (affiliation.GridId.isEmpty) {
|
||||||
val o = new Organization
|
val o = new Organization
|
||||||
|
@ -232,10 +296,13 @@ object SparkGenerateDoiBoost {
|
||||||
o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
|
o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
|
||||||
o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
|
o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
|
||||||
o
|
o
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
null
|
null
|
||||||
}).filter(o => o != null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization")
|
})
|
||||||
|
.filter(o => o != null)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingDirPath/doiBoostOrganization")
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,11 +22,16 @@ case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
|
||||||
|
|
||||||
case class mappingAffiliation(name: String) {}
|
case class mappingAffiliation(name: String) {}
|
||||||
|
|
||||||
case class mappingAuthor(given: Option[String], family: String, sequence:Option[String], ORCID: Option[String], affiliation: Option[mappingAffiliation]) {}
|
case class mappingAuthor(
|
||||||
|
given: Option[String],
|
||||||
|
family: String,
|
||||||
|
sequence: Option[String],
|
||||||
|
ORCID: Option[String],
|
||||||
|
affiliation: Option[mappingAffiliation]
|
||||||
|
) {}
|
||||||
|
|
||||||
case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {}
|
case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {}
|
||||||
|
|
||||||
|
|
||||||
case object Crossref2Oaf {
|
case object Crossref2Oaf {
|
||||||
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
||||||
|
|
||||||
|
@ -56,7 +61,6 @@ case object Crossref2Oaf {
|
||||||
"dataset" -> "dataset"
|
"dataset" -> "dataset"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
val mappingCrossrefSubType = Map(
|
val mappingCrossrefSubType = Map(
|
||||||
"book-section" -> "0013 Part of book or chapter of book",
|
"book-section" -> "0013 Part of book or chapter of book",
|
||||||
"book" -> "0002 Book",
|
"book" -> "0002 Book",
|
||||||
|
@ -100,7 +104,6 @@ case object Crossref2Oaf {
|
||||||
val originalIds = new util.ArrayList(tmp.filter(id => id != null).asJava)
|
val originalIds = new util.ArrayList(tmp.filter(id => id != null).asJava)
|
||||||
result.setOriginalId(originalIds)
|
result.setOriginalId(originalIds)
|
||||||
|
|
||||||
|
|
||||||
// Add DataInfo
|
// Add DataInfo
|
||||||
result.setDataInfo(generateDataInfo())
|
result.setDataInfo(generateDataInfo())
|
||||||
|
|
||||||
|
@ -114,55 +117,105 @@ case object Crossref2Oaf {
|
||||||
if (publisher != null && publisher.nonEmpty)
|
if (publisher != null && publisher.nonEmpty)
|
||||||
result.setPublisher(asField(publisher))
|
result.setPublisher(asField(publisher))
|
||||||
|
|
||||||
|
|
||||||
// TITLE
|
// TITLE
|
||||||
val mainTitles = for {JString(title) <- json \ "title" if title.nonEmpty} yield createSP(title, "main title", ModelConstants.DNET_DATACITE_TITLE)
|
val mainTitles =
|
||||||
val originalTitles = for {JString(title) <- json \ "original-title" if title.nonEmpty} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
for { JString(title) <- json \ "title" if title.nonEmpty } yield createSP(
|
||||||
val shortTitles = for {JString(title) <- json \ "short-title" if title.nonEmpty} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
title,
|
||||||
val subtitles = for {JString(title) <- json \ "subtitle" if title.nonEmpty} yield createSP(title, "subtitle", ModelConstants.DNET_DATACITE_TITLE)
|
"main title",
|
||||||
|
ModelConstants.DNET_DATACITE_TITLE
|
||||||
|
)
|
||||||
|
val originalTitles = for {
|
||||||
|
JString(title) <- json \ "original-title" if title.nonEmpty
|
||||||
|
} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
||||||
|
val shortTitles = for {
|
||||||
|
JString(title) <- json \ "short-title" if title.nonEmpty
|
||||||
|
} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
||||||
|
val subtitles =
|
||||||
|
for { JString(title) <- json \ "subtitle" if title.nonEmpty } yield createSP(
|
||||||
|
title,
|
||||||
|
"subtitle",
|
||||||
|
ModelConstants.DNET_DATACITE_TITLE
|
||||||
|
)
|
||||||
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
|
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
|
||||||
|
|
||||||
// DESCRIPTION
|
// DESCRIPTION
|
||||||
val descriptionList = for {JString(description) <- json \ "abstract"} yield asField(description)
|
val descriptionList =
|
||||||
|
for { JString(description) <- json \ "abstract" } yield asField(description)
|
||||||
result.setDescription(descriptionList.asJava)
|
result.setDescription(descriptionList.asJava)
|
||||||
|
|
||||||
// Source
|
// Source
|
||||||
val sourceList = for {JString(source) <- json \ "source" if source!= null && source.nonEmpty} yield asField(source)
|
val sourceList = for {
|
||||||
|
JString(source) <- json \ "source" if source != null && source.nonEmpty
|
||||||
|
} yield asField(source)
|
||||||
result.setSource(sourceList.asJava)
|
result.setSource(sourceList.asJava)
|
||||||
|
|
||||||
//RELEVANT DATE Mapping
|
//RELEVANT DATE Mapping
|
||||||
val createdDate = generateDate((json \ "created" \ "date-time").extract[String], (json \ "created" \ "date-parts").extract[List[List[Int]]], "created", ModelConstants.DNET_DATACITE_DATE)
|
val createdDate = generateDate(
|
||||||
val postedDate = generateDate((json \ "posted" \ "date-time").extractOrElse[String](null), (json \ "posted" \ "date-parts").extract[List[List[Int]]], "available", ModelConstants.DNET_DATACITE_DATE)
|
(json \ "created" \ "date-time").extract[String],
|
||||||
val acceptedDate = generateDate((json \ "accepted" \ "date-time").extractOrElse[String](null), (json \ "accepted" \ "date-parts").extract[List[List[Int]]], "accepted", ModelConstants.DNET_DATACITE_DATE)
|
(json \ "created" \ "date-parts").extract[List[List[Int]]],
|
||||||
val publishedPrintDate = generateDate((json \ "published-print" \ "date-time").extractOrElse[String](null), (json \ "published-print" \ "date-parts").extract[List[List[Int]]], "published-print", ModelConstants.DNET_DATACITE_DATE)
|
"created",
|
||||||
val publishedOnlineDate = generateDate((json \ "published-online" \ "date-time").extractOrElse[String](null), (json \ "published-online" \ "date-parts").extract[List[List[Int]]], "published-online", ModelConstants.DNET_DATACITE_DATE)
|
ModelConstants.DNET_DATACITE_DATE
|
||||||
|
)
|
||||||
|
val postedDate = generateDate(
|
||||||
|
(json \ "posted" \ "date-time").extractOrElse[String](null),
|
||||||
|
(json \ "posted" \ "date-parts").extract[List[List[Int]]],
|
||||||
|
"available",
|
||||||
|
ModelConstants.DNET_DATACITE_DATE
|
||||||
|
)
|
||||||
|
val acceptedDate = generateDate(
|
||||||
|
(json \ "accepted" \ "date-time").extractOrElse[String](null),
|
||||||
|
(json \ "accepted" \ "date-parts").extract[List[List[Int]]],
|
||||||
|
"accepted",
|
||||||
|
ModelConstants.DNET_DATACITE_DATE
|
||||||
|
)
|
||||||
|
val publishedPrintDate = generateDate(
|
||||||
|
(json \ "published-print" \ "date-time").extractOrElse[String](null),
|
||||||
|
(json \ "published-print" \ "date-parts").extract[List[List[Int]]],
|
||||||
|
"published-print",
|
||||||
|
ModelConstants.DNET_DATACITE_DATE
|
||||||
|
)
|
||||||
|
val publishedOnlineDate = generateDate(
|
||||||
|
(json \ "published-online" \ "date-time").extractOrElse[String](null),
|
||||||
|
(json \ "published-online" \ "date-parts").extract[List[List[Int]]],
|
||||||
|
"published-online",
|
||||||
|
ModelConstants.DNET_DATACITE_DATE
|
||||||
|
)
|
||||||
|
|
||||||
val issuedDate = extractDate((json \ "issued" \ "date-time").extractOrElse[String](null), (json \ "issued" \ "date-parts").extract[List[List[Int]]])
|
val issuedDate = extractDate(
|
||||||
|
(json \ "issued" \ "date-time").extractOrElse[String](null),
|
||||||
|
(json \ "issued" \ "date-parts").extract[List[List[Int]]]
|
||||||
|
)
|
||||||
if (StringUtils.isNotBlank(issuedDate)) {
|
if (StringUtils.isNotBlank(issuedDate)) {
|
||||||
result.setDateofacceptance(asField(issuedDate))
|
result.setDateofacceptance(asField(issuedDate))
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
result.setDateofacceptance(asField(createdDate.getValue))
|
result.setDateofacceptance(asField(createdDate.getValue))
|
||||||
}
|
}
|
||||||
result.setRelevantdate(List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate).filter(p => p != null).asJava)
|
result.setRelevantdate(
|
||||||
|
List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate)
|
||||||
|
.filter(p => p != null)
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
|
||||||
//Mapping Subject
|
//Mapping Subject
|
||||||
val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List())
|
val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List())
|
||||||
|
|
||||||
if (subjectList.nonEmpty) {
|
if (subjectList.nonEmpty) {
|
||||||
result.setSubject(subjectList.map(s=> createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava)
|
result.setSubject(
|
||||||
|
subjectList.map(s => createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//Mapping Author
|
//Mapping Author
|
||||||
val authorList: List[mappingAuthor] = (json \ "author").extractOrElse[List[mappingAuthor]](List())
|
val authorList: List[mappingAuthor] =
|
||||||
|
(json \ "author").extractOrElse[List[mappingAuthor]](List())
|
||||||
|
|
||||||
|
val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) =>
|
||||||
|
a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first")
|
||||||
|
)
|
||||||
|
|
||||||
|
result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) =>
|
||||||
val sorted_list = authorList.sortWith((a:mappingAuthor, b:mappingAuthor) => a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first"))
|
generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)
|
||||||
|
}.asJava)
|
||||||
result.setAuthor(sorted_list.zipWithIndex.map{case (a, index) => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)}.asJava)
|
|
||||||
|
|
||||||
// Mapping instance
|
// Mapping instance
|
||||||
val instance = new Instance()
|
val instance = new Instance()
|
||||||
|
@ -179,9 +232,9 @@ case object Crossref2Oaf {
|
||||||
instance.setLicense(d._1)
|
instance.setLicense(d._1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
instance.setLicense(l.head._1)
|
||||||
}
|
}
|
||||||
else{
|
|
||||||
instance.setLicense(l.head._1)}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ticket #6281 added pid to Instance
|
// Ticket #6281 added pid to Instance
|
||||||
|
@ -191,18 +244,39 @@ case object Crossref2Oaf {
|
||||||
|
|
||||||
if (has_review != JNothing) {
|
if (has_review != JNothing) {
|
||||||
instance.setRefereed(
|
instance.setRefereed(
|
||||||
OafMapperUtils.qualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS))
|
OafMapperUtils.qualifier(
|
||||||
|
"0001",
|
||||||
|
"peerReviewed",
|
||||||
|
ModelConstants.DNET_REVIEW_LEVELS,
|
||||||
|
ModelConstants.DNET_REVIEW_LEVELS
|
||||||
|
)
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
instance.setAccessright(decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue))
|
instance.setAccessright(
|
||||||
instance.setInstancetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
|
||||||
result.setResourcetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
)
|
||||||
|
instance.setInstancetype(
|
||||||
|
OafMapperUtils.qualifier(
|
||||||
|
cobjCategory.substring(0, 4),
|
||||||
|
cobjCategory.substring(5),
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||||
|
)
|
||||||
|
)
|
||||||
|
result.setResourcetype(
|
||||||
|
OafMapperUtils.qualifier(
|
||||||
|
cobjCategory.substring(0, 4),
|
||||||
|
cobjCategory.substring(5),
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
instance.setCollectedfrom(createCrossrefCollectedFrom())
|
instance.setCollectedfrom(createCrossrefCollectedFrom())
|
||||||
if (StringUtils.isNotBlank(issuedDate)) {
|
if (StringUtils.isNotBlank(issuedDate)) {
|
||||||
instance.setDateofacceptance(asField(issuedDate))
|
instance.setDateofacceptance(asField(issuedDate))
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
instance.setDateofacceptance(asField(createdDate.getValue))
|
instance.setDateofacceptance(asField(createdDate.getValue))
|
||||||
}
|
}
|
||||||
val s: List[String] = List("https://doi.org/" + doi)
|
val s: List[String] = List("https://doi.org/" + doi)
|
||||||
|
@ -210,8 +284,7 @@ case object Crossref2Oaf {
|
||||||
// if (links.nonEmpty) {
|
// if (links.nonEmpty) {
|
||||||
// instance.setUrl(links.asJava)
|
// instance.setUrl(links.asJava)
|
||||||
// }
|
// }
|
||||||
if(s.nonEmpty)
|
if (s.nonEmpty) {
|
||||||
{
|
|
||||||
instance.setUrl(s.asJava)
|
instance.setUrl(s.asJava)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -236,7 +309,6 @@ case object Crossref2Oaf {
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateAuhtor(given: String, family: String, orcid: String, index: Int): Author = {
|
def generateAuhtor(given: String, family: String, orcid: String, index: Int): Author = {
|
||||||
val a = new Author
|
val a = new Author
|
||||||
a.setName(given)
|
a.setName(given)
|
||||||
|
@ -244,7 +316,16 @@ case object Crossref2Oaf {
|
||||||
a.setFullname(s"$given $family")
|
a.setFullname(s"$given $family")
|
||||||
a.setRank(index + 1)
|
a.setRank(index + 1)
|
||||||
if (StringUtils.isNotBlank(orcid))
|
if (StringUtils.isNotBlank(orcid))
|
||||||
a.setPid(List(createSP(orcid, ModelConstants.ORCID_PENDING, ModelConstants.DNET_PID_TYPES, generateDataInfo())).asJava)
|
a.setPid(
|
||||||
|
List(
|
||||||
|
createSP(
|
||||||
|
orcid,
|
||||||
|
ModelConstants.ORCID_PENDING,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
generateDataInfo()
|
||||||
|
)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
|
|
||||||
a
|
a
|
||||||
}
|
}
|
||||||
|
@ -255,29 +336,35 @@ case object Crossref2Oaf {
|
||||||
|
|
||||||
var resultList: List[Oaf] = List()
|
var resultList: List[Oaf] = List()
|
||||||
|
|
||||||
|
|
||||||
val objectType = (json \ "type").extractOrElse[String](null)
|
val objectType = (json \ "type").extractOrElse[String](null)
|
||||||
val objectSubType = (json \ "subtype").extractOrElse[String](null)
|
val objectSubType = (json \ "subtype").extractOrElse[String](null)
|
||||||
if (objectType == null)
|
if (objectType == null)
|
||||||
return resultList
|
return resultList
|
||||||
|
|
||||||
|
|
||||||
val result = generateItemFromType(objectType, objectSubType)
|
val result = generateItemFromType(objectType, objectSubType)
|
||||||
if (result == null)
|
if (result == null)
|
||||||
return List()
|
return List()
|
||||||
val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type"))
|
val cOBJCategory = mappingCrossrefSubType.getOrElse(
|
||||||
|
objectType,
|
||||||
|
mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type")
|
||||||
|
)
|
||||||
mappingResult(result, json, cOBJCategory)
|
mappingResult(result, json, cOBJCategory)
|
||||||
if (result == null || result.getId == null)
|
if (result == null || result.getId == null)
|
||||||
return List()
|
return List()
|
||||||
|
|
||||||
|
val funderList: List[mappingFunder] =
|
||||||
val funderList: List[mappingFunder] = (json \ "funder").extractOrElse[List[mappingFunder]](List())
|
(json \ "funder").extractOrElse[List[mappingFunder]](List())
|
||||||
|
|
||||||
if (funderList.nonEmpty) {
|
if (funderList.nonEmpty) {
|
||||||
resultList = resultList ::: mappingFunderToRelations(funderList, result.getId, createCrossrefCollectedFrom(), result.getDataInfo, result.getLastupdatetimestamp)
|
resultList = resultList ::: mappingFunderToRelations(
|
||||||
|
funderList,
|
||||||
|
result.getId,
|
||||||
|
createCrossrefCollectedFrom(),
|
||||||
|
result.getDataInfo,
|
||||||
|
result.getLastupdatetimestamp
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
result match {
|
result match {
|
||||||
case publication: Publication => convertPublication(publication, json, cOBJCategory)
|
case publication: Publication => convertPublication(publication, json, cOBJCategory)
|
||||||
case dataset: Dataset => convertDataset(dataset)
|
case dataset: Dataset => convertDataset(dataset)
|
||||||
|
@ -287,22 +374,24 @@ case object Crossref2Oaf {
|
||||||
resultList
|
resultList
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def mappingFunderToRelations(
|
||||||
def mappingFunderToRelations(funders: List[mappingFunder], sourceId: String, cf: KeyValue, di: DataInfo, ts: Long): List[Relation] = {
|
funders: List[mappingFunder],
|
||||||
|
sourceId: String,
|
||||||
|
cf: KeyValue,
|
||||||
|
di: DataInfo,
|
||||||
|
ts: Long
|
||||||
|
): List[Relation] = {
|
||||||
|
|
||||||
val queue = new mutable.Queue[Relation]
|
val queue = new mutable.Queue[Relation]
|
||||||
|
|
||||||
|
|
||||||
def snsfRule(award: String): String = {
|
def snsfRule(award: String): String = {
|
||||||
val tmp1 = StringUtils.substringAfter(award, "_")
|
val tmp1 = StringUtils.substringAfter(award, "_")
|
||||||
val tmp2 = StringUtils.substringBefore(tmp1, "/")
|
val tmp2 = StringUtils.substringBefore(tmp1, "/")
|
||||||
logger.debug(s"From $award to $tmp2")
|
logger.debug(s"From $award to $tmp2")
|
||||||
tmp2
|
tmp2
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def extractECAward(award: String): String = {
|
def extractECAward(award: String): String = {
|
||||||
val awardECRegex: Regex = "[0-9]{4,9}".r
|
val awardECRegex: Regex = "[0-9]{4,9}".r
|
||||||
if (awardECRegex.findAllIn(award).hasNext)
|
if (awardECRegex.findAllIn(award).hasNext)
|
||||||
|
@ -310,7 +399,6 @@ case object Crossref2Oaf {
|
||||||
null
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateRelation(sourceId: String, targetId: String, relClass: String): Relation = {
|
def generateRelation(sourceId: String, targetId: String, relClass: String): Relation = {
|
||||||
|
|
||||||
val r = new Relation
|
val r = new Relation
|
||||||
|
@ -324,89 +412,111 @@ case object Crossref2Oaf {
|
||||||
r.setLastupdatetimestamp(ts)
|
r.setLastupdatetimestamp(ts)
|
||||||
r
|
r
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def generateSimpleRelationFromAward(
|
||||||
def generateSimpleRelationFromAward(funder: mappingFunder, nsPrefix: String, extractField: String => String): Unit = {
|
funder: mappingFunder,
|
||||||
|
nsPrefix: String,
|
||||||
|
extractField: String => String
|
||||||
|
): Unit = {
|
||||||
if (funder.award.isDefined && funder.award.get.nonEmpty)
|
if (funder.award.isDefined && funder.award.get.nonEmpty)
|
||||||
funder.award.get.map(extractField).filter(a => a!= null && a.nonEmpty).foreach(
|
funder.award.get
|
||||||
award => {
|
.map(extractField)
|
||||||
|
.filter(a => a != null && a.nonEmpty)
|
||||||
|
.foreach(award => {
|
||||||
val targetId = getProjectId(nsPrefix, DHPUtils.md5(award))
|
val targetId = getProjectId(nsPrefix, DHPUtils.md5(award))
|
||||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
}
|
})
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def getProjectId(nsPrefix: String, targetId: String): String = {
|
def getProjectId(nsPrefix: String, targetId: String): String = {
|
||||||
s"40|$nsPrefix::$targetId"
|
s"40|$nsPrefix::$targetId"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (funders != null)
|
if (funders != null)
|
||||||
funders.foreach(funder => {
|
funders.foreach(funder => {
|
||||||
if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
|
if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
|
||||||
funder.DOI.get match {
|
funder.DOI.get match {
|
||||||
case "10.13039/100010663" |
|
case "10.13039/100010663" | "10.13039/100010661" | "10.13039/501100007601" | "10.13039/501100000780" |
|
||||||
"10.13039/100010661" |
|
"10.13039/100010665" =>
|
||||||
"10.13039/501100007601" |
|
|
||||||
"10.13039/501100000780" |
|
|
||||||
"10.13039/100010665" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
|
||||||
case "10.13039/100011199" |
|
|
||||||
"10.13039/100004431" |
|
|
||||||
"10.13039/501100004963" |
|
|
||||||
"10.13039/501100000780" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
|
||||||
case "10.13039/501100000781" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
|
||||||
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||||
case "10.13039/100000001" => generateSimpleRelationFromAward(funder, "nsf_________", a => a)
|
case "10.13039/100011199" | "10.13039/100004431" | "10.13039/501100004963" | "10.13039/501100000780" =>
|
||||||
case "10.13039/501100001665" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||||
case "10.13039/501100002341" => generateSimpleRelationFromAward(funder, "aka_________", a => a)
|
case "10.13039/501100000781" =>
|
||||||
case "10.13039/501100001602" => generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
|
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||||
case "10.13039/501100000923" => generateSimpleRelationFromAward(funder, "arc_________", a => a)
|
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||||
case "10.13039/501100000038"=> val targetId = getProjectId("nserc_______" , "1e5e62235d094afd01cd56e65112fc63")
|
case "10.13039/100000001" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "nsf_________", a => a)
|
||||||
|
case "10.13039/501100001665" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
||||||
|
case "10.13039/501100002341" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "aka_________", a => a)
|
||||||
|
case "10.13039/501100001602" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
|
||||||
|
case "10.13039/501100000923" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "arc_________", a => a)
|
||||||
|
case "10.13039/501100000038" =>
|
||||||
|
val targetId = getProjectId("nserc_______", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
case "10.13039/501100000155"=> val targetId = getProjectId("sshrc_______" , "1e5e62235d094afd01cd56e65112fc63")
|
case "10.13039/501100000155" =>
|
||||||
|
val targetId = getProjectId("sshrc_______", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
case "10.13039/501100000024"=> val targetId = getProjectId("cihr________" , "1e5e62235d094afd01cd56e65112fc63")
|
case "10.13039/501100000024" =>
|
||||||
|
val targetId = getProjectId("cihr________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a)
|
case "10.13039/501100002848" =>
|
||||||
case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
|
generateSimpleRelationFromAward(funder, "conicytf____", a => a)
|
||||||
case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a=>a)
|
case "10.13039/501100003448" =>
|
||||||
case "10.13039/501100004564" => generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
|
generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
|
||||||
case "10.13039/501100003407" => generateSimpleRelationFromAward(funder, "miur________", a=>a)
|
case "10.13039/501100010198" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "sgov________", a => a)
|
||||||
|
case "10.13039/501100004564" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
|
||||||
|
case "10.13039/501100003407" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "miur________", a => a)
|
||||||
val targetId = getProjectId("miur________", "1e5e62235d094afd01cd56e65112fc63")
|
val targetId = getProjectId("miur________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
case "10.13039/501100006588" |
|
case "10.13039/501100006588" | "10.13039/501100004488" =>
|
||||||
"10.13039/501100004488" => generateSimpleRelationFromAward(funder, "irb_hr______", a=>a.replaceAll("Project No.", "").replaceAll("HRZZ-","") )
|
generateSimpleRelationFromAward(
|
||||||
case "10.13039/501100006769"=> generateSimpleRelationFromAward(funder, "rsf_________", a=>a)
|
funder,
|
||||||
case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
|
"irb_hr______",
|
||||||
case "10.13039/501100004410"=> generateSimpleRelationFromAward(funder, "tubitakf____", a =>a)
|
a => a.replaceAll("Project No.", "").replaceAll("HRZZ-", "")
|
||||||
case "10.10.13039/100004440"=> generateSimpleRelationFromAward(funder, "wt__________", a =>a)
|
)
|
||||||
case "10.13039/100004440"=> val targetId = getProjectId("wt__________" , "1e5e62235d094afd01cd56e65112fc63")
|
case "10.13039/501100006769" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "rsf_________", a => a)
|
||||||
|
case "10.13039/501100001711" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
|
||||||
|
case "10.13039/501100004410" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "tubitakf____", a => a)
|
||||||
|
case "10.10.13039/100004440" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "wt__________", a => a)
|
||||||
|
case "10.13039/100004440" =>
|
||||||
|
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
|
||||||
case _ => logger.debug("no match for " + funder.DOI.get)
|
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
funder.name match {
|
funder.name match {
|
||||||
case "European Union’s Horizon 2020 research and innovation program" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
case "European Union’s Horizon 2020 research and innovation program" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||||
case "European Union's" =>
|
case "European Union's" =>
|
||||||
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||||
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||||
case "The French National Research Agency (ANR)" |
|
case "The French National Research Agency (ANR)" | "The French National Research Agency" =>
|
||||||
"The French National Research Agency" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
||||||
case "CONICYT, Programa de Formación de Capital Humano Avanzado" => generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
|
case "CONICYT, Programa de Formación de Capital Humano Avanzado" =>
|
||||||
case "Wellcome Trust Masters Fellowship" => val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
|
generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
|
||||||
|
case "Wellcome Trust Masters Fellowship" =>
|
||||||
|
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
case _ => logger.debug("no match for " + funder.name)
|
case _ => logger.debug("no match for " + funder.name)
|
||||||
|
@ -414,8 +524,7 @@ case object Crossref2Oaf {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
})
|
||||||
)
|
|
||||||
queue.toList
|
queue.toList
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -423,12 +532,10 @@ case object Crossref2Oaf {
|
||||||
// TODO check if there are other info to map into the Dataset
|
// TODO check if there are other info to map into the Dataset
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def convertPublication(publication: Publication, json: JValue, cobjCategory: String): Unit = {
|
def convertPublication(publication: Publication, json: JValue, cobjCategory: String): Unit = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
val containerTitles = for { JString(ct) <- json \ "container-title" } yield ct
|
val containerTitles = for { JString(ct) <- json \ "container-title" } yield ct
|
||||||
|
|
||||||
|
|
||||||
//Mapping book
|
//Mapping book
|
||||||
if (cobjCategory.toLowerCase.contains("book")) {
|
if (cobjCategory.toLowerCase.contains("book")) {
|
||||||
val ISBN = for { JString(isbn) <- json \ "ISBN" } yield isbn
|
val ISBN = for { JString(isbn) <- json \ "ISBN" } yield isbn
|
||||||
|
@ -438,14 +545,14 @@ case object Crossref2Oaf {
|
||||||
val l: List[Field[String]] = publication.getSource.asScala.toList
|
val l: List[Field[String]] = publication.getSource.asScala.toList
|
||||||
val ll: List[Field[String]] = l ::: List(asField(source))
|
val ll: List[Field[String]] = l ::: List(asField(source))
|
||||||
publication.setSource(ll.asJava)
|
publication.setSource(ll.asJava)
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
publication.setSource(List(asField(source)).asJava)
|
publication.setSource(List(asField(source)).asJava)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Mapping Journal
|
// Mapping Journal
|
||||||
|
|
||||||
val issnInfos = for {JArray(issn_types) <- json \ "issn-type"
|
val issnInfos = for {
|
||||||
|
JArray(issn_types) <- json \ "issn-type"
|
||||||
JObject(issn_type) <- issn_types
|
JObject(issn_type) <- issn_types
|
||||||
JField("type", JString(tp)) <- issn_type
|
JField("type", JString(tp)) <- issn_type
|
||||||
JField("value", JString(vl)) <- issn_type
|
JField("value", JString(vl)) <- issn_type
|
||||||
|
@ -494,7 +601,12 @@ case object Crossref2Oaf {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def generateDate(dt: String, datePart: List[List[Int]], classId: String, schemeId: String): StructuredProperty = {
|
def generateDate(
|
||||||
|
dt: String,
|
||||||
|
datePart: List[List[Int]],
|
||||||
|
classId: String,
|
||||||
|
schemeId: String
|
||||||
|
): StructuredProperty = {
|
||||||
val dp = extractDate(dt, datePart)
|
val dp = extractDate(dt, datePart)
|
||||||
if (StringUtils.isNotBlank(dp))
|
if (StringUtils.isNotBlank(dp))
|
||||||
return createSP(dp, classId, schemeId)
|
return createSP(dp, classId, schemeId)
|
||||||
|
|
|
@ -16,7 +16,6 @@ object CrossrefDataset {
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
||||||
|
|
||||||
|
|
||||||
def to_item(input: String): CrossrefDT = {
|
def to_item(input: String): CrossrefDT = {
|
||||||
|
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
@ -29,19 +28,24 @@ object CrossrefDataset {
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
|
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
CrossrefDataset.getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
|
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
|
||||||
val crossrefAggregator = new Aggregator[CrossrefDT, CrossrefDT, CrossrefDT] with Serializable {
|
val crossrefAggregator = new Aggregator[CrossrefDT, CrossrefDT, CrossrefDT] with Serializable {
|
||||||
|
|
||||||
override def zero: CrossrefDT = null
|
override def zero: CrossrefDT = null
|
||||||
|
@ -52,7 +56,6 @@ object CrossrefDataset {
|
||||||
if (a == null)
|
if (a == null)
|
||||||
return b
|
return b
|
||||||
|
|
||||||
|
|
||||||
if (a.timestamp > b.timestamp) {
|
if (a.timestamp > b.timestamp) {
|
||||||
return a
|
return a
|
||||||
}
|
}
|
||||||
|
@ -80,19 +83,24 @@ object CrossrefDataset {
|
||||||
|
|
||||||
val workingPath: String = parser.get("workingPath")
|
val workingPath: String = parser.get("workingPath")
|
||||||
|
|
||||||
|
|
||||||
val main_ds: Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT]
|
val main_ds: Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT]
|
||||||
|
|
||||||
|
|
||||||
val update =
|
val update =
|
||||||
spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text])
|
spark.createDataset(
|
||||||
|
spark.sparkContext
|
||||||
|
.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text])
|
||||||
.map(i => CrossrefImporter.decompressBlob(i._2.toString))
|
.map(i => CrossrefImporter.decompressBlob(i._2.toString))
|
||||||
.map(i => to_item(i)))
|
.map(i => to_item(i))
|
||||||
|
)
|
||||||
|
|
||||||
main_ds.union(update).groupByKey(_.doi)
|
main_ds
|
||||||
|
.union(update)
|
||||||
|
.groupByKey(_.doi)
|
||||||
.agg(crossrefAggregator.toColumn)
|
.agg(crossrefAggregator.toColumn)
|
||||||
.map(s => s._2)
|
.map(s => s._2)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/crossref_ds_updated")
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,6 @@ object GenerateCrossrefDataset {
|
||||||
|
|
||||||
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
|
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
|
||||||
|
|
||||||
|
|
||||||
def crossrefElement(meta: String): CrossrefDT = {
|
def crossrefElement(meta: String): CrossrefDT = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json: json4s.JValue = parse(meta)
|
lazy val json: json4s.JValue = parse(meta)
|
||||||
|
@ -30,13 +29,23 @@ object GenerateCrossrefDataset {
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val conf = new SparkConf
|
val conf = new SparkConf
|
||||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString)
|
val parser = new ArgumentApplicationParser(
|
||||||
|
Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val master = parser.get("master")
|
val master = parser.get("master")
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
|
|
||||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
val spark: SparkSession = SparkSession
|
||||||
|
.builder()
|
||||||
|
.config(conf)
|
||||||
.appName(UnpackCrtossrefEntries.getClass.getSimpleName)
|
.appName(UnpackCrtossrefEntries.getClass.getSimpleName)
|
||||||
.master(master)
|
.master(master)
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
|
@ -44,12 +53,14 @@ object GenerateCrossrefDataset {
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
|
||||||
val tmp: RDD[String] = sc.textFile(sourcePath, 6000)
|
val tmp: RDD[String] = sc.textFile(sourcePath, 6000)
|
||||||
|
|
||||||
spark.createDataset(tmp)
|
spark
|
||||||
|
.createDataset(tmp)
|
||||||
.map(entry => crossrefElement(entry))
|
.map(entry => crossrefElement(entry))
|
||||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(targetPath)
|
||||||
// .map(meta => crossrefElement(meta))
|
// .map(meta => crossrefElement(meta))
|
||||||
// .toDS.as[CrossrefDT]
|
// .toDS.as[CrossrefDT]
|
||||||
// .write.mode(SaveMode.Overwrite).save(targetPath)
|
// .write.mode(SaveMode.Overwrite).save(targetPath)
|
||||||
|
|
|
@ -8,7 +8,6 @@ import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
|
||||||
case class Reference(author: String, firstPage: String) {}
|
case class Reference(author: String, firstPage: String) {}
|
||||||
|
|
||||||
object SparkMapDumpIntoOAF {
|
object SparkMapDumpIntoOAF {
|
||||||
|
@ -19,14 +18,21 @@ object SparkMapDumpIntoOAF {
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
SparkMapDumpIntoOAF.getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
|
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
|
@ -35,19 +41,34 @@ object SparkMapDumpIntoOAF {
|
||||||
|
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
|
|
||||||
spark.read.load(parser.get("sourcePath")).as[CrossrefDT]
|
spark.read
|
||||||
|
.load(parser.get("sourcePath"))
|
||||||
|
.as[CrossrefDT]
|
||||||
.flatMap(k => Crossref2Oaf.convert(k.json))
|
.flatMap(k => Crossref2Oaf.convert(k.json))
|
||||||
.filter(o => o != null)
|
.filter(o => o != null)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/mixObject")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/mixObject")
|
||||||
|
|
||||||
val ds: Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]
|
val ds: Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]
|
||||||
|
|
||||||
ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefPublication")
|
ds.filter(o => o.isInstanceOf[Publication])
|
||||||
|
.map(o => o.asInstanceOf[Publication])
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/crossrefPublication")
|
||||||
|
|
||||||
ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefRelation")
|
ds.filter(o => o.isInstanceOf[Relation])
|
||||||
|
.map(o => o.asInstanceOf[Relation])
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/crossrefRelation")
|
||||||
|
|
||||||
ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefDataset")
|
ds.filter(o => o.isInstanceOf[OafDataset])
|
||||||
|
.map(o => o.asInstanceOf[OafDataset])
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/crossrefDataset")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,7 +16,6 @@ object UnpackCrtossrefEntries {
|
||||||
|
|
||||||
val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass)
|
val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass)
|
||||||
|
|
||||||
|
|
||||||
def extractDump(input: String): List[String] = {
|
def extractDump(input: String): List[String] = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json: json4s.JValue = parse(input)
|
lazy val json: json4s.JValue = parse(input)
|
||||||
|
@ -24,28 +23,36 @@ object UnpackCrtossrefEntries {
|
||||||
val a = (json \ "items").extract[JArray]
|
val a = (json \ "items").extract[JArray]
|
||||||
a.arr.map(s => compact(render(s)))
|
a.arr.map(s => compact(render(s)))
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val conf = new SparkConf
|
val conf = new SparkConf
|
||||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString)
|
val parser = new ArgumentApplicationParser(
|
||||||
|
Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val master = parser.get("master")
|
val master = parser.get("master")
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
|
|
||||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
val spark: SparkSession = SparkSession
|
||||||
|
.builder()
|
||||||
|
.config(conf)
|
||||||
.appName(UnpackCrtossrefEntries.getClass.getSimpleName)
|
.appName(UnpackCrtossrefEntries.getClass.getSimpleName)
|
||||||
.master(master)
|
.master(master)
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
val sc: SparkContext = spark.sparkContext
|
val sc: SparkContext = spark.sparkContext
|
||||||
|
|
||||||
sc.wholeTextFiles(sourcePath, 6000).flatMap(d => extractDump(d._2))
|
sc.wholeTextFiles(sourcePath, 6000)
|
||||||
|
.flatMap(d => extractDump(d._2))
|
||||||
.saveAsTextFile(targetPath, classOf[GzipCodec])
|
.saveAsTextFile(targetPath, classOf[GzipCodec])
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
package eu.dnetlib.doiboost.mag
|
package eu.dnetlib.doiboost.mag
|
||||||
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
|
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
|
||||||
|
@ -14,45 +13,121 @@ import scala.collection.JavaConverters._
|
||||||
import scala.collection.mutable
|
import scala.collection.mutable
|
||||||
import scala.util.matching.Regex
|
import scala.util.matching.Regex
|
||||||
|
|
||||||
|
case class MagPapers(
|
||||||
case class MagPapers(PaperId: Long, Rank: Integer, Doi: String,
|
PaperId: Long,
|
||||||
DocType: String, PaperTitle: String, OriginalTitle: String,
|
Rank: Integer,
|
||||||
BookTitle: String, Year: Option[Integer], Date: Option[java.sql.Timestamp], Publisher: String,
|
Doi: String,
|
||||||
JournalId: Option[Long], ConferenceSeriesId: Option[Long], ConferenceInstanceId: Option[Long],
|
DocType: String,
|
||||||
Volume: String, Issue: String, FirstPage: String, LastPage: String,
|
PaperTitle: String,
|
||||||
ReferenceCount: Option[Long], CitationCount: Option[Long], EstimatedCitation: Option[Long],
|
OriginalTitle: String,
|
||||||
OriginalVenue: String, FamilyId: Option[Long], CreatedDate: java.sql.Timestamp) {}
|
BookTitle: String,
|
||||||
|
Year: Option[Integer],
|
||||||
|
Date: Option[java.sql.Timestamp],
|
||||||
|
Publisher: String,
|
||||||
|
JournalId: Option[Long],
|
||||||
|
ConferenceSeriesId: Option[Long],
|
||||||
|
ConferenceInstanceId: Option[Long],
|
||||||
|
Volume: String,
|
||||||
|
Issue: String,
|
||||||
|
FirstPage: String,
|
||||||
|
LastPage: String,
|
||||||
|
ReferenceCount: Option[Long],
|
||||||
|
CitationCount: Option[Long],
|
||||||
|
EstimatedCitation: Option[Long],
|
||||||
|
OriginalVenue: String,
|
||||||
|
FamilyId: Option[Long],
|
||||||
|
CreatedDate: java.sql.Timestamp
|
||||||
|
) {}
|
||||||
|
|
||||||
case class MagPaperAbstract(PaperId: Long, IndexedAbstract: String) {}
|
case class MagPaperAbstract(PaperId: Long, IndexedAbstract: String) {}
|
||||||
|
|
||||||
case class MagAuthor(AuthorId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], LastKnownAffiliationId: Option[Long], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
|
case class MagAuthor(
|
||||||
|
AuthorId: Long,
|
||||||
|
Rank: Option[Int],
|
||||||
|
NormalizedName: Option[String],
|
||||||
|
DisplayName: Option[String],
|
||||||
|
LastKnownAffiliationId: Option[Long],
|
||||||
|
PaperCount: Option[Long],
|
||||||
|
CitationCount: Option[Long],
|
||||||
|
CreatedDate: Option[java.sql.Timestamp]
|
||||||
|
) {}
|
||||||
|
|
||||||
case class MagAffiliation(AffiliationId: Long, Rank: Int, NormalizedName: String, DisplayName: String, GridId: String, OfficialPage: String, WikiPage: String, PaperCount: Long, CitationCount: Long, Latitude: Option[Float], Longitude: Option[Float], CreatedDate: java.sql.Timestamp) {}
|
case class MagAffiliation(
|
||||||
|
AffiliationId: Long,
|
||||||
case class MagPaperAuthorAffiliation(PaperId: Long, AuthorId: Long, AffiliationId: Option[Long], AuthorSequenceNumber: Int, OriginalAuthor: String, OriginalAffiliation: String) {}
|
Rank: Int,
|
||||||
|
NormalizedName: String,
|
||||||
|
DisplayName: String,
|
||||||
|
GridId: String,
|
||||||
|
OfficialPage: String,
|
||||||
|
WikiPage: String,
|
||||||
|
PaperCount: Long,
|
||||||
|
CitationCount: Long,
|
||||||
|
Latitude: Option[Float],
|
||||||
|
Longitude: Option[Float],
|
||||||
|
CreatedDate: java.sql.Timestamp
|
||||||
|
) {}
|
||||||
|
|
||||||
|
case class MagPaperAuthorAffiliation(
|
||||||
|
PaperId: Long,
|
||||||
|
AuthorId: Long,
|
||||||
|
AffiliationId: Option[Long],
|
||||||
|
AuthorSequenceNumber: Int,
|
||||||
|
OriginalAuthor: String,
|
||||||
|
OriginalAffiliation: String
|
||||||
|
) {}
|
||||||
|
|
||||||
case class MagAuthorAffiliation(author: MagAuthor, affiliation: String, sequenceNumber: Int)
|
case class MagAuthorAffiliation(author: MagAuthor, affiliation: String, sequenceNumber: Int)
|
||||||
|
|
||||||
case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {}
|
case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {}
|
||||||
|
|
||||||
case class MagPaperAuthorDenormalized(PaperId: Long, author: MagAuthor, affiliation:String, sequenceNumber:Int) {}
|
case class MagPaperAuthorDenormalized(
|
||||||
|
PaperId: Long,
|
||||||
|
author: MagAuthor,
|
||||||
|
affiliation: String,
|
||||||
|
sequenceNumber: Int
|
||||||
|
) {}
|
||||||
|
|
||||||
case class MagPaperUrl(PaperId: Long, SourceType: Option[Int], SourceUrl: Option[String], LanguageCode: Option[String]) {}
|
case class MagPaperUrl(
|
||||||
|
PaperId: Long,
|
||||||
|
SourceType: Option[Int],
|
||||||
|
SourceUrl: Option[String],
|
||||||
|
LanguageCode: Option[String]
|
||||||
|
) {}
|
||||||
|
|
||||||
case class MagUrlInstance(SourceUrl: String) {}
|
case class MagUrlInstance(SourceUrl: String) {}
|
||||||
|
|
||||||
case class MagUrl(PaperId: Long, instances: List[MagUrlInstance])
|
case class MagUrl(PaperId: Long, instances: List[MagUrlInstance])
|
||||||
|
|
||||||
case class MagSubject(FieldOfStudyId:Long, DisplayName:String, MainType:Option[String], Score:Float){}
|
case class MagSubject(
|
||||||
|
FieldOfStudyId: Long,
|
||||||
|
DisplayName: String,
|
||||||
|
MainType: Option[String],
|
||||||
|
Score: Float
|
||||||
|
) {}
|
||||||
|
|
||||||
case class MagFieldOfStudy(PaperId: Long, subjects: List[MagSubject]) {}
|
case class MagFieldOfStudy(PaperId: Long, subjects: List[MagSubject]) {}
|
||||||
|
|
||||||
case class MagJournal(JournalId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], Issn: Option[String], Publisher: Option[String], Webpage: Option[String], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
|
case class MagJournal(
|
||||||
|
JournalId: Long,
|
||||||
|
Rank: Option[Int],
|
||||||
|
NormalizedName: Option[String],
|
||||||
|
DisplayName: Option[String],
|
||||||
|
Issn: Option[String],
|
||||||
|
Publisher: Option[String],
|
||||||
|
Webpage: Option[String],
|
||||||
|
PaperCount: Option[Long],
|
||||||
|
CitationCount: Option[Long],
|
||||||
|
CreatedDate: Option[java.sql.Timestamp]
|
||||||
|
) {}
|
||||||
|
|
||||||
|
case class MagConferenceInstance(
|
||||||
case class MagConferenceInstance(ci:Long, DisplayName:Option[String], Location:Option[String], StartDate:Option[java.sql.Timestamp], EndDate:Option[java.sql.Timestamp], PaperId:Long){}
|
ci: Long,
|
||||||
|
DisplayName: Option[String],
|
||||||
|
Location: Option[String],
|
||||||
|
StartDate: Option[java.sql.Timestamp],
|
||||||
|
EndDate: Option[java.sql.Timestamp],
|
||||||
|
PaperId: Long
|
||||||
|
) {}
|
||||||
|
|
||||||
case object ConversionUtil {
|
case object ConversionUtil {
|
||||||
|
|
||||||
|
@ -65,7 +140,6 @@ case object ConversionUtil {
|
||||||
null
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def mergePublication(a: Publication, b: Publication): Publication = {
|
def mergePublication(a: Publication, b: Publication): Publication = {
|
||||||
if ((a != null) && (b != null)) {
|
if ((a != null) && (b != null)) {
|
||||||
a.mergeFrom(b)
|
a.mergeFrom(b)
|
||||||
|
@ -74,7 +148,6 @@ case object ConversionUtil {
|
||||||
if (a == null) b else a
|
if (a == null) b else a
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def choiceLatestMagArtitcle(p1: MagPapers, p2: MagPapers): MagPapers = {
|
def choiceLatestMagArtitcle(p1: MagPapers, p2: MagPapers): MagPapers = {
|
||||||
|
@ -93,8 +166,9 @@ case object ConversionUtil {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def updatePubsWithDescription(
|
||||||
def updatePubsWithDescription(inputItem:((String, Publication), MagPaperAbstract)) : Publication = {
|
inputItem: ((String, Publication), MagPaperAbstract)
|
||||||
|
): Publication = {
|
||||||
val pub = inputItem._1._2
|
val pub = inputItem._1._2
|
||||||
val abst = inputItem._2
|
val abst = inputItem._2
|
||||||
if (abst != null) {
|
if (abst != null) {
|
||||||
|
@ -104,8 +178,9 @@ case object ConversionUtil {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def updatePubsWithConferenceInfo(
|
||||||
def updatePubsWithConferenceInfo(inputItem:((String, Publication), MagConferenceInstance)) : Publication = {
|
inputItem: ((String, Publication), MagConferenceInstance)
|
||||||
|
): Publication = {
|
||||||
val publication: Publication = inputItem._1._2
|
val publication: Publication = inputItem._1._2
|
||||||
val ci: MagConferenceInstance = inputItem._2
|
val ci: MagConferenceInstance = inputItem._2
|
||||||
|
|
||||||
|
@ -115,9 +190,10 @@ case object ConversionUtil {
|
||||||
if (ci.Location.isDefined)
|
if (ci.Location.isDefined)
|
||||||
j.setConferenceplace(ci.Location.get)
|
j.setConferenceplace(ci.Location.get)
|
||||||
j.setName(ci.DisplayName.get)
|
j.setName(ci.DisplayName.get)
|
||||||
if (ci.StartDate.isDefined && ci.EndDate.isDefined)
|
if (ci.StartDate.isDefined && ci.EndDate.isDefined) {
|
||||||
{
|
j.setConferencedate(
|
||||||
j.setConferencedate(s"${ci.StartDate.get.toString.substring(0,10)} - ${ci.EndDate.get.toString.substring(0,10)}")
|
s"${ci.StartDate.get.toString.substring(0, 10)} - ${ci.EndDate.get.toString.substring(0, 10)}"
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
publication.setJournal(j)
|
publication.setJournal(j)
|
||||||
|
@ -135,16 +211,34 @@ case object ConversionUtil {
|
||||||
val classid = "MAG"
|
val classid = "MAG"
|
||||||
|
|
||||||
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
|
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
|
||||||
val s1 = createSP(s.DisplayName, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
|
val s1 = createSP(
|
||||||
|
s.DisplayName,
|
||||||
|
classid,
|
||||||
|
className,
|
||||||
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||||
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES
|
||||||
|
)
|
||||||
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
|
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
|
||||||
var resList: List[StructuredProperty] = List(s1)
|
var resList: List[StructuredProperty] = List(s1)
|
||||||
if (s.MainType.isDefined) {
|
if (s.MainType.isDefined) {
|
||||||
val maintp = s.MainType.get
|
val maintp = s.MainType.get
|
||||||
val s2 = createSP(s.MainType.get, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
|
val s2 = createSP(
|
||||||
|
s.MainType.get,
|
||||||
|
classid,
|
||||||
|
className,
|
||||||
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||||
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES
|
||||||
|
)
|
||||||
s2.setDataInfo(di)
|
s2.setDataInfo(di)
|
||||||
resList = resList ::: List(s2)
|
resList = resList ::: List(s2)
|
||||||
if (maintp.contains(".")) {
|
if (maintp.contains(".")) {
|
||||||
val s3 = createSP(maintp.split("\\.").head, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
|
val s3 = createSP(
|
||||||
|
maintp.split("\\.").head,
|
||||||
|
classid,
|
||||||
|
className,
|
||||||
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||||
|
ModelConstants.DNET_SUBJECT_TYPOLOGIES
|
||||||
|
)
|
||||||
s3.setDataInfo(di)
|
s3.setDataInfo(di)
|
||||||
resList = resList ::: List(s3)
|
resList = resList ::: List(s3)
|
||||||
}
|
}
|
||||||
|
@ -156,25 +250,27 @@ case object ConversionUtil {
|
||||||
publication
|
publication
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def addInstances(a: (Publication, MagUrl)): Publication = {
|
def addInstances(a: (Publication, MagUrl)): Publication = {
|
||||||
val pub = a._1
|
val pub = a._1
|
||||||
val urls = a._2
|
val urls = a._2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val i = new Instance
|
val i = new Instance
|
||||||
|
|
||||||
|
|
||||||
if (urls != null) {
|
if (urls != null) {
|
||||||
|
|
||||||
val l:List[String] = urls.instances.filter(k=>k.SourceUrl.nonEmpty).map(k=>k.SourceUrl):::List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}")
|
val l: List[String] = urls.instances
|
||||||
|
.filter(k => k.SourceUrl.nonEmpty)
|
||||||
|
.map(k => k.SourceUrl) ::: List(
|
||||||
|
s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}"
|
||||||
|
)
|
||||||
|
|
||||||
i.setUrl(l.asJava)
|
i.setUrl(l.asJava)
|
||||||
}
|
} else
|
||||||
else
|
i.setUrl(
|
||||||
i.setUrl(List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}").asJava)
|
List(
|
||||||
|
s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}"
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
|
|
||||||
// Ticket #6281 added pid to Instance
|
// Ticket #6281 added pid to Instance
|
||||||
i.setPid(pub.getPid)
|
i.setPid(pub.getPid)
|
||||||
|
@ -184,13 +280,13 @@ case object ConversionUtil {
|
||||||
pub
|
pub
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def transformPaperAbstract(input: MagPaperAbstract): MagPaperAbstract = {
|
def transformPaperAbstract(input: MagPaperAbstract): MagPaperAbstract = {
|
||||||
MagPaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract))
|
MagPaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def createOAFFromJournalAuthorPaper(
|
||||||
def createOAFFromJournalAuthorPaper(inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)): Publication = {
|
inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)
|
||||||
|
): Publication = {
|
||||||
val paper = inputParams._1._1
|
val paper = inputParams._1._1
|
||||||
val journal = inputParams._1._2
|
val journal = inputParams._1._2
|
||||||
val authors = inputParams._2
|
val authors = inputParams._2
|
||||||
|
@ -206,31 +302,37 @@ case object ConversionUtil {
|
||||||
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
|
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
|
||||||
|
|
||||||
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
|
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
|
||||||
val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
val originalTitles =
|
||||||
|
createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
||||||
pub.setTitle(List(mainTitles, originalTitles).asJava)
|
pub.setTitle(List(mainTitles, originalTitles).asJava)
|
||||||
|
|
||||||
pub.setSource(List(asField(paper.BookTitle)).asJava)
|
pub.setSource(List(asField(paper.BookTitle)).asJava)
|
||||||
|
|
||||||
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
|
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
|
||||||
|
|
||||||
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
|
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
|
||||||
a.setRank(f.sequenceNumber)
|
a.setRank(f.sequenceNumber)
|
||||||
if (f.author.DisplayName.isDefined)
|
if (f.author.DisplayName.isDefined)
|
||||||
a.setFullname(f.author.DisplayName.get)
|
a.setFullname(f.author.DisplayName.get)
|
||||||
if (f.affiliation != null)
|
if (f.affiliation != null)
|
||||||
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||||
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava)
|
a.setPid(
|
||||||
|
List(
|
||||||
|
createSP(
|
||||||
|
s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}",
|
||||||
|
"URL",
|
||||||
|
ModelConstants.DNET_PID_TYPES
|
||||||
|
)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
a
|
a
|
||||||
}
|
}
|
||||||
pub.setAuthor(authorsOAF.asJava)
|
pub.setAuthor(authorsOAF.asJava)
|
||||||
|
|
||||||
|
|
||||||
if (paper.Date != null && paper.Date.isDefined) {
|
if (paper.Date != null && paper.Date.isDefined) {
|
||||||
pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0, 10)))
|
pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0, 10)))
|
||||||
}
|
}
|
||||||
pub.setPublisher(asField(paper.Publisher))
|
pub.setPublisher(asField(paper.Publisher))
|
||||||
|
|
||||||
|
|
||||||
if (journal != null && journal.DisplayName.isDefined) {
|
if (journal != null && journal.DisplayName.isDefined) {
|
||||||
val j = new Journal
|
val j = new Journal
|
||||||
|
|
||||||
|
@ -250,8 +352,9 @@ case object ConversionUtil {
|
||||||
pub
|
pub
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def createOAF(
|
||||||
def createOAF(inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)): Publication = {
|
inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)
|
||||||
|
): Publication = {
|
||||||
|
|
||||||
val paper = inputParams._1._1
|
val paper = inputParams._1._1
|
||||||
val authors = inputParams._1._2
|
val authors = inputParams._1._2
|
||||||
|
@ -268,19 +371,17 @@ case object ConversionUtil {
|
||||||
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
|
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
|
||||||
|
|
||||||
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
|
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
|
||||||
val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
val originalTitles =
|
||||||
|
createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
||||||
pub.setTitle(List(mainTitles, originalTitles).asJava)
|
pub.setTitle(List(mainTitles, originalTitles).asJava)
|
||||||
|
|
||||||
pub.setSource(List(asField(paper.BookTitle)).asJava)
|
pub.setSource(List(asField(paper.BookTitle)).asJava)
|
||||||
|
|
||||||
|
|
||||||
if (description != null) {
|
if (description != null) {
|
||||||
pub.setDescription(List(asField(description.IndexedAbstract)).asJava)
|
pub.setDescription(List(asField(description.IndexedAbstract)).asJava)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
|
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
|
||||||
|
|
||||||
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
|
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
|
||||||
|
|
||||||
a.setFullname(f.author.DisplayName.get)
|
a.setFullname(f.author.DisplayName.get)
|
||||||
|
@ -288,26 +389,30 @@ case object ConversionUtil {
|
||||||
if (f.affiliation != null)
|
if (f.affiliation != null)
|
||||||
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||||
|
|
||||||
|
a.setPid(
|
||||||
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava)
|
List(
|
||||||
|
createSP(
|
||||||
|
s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}",
|
||||||
|
"URL",
|
||||||
|
ModelConstants.DNET_PID_TYPES
|
||||||
|
)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
|
|
||||||
a
|
a
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (paper.Date != null) {
|
if (paper.Date != null) {
|
||||||
pub.setDateofacceptance(asField(paper.Date.toString.substring(0, 10)))
|
pub.setDateofacceptance(asField(paper.Date.toString.substring(0, 10)))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub.setAuthor(authorsOAF.asJava)
|
pub.setAuthor(authorsOAF.asJava)
|
||||||
|
|
||||||
|
|
||||||
pub
|
pub
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def convertInvertedIndexString(json_input: String): String = {
|
def convertInvertedIndexString(json_input: String): String = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json: json4s.JValue = parse(json_input)
|
lazy val json: json4s.JValue = parse(json_input)
|
||||||
|
|
|
@ -8,6 +8,7 @@ import org.apache.spark.sql.{SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object SparkImportMagIntoDataset {
|
object SparkImportMagIntoDataset {
|
||||||
|
|
||||||
val datatypedict = Map(
|
val datatypedict = Map(
|
||||||
"bool" -> BooleanType,
|
"bool" -> BooleanType,
|
||||||
"int" -> IntegerType,
|
"int" -> IntegerType,
|
||||||
|
@ -19,32 +20,232 @@ object SparkImportMagIntoDataset {
|
||||||
"DateTime" -> DateType
|
"DateTime" -> DateType
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
val stream = Map(
|
val stream = Map(
|
||||||
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
"Affiliations" -> Tuple2(
|
||||||
"AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")),
|
"mag/Affiliations.txt",
|
||||||
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
Seq(
|
||||||
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
"AffiliationId:long",
|
||||||
"ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
"Rank:uint",
|
||||||
"EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
|
"NormalizedName:string",
|
||||||
"FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
|
"DisplayName:string",
|
||||||
"FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
|
"GridId:string",
|
||||||
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
"OfficialPage:string",
|
||||||
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
"WikiPage:string",
|
||||||
"PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
|
"PaperCount:long",
|
||||||
"PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
|
"PaperFamilyCount:long",
|
||||||
"PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
|
"CitationCount:long",
|
||||||
"PaperExtendedAttributes" -> Tuple2("mag/PaperExtendedAttributes.txt", Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")),
|
"Iso3166Code:string",
|
||||||
"PaperFieldsOfStudy" -> Tuple2("advanced/PaperFieldsOfStudy.txt", Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")),
|
"Latitude:float?",
|
||||||
"PaperMeSH" -> Tuple2("advanced/PaperMeSH.txt", Seq("PaperId:long", "DescriptorUI:string", "DescriptorName:string", "QualifierUI:string", "QualifierName:string", "IsMajorTopic:bool")),
|
"Longitude:float?",
|
||||||
"PaperRecommendations" -> Tuple2("advanced/PaperRecommendations.txt", Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")),
|
"CreatedDate:DateTime"
|
||||||
"PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
|
)
|
||||||
"PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
|
),
|
||||||
"PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
|
"AuthorExtendedAttributes" -> Tuple2(
|
||||||
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "FamilyRank:uint?", "DocSubTypes:string", "CreatedDate:DateTime")),
|
"mag/AuthorExtendedAttributes.txt",
|
||||||
"RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
|
Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")
|
||||||
|
),
|
||||||
|
"Authors" -> Tuple2(
|
||||||
|
"mag/Authors.txt",
|
||||||
|
Seq(
|
||||||
|
"AuthorId:long",
|
||||||
|
"Rank:uint",
|
||||||
|
"NormalizedName:string",
|
||||||
|
"DisplayName:string",
|
||||||
|
"LastKnownAffiliationId:long?",
|
||||||
|
"PaperCount:long",
|
||||||
|
"PaperFamilyCount:long",
|
||||||
|
"CitationCount:long",
|
||||||
|
"CreatedDate:DateTime"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"ConferenceInstances" -> Tuple2(
|
||||||
|
"mag/ConferenceInstances.txt",
|
||||||
|
Seq(
|
||||||
|
"ConferenceInstanceId:long",
|
||||||
|
"NormalizedName:string",
|
||||||
|
"DisplayName:string",
|
||||||
|
"ConferenceSeriesId:long",
|
||||||
|
"Location:string",
|
||||||
|
"OfficialUrl:string",
|
||||||
|
"StartDate:DateTime?",
|
||||||
|
"EndDate:DateTime?",
|
||||||
|
"AbstractRegistrationDate:DateTime?",
|
||||||
|
"SubmissionDeadlineDate:DateTime?",
|
||||||
|
"NotificationDueDate:DateTime?",
|
||||||
|
"FinalVersionDueDate:DateTime?",
|
||||||
|
"PaperCount:long",
|
||||||
|
"PaperFamilyCount:long",
|
||||||
|
"CitationCount:long",
|
||||||
|
"Latitude:float?",
|
||||||
|
"Longitude:float?",
|
||||||
|
"CreatedDate:DateTime"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"ConferenceSeries" -> Tuple2(
|
||||||
|
"mag/ConferenceSeries.txt",
|
||||||
|
Seq(
|
||||||
|
"ConferenceSeriesId:long",
|
||||||
|
"Rank:uint",
|
||||||
|
"NormalizedName:string",
|
||||||
|
"DisplayName:string",
|
||||||
|
"PaperCount:long",
|
||||||
|
"PaperFamilyCount:long",
|
||||||
|
"CitationCount:long",
|
||||||
|
"CreatedDate:DateTime"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"EntityRelatedEntities" -> Tuple2(
|
||||||
|
"advanced/EntityRelatedEntities.txt",
|
||||||
|
Seq(
|
||||||
|
"EntityId:long",
|
||||||
|
"EntityType:string",
|
||||||
|
"RelatedEntityId:long",
|
||||||
|
"RelatedEntityType:string",
|
||||||
|
"RelatedType:int",
|
||||||
|
"Score:float"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"FieldOfStudyChildren" -> Tuple2(
|
||||||
|
"advanced/FieldOfStudyChildren.txt",
|
||||||
|
Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")
|
||||||
|
),
|
||||||
|
"FieldOfStudyExtendedAttributes" -> Tuple2(
|
||||||
|
"advanced/FieldOfStudyExtendedAttributes.txt",
|
||||||
|
Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")
|
||||||
|
),
|
||||||
|
"FieldsOfStudy" -> Tuple2(
|
||||||
|
"advanced/FieldsOfStudy.txt",
|
||||||
|
Seq(
|
||||||
|
"FieldOfStudyId:long",
|
||||||
|
"Rank:uint",
|
||||||
|
"NormalizedName:string",
|
||||||
|
"DisplayName:string",
|
||||||
|
"MainType:string",
|
||||||
|
"Level:int",
|
||||||
|
"PaperCount:long",
|
||||||
|
"PaperFamilyCount:long",
|
||||||
|
"CitationCount:long",
|
||||||
|
"CreatedDate:DateTime"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"Journals" -> Tuple2(
|
||||||
|
"mag/Journals.txt",
|
||||||
|
Seq(
|
||||||
|
"JournalId:long",
|
||||||
|
"Rank:uint",
|
||||||
|
"NormalizedName:string",
|
||||||
|
"DisplayName:string",
|
||||||
|
"Issn:string",
|
||||||
|
"Publisher:string",
|
||||||
|
"Webpage:string",
|
||||||
|
"PaperCount:long",
|
||||||
|
"PaperFamilyCount:long",
|
||||||
|
"CitationCount:long",
|
||||||
|
"CreatedDate:DateTime"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"PaperAbstractsInvertedIndex" -> Tuple2(
|
||||||
|
"nlp/PaperAbstractsInvertedIndex.txt.*",
|
||||||
|
Seq("PaperId:long", "IndexedAbstract:string")
|
||||||
|
),
|
||||||
|
"PaperAuthorAffiliations" -> Tuple2(
|
||||||
|
"mag/PaperAuthorAffiliations.txt",
|
||||||
|
Seq(
|
||||||
|
"PaperId:long",
|
||||||
|
"AuthorId:long",
|
||||||
|
"AffiliationId:long?",
|
||||||
|
"AuthorSequenceNumber:uint",
|
||||||
|
"OriginalAuthor:string",
|
||||||
|
"OriginalAffiliation:string"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"PaperCitationContexts" -> Tuple2(
|
||||||
|
"nlp/PaperCitationContexts.txt",
|
||||||
|
Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")
|
||||||
|
),
|
||||||
|
"PaperExtendedAttributes" -> Tuple2(
|
||||||
|
"mag/PaperExtendedAttributes.txt",
|
||||||
|
Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")
|
||||||
|
),
|
||||||
|
"PaperFieldsOfStudy" -> Tuple2(
|
||||||
|
"advanced/PaperFieldsOfStudy.txt",
|
||||||
|
Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")
|
||||||
|
),
|
||||||
|
"PaperMeSH" -> Tuple2(
|
||||||
|
"advanced/PaperMeSH.txt",
|
||||||
|
Seq(
|
||||||
|
"PaperId:long",
|
||||||
|
"DescriptorUI:string",
|
||||||
|
"DescriptorName:string",
|
||||||
|
"QualifierUI:string",
|
||||||
|
"QualifierName:string",
|
||||||
|
"IsMajorTopic:bool"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"PaperRecommendations" -> Tuple2(
|
||||||
|
"advanced/PaperRecommendations.txt",
|
||||||
|
Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")
|
||||||
|
),
|
||||||
|
"PaperReferences" -> Tuple2(
|
||||||
|
"mag/PaperReferences.txt",
|
||||||
|
Seq("PaperId:long", "PaperReferenceId:long")
|
||||||
|
),
|
||||||
|
"PaperResources" -> Tuple2(
|
||||||
|
"mag/PaperResources.txt",
|
||||||
|
Seq(
|
||||||
|
"PaperId:long",
|
||||||
|
"ResourceType:int",
|
||||||
|
"ResourceUrl:string",
|
||||||
|
"SourceUrl:string",
|
||||||
|
"RelationshipType:int"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"PaperUrls" -> Tuple2(
|
||||||
|
"mag/PaperUrls.txt",
|
||||||
|
Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")
|
||||||
|
),
|
||||||
|
"Papers" -> Tuple2(
|
||||||
|
"mag/Papers.txt",
|
||||||
|
Seq(
|
||||||
|
"PaperId:long",
|
||||||
|
"Rank:uint",
|
||||||
|
"Doi:string",
|
||||||
|
"DocType:string",
|
||||||
|
"PaperTitle:string",
|
||||||
|
"OriginalTitle:string",
|
||||||
|
"BookTitle:string",
|
||||||
|
"Year:int?",
|
||||||
|
"Date:DateTime?",
|
||||||
|
"OnlineDate:DateTime?",
|
||||||
|
"Publisher:string",
|
||||||
|
"JournalId:long?",
|
||||||
|
"ConferenceSeriesId:long?",
|
||||||
|
"ConferenceInstanceId:long?",
|
||||||
|
"Volume:string",
|
||||||
|
"Issue:string",
|
||||||
|
"FirstPage:string",
|
||||||
|
"LastPage:string",
|
||||||
|
"ReferenceCount:long",
|
||||||
|
"CitationCount:long",
|
||||||
|
"EstimatedCitation:long",
|
||||||
|
"OriginalVenue:string",
|
||||||
|
"FamilyId:long?",
|
||||||
|
"FamilyRank:uint?",
|
||||||
|
"DocSubTypes:string",
|
||||||
|
"CreatedDate:DateTime"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"RelatedFieldOfStudy" -> Tuple2(
|
||||||
|
"advanced/RelatedFieldOfStudy.txt",
|
||||||
|
Seq(
|
||||||
|
"FieldOfStudyId1:long",
|
||||||
|
"Type1:string",
|
||||||
|
"FieldOfStudyId2:long",
|
||||||
|
"Type2:string",
|
||||||
|
"Rank:float"
|
||||||
|
)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def getSchema(streamName: String): StructType = {
|
def getSchema(streamName: String): StructType = {
|
||||||
var schema = new StructType()
|
var schema = new StructType()
|
||||||
|
@ -61,19 +262,22 @@ object SparkImportMagIntoDataset {
|
||||||
schema
|
schema
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
stream.foreach { case (k, v) =>
|
stream.foreach { case (k, v) =>
|
||||||
val s: StructType = getSchema(k)
|
val s: StructType = getSchema(k)
|
||||||
|
|
|
@ -9,6 +9,7 @@ import org.apache.spark.sql.functions.{col, collect_list, struct}
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
object SparkProcessMAG {
|
object SparkProcessMAG {
|
||||||
|
|
||||||
def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = {
|
def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = {
|
||||||
|
@ -17,13 +18,31 @@ object SparkProcessMAG {
|
||||||
.reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2))
|
.reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2))
|
||||||
.map(_._2)(Encoders.product[MagPapers])
|
.map(_._2)(Encoders.product[MagPapers])
|
||||||
.map(mp => {
|
.map(mp => {
|
||||||
MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi),
|
MagPapers(
|
||||||
mp.DocType, mp.PaperTitle, mp.OriginalTitle,
|
mp.PaperId,
|
||||||
mp.BookTitle, mp.Year, mp.Date, mp.Publisher: String,
|
mp.Rank,
|
||||||
mp.JournalId, mp.ConferenceSeriesId, mp.ConferenceInstanceId,
|
DoiBoostMappingUtil.normalizeDoi(mp.Doi),
|
||||||
mp.Volume, mp.Issue, mp.FirstPage, mp.LastPage,
|
mp.DocType,
|
||||||
mp.ReferenceCount, mp.CitationCount, mp.EstimatedCitation,
|
mp.PaperTitle,
|
||||||
mp.OriginalVenue, mp.FamilyId, mp.CreatedDate)
|
mp.OriginalTitle,
|
||||||
|
mp.BookTitle,
|
||||||
|
mp.Year,
|
||||||
|
mp.Date,
|
||||||
|
mp.Publisher: String,
|
||||||
|
mp.JournalId,
|
||||||
|
mp.ConferenceSeriesId,
|
||||||
|
mp.ConferenceInstanceId,
|
||||||
|
mp.Volume,
|
||||||
|
mp.Issue,
|
||||||
|
mp.FirstPage,
|
||||||
|
mp.LastPage,
|
||||||
|
mp.ReferenceCount,
|
||||||
|
mp.CitationCount,
|
||||||
|
mp.EstimatedCitation,
|
||||||
|
mp.OriginalVenue,
|
||||||
|
mp.FamilyId,
|
||||||
|
mp.CreatedDate
|
||||||
|
)
|
||||||
})(Encoders.product[MagPapers])
|
})(Encoders.product[MagPapers])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -31,22 +50,29 @@ object SparkProcessMAG {
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
val workingPath = parser.get("workingPath")
|
val workingPath = parser.get("workingPath")
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
|
implicit val mapEncoderPubs: Encoder[Publication] =
|
||||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
org.apache.spark.sql.Encoders.kryo[Publication]
|
||||||
|
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =
|
||||||
|
Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||||
|
|
||||||
logger.info("Phase 1) make uninue DOI in Papers:")
|
logger.info("Phase 1) make uninue DOI in Papers:")
|
||||||
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
|
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
|
||||||
|
@ -58,16 +84,23 @@ object SparkProcessMAG {
|
||||||
|
|
||||||
logger.info("Phase 0) Enrich Publication with description")
|
logger.info("Phase 0) Enrich Publication with description")
|
||||||
val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||||
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"$workingPath/PaperAbstract")
|
pa.map(ConversionUtil.transformPaperAbstract)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/PaperAbstract")
|
||||||
|
|
||||||
logger.info("Phase 3) Group Author by PaperId")
|
logger.info("Phase 3) Group Author by PaperId")
|
||||||
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
|
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
|
||||||
|
|
||||||
val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]
|
val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]
|
||||||
val paperAuthorAffiliation = spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
|
val paperAuthorAffiliation =
|
||||||
|
spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
|
||||||
|
|
||||||
paperAuthorAffiliation.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
|
paperAuthorAffiliation
|
||||||
.map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) => (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber)) }
|
.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
|
||||||
|
.map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) =>
|
||||||
|
(a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber))
|
||||||
|
}
|
||||||
.joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")
|
.joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")
|
||||||
.map(s => {
|
.map(s => {
|
||||||
val mpa = s._1._2
|
val mpa = s._1._2
|
||||||
|
@ -76,79 +109,133 @@ object SparkProcessMAG {
|
||||||
MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName, mpa.sequenceNumber)
|
MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName, mpa.sequenceNumber)
|
||||||
} else
|
} else
|
||||||
mpa
|
mpa
|
||||||
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors"))
|
})
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_1_paper_authors")
|
.groupBy("PaperId")
|
||||||
|
.agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors"))
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/merge_step_1_paper_authors")
|
||||||
|
|
||||||
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
|
logger.info(
|
||||||
|
"Phase 4) create First Version of publication Entity with Paper Journal and Authors"
|
||||||
|
)
|
||||||
|
|
||||||
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
|
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
|
||||||
|
|
||||||
val papers = spark.read.load((s"$workingPath/Papers_distinct")).as[MagPapers]
|
val papers = spark.read.load(s"$workingPath/Papers_distinct").as[MagPapers]
|
||||||
|
|
||||||
val paperWithAuthors = spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
val paperWithAuthors =
|
||||||
|
spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
||||||
|
|
||||||
val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
|
val firstJoin =
|
||||||
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
|
papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
|
||||||
|
firstJoin
|
||||||
|
.joinWith(
|
||||||
|
paperWithAuthors,
|
||||||
|
firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")),
|
||||||
|
"left"
|
||||||
|
)
|
||||||
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
|
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_2")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/merge_step_2")
|
||||||
|
|
||||||
var magPubs: Dataset[(String, Publication)] =
|
var magPubs: Dataset[(String, Publication)] =
|
||||||
spark.read.load(s"$workingPath/merge_step_2").as[Publication]
|
spark.read
|
||||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
.load(s"$workingPath/merge_step_2")
|
||||||
|
.as[Publication]
|
||||||
|
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
|
||||||
|
.as[(String, Publication)]
|
||||||
|
|
||||||
|
val conference = spark.read
|
||||||
|
.load(s"$sourcePath/ConferenceInstances")
|
||||||
|
.select(
|
||||||
|
$"ConferenceInstanceId".as("ci"),
|
||||||
|
$"DisplayName",
|
||||||
|
$"Location",
|
||||||
|
$"StartDate",
|
||||||
|
$"EndDate"
|
||||||
|
)
|
||||||
|
val conferenceInstance = conference
|
||||||
|
.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
|
||||||
|
.select(
|
||||||
|
$"_1.ci",
|
||||||
|
$"_1.DisplayName",
|
||||||
|
$"_1.Location",
|
||||||
|
$"_1.StartDate",
|
||||||
|
$"_1.EndDate",
|
||||||
|
$"_2.PaperId"
|
||||||
|
)
|
||||||
|
.as[MagConferenceInstance]
|
||||||
|
|
||||||
val conference = spark.read.load(s"$sourcePath/ConferenceInstances")
|
magPubs
|
||||||
.select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate", $"EndDate")
|
.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
|
||||||
val conferenceInstance = conference.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
|
|
||||||
.select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate", $"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance]
|
|
||||||
|
|
||||||
|
|
||||||
magPubs.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
|
|
||||||
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
|
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
|
||||||
.write
|
.write
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.save(s"$workingPath/merge_step_3")
|
.save(s"$workingPath/merge_step_3")
|
||||||
|
|
||||||
|
val paperAbstract = spark.read.load(s"$workingPath/PaperAbstract").as[MagPaperAbstract]
|
||||||
|
|
||||||
val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract]
|
magPubs = spark.read
|
||||||
|
.load(s"$workingPath/merge_step_3")
|
||||||
|
.as[Publication]
|
||||||
magPubs = spark.read.load(s"$workingPath/merge_step_3").as[Publication]
|
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
|
||||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
.as[(String, Publication)]
|
||||||
|
|
||||||
magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
|
|
||||||
.map(item => ConversionUtil.updatePubsWithDescription(item)
|
|
||||||
).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4")
|
|
||||||
|
|
||||||
|
magPubs
|
||||||
|
.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
|
||||||
|
.map(item => ConversionUtil.updatePubsWithDescription(item))
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/merge_step_4")
|
||||||
|
|
||||||
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
|
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
|
||||||
|
|
||||||
magPubs = spark.read.load(s"$workingPath/merge_step_4").as[Publication]
|
magPubs = spark.read
|
||||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
.load(s"$workingPath/merge_step_4")
|
||||||
|
.as[Publication]
|
||||||
|
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
|
||||||
|
.as[(String, Publication)]
|
||||||
|
|
||||||
val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
|
val fos = spark.read
|
||||||
|
.load(s"$sourcePath/FieldsOfStudy")
|
||||||
|
.select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
|
||||||
|
|
||||||
val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy")
|
val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy")
|
||||||
|
|
||||||
val paperField = pfos.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
|
val paperField = pfos
|
||||||
|
.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
|
||||||
.select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score")
|
.select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score")
|
||||||
.groupBy($"PaperId").agg(collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score")).as("subjects"))
|
.groupBy($"PaperId")
|
||||||
|
.agg(
|
||||||
|
collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score"))
|
||||||
|
.as("subjects")
|
||||||
|
)
|
||||||
.as[MagFieldOfStudy]
|
.as[MagFieldOfStudy]
|
||||||
|
|
||||||
magPubs.joinWith(paperField, col("_1")
|
magPubs
|
||||||
.equalTo(paperField("PaperId")), "left")
|
.joinWith(
|
||||||
|
paperField,
|
||||||
|
col("_1")
|
||||||
|
.equalTo(paperField("PaperId")),
|
||||||
|
"left"
|
||||||
|
)
|
||||||
.map(item => ConversionUtil.updatePubsWithSubject(item))
|
.map(item => ConversionUtil.updatePubsWithSubject(item))
|
||||||
.write.mode(SaveMode.Overwrite)
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
.save(s"$workingPath/mag_publication")
|
.save(s"$workingPath/mag_publication")
|
||||||
|
|
||||||
spark.read.load(s"$workingPath/mag_publication").as[Publication]
|
spark.read
|
||||||
|
.load(s"$workingPath/mag_publication")
|
||||||
|
.as[Publication]
|
||||||
.filter(p => p.getId != null)
|
.filter(p => p.getId != null)
|
||||||
.groupByKey(p => p.getId)
|
.groupByKey(p => p.getId)
|
||||||
.reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b))
|
.reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b))
|
||||||
.map(_._2)
|
.map(_._2)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/magPublication")
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,15 +15,20 @@ import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
|
|
||||||
case class ORCIDItem(doi: String, authors: List[OrcidAuthor]) {}
|
case class ORCIDItem(doi: String, authors: List[OrcidAuthor]) {}
|
||||||
case class OrcidAuthor(oid:String, name:Option[String], surname:Option[String], creditName:Option[String], otherNames:Option[List[String]], errorCode:Option[String]){}
|
|
||||||
|
case class OrcidAuthor(
|
||||||
|
oid: String,
|
||||||
|
name: Option[String],
|
||||||
|
surname: Option[String],
|
||||||
|
creditName: Option[String],
|
||||||
|
otherNames: Option[List[String]],
|
||||||
|
errorCode: Option[String]
|
||||||
|
) {}
|
||||||
case class OrcidWork(oid: String, doi: String)
|
case class OrcidWork(oid: String, doi: String)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
case class ORCIDElement(doi: String, authors: List[ORCIDItem]) {}
|
case class ORCIDElement(doi: String, authors: List[ORCIDItem]) {}
|
||||||
|
|
||||||
object ORCIDToOAF {
|
object ORCIDToOAF {
|
||||||
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
|
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
@ -51,7 +56,6 @@ object ORCIDToOAF {
|
||||||
} else null
|
} else null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def strValid(s: Option[String]): Boolean = {
|
def strValid(s: Option[String]): Boolean = {
|
||||||
s.isDefined && s.get.nonEmpty
|
s.isDefined && s.get.nonEmpty
|
||||||
}
|
}
|
||||||
|
@ -70,7 +74,6 @@ object ORCIDToOAF {
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def extractDOIWorks(input: String): List[OrcidWork] = {
|
def extractDOIWorks(input: String): List[OrcidWork] = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json: json4s.JValue = parse(input)
|
lazy val json: json4s.JValue = parse(input)
|
||||||
|
@ -97,7 +100,6 @@ object ORCIDToOAF {
|
||||||
(json \ "authorData").extractOrElse[OrcidAuthor](null)
|
(json \ "authorData").extractOrElse[OrcidAuthor](null)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def convertTOOAF(input: ORCIDItem): Publication = {
|
def convertTOOAF(input: ORCIDItem): Publication = {
|
||||||
val doi = input.doi
|
val doi = input.doi
|
||||||
val pub: Publication = new Publication
|
val pub: Publication = new Publication
|
||||||
|
@ -145,10 +147,18 @@ object ORCIDToOAF {
|
||||||
else if (strValid(o.creditName))
|
else if (strValid(o.creditName))
|
||||||
a.setFullname(o.creditName.get)
|
a.setFullname(o.creditName.get)
|
||||||
if (StringUtils.isNotBlank(o.oid))
|
if (StringUtils.isNotBlank(o.oid))
|
||||||
a.setPid(List(createSP(o.oid, ModelConstants.ORCID, ModelConstants.DNET_PID_TYPES, generateOricPIDDatainfo())).asJava)
|
a.setPid(
|
||||||
|
List(
|
||||||
|
createSP(
|
||||||
|
o.oid,
|
||||||
|
ModelConstants.ORCID,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
generateOricPIDDatainfo()
|
||||||
|
)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
|
|
||||||
a
|
a
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
|
@ -10,11 +10,11 @@ import org.slf4j.{Logger, LoggerFactory}
|
||||||
object SparkConvertORCIDToOAF {
|
object SparkConvertORCIDToOAF {
|
||||||
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
|
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
|
||||||
|
|
||||||
|
|
||||||
def run(spark: SparkSession, workingPath: String, targetPath: String): Unit = {
|
def run(spark: SparkSession, workingPath: String, targetPath: String): Unit = {
|
||||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
val dataset: Dataset[ORCIDItem] = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]
|
val dataset: Dataset[ORCIDItem] =
|
||||||
|
spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]
|
||||||
|
|
||||||
logger.info("Converting ORCID to OAF")
|
logger.info("Converting ORCID to OAF")
|
||||||
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath)
|
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||||
|
@ -22,15 +22,21 @@ object SparkConvertORCIDToOAF {
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
SparkConvertORCIDToOAF.getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val workingPath = parser.get("workingPath")
|
val workingPath = parser.get("workingPath")
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
|
|
|
@ -17,45 +17,72 @@ object SparkPreprocessORCID {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def run(spark: SparkSession, sourcePath: String, workingPath: String): Unit = {
|
def run(spark: SparkSession, sourcePath: String, workingPath: String): Unit = {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
|
|
||||||
val inputRDD: RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s != null).filter(s => ORCIDToOAF.authorValid(s))
|
val inputRDD: RDD[OrcidAuthor] = spark.sparkContext
|
||||||
|
.textFile(s"$sourcePath/authors")
|
||||||
|
.map(s => ORCIDToOAF.convertORCIDAuthor(s))
|
||||||
|
.filter(s => s != null)
|
||||||
|
.filter(s => ORCIDToOAF.authorValid(s))
|
||||||
|
|
||||||
spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author")
|
spark
|
||||||
|
.createDataset(inputRDD)
|
||||||
|
.as[OrcidAuthor]
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/author")
|
||||||
|
|
||||||
val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s != null)
|
val res = spark.sparkContext
|
||||||
|
.textFile(s"$sourcePath/works")
|
||||||
|
.flatMap(s => ORCIDToOAF.extractDOIWorks(s))
|
||||||
|
.filter(s => s != null)
|
||||||
|
|
||||||
spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works")
|
spark
|
||||||
|
.createDataset(res)
|
||||||
|
.as[OrcidWork]
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/works")
|
||||||
|
|
||||||
val authors: Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor]
|
val authors: Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor]
|
||||||
|
|
||||||
val works: Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork]
|
val works: Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork]
|
||||||
|
|
||||||
works.joinWith(authors, authors("oid").equalTo(works("oid")))
|
works
|
||||||
|
.joinWith(authors, authors("oid").equalTo(works("oid")))
|
||||||
.map(i => {
|
.map(i => {
|
||||||
val doi = i._1.doi
|
val doi = i._1.doi
|
||||||
val author = i._2
|
val author = i._2
|
||||||
(doi, author)
|
(doi, author)
|
||||||
}).groupBy(col("_1").alias("doi"))
|
})
|
||||||
.agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem]
|
.groupBy(col("_1").alias("doi"))
|
||||||
|
.agg(collect_list(col("_2")).alias("authors"))
|
||||||
|
.as[ORCIDItem]
|
||||||
.map(s => fixORCIDItem(s))
|
.map(s => fixORCIDItem(s))
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/orcidworksWithAuthor")
|
||||||
}
|
}
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/preprocess_orcid_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
SparkConvertORCIDToOAF.getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/preprocess_orcid_params.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
val workingPath = parser.get("workingPath")
|
val workingPath = parser.get("workingPath")
|
||||||
|
|
|
@ -13,28 +13,35 @@ object SparkMapUnpayWallToOAF {
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
SparkMapDumpIntoOAF.getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
|
|
||||||
|
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
val inputRDD: RDD[String] = spark.sparkContext.textFile(s"$sourcePath")
|
val inputRDD: RDD[String] = spark.sparkContext.textFile(s"$sourcePath")
|
||||||
|
|
||||||
logger.info("Converting UnpayWall to OAF")
|
logger.info("Converting UnpayWall to OAF")
|
||||||
|
|
||||||
val d: Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null)).as[Publication]
|
val d: Dataset[Publication] = spark
|
||||||
|
.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null))
|
||||||
|
.as[Publication]
|
||||||
d.write.mode(SaveMode.Overwrite).save(targetPath)
|
d.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,18 +12,22 @@ import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
|
case class OALocation(
|
||||||
|
evidence: Option[String],
|
||||||
case class OALocation(evidence:Option[String], host_type:Option[String], is_best:Option[Boolean], license: Option[String], pmh_id:Option[String], updated:Option[String],
|
host_type: Option[String],
|
||||||
url:Option[String], url_for_landing_page:Option[String], url_for_pdf:Option[String], version:Option[String]) {}
|
is_best: Option[Boolean],
|
||||||
|
license: Option[String],
|
||||||
|
pmh_id: Option[String],
|
||||||
|
updated: Option[String],
|
||||||
|
url: Option[String],
|
||||||
|
url_for_landing_page: Option[String],
|
||||||
|
url_for_pdf: Option[String],
|
||||||
|
version: Option[String]
|
||||||
|
) {}
|
||||||
|
|
||||||
object UnpayWallToOAF {
|
object UnpayWallToOAF {
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
|
|
||||||
|
|
||||||
def get_unpaywall_color(input: String): Option[OpenAccessRoute] = {
|
def get_unpaywall_color(input: String): Option[OpenAccessRoute] = {
|
||||||
if (input == null || input.equalsIgnoreCase("close"))
|
if (input == null || input.equalsIgnoreCase("close"))
|
||||||
return None
|
return None
|
||||||
|
@ -38,7 +42,11 @@ object UnpayWallToOAF {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_color(is_oa:Boolean, location: OALocation, journal_is_oa:Boolean):Option[OpenAccessRoute] = {
|
def get_color(
|
||||||
|
is_oa: Boolean,
|
||||||
|
location: OALocation,
|
||||||
|
journal_is_oa: Boolean
|
||||||
|
): Option[OpenAccessRoute] = {
|
||||||
if (is_oa) {
|
if (is_oa) {
|
||||||
if (location.host_type.isDefined) {
|
if (location.host_type.isDefined) {
|
||||||
{
|
{
|
||||||
|
@ -62,7 +70,6 @@ object UnpayWallToOAF {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def convertToOAF(input: String): Publication = {
|
def convertToOAF(input: String): Publication = {
|
||||||
val pub = new Publication
|
val pub = new Publication
|
||||||
|
|
||||||
|
@ -122,7 +129,4 @@ object UnpayWallToOAF {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,12 +9,8 @@ class DoiBoostHostedByMapTest {
|
||||||
def idDSGeneration(): Unit = {
|
def idDSGeneration(): Unit = {
|
||||||
val s = "doajarticles::0066-782X"
|
val s = "doajarticles::0066-782X"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
println(DoiBoostMappingUtil.generateDSId(s))
|
println(DoiBoostMappingUtil.generateDSId(s))
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,6 @@ class NormalizeDOITest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def doiFiltered(): Unit = {
|
def doiFiltered(): Unit = {
|
||||||
val doi = "0.1042/BCJ20160876"
|
val doi = "0.1042/BCJ20160876"
|
||||||
|
@ -28,7 +27,6 @@ class NormalizeDOITest {
|
||||||
assert(DoiBoostMappingUtil.normalizeDoi(doi) == null)
|
assert(DoiBoostMappingUtil.normalizeDoi(doi) == null)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def doiCleaned(): Unit = {
|
def doiCleaned(): Unit = {
|
||||||
val doi = "https://doi.org/10.1042/BCJ20160876"
|
val doi = "https://doi.org/10.1042/BCJ20160876"
|
||||||
|
|
|
@ -12,20 +12,24 @@ import scala.collection.JavaConverters._
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
import scala.util.matching.Regex
|
import scala.util.matching.Regex
|
||||||
|
|
||||||
|
|
||||||
class CrossrefMappingTest {
|
class CrossrefMappingTest {
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testFunderRelationshipsMapping(): Unit = {
|
def testFunderRelationshipsMapping(): Unit = {
|
||||||
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString
|
val template = Source
|
||||||
val funder_doi = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString
|
.fromInputStream(
|
||||||
val funder_name = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString
|
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
val funder_doi = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi"))
|
||||||
|
.mkString
|
||||||
|
val funder_name = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi"))
|
||||||
|
.mkString
|
||||||
|
|
||||||
for (line <- funder_doi.lines) {
|
for (line <- funder_doi.lines) {
|
||||||
val json = template.replace("%s", line)
|
val json = template.replace("%s", line)
|
||||||
|
@ -43,7 +47,8 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
def checkRelation(generatedOAF: List[Oaf]): Unit = {
|
def checkRelation(generatedOAF: List[Oaf]): Unit = {
|
||||||
|
|
||||||
val rels: List[Relation] = generatedOAF.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
|
val rels: List[Relation] =
|
||||||
|
generatedOAF.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
|
||||||
assertFalse(rels.isEmpty)
|
assertFalse(rels.isEmpty)
|
||||||
rels.foreach(relation => {
|
rels.foreach(relation => {
|
||||||
val relJson = mapper.writeValueAsString(relation)
|
val relJson = mapper.writeValueAsString(relation)
|
||||||
|
@ -59,22 +64,22 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testSum(): Unit = {
|
def testSum(): Unit = {
|
||||||
val from: Long = 1613135645000L
|
val from: Long = 1613135645000L
|
||||||
val delta: Long = 1000000L
|
val delta: Long = 1000000L
|
||||||
|
|
||||||
|
|
||||||
println(s"updating from value: $from -> ${from + delta}")
|
println(s"updating from value: $from -> ${from + delta}")
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testOrcidID(): Unit = {
|
def testOrcidID(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
assertFalse(json.isEmpty);
|
assertFalse(json.isEmpty);
|
||||||
|
@ -85,17 +90,18 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
val items = resultList.filter(p => p.isInstanceOf[Result])
|
val items = resultList.filter(p => p.isInstanceOf[Result])
|
||||||
|
|
||||||
|
|
||||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||||
items.foreach(p => println(mapper.writeValueAsString(p)))
|
items.foreach(p => println(mapper.writeValueAsString(p)))
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testEmptyTitle(): Unit = {
|
def testEmptyTitle(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
assertFalse(json.isEmpty);
|
assertFalse(json.isEmpty);
|
||||||
|
@ -106,17 +112,16 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
val items = resultList.filter(p => p.isInstanceOf[Result])
|
val items = resultList.filter(p => p.isInstanceOf[Result])
|
||||||
|
|
||||||
|
|
||||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||||
items.foreach(p => println(mapper.writeValueAsString(p)))
|
items.foreach(p => println(mapper.writeValueAsString(p)))
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testPeerReviewed(): Unit = {
|
def testPeerReviewed(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json"))
|
||||||
|
.mkString
|
||||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||||
|
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
|
@ -128,12 +133,8 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
val items = resultList.filter(p => p.isInstanceOf[Result])
|
val items = resultList.filter(p => p.isInstanceOf[Result])
|
||||||
|
|
||||||
|
|
||||||
items.foreach(p => logger.info(mapper.writeValueAsString(p)))
|
items.foreach(p => logger.info(mapper.writeValueAsString(p)))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def extractECAward(award: String): String = {
|
def extractECAward(award: String): String = {
|
||||||
|
@ -143,7 +144,6 @@ class CrossrefMappingTest {
|
||||||
null
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def extractECTest(): Unit = {
|
def extractECTest(): Unit = {
|
||||||
val s = "FP7/2007-2013"
|
val s = "FP7/2007-2013"
|
||||||
|
@ -152,12 +152,13 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
println(DHPUtils.md5(awardExtracted))
|
println(DHPUtils.md5(awardExtracted))
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testJournalRelation(): Unit = {
|
def testJournalRelation(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json"))
|
||||||
|
.mkString
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
|
|
||||||
assertFalse(json.isEmpty)
|
assertFalse(json.isEmpty)
|
||||||
|
@ -165,20 +166,19 @@ class CrossrefMappingTest {
|
||||||
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
||||||
|
|
||||||
assertTrue(resultList.nonEmpty)
|
assertTrue(resultList.nonEmpty)
|
||||||
val rels:List[Relation] = resultList.filter(p => p.isInstanceOf[Relation]).map(r=> r.asInstanceOf[Relation])
|
val rels: List[Relation] =
|
||||||
|
resultList.filter(p => p.isInstanceOf[Relation]).map(r => r.asInstanceOf[Relation])
|
||||||
|
|
||||||
|
|
||||||
rels.foreach(s => logger.info(s.getTarget))
|
rels.foreach(s => logger.info(s.getTarget))
|
||||||
assertEquals(rels.size, 6)
|
assertEquals(rels.size, 6)
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testConvertBookFromCrossRef2Oaf(): Unit = {
|
def testConvertBookFromCrossRef2Oaf(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json"))
|
||||||
|
.mkString
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
|
|
||||||
assertFalse(json.isEmpty);
|
assertFalse(json.isEmpty);
|
||||||
|
@ -199,42 +199,62 @@ class CrossrefMappingTest {
|
||||||
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
|
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
|
||||||
assertNotNull(
|
assertNotNull(
|
||||||
result.getDataInfo.getProvenanceaction,
|
result.getDataInfo.getProvenanceaction,
|
||||||
"DataInfo/Provenance test not null Failed");
|
"DataInfo/Provenance test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
|
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
|
||||||
"DataInfo/Provenance/classId test not null Failed");
|
"DataInfo/Provenance/classId test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
|
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
|
||||||
"DataInfo/Provenance/className test not null Failed");
|
"DataInfo/Provenance/className test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
|
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
|
||||||
"DataInfo/Provenance/SchemeId test not null Failed");
|
"DataInfo/Provenance/SchemeId test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
|
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
|
||||||
"DataInfo/Provenance/SchemeName test not null Failed");
|
"DataInfo/Provenance/SchemeName test not null Failed"
|
||||||
|
);
|
||||||
|
|
||||||
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
|
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
|
||||||
assertFalse(result.getCollectedfrom.isEmpty);
|
assertFalse(result.getCollectedfrom.isEmpty);
|
||||||
|
|
||||||
val collectedFromList = result.getCollectedfrom.asScala
|
val collectedFromList = result.getCollectedfrom.asScala
|
||||||
assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
|
assert(
|
||||||
|
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
|
||||||
assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
|
"Wrong collected from assertion"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert(
|
||||||
|
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
|
||||||
|
"Wrong collected from assertion"
|
||||||
|
)
|
||||||
|
|
||||||
val relevantDates = result.getRelevantdate.asScala
|
val relevantDates = result.getRelevantdate.asScala
|
||||||
|
|
||||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
|
assert(
|
||||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), "Missing relevant date of type published-online")
|
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
|
||||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), "Missing relevant date of type published-print")
|
"Missing relevant date of type created"
|
||||||
|
)
|
||||||
|
assert(
|
||||||
|
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")),
|
||||||
|
"Missing relevant date of type published-online"
|
||||||
|
)
|
||||||
|
assert(
|
||||||
|
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")),
|
||||||
|
"Missing relevant date of type published-print"
|
||||||
|
)
|
||||||
val rels = resultList.filter(p => p.isInstanceOf[Relation])
|
val rels = resultList.filter(p => p.isInstanceOf[Relation])
|
||||||
assert(rels.isEmpty)
|
assert(rels.isEmpty)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testConvertPreprintFromCrossRef2Oaf(): Unit = {
|
def testConvertPreprintFromCrossRef2Oaf(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json"))
|
||||||
|
.mkString
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
|
|
||||||
assertFalse(json.isEmpty);
|
assertFalse(json.isEmpty);
|
||||||
|
@ -255,44 +275,70 @@ class CrossrefMappingTest {
|
||||||
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
|
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
|
||||||
assertNotNull(
|
assertNotNull(
|
||||||
result.getDataInfo.getProvenanceaction,
|
result.getDataInfo.getProvenanceaction,
|
||||||
"DataInfo/Provenance test not null Failed");
|
"DataInfo/Provenance test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
|
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
|
||||||
"DataInfo/Provenance/classId test not null Failed");
|
"DataInfo/Provenance/classId test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
|
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
|
||||||
"DataInfo/Provenance/className test not null Failed");
|
"DataInfo/Provenance/className test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
|
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
|
||||||
"DataInfo/Provenance/SchemeId test not null Failed");
|
"DataInfo/Provenance/SchemeId test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
|
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
|
||||||
"DataInfo/Provenance/SchemeName test not null Failed");
|
"DataInfo/Provenance/SchemeName test not null Failed"
|
||||||
|
);
|
||||||
|
|
||||||
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
|
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
|
||||||
assertFalse(result.getCollectedfrom.isEmpty);
|
assertFalse(result.getCollectedfrom.isEmpty);
|
||||||
|
|
||||||
val collectedFromList = result.getCollectedfrom.asScala
|
val collectedFromList = result.getCollectedfrom.asScala
|
||||||
assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
|
assert(
|
||||||
|
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
|
||||||
assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
|
"Wrong collected from assertion"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert(
|
||||||
|
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
|
||||||
|
"Wrong collected from assertion"
|
||||||
|
)
|
||||||
|
|
||||||
val relevantDates = result.getRelevantdate.asScala
|
val relevantDates = result.getRelevantdate.asScala
|
||||||
|
|
||||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
|
assert(
|
||||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("available")), "Missing relevant date of type available")
|
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
|
||||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("accepted")), "Missing relevant date of type accepted")
|
"Missing relevant date of type created"
|
||||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), "Missing relevant date of type published-online")
|
)
|
||||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), "Missing relevant date of type published-print")
|
assert(
|
||||||
|
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("available")),
|
||||||
|
"Missing relevant date of type available"
|
||||||
|
)
|
||||||
|
assert(
|
||||||
|
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("accepted")),
|
||||||
|
"Missing relevant date of type accepted"
|
||||||
|
)
|
||||||
|
assert(
|
||||||
|
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")),
|
||||||
|
"Missing relevant date of type published-online"
|
||||||
|
)
|
||||||
|
assert(
|
||||||
|
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")),
|
||||||
|
"Missing relevant date of type published-print"
|
||||||
|
)
|
||||||
val rels = resultList.filter(p => p.isInstanceOf[Relation])
|
val rels = resultList.filter(p => p.isInstanceOf[Relation])
|
||||||
assert(rels.isEmpty)
|
assert(rels.isEmpty)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testConvertDatasetFromCrossRef2Oaf(): Unit = {
|
def testConvertDatasetFromCrossRef2Oaf(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json"))
|
||||||
|
.mkString
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
|
|
||||||
assertFalse(json.isEmpty);
|
assertFalse(json.isEmpty);
|
||||||
|
@ -313,19 +359,24 @@ class CrossrefMappingTest {
|
||||||
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
|
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
|
||||||
assertNotNull(
|
assertNotNull(
|
||||||
result.getDataInfo.getProvenanceaction,
|
result.getDataInfo.getProvenanceaction,
|
||||||
"DataInfo/Provenance test not null Failed");
|
"DataInfo/Provenance test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
|
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
|
||||||
"DataInfo/Provenance/classId test not null Failed");
|
"DataInfo/Provenance/classId test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
|
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
|
||||||
"DataInfo/Provenance/className test not null Failed");
|
"DataInfo/Provenance/className test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
|
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
|
||||||
"DataInfo/Provenance/SchemeId test not null Failed");
|
"DataInfo/Provenance/SchemeId test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
|
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
|
||||||
"DataInfo/Provenance/SchemeName test not null Failed");
|
"DataInfo/Provenance/SchemeName test not null Failed"
|
||||||
|
);
|
||||||
|
|
||||||
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
|
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
|
||||||
assertFalse(result.getCollectedfrom.isEmpty);
|
assertFalse(result.getCollectedfrom.isEmpty);
|
||||||
|
@ -333,7 +384,9 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testConvertArticleFromCrossRef2Oaf(): Unit = {
|
def testConvertArticleFromCrossRef2Oaf(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json"))
|
||||||
|
.mkString
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
|
|
||||||
assertFalse(json.isEmpty);
|
assertFalse(json.isEmpty);
|
||||||
|
@ -354,32 +407,45 @@ class CrossrefMappingTest {
|
||||||
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
|
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
|
||||||
assertNotNull(
|
assertNotNull(
|
||||||
result.getDataInfo.getProvenanceaction,
|
result.getDataInfo.getProvenanceaction,
|
||||||
"DataInfo/Provenance test not null Failed");
|
"DataInfo/Provenance test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
|
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
|
||||||
"DataInfo/Provenance/classId test not null Failed");
|
"DataInfo/Provenance/classId test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
|
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
|
||||||
"DataInfo/Provenance/className test not null Failed");
|
"DataInfo/Provenance/className test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
|
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
|
||||||
"DataInfo/Provenance/SchemeId test not null Failed");
|
"DataInfo/Provenance/SchemeId test not null Failed"
|
||||||
|
);
|
||||||
assertFalse(
|
assertFalse(
|
||||||
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
|
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
|
||||||
"DataInfo/Provenance/SchemeName test not null Failed");
|
"DataInfo/Provenance/SchemeName test not null Failed"
|
||||||
|
);
|
||||||
|
|
||||||
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
|
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
|
||||||
assertFalse(result.getCollectedfrom.isEmpty);
|
assertFalse(result.getCollectedfrom.isEmpty);
|
||||||
|
|
||||||
val collectedFromList = result.getCollectedfrom.asScala
|
val collectedFromList = result.getCollectedfrom.asScala
|
||||||
assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
|
assert(
|
||||||
|
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
|
||||||
assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
|
"Wrong collected from assertion"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert(
|
||||||
|
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
|
||||||
|
"Wrong collected from assertion"
|
||||||
|
)
|
||||||
|
|
||||||
val relevantDates = result.getRelevantdate.asScala
|
val relevantDates = result.getRelevantdate.asScala
|
||||||
|
|
||||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
|
assert(
|
||||||
|
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
|
||||||
|
"Missing relevant date of type created"
|
||||||
|
)
|
||||||
|
|
||||||
val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
|
val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
|
||||||
assertFalse(rels.isEmpty)
|
assertFalse(rels.isEmpty)
|
||||||
|
@ -393,15 +459,14 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testSetDateOfAcceptanceCrossRef2Oaf(): Unit = {
|
def testSetDateOfAcceptanceCrossRef2Oaf(): Unit = {
|
||||||
|
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json"))
|
||||||
|
.mkString
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
|
|
||||||
assertFalse(json.isEmpty);
|
assertFalse(json.isEmpty);
|
||||||
|
@ -421,8 +486,13 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testNormalizeDOI(): Unit = {
|
def testNormalizeDOI(): Unit = {
|
||||||
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString
|
val template = Source
|
||||||
val line :String = "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
val line: String =
|
||||||
|
"\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
|
||||||
val json = template.replace("%s", line)
|
val json = template.replace("%s", line)
|
||||||
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
||||||
assertTrue(resultList.nonEmpty)
|
assertTrue(resultList.nonEmpty)
|
||||||
|
@ -431,13 +501,17 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
|
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
|
||||||
assertTrue(result.getPid.size() == 1)
|
assertTrue(result.getPid.size() == 1)
|
||||||
result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())))
|
result.getPid.asScala.foreach(pid =>
|
||||||
|
assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testNormalizeDOI2(): Unit = {
|
def testNormalizeDOI2(): Unit = {
|
||||||
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString
|
val template = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json"))
|
||||||
|
.mkString
|
||||||
|
|
||||||
val resultList: List[Oaf] = Crossref2Oaf.convert(template)
|
val resultList: List[Oaf] = Crossref2Oaf.convert(template)
|
||||||
assertTrue(resultList.nonEmpty)
|
assertTrue(resultList.nonEmpty)
|
||||||
|
@ -446,14 +520,19 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
|
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
|
||||||
assertTrue(result.getPid.size() == 1)
|
assertTrue(result.getPid.size() == 1)
|
||||||
result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())))
|
result.getPid.asScala.foreach(pid =>
|
||||||
|
assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testLicenseVorClosed(): Unit = {
|
def testLicenseVorClosed(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
assertFalse(json.isEmpty);
|
assertFalse(json.isEmpty);
|
||||||
|
@ -462,25 +541,28 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
assertTrue(resultList.nonEmpty)
|
assertTrue(resultList.nonEmpty)
|
||||||
|
|
||||||
|
|
||||||
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||||
|
|
||||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||||
println(mapper.writeValueAsString(item))
|
println(mapper.writeValueAsString(item))
|
||||||
|
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor")))
|
assertTrue(
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED")))
|
item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor"))
|
||||||
|
)
|
||||||
|
assertTrue(
|
||||||
|
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED"))
|
||||||
|
)
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
|
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testLicenseOpen(): Unit = {
|
def testLicenseOpen(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
assertFalse(json.isEmpty);
|
assertFalse(json.isEmpty);
|
||||||
|
@ -489,12 +571,19 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
assertTrue(resultList.nonEmpty)
|
assertTrue(resultList.nonEmpty)
|
||||||
|
|
||||||
|
|
||||||
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||||
|
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html")))
|
assertTrue(
|
||||||
|
item.getInstance().asScala exists (i =>
|
||||||
|
i.getLicense.getValue.equals(
|
||||||
|
"http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
|
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid))
|
assertTrue(
|
||||||
|
item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)
|
||||||
|
)
|
||||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||||
println(mapper.writeValueAsString(item))
|
println(mapper.writeValueAsString(item))
|
||||||
|
|
||||||
|
@ -502,8 +591,13 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testLicenseEmbargoOpen(): Unit = {
|
def testLicenseEmbargoOpen(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
assertFalse(json.isEmpty);
|
assertFalse(json.isEmpty);
|
||||||
|
@ -512,12 +606,19 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
assertTrue(resultList.nonEmpty)
|
assertTrue(resultList.nonEmpty)
|
||||||
|
|
||||||
|
|
||||||
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||||
|
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
|
assertTrue(
|
||||||
|
item.getInstance().asScala exists (i =>
|
||||||
|
i.getLicense.getValue.equals(
|
||||||
|
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
|
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid))
|
assertTrue(
|
||||||
|
item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)
|
||||||
|
)
|
||||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||||
println(mapper.writeValueAsString(item))
|
println(mapper.writeValueAsString(item))
|
||||||
|
|
||||||
|
@ -525,8 +626,13 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testLicenseEmbargo(): Unit = {
|
def testLicenseEmbargo(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/doiboost/crossref/publication_license_embargo.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
assertFalse(json.isEmpty);
|
assertFalse(json.isEmpty);
|
||||||
|
@ -535,22 +641,33 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
assertTrue(resultList.nonEmpty)
|
assertTrue(resultList.nonEmpty)
|
||||||
|
|
||||||
|
|
||||||
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||||
|
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
|
assertTrue(
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO")))
|
item.getInstance().asScala exists (i =>
|
||||||
|
i.getLicense.getValue.equals(
|
||||||
|
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertTrue(
|
||||||
|
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))
|
||||||
|
)
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
|
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
|
||||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||||
println(mapper.writeValueAsString(item))
|
println(mapper.writeValueAsString(item))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testLicenseEmbargoDateTime(): Unit = {
|
def testLicenseEmbargoDateTime(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
assertFalse(json.isEmpty);
|
assertFalse(json.isEmpty);
|
||||||
|
@ -559,11 +676,18 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
assertTrue(resultList.nonEmpty)
|
assertTrue(resultList.nonEmpty)
|
||||||
|
|
||||||
|
|
||||||
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||||
|
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
|
assertTrue(
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO")))
|
item.getInstance().asScala exists (i =>
|
||||||
|
i.getLicense.getValue.equals(
|
||||||
|
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertTrue(
|
||||||
|
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))
|
||||||
|
)
|
||||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
|
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
|
||||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||||
println(mapper.writeValueAsString(item))
|
println(mapper.writeValueAsString(item))
|
||||||
|
@ -572,8 +696,11 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testMultipleURLs(): Unit = {
|
def testMultipleURLs(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
assertFalse(json.isEmpty);
|
assertFalse(json.isEmpty);
|
||||||
|
@ -582,12 +709,14 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
assertTrue(resultList.nonEmpty)
|
assertTrue(resultList.nonEmpty)
|
||||||
|
|
||||||
|
|
||||||
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||||
|
|
||||||
assertEquals(1, item.getInstance().size())
|
assertEquals(1, item.getInstance().size())
|
||||||
assertEquals(1, item.getInstance().get(0).getUrl().size())
|
assertEquals(1, item.getInstance().get(0).getUrl().size())
|
||||||
assertEquals("https://doi.org/10.1016/j.jas.2019.105013", item.getInstance().get(0).getUrl().get(0))
|
assertEquals(
|
||||||
|
"https://doi.org/10.1016/j.jas.2019.105013",
|
||||||
|
item.getInstance().get(0).getUrl().get(0)
|
||||||
|
)
|
||||||
//println(mapper.writeValueAsString(item))
|
//println(mapper.writeValueAsString(item))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,29 +12,21 @@ import org.slf4j.{Logger, LoggerFactory}
|
||||||
import java.sql.Timestamp
|
import java.sql.Timestamp
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class MAGMappingTest {
|
class MAGMappingTest {
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testSplitter(): Unit = {
|
def testSplitter(): Unit = {
|
||||||
val s = "sports.team"
|
val s = "sports.team"
|
||||||
|
|
||||||
|
|
||||||
if (s.contains(".")) {
|
if (s.contains(".")) {
|
||||||
println(s.split("\\.") head)
|
println(s.split("\\.") head)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testDate(): Unit = {
|
def testDate(): Unit = {
|
||||||
|
|
||||||
|
@ -44,11 +36,11 @@ class MAGMappingTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def buildInvertedIndexTest(): Unit = {
|
def buildInvertedIndexTest(): Unit = {
|
||||||
val json_input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json")).mkString
|
val json_input = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json"))
|
||||||
|
.mkString
|
||||||
val description = ConversionUtil.convertInvertedIndexString(json_input)
|
val description = ConversionUtil.convertInvertedIndexString(json_input)
|
||||||
assertNotNull(description)
|
assertNotNull(description)
|
||||||
assertTrue(description.nonEmpty)
|
assertTrue(description.nonEmpty)
|
||||||
|
@ -56,11 +48,10 @@ class MAGMappingTest {
|
||||||
logger.debug(description)
|
logger.debug(description)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def normalizeDoiTest(): Unit = {
|
def normalizeDoiTest(): Unit = {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
implicit val formats = DefaultFormats
|
implicit val formats = DefaultFormats
|
||||||
|
|
||||||
val conf = new SparkConf()
|
val conf = new SparkConf()
|
||||||
|
@ -78,7 +69,8 @@ class MAGMappingTest {
|
||||||
val schema = Encoders.product[MagPapers].schema
|
val schema = Encoders.product[MagPapers].schema
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
|
val magPapers: Dataset[MagPapers] =
|
||||||
|
spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers]
|
||||||
val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
|
val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
|
||||||
assertTrue(ret.count == 10)
|
assertTrue(ret.count == 10)
|
||||||
ret.take(10).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
|
ret.take(10).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
|
||||||
|
@ -108,7 +100,8 @@ class MAGMappingTest {
|
||||||
val schema = Encoders.product[MagPapers].schema
|
val schema = Encoders.product[MagPapers].schema
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
|
val magPapers: Dataset[MagPapers] =
|
||||||
|
spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers]
|
||||||
val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
|
val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
|
||||||
assertTrue(ret.count == 8)
|
assertTrue(ret.count == 8)
|
||||||
ret.take(8).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
|
ret.take(8).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
|
||||||
|
@ -116,7 +109,4 @@ class MAGMappingTest {
|
||||||
//ret.take(8).foreach(mp => println(write(mp)))
|
//ret.take(8).foreach(mp => println(write(mp)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,9 @@ class MappingORCIDToOAFTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testExtractData(): Unit = {
|
def testExtractData(): Unit = {
|
||||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput")).mkString
|
val json = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput"))
|
||||||
|
.mkString
|
||||||
assertNotNull(json)
|
assertNotNull(json)
|
||||||
assertFalse(json.isEmpty)
|
assertFalse(json.isEmpty)
|
||||||
json.lines.foreach(s => {
|
json.lines.foreach(s => {
|
||||||
|
@ -52,12 +54,8 @@ class MappingORCIDToOAFTest {
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val oA = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem].count()
|
val oA = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem].count()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val p: Dataset[Publication] = spark.read.load(targetPath).as[Publication]
|
val p: Dataset[Publication] = spark.read.load(targetPath).as[Publication]
|
||||||
|
|
||||||
assertTrue(oA == p.count())
|
assertTrue(oA == p.count())
|
||||||
|
@ -65,17 +63,16 @@ class MappingORCIDToOAFTest {
|
||||||
|
|
||||||
spark.close()
|
spark.close()
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testExtractDat1(): Unit = {
|
def testExtractDat1(): Unit = {
|
||||||
|
|
||||||
|
val aList: List[OrcidAuthor] = List(
|
||||||
|
OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null),
|
||||||
val aList: List[OrcidAuthor] = List(OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null ),
|
OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null),
|
||||||
OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null ), OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null ))
|
OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null)
|
||||||
|
)
|
||||||
|
|
||||||
val orcid: ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList)
|
val orcid: ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList)
|
||||||
|
|
||||||
|
@ -85,10 +82,6 @@ class MappingORCIDToOAFTest {
|
||||||
oaf.getPid.toList.foreach(pid => assert(pid.getValue.equals("10.1042/BCJ20160876")))
|
oaf.getPid.toList.foreach(pid => assert(pid.getValue.equals("10.1042/BCJ20160876")))
|
||||||
//println(mapper.writeValueAsString(ORCIDToOAF.convertTOOAF(orcid)))
|
//println(mapper.writeValueAsString(ORCIDToOAF.convertTOOAF(orcid)))
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,11 +14,12 @@ class UnpayWallMappingTest {
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testMappingToOAF(): Unit = {
|
def testMappingToOAF(): Unit = {
|
||||||
|
|
||||||
val Ilist = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json")).mkString
|
val Ilist = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json"))
|
||||||
|
.mkString
|
||||||
|
|
||||||
var i: Int = 0
|
var i: Int = 0
|
||||||
for (line <- Ilist.lines) {
|
for (line <- Ilist.lines) {
|
||||||
|
@ -42,13 +43,14 @@ class UnpayWallMappingTest {
|
||||||
i = i + 1
|
i = i + 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val l = Ilist.lines.next()
|
val l = Ilist.lines.next()
|
||||||
|
|
||||||
val item = UnpayWallToOAF.convertToOAF(l)
|
val item = UnpayWallToOAF.convertToOAF(l)
|
||||||
|
|
||||||
assertEquals(item.getInstance().get(0).getAccessright.getOpenAccessRoute, OpenAccessRoute.bronze)
|
assertEquals(
|
||||||
|
item.getInstance().get(0).getAccessright.getOpenAccessRoute,
|
||||||
|
OpenAccessRoute.bronze
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(mapper.writeValueAsString(item))
|
logger.info(mapper.writeValueAsString(item))
|
||||||
|
|
||||||
|
|
|
@ -4,17 +4,29 @@ import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
||||||
import org.apache.spark.sql.expressions.Aggregator
|
import org.apache.spark.sql.expressions.Aggregator
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
|
||||||
|
|
||||||
|
case class HostedByItemType(
|
||||||
|
id: String,
|
||||||
|
officialname: String,
|
||||||
|
issn: String,
|
||||||
|
eissn: String,
|
||||||
|
lissn: String,
|
||||||
|
openAccess: Boolean
|
||||||
|
) {}
|
||||||
|
|
||||||
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
case class HostedByInfo(
|
||||||
case class HostedByInfo(id: String, officialname: String, journal_id: String, provenance : String, id_type: String) {}
|
id: String,
|
||||||
|
officialname: String,
|
||||||
|
journal_id: String,
|
||||||
|
provenance: String,
|
||||||
|
id_type: String
|
||||||
|
) {}
|
||||||
|
|
||||||
object Aggregators {
|
object Aggregators {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def getId(s1: String, s2: String): String = {
|
def getId(s1: String, s2: String): String = {
|
||||||
if (s1.startsWith("10|")) {
|
if (s1.startsWith("10|")) {
|
||||||
return s1}
|
return s1
|
||||||
|
}
|
||||||
s2
|
s2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,24 +37,40 @@ object Aggregators {
|
||||||
s2
|
s2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def explodeHostedByItemType(
|
||||||
def explodeHostedByItemType(df: Dataset[(String, HostedByItemType)]): Dataset[(String, HostedByItemType)] = {
|
df: Dataset[(String, HostedByItemType)]
|
||||||
|
): Dataset[(String, HostedByItemType)] = {
|
||||||
val transformedData: Dataset[(String, HostedByItemType)] = df
|
val transformedData: Dataset[(String, HostedByItemType)] = df
|
||||||
.groupByKey(_._1)(Encoders.STRING)
|
.groupByKey(_._1)(Encoders.STRING)
|
||||||
.agg(Aggregators.hostedByAggregator)
|
.agg(Aggregators.hostedByAggregator)
|
||||||
.map{
|
.map { case (id: String, res: (String, HostedByItemType)) =>
|
||||||
case (id:String , res:(String, HostedByItemType)) => res
|
res
|
||||||
}(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]))
|
}(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]))
|
||||||
|
|
||||||
transformedData
|
transformedData
|
||||||
}
|
}
|
||||||
|
|
||||||
val hostedByAggregator: TypedColumn[(String, HostedByItemType), (String, HostedByItemType)] = new Aggregator[(String, HostedByItemType), (String, HostedByItemType), (String, HostedByItemType)] {
|
val hostedByAggregator: TypedColumn[(String, HostedByItemType), (String, HostedByItemType)] =
|
||||||
override def zero: (String, HostedByItemType) = ("", HostedByItemType("","","","","",false))
|
new Aggregator[
|
||||||
override def reduce(b: (String, HostedByItemType), a:(String,HostedByItemType)): (String, HostedByItemType) = {
|
(String, HostedByItemType),
|
||||||
|
(String, HostedByItemType),
|
||||||
|
(String, HostedByItemType)
|
||||||
|
] {
|
||||||
|
|
||||||
|
override def zero: (String, HostedByItemType) =
|
||||||
|
("", HostedByItemType("", "", "", "", "", false))
|
||||||
|
|
||||||
|
override def reduce(
|
||||||
|
b: (String, HostedByItemType),
|
||||||
|
a: (String, HostedByItemType)
|
||||||
|
): (String, HostedByItemType) = {
|
||||||
return merge(b, a)
|
return merge(b, a)
|
||||||
}
|
}
|
||||||
override def merge(b1: (String, HostedByItemType), b2: (String, HostedByItemType)): (String, HostedByItemType) = {
|
|
||||||
|
override def merge(
|
||||||
|
b1: (String, HostedByItemType),
|
||||||
|
b2: (String, HostedByItemType)
|
||||||
|
): (String, HostedByItemType) = {
|
||||||
if (b1 == null) {
|
if (b1 == null) {
|
||||||
return b2
|
return b2
|
||||||
}
|
}
|
||||||
|
@ -50,27 +78,51 @@ object Aggregators {
|
||||||
return b1
|
return b1
|
||||||
}
|
}
|
||||||
if (b1._2.id.startsWith("10|")) {
|
if (b1._2.id.startsWith("10|")) {
|
||||||
return (b1._1, HostedByItemType(b1._2.id, b1._2.officialname, b1._2.issn, b1._2.eissn, b1._2.lissn, b1._2.openAccess || b2._2.openAccess))
|
return (
|
||||||
|
b1._1,
|
||||||
|
HostedByItemType(
|
||||||
|
b1._2.id,
|
||||||
|
b1._2.officialname,
|
||||||
|
b1._2.issn,
|
||||||
|
b1._2.eissn,
|
||||||
|
b1._2.lissn,
|
||||||
|
b1._2.openAccess || b2._2.openAccess
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
return (b2._1, HostedByItemType(b2._2.id, b2._2.officialname, b2._2.issn, b2._2.eissn, b2._2.lissn, b1._2.openAccess || b2._2.openAccess))
|
return (
|
||||||
|
b2._1,
|
||||||
|
HostedByItemType(
|
||||||
|
b2._2.id,
|
||||||
|
b2._2.officialname,
|
||||||
|
b2._2.issn,
|
||||||
|
b2._2.eissn,
|
||||||
|
b2._2.lissn,
|
||||||
|
b1._2.openAccess || b2._2.openAccess
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
override def finish(reduction: (String,HostedByItemType)): (String, HostedByItemType) = reduction
|
|
||||||
override def bufferEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType])
|
|
||||||
|
|
||||||
override def outputEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType])
|
override def finish(reduction: (String, HostedByItemType)): (String, HostedByItemType) =
|
||||||
|
reduction
|
||||||
|
|
||||||
|
override def bufferEncoder: Encoder[(String, HostedByItemType)] =
|
||||||
|
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
|
||||||
|
|
||||||
|
override def outputEncoder: Encoder[(String, HostedByItemType)] =
|
||||||
|
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
|
||||||
}.toColumn
|
}.toColumn
|
||||||
|
|
||||||
|
def resultToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] =
|
||||||
|
new Aggregator[EntityInfo, EntityInfo, EntityInfo] {
|
||||||
|
|
||||||
def resultToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = new Aggregator[EntityInfo, EntityInfo, EntityInfo]{
|
|
||||||
override def zero: EntityInfo = EntityInfo.newInstance("", "", "")
|
override def zero: EntityInfo = EntityInfo.newInstance("", "", "")
|
||||||
|
|
||||||
override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = {
|
override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = {
|
||||||
return merge(b, a)
|
return merge(b, a)
|
||||||
}
|
}
|
||||||
|
|
||||||
override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
|
override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
|
||||||
if (b1 == null) {
|
if (b1 == null) {
|
||||||
return b2
|
return b2
|
||||||
|
@ -96,19 +148,21 @@ object Aggregators {
|
||||||
val transformedData: Dataset[EntityInfo] = df
|
val transformedData: Dataset[EntityInfo] = df
|
||||||
.groupByKey(_.getId)(Encoders.STRING)
|
.groupByKey(_.getId)(Encoders.STRING)
|
||||||
.agg(Aggregators.resultToSingleIdAggregator)
|
.agg(Aggregators.resultToSingleIdAggregator)
|
||||||
.map{
|
.map { case (id: String, res: EntityInfo) =>
|
||||||
case (id:String , res: EntityInfo) => res
|
res
|
||||||
}(Encoders.bean(classOf[EntityInfo]))
|
}(Encoders.bean(classOf[EntityInfo]))
|
||||||
|
|
||||||
transformedData
|
transformedData
|
||||||
}
|
}
|
||||||
|
|
||||||
def datasourceToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = new Aggregator[EntityInfo, EntityInfo, EntityInfo]{
|
def datasourceToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] =
|
||||||
|
new Aggregator[EntityInfo, EntityInfo, EntityInfo] {
|
||||||
override def zero: EntityInfo = EntityInfo.newInstance("", "", "")
|
override def zero: EntityInfo = EntityInfo.newInstance("", "", "")
|
||||||
|
|
||||||
override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = {
|
override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = {
|
||||||
return merge(b, a)
|
return merge(b, a)
|
||||||
}
|
}
|
||||||
|
|
||||||
override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
|
override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
|
||||||
if (b1 == null) {
|
if (b1 == null) {
|
||||||
return b2
|
return b2
|
||||||
|
@ -128,13 +182,12 @@ object Aggregators {
|
||||||
override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||||
}.toColumn
|
}.toColumn
|
||||||
|
|
||||||
|
|
||||||
def datasourceToSingleId(df: Dataset[EntityInfo]): Dataset[EntityInfo] = {
|
def datasourceToSingleId(df: Dataset[EntityInfo]): Dataset[EntityInfo] = {
|
||||||
val transformedData: Dataset[EntityInfo] = df
|
val transformedData: Dataset[EntityInfo] = df
|
||||||
.groupByKey(_.getHostedById)(Encoders.STRING)
|
.groupByKey(_.getHostedById)(Encoders.STRING)
|
||||||
.agg(Aggregators.datasourceToSingleIdAggregator)
|
.agg(Aggregators.datasourceToSingleIdAggregator)
|
||||||
.map{
|
.map { case (id: String, res: EntityInfo) =>
|
||||||
case (id:String , res: EntityInfo) => res
|
res
|
||||||
}(Encoders.bean(classOf[EntityInfo]))
|
}(Encoders.bean(classOf[EntityInfo]))
|
||||||
|
|
||||||
transformedData
|
transformedData
|
||||||
|
|
|
@ -14,7 +14,8 @@ import org.slf4j.{Logger, LoggerFactory}
|
||||||
object SparkApplyHostedByMapToDatasource {
|
object SparkApplyHostedByMapToDatasource {
|
||||||
|
|
||||||
def applyHBtoDats(join: Dataset[EntityInfo], dats: Dataset[Datasource]): Dataset[Datasource] = {
|
def applyHBtoDats(join: Dataset[EntityInfo], dats: Dataset[Datasource]): Dataset[Datasource] = {
|
||||||
dats.joinWith(join, dats.col("id").equalTo(join.col("hostedById")), "left")
|
dats
|
||||||
|
.joinWith(join, dats.col("id").equalTo(join.col("hostedById")), "left")
|
||||||
.map(t2 => {
|
.map(t2 => {
|
||||||
val d: Datasource = t2._1
|
val d: Datasource = t2._1
|
||||||
if (t2._2 != null) {
|
if (t2._2 != null) {
|
||||||
|
@ -31,14 +32,21 @@ object SparkApplyHostedByMapToDatasource {
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val graphPath = parser.get("graphPath")
|
val graphPath = parser.get("graphPath")
|
||||||
val outputPath = parser.get("outputPath")
|
val outputPath = parser.get("outputPath")
|
||||||
|
@ -51,20 +59,27 @@ object SparkApplyHostedByMapToDatasource {
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
val dats: Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource")
|
val dats: Dataset[Datasource] = spark.read
|
||||||
|
.textFile(graphPath + "/datasource")
|
||||||
.map(r => mapper.readValue(r, classOf[Datasource]))
|
.map(r => mapper.readValue(r, classOf[Datasource]))
|
||||||
|
|
||||||
val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(preparedInfoPath)
|
val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(
|
||||||
.map(ei => mapper.readValue(ei, classOf[EntityInfo])))
|
spark.read
|
||||||
|
.textFile(preparedInfoPath)
|
||||||
|
.map(ei => mapper.readValue(ei, classOf[EntityInfo]))
|
||||||
|
)
|
||||||
|
|
||||||
applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
|
applyHBtoDats(pinfo, dats).write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath)
|
||||||
|
|
||||||
spark.read.textFile(outputPath)
|
spark.read
|
||||||
|
.textFile(outputPath)
|
||||||
.write
|
.write
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.text(graphPath + "/datasource")
|
.text(graphPath + "/datasource")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,7 +16,8 @@ import scala.collection.JavaConverters._
|
||||||
object SparkApplyHostedByMapToResult {
|
object SparkApplyHostedByMapToResult {
|
||||||
|
|
||||||
def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = {
|
def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = {
|
||||||
pubs.joinWith(join, pubs.col("id").equalTo(join.col("id")), "left")
|
pubs
|
||||||
|
.joinWith(join, pubs.col("id").equalTo(join.col("id")), "left")
|
||||||
.map(t2 => {
|
.map(t2 => {
|
||||||
val p: Publication = t2._1
|
val p: Publication = t2._1
|
||||||
if (t2._2 != null) {
|
if (t2._2 != null) {
|
||||||
|
@ -27,7 +28,14 @@ object SparkApplyHostedByMapToResult {
|
||||||
inst.getHostedby.setKey(ei.getHostedById)
|
inst.getHostedby.setKey(ei.getHostedById)
|
||||||
inst.getHostedby.setValue(ei.getName)
|
inst.getHostedby.setValue(ei.getName)
|
||||||
if (ei.getOpenAccess) {
|
if (ei.getOpenAccess) {
|
||||||
inst.setAccessright(OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN, "Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES))
|
inst.setAccessright(
|
||||||
|
OafMapperUtils.accessRight(
|
||||||
|
ModelConstants.ACCESS_RIGHT_OPEN,
|
||||||
|
"Open Access",
|
||||||
|
ModelConstants.DNET_ACCESS_MODES,
|
||||||
|
ModelConstants.DNET_ACCESS_MODES
|
||||||
|
)
|
||||||
|
)
|
||||||
inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
|
inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
|
||||||
p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
|
p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
|
||||||
}
|
}
|
||||||
|
@ -40,46 +48,54 @@ object SparkApplyHostedByMapToResult {
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val graphPath = parser.get("graphPath")
|
val graphPath = parser.get("graphPath")
|
||||||
|
|
||||||
val outputPath = parser.get("outputPath")
|
val outputPath = parser.get("outputPath")
|
||||||
val preparedInfoPath = parser.get("preparedInfoPath")
|
val preparedInfoPath = parser.get("preparedInfoPath")
|
||||||
|
|
||||||
|
|
||||||
implicit val formats = DefaultFormats
|
implicit val formats = DefaultFormats
|
||||||
|
|
||||||
|
|
||||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication])
|
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication])
|
||||||
implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
val pubs: Dataset[Publication] = spark.read.textFile(graphPath + "/publication")
|
val pubs: Dataset[Publication] = spark.read
|
||||||
|
.textFile(graphPath + "/publication")
|
||||||
.map(r => mapper.readValue(r, classOf[Publication]))
|
.map(r => mapper.readValue(r, classOf[Publication]))
|
||||||
|
|
||||||
val pinfo: Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath)
|
val pinfo: Dataset[EntityInfo] = spark.read
|
||||||
|
.textFile(preparedInfoPath)
|
||||||
.map(ei => mapper.readValue(ei, classOf[EntityInfo]))
|
.map(ei => mapper.readValue(ei, classOf[EntityInfo]))
|
||||||
|
|
||||||
applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
|
applyHBtoPubs(pinfo, pubs).write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath)
|
||||||
|
|
||||||
spark.read.textFile(outputPath)
|
spark.read
|
||||||
|
.textFile(outputPath)
|
||||||
.write
|
.write
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.text(graphPath + "/publication")
|
.text(graphPath + "/publication")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,7 +19,6 @@ object SparkPrepareHostedByInfoToApply {
|
||||||
def getList(id: String, j: Journal, name: String): List[EntityInfo] = {
|
def getList(id: String, j: Journal, name: String): List[EntityInfo] = {
|
||||||
var lst: List[EntityInfo] = List()
|
var lst: List[EntityInfo] = List()
|
||||||
|
|
||||||
|
|
||||||
if (j.getIssnLinking != null && !j.getIssnLinking.equals("")) {
|
if (j.getIssnLinking != null && !j.getIssnLinking.equals("")) {
|
||||||
lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst
|
lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst
|
||||||
}
|
}
|
||||||
|
@ -37,14 +36,14 @@ object SparkPrepareHostedByInfoToApply {
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
val dd: Dataset[Publication] = spark.read.textFile(publicationPath)
|
val dd: Dataset[Publication] = spark.read
|
||||||
|
.textFile(publicationPath)
|
||||||
.map(r => mapper.readValue(r, classOf[Publication]))
|
.map(r => mapper.readValue(r, classOf[Publication]))
|
||||||
|
|
||||||
dd.filter(p => p.getJournal != null).flatMap(p => getList(p.getId, p.getJournal, ""))
|
dd.filter(p => p.getJournal != null).flatMap(p => getList(p.getId, p.getJournal, ""))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def toEntityInfo(input: String): EntityInfo = {
|
def toEntityInfo(input: String): EntityInfo = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
|
||||||
|
@ -53,7 +52,6 @@ object SparkPrepareHostedByInfoToApply {
|
||||||
toEntityItem(c.keys.head, c.values.head)
|
toEntityItem(c.keys.head, c.values.head)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def toEntityItem(journal_id: String, hbi: HostedByItemType): EntityInfo = {
|
def toEntityItem(journal_id: String, hbi: HostedByItemType): EntityInfo = {
|
||||||
|
|
||||||
EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess)
|
EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess)
|
||||||
|
@ -61,7 +59,9 @@ object SparkPrepareHostedByInfoToApply {
|
||||||
}
|
}
|
||||||
|
|
||||||
def joinResHBM(res: Dataset[EntityInfo], hbm: Dataset[EntityInfo]): Dataset[EntityInfo] = {
|
def joinResHBM(res: Dataset[EntityInfo], hbm: Dataset[EntityInfo]): Dataset[EntityInfo] = {
|
||||||
Aggregators.resultToSingleId(res.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left")
|
Aggregators.resultToSingleId(
|
||||||
|
res
|
||||||
|
.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left")
|
||||||
.map(t2 => {
|
.map(t2 => {
|
||||||
val res: EntityInfo = t2._1
|
val res: EntityInfo = t2._1
|
||||||
if (t2._2 != null) {
|
if (t2._2 != null) {
|
||||||
|
@ -71,52 +71,57 @@ object SparkPrepareHostedByInfoToApply {
|
||||||
res.setName(ds.getName)
|
res.setName(ds.getName)
|
||||||
}
|
}
|
||||||
res
|
res
|
||||||
}))
|
})
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val graphPath = parser.get("graphPath")
|
val graphPath = parser.get("graphPath")
|
||||||
|
|
||||||
val outputPath = parser.get("preparedInfoPath")
|
val outputPath = parser.get("preparedInfoPath")
|
||||||
val hostedByMapPath = parser.get("hostedByMapPath")
|
val hostedByMapPath = parser.get("hostedByMapPath")
|
||||||
|
|
||||||
|
|
||||||
implicit val formats = DefaultFormats
|
implicit val formats = DefaultFormats
|
||||||
|
|
||||||
|
|
||||||
logger.info("Getting the Datasources")
|
logger.info("Getting the Datasources")
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
|
||||||
//STEP1: read the hostedbymap and transform it in EntityInfo
|
//STEP1: read the hostedbymap and transform it in EntityInfo
|
||||||
val hostedByInfo: Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
|
val hostedByInfo: Dataset[EntityInfo] =
|
||||||
|
spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
|
||||||
|
|
||||||
//STEP2: create association (publication, issn), (publication, eissn), (publication, lissn)
|
//STEP2: create association (publication, issn), (publication, eissn), (publication, lissn)
|
||||||
val resultInfoDataset: Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication")
|
val resultInfoDataset: Dataset[EntityInfo] =
|
||||||
|
prepareResultInfo(spark, graphPath + "/publication")
|
||||||
|
|
||||||
//STEP3: left join resultInfo with hostedByInfo on journal_id. Reduction of all the results with the same id in just
|
//STEP3: left join resultInfo with hostedByInfo on journal_id. Reduction of all the results with the same id in just
|
||||||
//one entry (one result could be associated to issn and eissn and so possivly matching more than once against the map)
|
//one entry (one result could be associated to issn and eissn and so possivly matching more than once against the map)
|
||||||
//to this entry we add the id of the datasource for the next step
|
//to this entry we add the id of the datasource for the next step
|
||||||
joinResHBM(resultInfoDataset, hostedByInfo)
|
joinResHBM(resultInfoDataset, hostedByInfo).write
|
||||||
.write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,9 +17,8 @@ import java.io.PrintWriter
|
||||||
|
|
||||||
object SparkProduceHostedByMap {
|
object SparkProduceHostedByMap {
|
||||||
|
|
||||||
|
implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] =
|
||||||
implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] = Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
|
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
|
||||||
|
|
||||||
|
|
||||||
def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)): HostedByItemType = {
|
def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)): HostedByItemType = {
|
||||||
val openaire: HostedByInfo = input._1._1
|
val openaire: HostedByInfo = input._1._1
|
||||||
|
@ -28,9 +27,33 @@ object SparkProduceHostedByMap {
|
||||||
val isOpenAccess: Boolean = doaj == null && gold == null
|
val isOpenAccess: Boolean = doaj == null && gold == null
|
||||||
|
|
||||||
openaire.journal_id match {
|
openaire.journal_id match {
|
||||||
case Constants.ISSN => HostedByItemType(openaire.id, openaire.officialname, openaire.journal_id, "", "", isOpenAccess)
|
case Constants.ISSN =>
|
||||||
case Constants.EISSN => HostedByItemType(openaire.id, openaire.officialname, "", openaire.journal_id, "", isOpenAccess)
|
HostedByItemType(
|
||||||
case Constants.ISSNL => HostedByItemType(openaire.id, openaire.officialname, "", "", openaire.journal_id, isOpenAccess)
|
openaire.id,
|
||||||
|
openaire.officialname,
|
||||||
|
openaire.journal_id,
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
isOpenAccess
|
||||||
|
)
|
||||||
|
case Constants.EISSN =>
|
||||||
|
HostedByItemType(
|
||||||
|
openaire.id,
|
||||||
|
openaire.officialname,
|
||||||
|
"",
|
||||||
|
openaire.journal_id,
|
||||||
|
"",
|
||||||
|
isOpenAccess
|
||||||
|
)
|
||||||
|
case Constants.ISSNL =>
|
||||||
|
HostedByItemType(
|
||||||
|
openaire.id,
|
||||||
|
openaire.officialname,
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
openaire.journal_id,
|
||||||
|
isOpenAccess
|
||||||
|
)
|
||||||
|
|
||||||
// catch the default with a variable so you can print it
|
// catch the default with a variable so you can print it
|
||||||
case whoa => null
|
case whoa => null
|
||||||
|
@ -46,11 +69,16 @@ object SparkProduceHostedByMap {
|
||||||
|
|
||||||
Serialization.write(map)
|
Serialization.write(map)
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def getHostedByItemType(
|
||||||
def getHostedByItemType(id: String, officialname: String, issn: String, eissn: String, issnl: String, oa: Boolean): HostedByItemType = {
|
id: String,
|
||||||
|
officialname: String,
|
||||||
|
issn: String,
|
||||||
|
eissn: String,
|
||||||
|
issnl: String,
|
||||||
|
oa: Boolean
|
||||||
|
): HostedByItemType = {
|
||||||
if (issn != null) {
|
if (issn != null) {
|
||||||
if (eissn != null) {
|
if (eissn != null) {
|
||||||
if (issnl != null) {
|
if (issnl != null) {
|
||||||
|
@ -85,7 +113,14 @@ object SparkProduceHostedByMap {
|
||||||
def oaToHostedbyItemType(dats: Datasource): HostedByItemType = {
|
def oaToHostedbyItemType(dats: Datasource): HostedByItemType = {
|
||||||
if (dats.getJournal != null) {
|
if (dats.getJournal != null) {
|
||||||
|
|
||||||
return getHostedByItemType(dats.getId, dats.getOfficialname.getValue, dats.getJournal.getIssnPrinted, dats.getJournal.getIssnOnline, dats.getJournal.getIssnLinking, false)
|
return getHostedByItemType(
|
||||||
|
dats.getId,
|
||||||
|
dats.getOfficialname.getValue,
|
||||||
|
dats.getJournal.getIssnPrinted,
|
||||||
|
dats.getJournal.getIssnOnline,
|
||||||
|
dats.getJournal.getIssnLinking,
|
||||||
|
false
|
||||||
|
)
|
||||||
}
|
}
|
||||||
HostedByItemType("", "", "", "", "", false)
|
HostedByItemType("", "", "", "", "", false)
|
||||||
}
|
}
|
||||||
|
@ -94,32 +129,41 @@ object SparkProduceHostedByMap {
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
implicit var encoderD = Encoders.kryo[Datasource]
|
implicit var encoderD = Encoders.kryo[Datasource]
|
||||||
|
|
||||||
val dd: Dataset[Datasource] = spark.read.textFile(datasourcePath)
|
val dd: Dataset[Datasource] = spark.read
|
||||||
|
.textFile(datasourcePath)
|
||||||
.map(r => mapper.readValue(r, classOf[Datasource]))
|
.map(r => mapper.readValue(r, classOf[Datasource]))
|
||||||
|
|
||||||
dd.map { ddt => oaToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
|
dd.map { ddt => oaToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def goldToHostedbyItemType(gold: UnibiGoldModel): HostedByItemType = {
|
def goldToHostedbyItemType(gold: UnibiGoldModel): HostedByItemType = {
|
||||||
return getHostedByItemType(Constants.UNIBI, gold.getTitle, gold.getIssn, "", gold.getIssnL, true)
|
return getHostedByItemType(
|
||||||
|
Constants.UNIBI,
|
||||||
|
gold.getTitle,
|
||||||
|
gold.getIssn,
|
||||||
|
"",
|
||||||
|
gold.getIssnL,
|
||||||
|
true
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def goldHostedByDataset(
|
||||||
def goldHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
|
spark: SparkSession,
|
||||||
|
datasourcePath: String
|
||||||
|
): Dataset[HostedByItemType] = {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
implicit val mapEncoderUnibi: Encoder[UnibiGoldModel] = Encoders.kryo[UnibiGoldModel]
|
implicit val mapEncoderUnibi: Encoder[UnibiGoldModel] = Encoders.kryo[UnibiGoldModel]
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
val dd: Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath)
|
val dd: Dataset[UnibiGoldModel] = spark.read
|
||||||
|
.textFile(datasourcePath)
|
||||||
.map(r => mapper.readValue(r, classOf[UnibiGoldModel]))
|
.map(r => mapper.readValue(r, classOf[UnibiGoldModel]))
|
||||||
|
|
||||||
dd.map { ddt => goldToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
|
dd.map { ddt => goldToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
|
||||||
|
@ -128,17 +172,28 @@ object SparkProduceHostedByMap {
|
||||||
|
|
||||||
def doajToHostedbyItemType(doaj: DOAJModel): HostedByItemType = {
|
def doajToHostedbyItemType(doaj: DOAJModel): HostedByItemType = {
|
||||||
|
|
||||||
return getHostedByItemType(Constants.DOAJ, doaj.getJournalTitle, doaj.getIssn, doaj.getEissn, "", true)
|
return getHostedByItemType(
|
||||||
|
Constants.DOAJ,
|
||||||
|
doaj.getJournalTitle,
|
||||||
|
doaj.getIssn,
|
||||||
|
doaj.getEissn,
|
||||||
|
"",
|
||||||
|
true
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
def doajHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
|
def doajHostedByDataset(
|
||||||
|
spark: SparkSession,
|
||||||
|
datasourcePath: String
|
||||||
|
): Dataset[HostedByItemType] = {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
implicit val mapEncoderDOAJ: Encoder[DOAJModel] = Encoders.kryo[DOAJModel]
|
implicit val mapEncoderDOAJ: Encoder[DOAJModel] = Encoders.kryo[DOAJModel]
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
val dd: Dataset[DOAJModel] = spark.read.textFile(datasourcePath)
|
val dd: Dataset[DOAJModel] = spark.read
|
||||||
|
.textFile(datasourcePath)
|
||||||
.map(r => mapper.readValue(r, classOf[DOAJModel]))
|
.map(r => mapper.readValue(r, classOf[DOAJModel]))
|
||||||
|
|
||||||
dd.map { ddt => doajToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
|
dd.map { ddt => doajToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
|
||||||
|
@ -159,7 +214,6 @@ object SparkProduceHostedByMap {
|
||||||
lst
|
lst
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = {
|
def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = {
|
||||||
val conf = new Configuration()
|
val conf = new Configuration()
|
||||||
|
|
||||||
|
@ -169,49 +223,51 @@ object SparkProduceHostedByMap {
|
||||||
val writer = new PrintWriter(output)
|
val writer = new PrintWriter(output)
|
||||||
try {
|
try {
|
||||||
input.foreach(hbi => writer.println(hbi))
|
input.foreach(hbi => writer.println(hbi))
|
||||||
}
|
} finally {
|
||||||
finally {
|
|
||||||
writer.close()
|
writer.close()
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val datasourcePath = parser.get("datasourcePath")
|
val datasourcePath = parser.get("datasourcePath")
|
||||||
val workingDirPath = parser.get("workingPath")
|
val workingDirPath = parser.get("workingPath")
|
||||||
val outputPath = parser.get("outputPath")
|
val outputPath = parser.get("outputPath")
|
||||||
|
|
||||||
|
|
||||||
implicit val formats = DefaultFormats
|
implicit val formats = DefaultFormats
|
||||||
|
|
||||||
|
|
||||||
logger.info("Getting the Datasources")
|
logger.info("Getting the Datasources")
|
||||||
|
|
||||||
|
Aggregators
|
||||||
Aggregators.explodeHostedByItemType(oaHostedByDataset(spark, datasourcePath)
|
.explodeHostedByItemType(
|
||||||
|
oaHostedByDataset(spark, datasourcePath)
|
||||||
.union(goldHostedByDataset(spark, workingDirPath + "/unibi_gold.json"))
|
.union(goldHostedByDataset(spark, workingDirPath + "/unibi_gold.json"))
|
||||||
.union(doajHostedByDataset(spark, workingDirPath + "/doaj.json"))
|
.union(doajHostedByDataset(spark, workingDirPath + "/doaj.json"))
|
||||||
.flatMap(hbi => toList(hbi))).filter(hbi => hbi._2.id.startsWith("10|"))
|
.flatMap(hbi => toList(hbi))
|
||||||
|
)
|
||||||
|
.filter(hbi => hbi._2.id.startsWith("10|"))
|
||||||
.map(hbi => toHostedByMap(hbi))(Encoders.STRING)
|
.map(hbi => toHostedByMap(hbi))(Encoders.STRING)
|
||||||
.rdd.saveAsTextFile(outputPath, classOf[GzipCodec])
|
.rdd
|
||||||
|
.saveAsTextFile(outputPath, classOf[GzipCodec])
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,7 +20,13 @@ object CopyHdfsOafSparkApplication {
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val log = LoggerFactory.getLogger(getClass)
|
val log = LoggerFactory.getLogger(getClass)
|
||||||
val conf = new SparkConf()
|
val conf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json")).mkString)
|
val parser = new ArgumentApplicationParser(
|
||||||
|
Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
|
|
||||||
val spark =
|
val spark =
|
||||||
|
@ -28,7 +34,8 @@ object CopyHdfsOafSparkApplication {
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val sc: SparkContext = spark.sparkContext
|
val sc: SparkContext = spark.sparkContext
|
||||||
|
|
||||||
|
@ -49,19 +56,22 @@ object CopyHdfsOafSparkApplication {
|
||||||
|
|
||||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
|
|
||||||
val paths = DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala
|
val paths =
|
||||||
|
DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala
|
||||||
|
|
||||||
val validPaths: List[String] = paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList
|
val validPaths: List[String] =
|
||||||
|
paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList
|
||||||
|
|
||||||
val types = ModelSupport.oafTypes.entrySet
|
val types = ModelSupport.oafTypes.entrySet.asScala
|
||||||
.asScala
|
|
||||||
.map(e => Tuple2(e.getKey, e.getValue))
|
.map(e => Tuple2(e.getKey, e.getValue))
|
||||||
|
|
||||||
if (validPaths.nonEmpty) {
|
if (validPaths.nonEmpty) {
|
||||||
val oaf = spark.read.textFile(validPaths: _*)
|
val oaf = spark.read.textFile(validPaths: _*)
|
||||||
val mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
|
val mapper =
|
||||||
|
new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
|
||||||
|
|
||||||
types.foreach(t => oaf
|
types.foreach(t =>
|
||||||
|
oaf
|
||||||
.filter(o => isOafType(o, t._1))
|
.filter(o => isOafType(o, t._1))
|
||||||
.map(j => mapper.readValue(j, t._2).asInstanceOf[Oaf])
|
.map(j => mapper.readValue(j, t._2).asInstanceOf[Oaf])
|
||||||
.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
|
.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
|
||||||
|
|
|
@ -13,20 +13,32 @@ import org.slf4j.{Logger, LoggerFactory}
|
||||||
object SparkResolveEntities {
|
object SparkResolveEntities {
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
val entities = List(EntityType.dataset, EntityType.publication, EntityType.software, EntityType.otherresearchproduct)
|
|
||||||
|
val entities = List(
|
||||||
|
EntityType.dataset,
|
||||||
|
EntityType.publication,
|
||||||
|
EntityType.software,
|
||||||
|
EntityType.otherresearchproduct
|
||||||
|
)
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val graphBasePath = parser.get("graphBasePath")
|
val graphBasePath = parser.get("graphBasePath")
|
||||||
log.info(s"graphBasePath -> $graphBasePath")
|
log.info(s"graphBasePath -> $graphBasePath")
|
||||||
|
@ -38,7 +50,6 @@ object SparkResolveEntities {
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
log.info(s"targetPath -> $targetPath")
|
log.info(s"targetPath -> $targetPath")
|
||||||
|
|
||||||
|
|
||||||
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
|
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
|
||||||
fs.mkdirs(new Path(workingPath))
|
fs.mkdirs(new Path(workingPath))
|
||||||
|
|
||||||
|
@ -46,23 +57,30 @@ object SparkResolveEntities {
|
||||||
generateResolvedEntities(spark, workingPath, graphBasePath, targetPath)
|
generateResolvedEntities(spark, workingPath, graphBasePath, targetPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = {
|
def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = {
|
||||||
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
|
val rPid: Dataset[(String, String)] =
|
||||||
val up: Dataset[(String, Result)] = spark.read.text(unresolvedPath).as[String].map(s => mapper.readValue(s, classOf[Result])).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
|
spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
|
||||||
|
val up: Dataset[(String, Result)] = spark.read
|
||||||
|
.text(unresolvedPath)
|
||||||
|
.as[String]
|
||||||
|
.map(s => mapper.readValue(s, classOf[Result]))
|
||||||
|
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
|
||||||
|
|
||||||
rPid.joinWith(up, rPid("_2").equalTo(up("_1")), "inner").map {
|
rPid
|
||||||
r =>
|
.joinWith(up, rPid("_2").equalTo(up("_1")), "inner")
|
||||||
|
.map { r =>
|
||||||
val result = r._2._2
|
val result = r._2._2
|
||||||
val dnetId = r._1._1
|
val dnetId = r._1._1
|
||||||
result.setId(dnetId)
|
result.setId(dnetId)
|
||||||
result
|
result
|
||||||
}.write.mode(SaveMode.Overwrite).save(s"$workingPath/resolvedEntities")
|
|
||||||
}
|
}
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/resolvedEntities")
|
||||||
|
}
|
||||||
|
|
||||||
def deserializeObject(input: String, entity: EntityType): Result = {
|
def deserializeObject(input: String, entity: EntityType): Result = {
|
||||||
|
|
||||||
|
@ -74,18 +92,32 @@ object SparkResolveEntities {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def generateResolvedEntities(spark: SparkSession, workingPath: String, graphBasePath: String, targetPath: String) = {
|
def generateResolvedEntities(
|
||||||
|
spark: SparkSession,
|
||||||
|
workingPath: String,
|
||||||
|
graphBasePath: String,
|
||||||
|
targetPath: String
|
||||||
|
) = {
|
||||||
|
|
||||||
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
val re: Dataset[(String, Result)] = spark.read.load(s"$workingPath/resolvedEntities").as[Result].map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
|
val re: Dataset[(String, Result)] = spark.read
|
||||||
entities.foreach {
|
.load(s"$workingPath/resolvedEntities")
|
||||||
e => {
|
.as[Result]
|
||||||
|
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
|
||||||
|
entities.foreach { e =>
|
||||||
|
{
|
||||||
|
|
||||||
val currentEntityDataset: Dataset[(String, Result)] = spark.read.text(s"$graphBasePath/$e").as[String].map(s => deserializeObject(s, e)).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
|
val currentEntityDataset: Dataset[(String, Result)] = spark.read
|
||||||
|
.text(s"$graphBasePath/$e")
|
||||||
|
.as[String]
|
||||||
|
.map(s => deserializeObject(s, e))
|
||||||
|
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
|
||||||
|
|
||||||
currentEntityDataset.joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left").map(k => {
|
currentEntityDataset
|
||||||
|
.joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left")
|
||||||
|
.map(k => {
|
||||||
|
|
||||||
val a = k._1
|
val a = k._1
|
||||||
val b = k._2
|
val b = k._2
|
||||||
|
@ -95,11 +127,14 @@ object SparkResolveEntities {
|
||||||
a._2.mergeFrom(b._2)
|
a._2.mergeFrom(b._2)
|
||||||
a._2
|
a._2
|
||||||
}
|
}
|
||||||
}).map(r => mapper.writeValueAsString(r))(Encoders.STRING)
|
})
|
||||||
.write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$targetPath/$e")
|
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.text(s"$targetPath/$e")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,18 +17,25 @@ import org.json4s.jackson.JsonMethods.parse
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object SparkResolveRelation {
|
object SparkResolveRelation {
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val graphBasePath = parser.get("graphBasePath")
|
val graphBasePath = parser.get("graphBasePath")
|
||||||
log.info(s"graphBasePath -> $graphBasePath")
|
log.info(s"graphBasePath -> $graphBasePath")
|
||||||
|
@ -41,7 +48,6 @@ object SparkResolveRelation {
|
||||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
|
||||||
//CLEANING TEMPORARY FOLDER
|
//CLEANING TEMPORARY FOLDER
|
||||||
HdfsSupport.remove(workingPath, spark.sparkContext.hadoopConfiguration)
|
HdfsSupport.remove(workingPath, spark.sparkContext.hadoopConfiguration)
|
||||||
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
|
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
|
||||||
|
@ -51,28 +57,36 @@ object SparkResolveRelation {
|
||||||
|
|
||||||
val mapper: ObjectMapper = new ObjectMapper()
|
val mapper: ObjectMapper = new ObjectMapper()
|
||||||
|
|
||||||
val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
|
val rPid: Dataset[(String, String)] =
|
||||||
|
spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
|
||||||
|
|
||||||
val relationDs: Dataset[(String, Relation)] = spark.read.text(s"$graphBasePath/relation").as[String]
|
val relationDs: Dataset[(String, Relation)] = spark.read
|
||||||
.map(s => mapper.readValue(s, classOf[Relation])).as[Relation]
|
.text(s"$graphBasePath/relation")
|
||||||
|
.as[String]
|
||||||
|
.map(s => mapper.readValue(s, classOf[Relation]))
|
||||||
|
.as[Relation]
|
||||||
.map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
.map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||||
|
|
||||||
relationDs.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left").map {
|
relationDs
|
||||||
m =>
|
.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left")
|
||||||
|
.map { m =>
|
||||||
val sourceResolved = m._2
|
val sourceResolved = m._2
|
||||||
val currentRelation = m._1._2
|
val currentRelation = m._1._2
|
||||||
if (sourceResolved != null && sourceResolved._1 != null && sourceResolved._1.nonEmpty)
|
if (sourceResolved != null && sourceResolved._1 != null && sourceResolved._1.nonEmpty)
|
||||||
currentRelation.setSource(sourceResolved._1)
|
currentRelation.setSource(sourceResolved._1)
|
||||||
currentRelation
|
currentRelation
|
||||||
}.write
|
}
|
||||||
|
.write
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.save(s"$workingPath/relationResolvedSource")
|
.save(s"$workingPath/relationResolvedSource")
|
||||||
|
|
||||||
|
val relationSourceResolved: Dataset[(String, Relation)] = spark.read
|
||||||
val relationSourceResolved: Dataset[(String, Relation)] = spark.read.load(s"$workingPath/relationResolvedSource").as[Relation]
|
.load(s"$workingPath/relationResolvedSource")
|
||||||
|
.as[Relation]
|
||||||
.map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
.map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||||
relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left").map {
|
relationSourceResolved
|
||||||
m =>
|
.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left")
|
||||||
|
.map { m =>
|
||||||
val targetResolved = m._2
|
val targetResolved = m._2
|
||||||
val currentRelation = m._1._2
|
val currentRelation = m._1._2
|
||||||
if (targetResolved != null && targetResolved._1.nonEmpty)
|
if (targetResolved != null && targetResolved._1.nonEmpty)
|
||||||
|
@ -83,7 +97,9 @@ object SparkResolveRelation {
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.save(s"$workingPath/relation_resolved")
|
.save(s"$workingPath/relation_resolved")
|
||||||
|
|
||||||
spark.read.load(s"$workingPath/relation_resolved").as[Relation]
|
spark.read
|
||||||
|
.load(s"$workingPath/relation_resolved")
|
||||||
|
.as[Relation]
|
||||||
.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
|
.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
|
||||||
.map(r => mapper.writeValueAsString(r))
|
.map(r => mapper.writeValueAsString(r))
|
||||||
.write
|
.write
|
||||||
|
@ -107,7 +123,6 @@ object SparkResolveRelation {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def extractPidsFromRecord(input: String): (String, List[(String, String)]) = {
|
def extractPidsFromRecord(input: String): (String, List[(String, String)]) = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json: json4s.JValue = parse(input)
|
lazy val json: json4s.JValue = parse(input)
|
||||||
|
@ -122,7 +137,6 @@ object SparkResolveRelation {
|
||||||
(id, result)
|
(id, result)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private def isRelation(input: String): Boolean = {
|
private def isRelation(input: String): Boolean = {
|
||||||
|
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
@ -132,20 +146,25 @@ object SparkResolveRelation {
|
||||||
source != null
|
source != null
|
||||||
}
|
}
|
||||||
|
|
||||||
def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = {
|
def extractPidResolvedTableFromJsonRDD(
|
||||||
|
spark: SparkSession,
|
||||||
|
graphPath: String,
|
||||||
|
workingPath: String
|
||||||
|
) = {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
val d: RDD[(String, String)] = spark.sparkContext.textFile(s"$graphPath/*")
|
val d: RDD[(String, String)] = spark.sparkContext
|
||||||
|
.textFile(s"$graphPath/*")
|
||||||
.filter(i => !isRelation(i))
|
.filter(i => !isRelation(i))
|
||||||
.map(i => extractPidsFromRecord(i))
|
.map(i => extractPidsFromRecord(i))
|
||||||
.filter(s => s != null && s._1 != null && s._2 != null && s._2.nonEmpty)
|
.filter(s => s != null && s._1 != null && s._2 != null && s._2.nonEmpty)
|
||||||
.flatMap { p =>
|
.flatMap { p =>
|
||||||
p._2.map(pid =>
|
p._2.map(pid => (p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2)))
|
||||||
(p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2))
|
}
|
||||||
)
|
.filter(r => r._1 != null || r._2 != null)
|
||||||
}.filter(r => r._1 != null || r._2 != null)
|
|
||||||
|
|
||||||
spark.createDataset(d)
|
spark
|
||||||
|
.createDataset(d)
|
||||||
.groupByKey(_._2)
|
.groupByKey(_._2)
|
||||||
.reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y)
|
.reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y)
|
||||||
.map(s => s._2)
|
.map(s => s._2)
|
||||||
|
|
|
@ -7,24 +7,26 @@ import org.apache.spark.sql.SparkSession
|
||||||
|
|
||||||
object SparkDataciteToOAF {
|
object SparkDataciteToOAF {
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val sc = spark.sparkContext
|
val sc = spark.sparkContext
|
||||||
|
|
||||||
val inputPath = parser.get("inputPath")
|
val inputPath = parser.get("inputPath")
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,18 +11,22 @@ import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object SparkConvertDatasetToJsonRDD {
|
object SparkConvertDatasetToJsonRDD {
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
log.info(s"sourcePath -> $sourcePath")
|
log.info(s"sourcePath -> $sourcePath")
|
||||||
|
@ -33,9 +37,13 @@ object SparkConvertDatasetToJsonRDD {
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||||
|
|
||||||
|
|
||||||
resultObject.foreach { item =>
|
resultObject.foreach { item =>
|
||||||
spark.read.load(s"$sourcePath/$item").as[Result].map(r => mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec])
|
spark.read
|
||||||
|
.load(s"$sourcePath/$item")
|
||||||
|
.as[Result]
|
||||||
|
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
|
||||||
|
.rdd
|
||||||
|
.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,14 +15,19 @@ object SparkConvertObjectToJson {
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
log.info(s"sourcePath -> $sourcePath")
|
log.info(s"sourcePath -> $sourcePath")
|
||||||
|
@ -33,12 +38,9 @@ object SparkConvertObjectToJson {
|
||||||
val scholixUpdatePath = parser.get("scholixUpdatePath")
|
val scholixUpdatePath = parser.get("scholixUpdatePath")
|
||||||
log.info(s"scholixUpdatePath -> $scholixUpdatePath")
|
log.info(s"scholixUpdatePath -> $scholixUpdatePath")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||||
|
|
||||||
|
|
||||||
val mapper = new ObjectMapper
|
val mapper = new ObjectMapper
|
||||||
|
|
||||||
objectType.toLowerCase match {
|
objectType.toLowerCase match {
|
||||||
|
@ -46,11 +48,18 @@ object SparkConvertObjectToJson {
|
||||||
log.info("Serialize Scholix")
|
log.info("Serialize Scholix")
|
||||||
val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix]
|
val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix]
|
||||||
val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
|
val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
|
||||||
d.union(u).repartition(8000).map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.saveAsTextFile(targetPath, classOf[GzipCodec])
|
d.union(u)
|
||||||
|
.repartition(8000)
|
||||||
|
.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
|
||||||
|
.rdd
|
||||||
|
.saveAsTextFile(targetPath, classOf[GzipCodec])
|
||||||
case "summary" =>
|
case "summary" =>
|
||||||
log.info("Serialize Summary")
|
log.info("Serialize Summary")
|
||||||
val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary]
|
val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary]
|
||||||
d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(1000).saveAsTextFile(targetPath, classOf[GzipCodec])
|
d.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
|
||||||
|
.rdd
|
||||||
|
.repartition(1000)
|
||||||
|
.saveAsTextFile(targetPath, classOf[GzipCodec])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,21 +7,26 @@ import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object SparkConvertRDDtoDataset {
|
object SparkConvertRDDtoDataset {
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
|
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
log.info(s"sourcePath -> $sourcePath")
|
log.info(s"sourcePath -> $sourcePath")
|
||||||
|
@ -34,40 +39,76 @@ object SparkConvertRDDtoDataset {
|
||||||
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
|
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
|
||||||
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
|
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
|
||||||
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||||
implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct])
|
implicit val orpEncoder: Encoder[OtherResearchProduct] =
|
||||||
|
Encoders.kryo(classOf[OtherResearchProduct])
|
||||||
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
|
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
|
||||||
|
|
||||||
|
|
||||||
log.info("Converting dataset")
|
log.info("Converting dataset")
|
||||||
val rddDataset =spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
|
val rddDataset = spark.sparkContext
|
||||||
spark.createDataset(rddDataset).as[OafDataset].write.mode(SaveMode.Overwrite).save(s"$entityPath/dataset")
|
.textFile(s"$sourcePath/dataset")
|
||||||
|
.map(s => mapper.readValue(s, classOf[OafDataset]))
|
||||||
|
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
||||||
|
spark
|
||||||
|
.createDataset(rddDataset)
|
||||||
|
.as[OafDataset]
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$entityPath/dataset")
|
||||||
|
|
||||||
log.info("Converting publication")
|
log.info("Converting publication")
|
||||||
val rddPublication =spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
|
val rddPublication = spark.sparkContext
|
||||||
spark.createDataset(rddPublication).as[Publication].write.mode(SaveMode.Overwrite).save(s"$entityPath/publication")
|
.textFile(s"$sourcePath/publication")
|
||||||
|
.map(s => mapper.readValue(s, classOf[Publication]))
|
||||||
|
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
||||||
|
spark
|
||||||
|
.createDataset(rddPublication)
|
||||||
|
.as[Publication]
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$entityPath/publication")
|
||||||
|
|
||||||
log.info("Converting software")
|
log.info("Converting software")
|
||||||
val rddSoftware =spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
|
val rddSoftware = spark.sparkContext
|
||||||
spark.createDataset(rddSoftware).as[Software].write.mode(SaveMode.Overwrite).save(s"$entityPath/software")
|
.textFile(s"$sourcePath/software")
|
||||||
|
.map(s => mapper.readValue(s, classOf[Software]))
|
||||||
|
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
||||||
|
spark
|
||||||
|
.createDataset(rddSoftware)
|
||||||
|
.as[Software]
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$entityPath/software")
|
||||||
|
|
||||||
log.info("Converting otherresearchproduct")
|
log.info("Converting otherresearchproduct")
|
||||||
val rddOtherResearchProduct =spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
|
val rddOtherResearchProduct = spark.sparkContext
|
||||||
spark.createDataset(rddOtherResearchProduct).as[OtherResearchProduct].write.mode(SaveMode.Overwrite).save(s"$entityPath/otherresearchproduct")
|
.textFile(s"$sourcePath/otherresearchproduct")
|
||||||
|
.map(s => mapper.readValue(s, classOf[OtherResearchProduct]))
|
||||||
|
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
||||||
|
spark
|
||||||
|
.createDataset(rddOtherResearchProduct)
|
||||||
|
.as[OtherResearchProduct]
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$entityPath/otherresearchproduct")
|
||||||
|
|
||||||
log.info("Converting Relation")
|
log.info("Converting Relation")
|
||||||
|
|
||||||
|
val relationSemanticFilter = List(
|
||||||
|
"cites",
|
||||||
|
"iscitedby",
|
||||||
|
"merges",
|
||||||
|
"ismergedin",
|
||||||
|
"HasAmongTopNSimilarDocuments",
|
||||||
|
"IsAmongTopNSimilarDocuments"
|
||||||
|
)
|
||||||
|
|
||||||
val relationSemanticFilter = List("cites", "iscitedby","merges", "ismergedin", "HasAmongTopNSimilarDocuments","IsAmongTopNSimilarDocuments" )
|
val rddRelation = spark.sparkContext
|
||||||
|
.textFile(s"$sourcePath/relation")
|
||||||
val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation")
|
|
||||||
.map(s => mapper.readValue(s, classOf[Relation]))
|
.map(s => mapper.readValue(s, classOf[Relation]))
|
||||||
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
||||||
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
|
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
|
||||||
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
|
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
|
||||||
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
|
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,82 +13,131 @@ object SparkCreateInputGraph {
|
||||||
|
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/extract_entities_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/extract_entities_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val resultObject = List(
|
val resultObject = List(
|
||||||
("publication", classOf[Publication]),
|
("publication", classOf[Publication]),
|
||||||
("dataset", classOf[OafDataset]),
|
("dataset", classOf[OafDataset]),
|
||||||
("software", classOf[Software]),
|
("software", classOf[Software]),
|
||||||
("otherResearchProduct", classOf[OtherResearchProduct])
|
("otherResearchProduct", classOf[OtherResearchProduct])
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|
||||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||||
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
|
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
|
||||||
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
|
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
|
||||||
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
|
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
|
||||||
implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct])
|
implicit val orpEncoder: Encoder[OtherResearchProduct] =
|
||||||
|
Encoders.kryo(classOf[OtherResearchProduct])
|
||||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||||
|
|
||||||
|
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
log.info(s"sourcePath -> $sourcePath")
|
log.info(s"sourcePath -> $sourcePath")
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
log.info(s"targetPath -> $targetPath")
|
log.info(s"targetPath -> $targetPath")
|
||||||
|
|
||||||
|
|
||||||
val oafDs: Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf]
|
val oafDs: Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf]
|
||||||
|
|
||||||
|
|
||||||
log.info("Extract Publication")
|
log.info("Extract Publication")
|
||||||
oafDs.filter(o => o.isInstanceOf[Publication]).map(p => p.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/publication")
|
oafDs
|
||||||
|
.filter(o => o.isInstanceOf[Publication])
|
||||||
|
.map(p => p.asInstanceOf[Publication])
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/extracted/publication")
|
||||||
|
|
||||||
log.info("Extract dataset")
|
log.info("Extract dataset")
|
||||||
oafDs.filter(o => o.isInstanceOf[OafDataset]).map(p => p.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/dataset")
|
oafDs
|
||||||
|
.filter(o => o.isInstanceOf[OafDataset])
|
||||||
|
.map(p => p.asInstanceOf[OafDataset])
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/extracted/dataset")
|
||||||
|
|
||||||
log.info("Extract software")
|
log.info("Extract software")
|
||||||
oafDs.filter(o => o.isInstanceOf[Software]).map(p => p.asInstanceOf[Software]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/software")
|
oafDs
|
||||||
|
.filter(o => o.isInstanceOf[Software])
|
||||||
|
.map(p => p.asInstanceOf[Software])
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/extracted/software")
|
||||||
|
|
||||||
log.info("Extract otherResearchProduct")
|
log.info("Extract otherResearchProduct")
|
||||||
oafDs.filter(o => o.isInstanceOf[OtherResearchProduct]).map(p => p.asInstanceOf[OtherResearchProduct]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/otherResearchProduct")
|
oafDs
|
||||||
|
.filter(o => o.isInstanceOf[OtherResearchProduct])
|
||||||
|
.map(p => p.asInstanceOf[OtherResearchProduct])
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/extracted/otherResearchProduct")
|
||||||
|
|
||||||
log.info("Extract Relation")
|
log.info("Extract Relation")
|
||||||
oafDs.filter(o => o.isInstanceOf[Relation]).map(p => p.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/relation")
|
oafDs
|
||||||
|
.filter(o => o.isInstanceOf[Relation])
|
||||||
|
.map(p => p.asInstanceOf[Relation])
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/extracted/relation")
|
||||||
|
|
||||||
resultObject.foreach { r =>
|
resultObject.foreach { r =>
|
||||||
log.info(s"Make ${r._1} unique")
|
log.info(s"Make ${r._1} unique")
|
||||||
makeDatasetUnique(s"$targetPath/extracted/${r._1}", s"$targetPath/preprocess/${r._1}", spark, r._2)
|
makeDatasetUnique(
|
||||||
|
s"$targetPath/extracted/${r._1}",
|
||||||
|
s"$targetPath/preprocess/${r._1}",
|
||||||
|
spark,
|
||||||
|
r._2
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def extractEntities[T <: Oaf](
|
||||||
def extractEntities[T <: Oaf](oafDs: Dataset[Oaf], targetPath: String, clazz: Class[T], log: Logger): Unit = {
|
oafDs: Dataset[Oaf],
|
||||||
|
targetPath: String,
|
||||||
|
clazz: Class[T],
|
||||||
|
log: Logger
|
||||||
|
): Unit = {
|
||||||
|
|
||||||
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
|
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
|
||||||
log.info(s"Extract ${clazz.getSimpleName}")
|
log.info(s"Extract ${clazz.getSimpleName}")
|
||||||
oafDs.filter(o => o.isInstanceOf[T]).map(p => p.asInstanceOf[T]).write.mode(SaveMode.Overwrite).save(targetPath)
|
oafDs
|
||||||
|
.filter(o => o.isInstanceOf[T])
|
||||||
|
.map(p => p.asInstanceOf[T])
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(targetPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def makeDatasetUnique[T <: Result](
|
||||||
def makeDatasetUnique[T <: Result](sourcePath: String, targetPath: String, spark: SparkSession, clazz: Class[T]): Unit = {
|
sourcePath: String,
|
||||||
|
targetPath: String,
|
||||||
|
spark: SparkSession,
|
||||||
|
clazz: Class[T]
|
||||||
|
): Unit = {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
|
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
|
||||||
|
|
||||||
val ds: Dataset[T] = spark.read.load(sourcePath).as[T]
|
val ds: Dataset[T] = spark.read.load(sourcePath).as[T]
|
||||||
|
|
||||||
ds.groupByKey(_.getId).reduceGroups { (x, y) =>
|
ds.groupByKey(_.getId)
|
||||||
|
.reduceGroups { (x, y) =>
|
||||||
x.mergeFrom(y)
|
x.mergeFrom(y)
|
||||||
x
|
x
|
||||||
}.map(_._2).write.mode(SaveMode.Overwrite).save(targetPath)
|
}
|
||||||
|
.map(_._2)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(targetPath)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,14 +17,19 @@ object SparkCreateScholix {
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val relationPath = parser.get("relationPath")
|
val relationPath = parser.get("relationPath")
|
||||||
log.info(s"relationPath -> $relationPath")
|
log.info(s"relationPath -> $relationPath")
|
||||||
|
@ -33,37 +38,46 @@ object SparkCreateScholix {
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
log.info(s"targetPath -> $targetPath")
|
log.info(s"targetPath -> $targetPath")
|
||||||
|
|
||||||
|
|
||||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
val relationDS: Dataset[(String, Relation)] = spark.read
|
||||||
val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation]
|
.load(relationPath)
|
||||||
.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
.as[Relation]
|
||||||
|
.filter(r =>
|
||||||
|
(r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase
|
||||||
|
.contains("merge")
|
||||||
|
)
|
||||||
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||||
|
|
||||||
val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary]
|
val summaryDS: Dataset[(String, ScholixSummary)] = spark.read
|
||||||
|
.load(summaryPath)
|
||||||
|
.as[ScholixSummary]
|
||||||
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, summaryEncoder))
|
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, summaryEncoder))
|
||||||
|
|
||||||
|
relationDS
|
||||||
relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
|
.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
|
||||||
.map { input: ((String, Relation), (String, ScholixSummary)) =>
|
.map { input: ((String, Relation), (String, ScholixSummary)) =>
|
||||||
if (input._1 != null && input._2 != null) {
|
if (input._1 != null && input._2 != null) {
|
||||||
val rel: Relation = input._1._2
|
val rel: Relation = input._1._2
|
||||||
val source: ScholixSummary = input._2._2
|
val source: ScholixSummary = input._2._2
|
||||||
(rel.getTarget, ScholixUtils.scholixFromSource(rel, source))
|
(rel.getTarget, ScholixUtils.scholixFromSource(rel, source))
|
||||||
}
|
} else null
|
||||||
else null
|
|
||||||
}(Encoders.tuple(Encoders.STRING, scholixEncoder))
|
}(Encoders.tuple(Encoders.STRING, scholixEncoder))
|
||||||
.filter(r => r != null)
|
.filter(r => r != null)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/scholix_from_source")
|
||||||
|
|
||||||
val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
|
val scholixSource: Dataset[(String, Scholix)] = spark.read
|
||||||
|
.load(s"$targetPath/scholix_from_source")
|
||||||
|
.as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
|
||||||
|
|
||||||
scholixSource.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
|
scholixSource
|
||||||
|
.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
|
||||||
.map { input: ((String, Scholix), (String, ScholixSummary)) =>
|
.map { input: ((String, Scholix), (String, ScholixSummary)) =>
|
||||||
if (input._2 == null) {
|
if (input._2 == null) {
|
||||||
null
|
null
|
||||||
|
@ -72,40 +86,73 @@ object SparkCreateScholix {
|
||||||
val target: ScholixSummary = input._2._2
|
val target: ScholixSummary = input._2._2
|
||||||
ScholixUtils.generateCompleteScholix(s, target)
|
ScholixUtils.generateCompleteScholix(s, target)
|
||||||
}
|
}
|
||||||
}.filter(s => s != null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse")
|
}
|
||||||
|
.filter(s => s != null)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/scholix_one_verse")
|
||||||
|
|
||||||
|
val scholix_o_v: Dataset[Scholix] =
|
||||||
|
spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
|
||||||
|
|
||||||
val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
|
scholix_o_v
|
||||||
|
.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s)))
|
||||||
scholix_o_v.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))).as[Scholix]
|
.as[Scholix]
|
||||||
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder))
|
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder))
|
||||||
.groupByKey(_._1)
|
.groupByKey(_._1)
|
||||||
.agg(ScholixUtils.scholixAggregator.toColumn)
|
.agg(ScholixUtils.scholixAggregator.toColumn)
|
||||||
.map(s => s._2)
|
.map(s => s._2)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/scholix")
|
||||||
|
|
||||||
val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix]
|
val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix]
|
||||||
|
|
||||||
val stats: Dataset[(String, String, Long)] = scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String, String, Long)]
|
val stats: Dataset[(String, String, Long)] = scholix_final
|
||||||
|
.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType))
|
||||||
|
.groupBy("_1", "_2")
|
||||||
|
.agg(count("_1"))
|
||||||
|
.as[(String, String, Long)]
|
||||||
|
|
||||||
stats
|
stats
|
||||||
.map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0))
|
.map(s =>
|
||||||
|
RelatedEntities(
|
||||||
|
s._1,
|
||||||
|
if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0,
|
||||||
|
if ("publication".equalsIgnoreCase(s._2)) s._3 else 0
|
||||||
|
)
|
||||||
|
)
|
||||||
.groupByKey(_.id)
|
.groupByKey(_.id)
|
||||||
.reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset + b.relatedDataset, a.relatedPublication + b.relatedPublication))
|
.reduceGroups((a, b) =>
|
||||||
|
RelatedEntities(
|
||||||
|
a.id,
|
||||||
|
a.relatedDataset + b.relatedDataset,
|
||||||
|
a.relatedPublication + b.relatedPublication
|
||||||
|
)
|
||||||
|
)
|
||||||
.map(_._2)
|
.map(_._2)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/related_entities")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$targetPath/related_entities")
|
||||||
|
|
||||||
val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication > 0 || r.relatedDataset > 0)
|
val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read
|
||||||
|
.load(s"$targetPath/related_entities")
|
||||||
|
.as[RelatedEntities]
|
||||||
|
.filter(r => r.relatedPublication > 0 || r.relatedDataset > 0)
|
||||||
|
|
||||||
relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map { i =>
|
relatedEntitiesDS
|
||||||
|
.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner")
|
||||||
|
.map { i =>
|
||||||
val re = i._1
|
val re = i._1
|
||||||
val sum = i._2._2
|
val sum = i._2._2
|
||||||
|
|
||||||
sum.setRelatedDatasets(re.relatedDataset)
|
sum.setRelatedDatasets(re.relatedDataset)
|
||||||
sum.setRelatedPublications(re.relatedPublication)
|
sum.setRelatedPublications(re.relatedPublication)
|
||||||
sum
|
sum
|
||||||
}.write.mode(SaveMode.Overwrite).save(s"${summaryPath}_filtered")
|
}
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"${summaryPath}_filtered")
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,14 +14,19 @@ object SparkCreateSummaryObject {
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json")))
|
val parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json")
|
||||||
|
)
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
log.info(s"sourcePath -> $sourcePath")
|
log.info(s"sourcePath -> $sourcePath")
|
||||||
|
@ -33,10 +38,17 @@ object SparkCreateSummaryObject {
|
||||||
|
|
||||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||||
|
|
||||||
|
val ds: Dataset[Result] = spark.read
|
||||||
|
.load(s"$sourcePath/*")
|
||||||
|
.as[Result]
|
||||||
|
.filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false)
|
||||||
|
|
||||||
val ds: Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false)
|
ds.repartition(6000)
|
||||||
|
.map(r => ScholixUtils.resultToSummary(r))
|
||||||
ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
.filter(s => s != null)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(targetPath)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,13 +10,23 @@ import java.util.regex.Pattern
|
||||||
import scala.language.postfixOps
|
import scala.language.postfixOps
|
||||||
import scala.xml.{Elem, Node, XML}
|
import scala.xml.{Elem, Node, XML}
|
||||||
|
|
||||||
case class PangaeaDataModel(identifier:String, title:List[String], objectType:List[String], creator:List[String],
|
case class PangaeaDataModel(
|
||||||
publisher:List[String], dataCenter :List[String],subject :List[String], language:String,
|
identifier: String,
|
||||||
rights:String, parent:String,relation :List[String],linkage:List[(String,String)] ) {}
|
title: List[String],
|
||||||
|
objectType: List[String],
|
||||||
|
creator: List[String],
|
||||||
|
publisher: List[String],
|
||||||
|
dataCenter: List[String],
|
||||||
|
subject: List[String],
|
||||||
|
language: String,
|
||||||
|
rights: String,
|
||||||
|
parent: String,
|
||||||
|
relation: List[String],
|
||||||
|
linkage: List[(String, String)]
|
||||||
|
) {}
|
||||||
|
|
||||||
object PangaeaUtils {
|
object PangaeaUtils {
|
||||||
|
|
||||||
|
|
||||||
def toDataset(input: String): PangaeaDataModel = {
|
def toDataset(input: String): PangaeaDataModel = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json: json4s.JValue = parse(input)
|
lazy val json: json4s.JValue = parse(input)
|
||||||
|
@ -26,20 +36,25 @@ object PangaeaUtils {
|
||||||
|
|
||||||
def findDOIInRelation(input: List[String]): List[String] = {
|
def findDOIInRelation(input: List[String]): List[String] = {
|
||||||
val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b")
|
val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b")
|
||||||
input.map(i => {
|
input
|
||||||
|
.map(i => {
|
||||||
val matcher = pattern.matcher(i)
|
val matcher = pattern.matcher(i)
|
||||||
if (matcher.find())
|
if (matcher.find())
|
||||||
matcher.group(0)
|
matcher.group(0)
|
||||||
else
|
else
|
||||||
null
|
null
|
||||||
}).filter(i => i!= null)
|
})
|
||||||
|
.filter(i => i != null)
|
||||||
}
|
}
|
||||||
|
|
||||||
def attributeOpt(attribute: String, node: Node): Option[String] =
|
def attributeOpt(attribute: String, node: Node): Option[String] =
|
||||||
node.attribute(attribute) flatMap (_.headOption) map (_.text)
|
node.attribute(attribute) flatMap (_.headOption) map (_.text)
|
||||||
|
|
||||||
def extractLinkage(node: Elem): List[(String, String)] = {
|
def extractLinkage(node: Elem): List[(String, String)] = {
|
||||||
(node \ "linkage").map(n =>(attributeOpt("type",n), n.text)).filter(t => t._1.isDefined).map(t=> (t._1.get, t._2))(collection.breakOut)
|
(node \ "linkage")
|
||||||
|
.map(n => (attributeOpt("type", n), n.text))
|
||||||
|
.filter(t => t._1.isDefined)
|
||||||
|
.map(t => (t._1.get, t._2))(collection.breakOut)
|
||||||
}
|
}
|
||||||
|
|
||||||
def parseXml(input: String): PangaeaDataModel = {
|
def parseXml(input: String): PangaeaDataModel = {
|
||||||
|
@ -59,12 +74,24 @@ object PangaeaUtils {
|
||||||
val relationFiltered = findDOIInRelation(relation)
|
val relationFiltered = findDOIInRelation(relation)
|
||||||
val linkage: List[(String, String)] = extractLinkage(xml)
|
val linkage: List[(String, String)] = extractLinkage(xml)
|
||||||
|
|
||||||
PangaeaDataModel(identifier,title, pType, creators,publisher, dataCenter, subject, language, rights, parentIdentifier, relationFiltered, linkage)
|
PangaeaDataModel(
|
||||||
|
identifier,
|
||||||
|
title,
|
||||||
|
pType,
|
||||||
|
creators,
|
||||||
|
publisher,
|
||||||
|
dataCenter,
|
||||||
|
subject,
|
||||||
|
language,
|
||||||
|
rights,
|
||||||
|
parentIdentifier,
|
||||||
|
relationFiltered,
|
||||||
|
linkage
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] =
|
||||||
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] = new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel]{
|
new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] {
|
||||||
|
|
||||||
|
|
||||||
override def zero: PangaeaDataModel = null
|
override def zero: PangaeaDataModel = null
|
||||||
|
|
||||||
|
@ -106,7 +133,4 @@ object PangaeaUtils {
|
||||||
override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
|
@ -11,20 +11,25 @@ import scala.io.Source
|
||||||
|
|
||||||
object SparkGeneratePanagaeaDataset {
|
object SparkGeneratePanagaeaDataset {
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")).mkString)
|
val parser = new ArgumentApplicationParser(
|
||||||
|
Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
|
|
||||||
|
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(SparkGeneratePanagaeaDataset.getClass.getSimpleName)
|
.appName(SparkGeneratePanagaeaDataset.getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}"))
|
parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}"))
|
||||||
logger.info("Converting sequential file into Dataset")
|
logger.info("Converting sequential file into Dataset")
|
||||||
|
@ -34,16 +39,20 @@ object SparkGeneratePanagaeaDataset {
|
||||||
|
|
||||||
implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
||||||
|
|
||||||
val inputRDD: RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
|
val inputRDD: RDD[PangaeaDataModel] =
|
||||||
|
sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
|
||||||
|
|
||||||
spark.createDataset(inputRDD).as[PangaeaDataModel]
|
spark
|
||||||
|
.createDataset(inputRDD)
|
||||||
|
.as[PangaeaDataModel]
|
||||||
.map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders))
|
.map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders))
|
||||||
.groupByKey(_._1)(Encoders.STRING)
|
.groupByKey(_._1)(Encoders.STRING)
|
||||||
.agg(PangaeaUtils.getDatasetAggregator().toColumn)
|
.agg(PangaeaUtils.getDatasetAggregator().toColumn)
|
||||||
.map(s => s._2)
|
.map(s => s._2)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset")
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$workingPath/dataset")
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,10 +30,10 @@ class TestApply extends java.io.Serializable{
|
||||||
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||||
implicit val mapEncoderPubInfo: Encoder[Publication] = Encoders.bean(classOf[Publication])
|
implicit val mapEncoderPubInfo: Encoder[Publication] = Encoders.bean(classOf[Publication])
|
||||||
|
|
||||||
|
val pub_ds: Dataset[Publication] =
|
||||||
val pub_ds :Dataset[Publication] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[Publication]))
|
spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[Publication]))
|
||||||
val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
val hbm_ds: Dataset[EntityInfo] =
|
||||||
|
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||||
|
|
||||||
assertEquals(13, pub_ds.count())
|
assertEquals(13, pub_ds.count())
|
||||||
|
|
||||||
|
@ -41,7 +41,8 @@ class TestApply extends java.io.Serializable{
|
||||||
|
|
||||||
assertEquals(13, ds.count)
|
assertEquals(13, ds.count)
|
||||||
|
|
||||||
val temp: Dataset[(Publication, Publication)] = pub_ds.joinWith(ds, pub_ds.col("id").equalTo(ds.col("id")), "left")
|
val temp: Dataset[(Publication, Publication)] =
|
||||||
|
pub_ds.joinWith(ds, pub_ds.col("id").equalTo(ds.col("id")), "left")
|
||||||
assertEquals(13, temp.count())
|
assertEquals(13, temp.count())
|
||||||
temp.foreach(t2 => {
|
temp.foreach(t2 => {
|
||||||
val pb: Publication = t2._1
|
val pb: Publication = t2._1
|
||||||
|
@ -50,17 +51,36 @@ class TestApply extends java.io.Serializable{
|
||||||
assertEquals(1, pb.getInstance().size())
|
assertEquals(1, pb.getInstance().size())
|
||||||
assertTrue(t2._1.getId.equals(t2._2.getId))
|
assertTrue(t2._1.getId.equals(t2._2.getId))
|
||||||
if (pb.getId.equals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9")) {
|
if (pb.getId.equals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9")) {
|
||||||
assertTrue(pa.getInstance().get(0).getHostedby.getKey.equals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735"))
|
assertTrue(
|
||||||
|
pa.getInstance()
|
||||||
|
.get(0)
|
||||||
|
.getHostedby
|
||||||
|
.getKey
|
||||||
|
.equals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735")
|
||||||
|
)
|
||||||
assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals("Academic Therapy"))
|
assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals("Academic Therapy"))
|
||||||
assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals("OPEN"))
|
assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals("OPEN"))
|
||||||
assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals("Open Access"))
|
assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals("Open Access"))
|
||||||
assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.gold))
|
assertTrue(
|
||||||
|
pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.gold)
|
||||||
|
)
|
||||||
assertTrue(pa.getBestaccessright.getClassid.equals("OPEN"))
|
assertTrue(pa.getBestaccessright.getClassid.equals("OPEN"))
|
||||||
assertTrue(pa.getBestaccessright.getClassname.equals("Open Access"))
|
assertTrue(pa.getBestaccessright.getClassname.equals("Open Access"))
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
assertTrue(pb.getInstance().get(0).getHostedby.getKey.equals("10|openaire____::0b74b6a356bbf23c245f9ae9a748745c"))
|
pb.getInstance()
|
||||||
assertTrue(pb.getInstance().get(0).getHostedby.getValue.equals("Revistas de investigación Universidad Nacional Mayor de San Marcos"))
|
.get(0)
|
||||||
|
.getHostedby
|
||||||
|
.getKey
|
||||||
|
.equals("10|openaire____::0b74b6a356bbf23c245f9ae9a748745c")
|
||||||
|
)
|
||||||
|
assertTrue(
|
||||||
|
pb.getInstance()
|
||||||
|
.get(0)
|
||||||
|
.getHostedby
|
||||||
|
.getValue
|
||||||
|
.equals("Revistas de investigación Universidad Nacional Mayor de San Marcos")
|
||||||
|
)
|
||||||
assertTrue(pb.getInstance().get(0).getAccessright.getClassname.equals("not available"))
|
assertTrue(pb.getInstance().get(0).getAccessright.getClassname.equals("not available"))
|
||||||
assertTrue(pb.getInstance().get(0).getAccessright.getClassid.equals("UNKNOWN"))
|
assertTrue(pb.getInstance().get(0).getAccessright.getClassid.equals("UNKNOWN"))
|
||||||
assertTrue(pb.getInstance().get(0).getAccessright.getOpenAccessRoute == null)
|
assertTrue(pb.getInstance().get(0).getAccessright.getOpenAccessRoute == null)
|
||||||
|
@ -68,11 +88,41 @@ class TestApply extends java.io.Serializable{
|
||||||
assertTrue(pb.getBestaccessright.getClassname.equals("not available"))
|
assertTrue(pb.getBestaccessright.getClassname.equals("not available"))
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
assertTrue(pa.getInstance().get(0).getHostedby.getKey.equals(pb.getInstance().get(0).getHostedby.getKey))
|
assertTrue(
|
||||||
assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals(pb.getInstance().get(0).getHostedby.getValue))
|
pa.getInstance()
|
||||||
assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals(pb.getInstance().get(0).getAccessright.getClassid))
|
.get(0)
|
||||||
assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals(pb.getInstance().get(0).getAccessright.getClassname))
|
.getHostedby
|
||||||
assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute == pb.getInstance().get(0).getAccessright.getOpenAccessRoute)
|
.getKey
|
||||||
|
.equals(pb.getInstance().get(0).getHostedby.getKey)
|
||||||
|
)
|
||||||
|
assertTrue(
|
||||||
|
pa.getInstance()
|
||||||
|
.get(0)
|
||||||
|
.getHostedby
|
||||||
|
.getValue
|
||||||
|
.equals(pb.getInstance().get(0).getHostedby.getValue)
|
||||||
|
)
|
||||||
|
assertTrue(
|
||||||
|
pa.getInstance()
|
||||||
|
.get(0)
|
||||||
|
.getAccessright
|
||||||
|
.getClassid
|
||||||
|
.equals(pb.getInstance().get(0).getAccessright.getClassid)
|
||||||
|
)
|
||||||
|
assertTrue(
|
||||||
|
pa.getInstance()
|
||||||
|
.get(0)
|
||||||
|
.getAccessright
|
||||||
|
.getClassname
|
||||||
|
.equals(pb.getInstance().get(0).getAccessright.getClassname)
|
||||||
|
)
|
||||||
|
assertTrue(
|
||||||
|
pa.getInstance().get(0).getAccessright.getOpenAccessRoute == pb
|
||||||
|
.getInstance()
|
||||||
|
.get(0)
|
||||||
|
.getAccessright
|
||||||
|
.getOpenAccessRoute
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
@ -80,7 +130,6 @@ class TestApply extends java.io.Serializable{
|
||||||
spark.close()
|
spark.close()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testApplyOnDatasource(): Unit = {
|
def testApplyOnDatasource(): Unit = {
|
||||||
val conf = new SparkConf()
|
val conf = new SparkConf()
|
||||||
|
@ -100,10 +149,11 @@ class TestApply extends java.io.Serializable{
|
||||||
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||||
implicit val mapEncoderPubInfo: Encoder[Datasource] = Encoders.bean(classOf[Datasource])
|
implicit val mapEncoderPubInfo: Encoder[Datasource] = Encoders.bean(classOf[Datasource])
|
||||||
|
|
||||||
|
val dats_ds: Dataset[Datasource] =
|
||||||
val dats_ds :Dataset[Datasource] = spark.read.textFile(dats).map(p => mapper.readValue(p, classOf[Datasource]))
|
spark.read.textFile(dats).map(p => mapper.readValue(p, classOf[Datasource]))
|
||||||
val hbm_ds :Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])))
|
val hbm_ds: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(
|
||||||
|
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||||
|
)
|
||||||
|
|
||||||
assertEquals(10, dats_ds.count())
|
assertEquals(10, dats_ds.count())
|
||||||
|
|
||||||
|
@ -111,7 +161,8 @@ class TestApply extends java.io.Serializable{
|
||||||
|
|
||||||
assertEquals(10, ds.count)
|
assertEquals(10, ds.count)
|
||||||
|
|
||||||
val temp: Dataset[(Datasource, Datasource)] = dats_ds.joinWith(ds, dats_ds.col("id").equalTo(ds.col("id")), "left")
|
val temp: Dataset[(Datasource, Datasource)] =
|
||||||
|
dats_ds.joinWith(ds, dats_ds.col("id").equalTo(ds.col("id")), "left")
|
||||||
assertEquals(10, temp.count())
|
assertEquals(10, temp.count())
|
||||||
temp.foreach(t2 => {
|
temp.foreach(t2 => {
|
||||||
val pb: Datasource = t2._1
|
val pb: Datasource = t2._1
|
||||||
|
@ -119,14 +170,23 @@ class TestApply extends java.io.Serializable{
|
||||||
assertTrue(t2._1.getId.equals(t2._2.getId))
|
assertTrue(t2._1.getId.equals(t2._2.getId))
|
||||||
if (pb.getId.equals("10|doajarticles::0ab37b7620eb9a73ac95d3ca4320c97d")) {
|
if (pb.getId.equals("10|doajarticles::0ab37b7620eb9a73ac95d3ca4320c97d")) {
|
||||||
assertTrue(pa.getOpenairecompatibility().getClassid.equals("hostedBy"))
|
assertTrue(pa.getOpenairecompatibility().getClassid.equals("hostedBy"))
|
||||||
assertTrue(pa.getOpenairecompatibility().getClassname.equals("collected from a compatible aggregator"))
|
assertTrue(
|
||||||
|
pa.getOpenairecompatibility()
|
||||||
|
.getClassname
|
||||||
|
.equals("collected from a compatible aggregator")
|
||||||
|
)
|
||||||
|
|
||||||
assertTrue(pb.getOpenairecompatibility().getClassid.equals(ModelConstants.UNKNOWN))
|
assertTrue(pb.getOpenairecompatibility().getClassid.equals(ModelConstants.UNKNOWN))
|
||||||
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
assertTrue(pa.getOpenairecompatibility().getClassid.equals(pb.getOpenairecompatibility.getClassid))
|
assertTrue(
|
||||||
assertTrue(pa.getOpenairecompatibility().getClassname.equals(pb.getOpenairecompatibility.getClassname))
|
pa.getOpenairecompatibility().getClassid.equals(pb.getOpenairecompatibility.getClassid)
|
||||||
|
)
|
||||||
|
assertTrue(
|
||||||
|
pa.getOpenairecompatibility()
|
||||||
|
.getClassname
|
||||||
|
.equals(pb.getOpenairecompatibility.getClassname)
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
|
@ -19,7 +19,6 @@ class TestPrepare extends java.io.Serializable{
|
||||||
write(input)
|
write(input)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testHostedByMaptoEntityInfo(): Unit = {
|
def testHostedByMaptoEntityInfo(): Unit = {
|
||||||
val conf = new SparkConf()
|
val conf = new SparkConf()
|
||||||
|
@ -33,14 +32,14 @@ class TestPrepare extends java.io.Serializable{
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
val hbm = getClass.getResource("hostedbymap.json").getPath
|
val hbm = getClass.getResource("hostedbymap.json").getPath
|
||||||
|
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
val mapper: ObjectMapper = new ObjectMapper()
|
val mapper: ObjectMapper = new ObjectMapper()
|
||||||
|
|
||||||
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||||
|
|
||||||
val ds :Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hbm)).map(toEntityInfo)
|
val ds: Dataset[EntityInfo] =
|
||||||
|
spark.createDataset(spark.sparkContext.textFile(hbm)).map(toEntityInfo)
|
||||||
|
|
||||||
ds.foreach(e => println(mapper.writeValueAsString(e)))
|
ds.foreach(e => println(mapper.writeValueAsString(e)))
|
||||||
|
|
||||||
|
@ -71,8 +70,14 @@ class TestPrepare extends java.io.Serializable{
|
||||||
|
|
||||||
assertEquals(2, ds.count)
|
assertEquals(2, ds.count)
|
||||||
|
|
||||||
assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ds.filter(ei => ei.getJournalId.equals("1728-5852")).first().getId)
|
assertEquals(
|
||||||
assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ds.filter(ei => ei.getJournalId.equals("0001-396X")).first().getId)
|
"50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9",
|
||||||
|
ds.filter(ei => ei.getJournalId.equals("1728-5852")).first().getId
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
"50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9",
|
||||||
|
ds.filter(ei => ei.getJournalId.equals("0001-396X")).first().getId
|
||||||
|
)
|
||||||
|
|
||||||
spark.close()
|
spark.close()
|
||||||
}
|
}
|
||||||
|
@ -95,8 +100,10 @@ class TestPrepare extends java.io.Serializable{
|
||||||
|
|
||||||
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||||
|
|
||||||
val pub_ds :Dataset[EntityInfo] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
val pub_ds: Dataset[EntityInfo] =
|
||||||
val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||||
|
val hbm_ds: Dataset[EntityInfo] =
|
||||||
|
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||||
|
|
||||||
val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds)
|
val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds)
|
||||||
|
|
||||||
|
@ -131,8 +138,10 @@ class TestPrepare extends java.io.Serializable{
|
||||||
|
|
||||||
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||||
|
|
||||||
val pub_ds :Dataset[EntityInfo] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
val pub_ds: Dataset[EntityInfo] =
|
||||||
val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||||
|
val hbm_ds: Dataset[EntityInfo] =
|
||||||
|
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||||
|
|
||||||
val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds)
|
val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds)
|
||||||
|
|
||||||
|
@ -150,6 +159,4 @@ class TestPrepare extends java.io.Serializable{
|
||||||
spark.close()
|
spark.close()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,6 @@ class TestPreprocess extends java.io.Serializable{
|
||||||
implicit val mapEncoderDats: Encoder[Datasource] = Encoders.kryo[Datasource]
|
implicit val mapEncoderDats: Encoder[Datasource] = Encoders.kryo[Datasource]
|
||||||
implicit val schema = Encoders.product[HostedByInfo]
|
implicit val schema = Encoders.product[HostedByInfo]
|
||||||
|
|
||||||
|
|
||||||
def toHBIString(hbi: HostedByItemType): String = {
|
def toHBIString(hbi: HostedByItemType): String = {
|
||||||
implicit val formats = DefaultFormats
|
implicit val formats = DefaultFormats
|
||||||
|
|
||||||
|
@ -41,19 +40,30 @@ class TestPreprocess extends java.io.Serializable{
|
||||||
assertEquals(5, ds.filter(hbi => !hbi.eissn.equals("")).count)
|
assertEquals(5, ds.filter(hbi => !hbi.eissn.equals("")).count)
|
||||||
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
|
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
|
||||||
|
|
||||||
assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count)
|
assertEquals(
|
||||||
|
0,
|
||||||
|
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
|
||||||
|
)
|
||||||
|
|
||||||
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365")).count == 1)
|
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365")).count == 1)
|
||||||
assertTrue(ds.filter(hbi => hbi.eissn.equals("2253-900X")).count == 1)
|
assertTrue(ds.filter(hbi => hbi.eissn.equals("2253-900X")).count == 1)
|
||||||
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1)
|
assertTrue(
|
||||||
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata")).count == 1)
|
ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1
|
||||||
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.id.equals("10|doajarticles::abbc9265bea9ff62776a1c39785af00c")).count == 1)
|
)
|
||||||
|
assertTrue(
|
||||||
|
ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata")).count == 1
|
||||||
|
)
|
||||||
|
assertTrue(
|
||||||
|
ds.filter(hbi =>
|
||||||
|
hbi.issn.equals("0212-8365") && hbi.id
|
||||||
|
.equals("10|doajarticles::abbc9265bea9ff62776a1c39785af00c")
|
||||||
|
).count == 1
|
||||||
|
)
|
||||||
ds.foreach(hbi => assertTrue(hbi.id.startsWith("10|")))
|
ds.foreach(hbi => assertTrue(hbi.id.startsWith("10|")))
|
||||||
ds.foreach(hbi => println(toHBIString(hbi)))
|
ds.foreach(hbi => println(toHBIString(hbi)))
|
||||||
spark.close()
|
spark.close()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def readGold(): Unit = {
|
def readGold(): Unit = {
|
||||||
val conf = new SparkConf()
|
val conf = new SparkConf()
|
||||||
|
@ -67,7 +77,6 @@ class TestPreprocess extends java.io.Serializable{
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
val path = getClass.getResource("unibi_transformed.json").getPath
|
val path = getClass.getResource("unibi_transformed.json").getPath
|
||||||
|
|
||||||
|
|
||||||
val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.goldHostedByDataset(spark, path)
|
val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.goldHostedByDataset(spark, path)
|
||||||
|
|
||||||
assertEquals(29, ds.count)
|
assertEquals(29, ds.count)
|
||||||
|
@ -76,9 +85,17 @@ class TestPreprocess extends java.io.Serializable{
|
||||||
assertEquals(0, ds.filter(hbi => !hbi.eissn.equals("")).count)
|
assertEquals(0, ds.filter(hbi => !hbi.eissn.equals("")).count)
|
||||||
assertEquals(29, ds.filter(hbi => !hbi.lissn.equals("")).count)
|
assertEquals(29, ds.filter(hbi => !hbi.lissn.equals("")).count)
|
||||||
|
|
||||||
assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count)
|
assertEquals(
|
||||||
|
0,
|
||||||
|
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
|
||||||
|
)
|
||||||
|
|
||||||
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().officialname.equals("European journal of sustainable development."))
|
assertTrue(
|
||||||
|
ds.filter(hbi => hbi.issn.equals("2239-6101"))
|
||||||
|
.first()
|
||||||
|
.officialname
|
||||||
|
.equals("European journal of sustainable development.")
|
||||||
|
)
|
||||||
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().lissn.equals("2239-5938"))
|
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().lissn.equals("2239-5938"))
|
||||||
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).count == 1)
|
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).count == 1)
|
||||||
ds.foreach(hbi => assertTrue(hbi.id.equals(Constants.UNIBI)))
|
ds.foreach(hbi => assertTrue(hbi.id.equals(Constants.UNIBI)))
|
||||||
|
@ -108,9 +125,17 @@ class TestPreprocess extends java.io.Serializable{
|
||||||
assertEquals(21, ds.filter(hbi => !hbi.eissn.equals("")).count)
|
assertEquals(21, ds.filter(hbi => !hbi.eissn.equals("")).count)
|
||||||
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
|
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
|
||||||
|
|
||||||
assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count)
|
assertEquals(
|
||||||
|
0,
|
||||||
|
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
|
||||||
|
)
|
||||||
|
|
||||||
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().officialname.equals("Journal of Space Technology"))
|
assertTrue(
|
||||||
|
ds.filter(hbi => hbi.issn.equals("2077-3099"))
|
||||||
|
.first()
|
||||||
|
.officialname
|
||||||
|
.equals("Journal of Space Technology")
|
||||||
|
)
|
||||||
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().eissn.equals("2411-5029"))
|
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().eissn.equals("2411-5029"))
|
||||||
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).count == 1)
|
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).count == 1)
|
||||||
assertTrue(ds.filter(hbi => hbi.eissn.equals("2077-2955")).first().issn.equals(""))
|
assertTrue(ds.filter(hbi => hbi.eissn.equals("2077-2955")).first().issn.equals(""))
|
||||||
|
@ -133,20 +158,38 @@ class TestPreprocess extends java.io.Serializable{
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
|
|
||||||
|
val tmp = SparkProduceHostedByMap
|
||||||
val tmp = SparkProduceHostedByMap.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
|
.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
|
||||||
.union(SparkProduceHostedByMap.goldHostedByDataset(spark,getClass.getResource("unibi_transformed.json").getPath))
|
.union(
|
||||||
.union(SparkProduceHostedByMap.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath))
|
SparkProduceHostedByMap
|
||||||
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]))
|
.goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath)
|
||||||
|
)
|
||||||
|
.union(
|
||||||
|
SparkProduceHostedByMap
|
||||||
|
.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)
|
||||||
|
)
|
||||||
|
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(
|
||||||
|
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
|
||||||
|
)
|
||||||
|
|
||||||
assertEquals(106, tmp.count)
|
assertEquals(106, tmp.count)
|
||||||
assertEquals(82, tmp.map(i => i._1)(Encoders.STRING).distinct().count)
|
assertEquals(82, tmp.map(i => i._1)(Encoders.STRING).distinct().count)
|
||||||
|
|
||||||
|
val ds: Dataset[(String, HostedByItemType)] = Aggregators.explodeHostedByItemType(
|
||||||
val ds :Dataset[(String, HostedByItemType)] = Aggregators.explodeHostedByItemType(SparkProduceHostedByMap.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
|
SparkProduceHostedByMap
|
||||||
.union(SparkProduceHostedByMap.goldHostedByDataset(spark,getClass.getResource("unibi_transformed.json").getPath))
|
.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
|
||||||
.union(SparkProduceHostedByMap.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath))
|
.union(
|
||||||
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])))
|
SparkProduceHostedByMap
|
||||||
|
.goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath)
|
||||||
|
)
|
||||||
|
.union(
|
||||||
|
SparkProduceHostedByMap
|
||||||
|
.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)
|
||||||
|
)
|
||||||
|
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(
|
||||||
|
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
assertEquals(82, ds.count)
|
assertEquals(82, ds.count)
|
||||||
|
|
||||||
|
@ -156,14 +199,13 @@ class TestPreprocess extends java.io.Serializable{
|
||||||
assertTrue(ds.filter(i => i._1.equals("2077-3757")).first()._2.openAccess)
|
assertTrue(ds.filter(i => i._1.equals("2077-3757")).first()._2.openAccess)
|
||||||
assertEquals(1, ds.filter(i => i._1.equals("2077-3757")).count)
|
assertEquals(1, ds.filter(i => i._1.equals("2077-3757")).count)
|
||||||
|
|
||||||
val hbmap : Dataset[String] = ds.filter(hbi => hbi._2.id.startsWith("10|")).map(SparkProduceHostedByMap.toHostedByMap)(Encoders.STRING)
|
val hbmap: Dataset[String] = ds
|
||||||
|
.filter(hbi => hbi._2.id.startsWith("10|"))
|
||||||
|
.map(SparkProduceHostedByMap.toHostedByMap)(Encoders.STRING)
|
||||||
|
|
||||||
hbmap.foreach(entry => println(entry))
|
hbmap.foreach(entry => println(entry))
|
||||||
spark.close()
|
spark.close()
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.resolution
|
package eu.dnetlib.dhp.oa.graph.resolution
|
||||||
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType
|
import eu.dnetlib.dhp.schema.common.EntityType
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
||||||
|
@ -26,65 +25,86 @@ class ResolveEntitiesTest extends Serializable {
|
||||||
|
|
||||||
var sparkSession: Option[SparkSession] = None
|
var sparkSession: Option[SparkSession] = None
|
||||||
|
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
def setUp(): Unit = {
|
def setUp(): Unit = {
|
||||||
workingDir = Files.createTempDirectory(getClass.getSimpleName)
|
workingDir = Files.createTempDirectory(getClass.getSimpleName)
|
||||||
|
|
||||||
val conf = new SparkConf()
|
val conf = new SparkConf()
|
||||||
sparkSession = Some(SparkSession
|
sparkSession = Some(
|
||||||
|
SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master("local[*]").getOrCreate())
|
.master("local[*]")
|
||||||
|
.getOrCreate()
|
||||||
|
)
|
||||||
populateDatasets(sparkSession.get)
|
populateDatasets(sparkSession.get)
|
||||||
generateUpdates(sparkSession.get)
|
generateUpdates(sparkSession.get)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@AfterAll
|
@AfterAll
|
||||||
def tearDown(): Unit = {
|
def tearDown(): Unit = {
|
||||||
FileUtils.deleteDirectory(workingDir.toFile)
|
FileUtils.deleteDirectory(workingDir.toFile)
|
||||||
sparkSession.get.stop()
|
sparkSession.get.stop()
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateUpdates(spark: SparkSession): Unit = {
|
def generateUpdates(spark: SparkSession): Unit = {
|
||||||
val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString
|
val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString
|
||||||
|
|
||||||
|
val pids: List[String] = template.lines
|
||||||
val pids:List[String] = template.lines.map{id =>
|
.map { id =>
|
||||||
val r = new Result
|
val r = new Result
|
||||||
r.setId(id.toLowerCase.trim)
|
r.setId(id.toLowerCase.trim)
|
||||||
r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
|
r.setSubject(
|
||||||
r.setTitle(List(OafMapperUtils.structuredProperty(FAKE_TITLE, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
|
List(
|
||||||
|
OafMapperUtils.structuredProperty(
|
||||||
|
FAKE_SUBJECT,
|
||||||
|
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
|
||||||
|
null
|
||||||
|
)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
|
r.setTitle(
|
||||||
|
List(
|
||||||
|
OafMapperUtils.structuredProperty(
|
||||||
|
FAKE_TITLE,
|
||||||
|
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
|
||||||
|
null
|
||||||
|
)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
r
|
r
|
||||||
}.map{r =>
|
}
|
||||||
|
.map { r =>
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
mapper.writeValueAsString(r)}.toList
|
mapper.writeValueAsString(r)
|
||||||
|
}
|
||||||
|
.toList
|
||||||
|
|
||||||
val sc = spark.sparkContext
|
val sc = spark.sparkContext
|
||||||
|
|
||||||
println(sc.parallelize(pids).count())
|
println(sc.parallelize(pids).count())
|
||||||
|
|
||||||
spark.createDataset(sc.parallelize(pids))(Encoders.STRING).write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingDir/updates")
|
spark
|
||||||
|
.createDataset(sc.parallelize(pids))(Encoders.STRING)
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.text(s"$workingDir/updates")
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
implicit val resEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
|
implicit val resEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
|
||||||
val ds = spark.read.text(s"$workingDir/updates").as[String].map{s => val mapper = new ObjectMapper()
|
val ds = spark.read
|
||||||
mapper.readValue(s, classOf[Result])}.collect()
|
.text(s"$workingDir/updates")
|
||||||
|
.as[String]
|
||||||
|
.map { s =>
|
||||||
|
val mapper = new ObjectMapper()
|
||||||
|
mapper.readValue(s, classOf[Result])
|
||||||
|
}
|
||||||
|
.collect()
|
||||||
|
|
||||||
assertEquals(4, ds.length)
|
assertEquals(4, ds.length)
|
||||||
ds.foreach { r => assertNotNull(r.getSubject) }
|
ds.foreach { r => assertNotNull(r.getSubject) }
|
||||||
|
@ -92,30 +112,36 @@ class ResolveEntitiesTest extends Serializable {
|
||||||
ds.foreach { r => assertNotNull(r.getTitle) }
|
ds.foreach { r => assertNotNull(r.getTitle) }
|
||||||
ds.foreach { r => assertEquals(1, r.getTitle.size()) }
|
ds.foreach { r => assertEquals(1, r.getTitle.size()) }
|
||||||
|
|
||||||
|
ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue))
|
||||||
|
.foreach(t => assertEquals(FAKE_TITLE, t))
|
||||||
ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_TITLE,t))
|
ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue))
|
||||||
ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_SUBJECT,t))
|
.foreach(t => assertEquals(FAKE_SUBJECT, t))
|
||||||
|
|
||||||
println("generated Updates")
|
println("generated Updates")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def populateDatasets(spark: SparkSession): Unit = {
|
def populateDatasets(spark: SparkSession): Unit = {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
val entities = SparkResolveEntities.entities
|
val entities = SparkResolveEntities.entities
|
||||||
|
|
||||||
entities.foreach{
|
entities.foreach { e =>
|
||||||
e =>
|
|
||||||
val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString
|
val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString
|
||||||
spark.createDataset(spark.sparkContext.parallelize(template.lines.toList)).as[String].write.option("compression", "gzip").text(s"$workingDir/graph/$e")
|
spark
|
||||||
|
.createDataset(spark.sparkContext.parallelize(template.lines.toList))
|
||||||
|
.as[String]
|
||||||
|
.write
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.text(s"$workingDir/graph/$e")
|
||||||
println(s"Created Dataset $e")
|
println(s"Created Dataset $e")
|
||||||
}
|
}
|
||||||
SparkResolveRelation.extractPidResolvedTableFromJsonRDD(spark, s"$workingDir/graph", s"$workingDir/work")
|
SparkResolveRelation.extractPidResolvedTableFromJsonRDD(
|
||||||
|
spark,
|
||||||
|
s"$workingDir/graph",
|
||||||
|
s"$workingDir/work"
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testResolution(): Unit = {
|
def testResolution(): Unit = {
|
||||||
val spark: SparkSession = sparkSession.get
|
val spark: SparkSession = sparkSession.get
|
||||||
|
@ -126,16 +152,15 @@ class ResolveEntitiesTest extends Serializable {
|
||||||
|
|
||||||
assertEquals(3, ds.count())
|
assertEquals(3, ds.count())
|
||||||
|
|
||||||
ds.collect().foreach{
|
ds.collect().foreach { r =>
|
||||||
r =>
|
|
||||||
assertTrue(r.getId.startsWith("50"))
|
assertTrue(r.getId.startsWith("50"))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private def structuredPContainsValue(
|
||||||
|
l: java.util.List[StructuredProperty],
|
||||||
|
exptectedValue: String
|
||||||
private def structuredPContainsValue(l:java.util.List[StructuredProperty], exptectedValue:String):Boolean = {
|
): Boolean = {
|
||||||
l.asScala.exists(p => p.getValue != null && p.getValue.equalsIgnoreCase(exptectedValue))
|
l.asScala.exists(p => p.getValue != null && p.getValue.equalsIgnoreCase(exptectedValue))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -146,47 +171,72 @@ class ResolveEntitiesTest extends Serializable {
|
||||||
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||||
val m = new ObjectMapper()
|
val m = new ObjectMapper()
|
||||||
SparkResolveEntities.resolveEntities(spark, s"$workingDir/work", s"$workingDir/updates")
|
SparkResolveEntities.resolveEntities(spark, s"$workingDir/work", s"$workingDir/updates")
|
||||||
SparkResolveEntities.generateResolvedEntities(spark,s"$workingDir/work",s"$workingDir/graph", s"$workingDir/target" )
|
SparkResolveEntities.generateResolvedEntities(
|
||||||
|
spark,
|
||||||
|
s"$workingDir/work",
|
||||||
|
s"$workingDir/graph",
|
||||||
val pubDS:Dataset[Result] = spark.read.text(s"$workingDir/target/publication").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
|
s"$workingDir/target"
|
||||||
val t = pubDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
|
)
|
||||||
|
|
||||||
|
val pubDS: Dataset[Result] = spark.read
|
||||||
|
.text(s"$workingDir/target/publication")
|
||||||
|
.as[String]
|
||||||
|
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
|
||||||
|
val t = pubDS
|
||||||
|
.filter(p => p.getTitle != null && p.getSubject != null)
|
||||||
|
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
|
||||||
|
.count()
|
||||||
|
|
||||||
var ct = pubDS.count()
|
var ct = pubDS.count()
|
||||||
var et = pubDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
|
var et = pubDS
|
||||||
|
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
|
||||||
|
.count()
|
||||||
|
|
||||||
assertEquals(ct, et)
|
assertEquals(ct, et)
|
||||||
|
|
||||||
|
val datDS: Dataset[Result] = spark.read
|
||||||
|
.text(s"$workingDir/target/dataset")
|
||||||
val datDS:Dataset[Result] = spark.read.text(s"$workingDir/target/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
|
.as[String]
|
||||||
val td = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
|
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
|
||||||
|
val td = datDS
|
||||||
|
.filter(p => p.getTitle != null && p.getSubject != null)
|
||||||
|
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
|
||||||
|
.count()
|
||||||
ct = datDS.count()
|
ct = datDS.count()
|
||||||
et = datDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
|
et = datDS
|
||||||
|
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
|
||||||
|
.count()
|
||||||
assertEquals(ct, et)
|
assertEquals(ct, et)
|
||||||
|
|
||||||
|
val softDS: Dataset[Result] = spark.read
|
||||||
val softDS:Dataset[Result] = spark.read.text(s"$workingDir/target/software").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.software))
|
.text(s"$workingDir/target/software")
|
||||||
val ts = softDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
|
.as[String]
|
||||||
|
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.software))
|
||||||
|
val ts = softDS
|
||||||
|
.filter(p => p.getTitle != null && p.getSubject != null)
|
||||||
|
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
|
||||||
|
.count()
|
||||||
ct = softDS.count()
|
ct = softDS.count()
|
||||||
et = softDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
|
et = softDS
|
||||||
|
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
|
||||||
|
.count()
|
||||||
assertEquals(ct, et)
|
assertEquals(ct, et)
|
||||||
|
|
||||||
|
val orpDS: Dataset[Result] = spark.read
|
||||||
val orpDS:Dataset[Result] = spark.read.text(s"$workingDir/target/otherresearchproduct").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct))
|
.text(s"$workingDir/target/otherresearchproduct")
|
||||||
val to = orpDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
|
.as[String]
|
||||||
|
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct))
|
||||||
|
val to = orpDS
|
||||||
|
.filter(p => p.getTitle != null && p.getSubject != null)
|
||||||
|
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
|
||||||
|
.count()
|
||||||
|
|
||||||
ct = orpDS.count()
|
ct = orpDS.count()
|
||||||
et = orpDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
|
et = orpDS
|
||||||
|
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
|
||||||
|
.count()
|
||||||
assertEquals(ct, et)
|
assertEquals(ct, et)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
assertEquals(0, t)
|
assertEquals(0, t)
|
||||||
assertEquals(2, td)
|
assertEquals(2, td)
|
||||||
assertEquals(1, ts)
|
assertEquals(1, ts)
|
||||||
|
@ -194,40 +244,35 @@ class ResolveEntitiesTest extends Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testMerge(): Unit = {
|
def testMerge(): Unit = {
|
||||||
|
|
||||||
val r = new Result
|
val r = new Result
|
||||||
r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
|
r.setSubject(
|
||||||
|
List(
|
||||||
|
OafMapperUtils.structuredProperty(
|
||||||
|
FAKE_SUBJECT,
|
||||||
|
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
|
||||||
|
null
|
||||||
|
)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
val p = mapper.readValue(Source.fromInputStream(this.getClass.getResourceAsStream(s"publication")).mkString.lines.next(), classOf[Publication])
|
val p = mapper.readValue(
|
||||||
|
Source
|
||||||
|
.fromInputStream(this.getClass.getResourceAsStream(s"publication"))
|
||||||
|
.mkString
|
||||||
|
.lines
|
||||||
|
.next(),
|
||||||
|
classOf[Publication]
|
||||||
|
)
|
||||||
|
|
||||||
r.mergeFrom(p)
|
r.mergeFrom(p)
|
||||||
|
|
||||||
|
|
||||||
println(mapper.writeValueAsString(r))
|
println(mapper.writeValueAsString(r))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,26 +1,20 @@
|
||||||
package eu.dnetlib.dhp.sx.graph
|
package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
|
||||||
import java.text.SimpleDateFormat
|
import java.text.SimpleDateFormat
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class RetrieveDataciteDeltaTest {
|
class RetrieveDataciteDeltaTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testParsingDate(): Unit = {
|
def testParsingDate(): Unit = {
|
||||||
|
|
||||||
|
|
||||||
val inputDate = "2021-12-02T11:17:36+0000"
|
val inputDate = "2021-12-02T11:17:36+0000"
|
||||||
|
|
||||||
val t = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ").parse(inputDate).getTime
|
val t = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ").parse(inputDate).getTime
|
||||||
|
|
||||||
|
|
||||||
println(t)
|
println(t)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,7 +20,6 @@ import scala.io.Source
|
||||||
@ExtendWith(Array(classOf[MockitoExtension]))
|
@ExtendWith(Array(classOf[MockitoExtension]))
|
||||||
class ScholixGraphTest extends AbstractVocabularyTest {
|
class ScholixGraphTest extends AbstractVocabularyTest {
|
||||||
|
|
||||||
|
|
||||||
val mapper: ObjectMapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
val mapper: ObjectMapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
||||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
|
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
|
||||||
|
|
||||||
|
@ -30,11 +29,12 @@ class ScholixGraphTest extends AbstractVocabularyTest{
|
||||||
super.setUpVocabulary()
|
super.setUpVocabulary()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testExtractPids(): Unit = {
|
def testExtractPids(): Unit = {
|
||||||
|
|
||||||
val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json")).mkString
|
val input = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json"))
|
||||||
|
.mkString
|
||||||
val res = SparkResolveRelation.extractPidsFromRecord(input)
|
val res = SparkResolveRelation.extractPidsFromRecord(input)
|
||||||
assertNotNull(res)
|
assertNotNull(res)
|
||||||
|
|
||||||
|
@ -44,11 +44,14 @@ class ScholixGraphTest extends AbstractVocabularyTest{
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testOAFToSummary(): Unit = {
|
def testOAFToSummary(): Unit = {
|
||||||
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary")).mkString
|
val inputRelations = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary"))
|
||||||
|
.mkString
|
||||||
val items = inputRelations.lines.toList
|
val items = inputRelations.lines.toList
|
||||||
assertNotNull(items)
|
assertNotNull(items)
|
||||||
items.foreach(i => assertTrue(i.nonEmpty))
|
items.foreach(i => assertTrue(i.nonEmpty))
|
||||||
val result = items.map(r => mapper.readValue(r, classOf[Result])).map(i => ScholixUtils.resultToSummary(i))
|
val result =
|
||||||
|
items.map(r => mapper.readValue(r, classOf[Result])).map(i => ScholixUtils.resultToSummary(i))
|
||||||
|
|
||||||
assertNotNull(result)
|
assertNotNull(result)
|
||||||
|
|
||||||
|
@ -59,12 +62,18 @@ class ScholixGraphTest extends AbstractVocabularyTest{
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testScholixMergeOnSource(): Unit = {
|
def testScholixMergeOnSource(): Unit = {
|
||||||
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix")).mkString
|
val inputRelations = Source
|
||||||
val result:List[(Relation,ScholixSummary)] =inputRelations.lines.sliding(2).map(s => (s.head, s(1))).map(p => (mapper.readValue(p._1, classOf[Relation]),mapper.readValue(p._2, classOf[ScholixSummary]) )).toList
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
val result: List[(Relation, ScholixSummary)] = inputRelations.lines
|
||||||
|
.sliding(2)
|
||||||
|
.map(s => (s.head, s(1)))
|
||||||
|
.map(p => (mapper.readValue(p._1, classOf[Relation]), mapper.readValue(p._2, classOf[ScholixSummary])))
|
||||||
|
.toList
|
||||||
assertNotNull(result)
|
assertNotNull(result)
|
||||||
assertTrue(result.nonEmpty)
|
assertTrue(result.nonEmpty)
|
||||||
result.foreach(r => assertEquals(r._1.getSource, r._2.getId))
|
result.foreach(r => assertEquals(r._1.getSource, r._2.getId))
|
||||||
|
@ -72,12 +81,13 @@ class ScholixGraphTest extends AbstractVocabularyTest{
|
||||||
println(mapper.writeValueAsString(scholix.head))
|
println(mapper.writeValueAsString(scholix.head))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testScholixRelationshipsClean(): Unit = {
|
def testScholixRelationshipsClean(): Unit = {
|
||||||
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relation_transform.json")).mkString
|
val inputRelations = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relation_transform.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
|
||||||
lazy val json: json4s.JValue = parse(inputRelations)
|
lazy val json: json4s.JValue = parse(inputRelations)
|
||||||
|
@ -89,7 +99,4 @@ class ScholixGraphTest extends AbstractVocabularyTest{
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
39
pom.xml
39
pom.xml
|
@ -620,6 +620,18 @@
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.antipathy</groupId>
|
||||||
|
<artifactId>mvn-scalafmt_2.11</artifactId>
|
||||||
|
<version>1.0.1640073709.733712b</version>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-code-style</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</plugin>
|
||||||
</plugins>
|
</plugins>
|
||||||
</pluginManagement>
|
</pluginManagement>
|
||||||
<plugins>
|
<plugins>
|
||||||
|
@ -665,6 +677,33 @@
|
||||||
</execution>
|
</execution>
|
||||||
</executions>
|
</executions>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.antipathy</groupId>
|
||||||
|
<artifactId>mvn-scalafmt_2.11</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<configLocation>dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf</configLocation>
|
||||||
|
<skipTestSources>false</skipTestSources>
|
||||||
|
<skipSources>false</skipSources>
|
||||||
|
<sourceDirectories>
|
||||||
|
<param>${project.basedir}/src/main/scala</param>
|
||||||
|
</sourceDirectories>
|
||||||
|
<testSourceDirectories>
|
||||||
|
<param>${project.basedir}/src/test/scala</param>
|
||||||
|
</testSourceDirectories>
|
||||||
|
<validateOnly>false</validateOnly>
|
||||||
|
<onlyChangedFiles>false</onlyChangedFiles>
|
||||||
|
<branch>: git rev-parse --abbrev-ref HEAD</branch>
|
||||||
|
<useSpecifiedRepositories>false</useSpecifiedRepositories>
|
||||||
|
</configuration>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>validate</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>format</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-release-plugin</artifactId>
|
<artifactId>maven-release-plugin</artifactId>
|
||||||
|
|
Loading…
Reference in New Issue