forked from D-Net/dnet-hadoop
scalafmt: code formatting
This commit is contained in:
parent
908294d86e
commit
4f212652ca
|
@ -0,0 +1,15 @@
|
|||
style = defaultWithAlign
|
||||
|
||||
align.openParenCallSite = false
|
||||
align.openParenDefnSite = false
|
||||
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
|
||||
continuationIndent.callSite = 2
|
||||
continuationIndent.defnSite = 2
|
||||
danglingParentheses = true
|
||||
indentOperator = spray
|
||||
maxColumn = 100
|
||||
newlines.alwaysBeforeTopLevelStatements = true
|
||||
#project.excludeFilters = ["\.*\.sbt"]
|
||||
rewrite.rules = [RedundantParens, SortImports]
|
||||
spaces.inImportCurlyBraces = false
|
||||
unindentTopLevelOperators = true
|
|
@ -2,68 +2,69 @@ package eu.dnetlib.dhp.application
|
|||
|
||||
import scala.io.Source
|
||||
|
||||
/**
|
||||
* This is the main Interface SparkApplication
|
||||
* where all the Spark Scala class should inherit
|
||||
*
|
||||
*/
|
||||
/** This is the main Interface SparkApplication
|
||||
* where all the Spark Scala class should inherit
|
||||
*/
|
||||
trait SparkScalaApplication {
|
||||
/**
|
||||
* This is the path in the classpath of the json
|
||||
* describes all the argument needed to run
|
||||
*/
|
||||
|
||||
/** This is the path in the classpath of the json
|
||||
* describes all the argument needed to run
|
||||
*/
|
||||
val propertyPath: String
|
||||
|
||||
/**
|
||||
* Utility to parse the arguments using the
|
||||
* property json in the classpath identified from
|
||||
* the variable propertyPath
|
||||
*
|
||||
* @param args the list of arguments
|
||||
*/
|
||||
/** Utility to parse the arguments using the
|
||||
* property json in the classpath identified from
|
||||
* the variable propertyPath
|
||||
*
|
||||
* @param args the list of arguments
|
||||
*/
|
||||
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString)
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
parser
|
||||
}
|
||||
|
||||
/**
|
||||
* Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
def run(): Unit
|
||||
}
|
||||
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.slf4j.Logger
|
||||
|
||||
abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends SparkScalaApplication {
|
||||
abstract class AbstractScalaApplication(
|
||||
val propertyPath: String,
|
||||
val args: Array[String],
|
||||
log: Logger
|
||||
) extends SparkScalaApplication {
|
||||
|
||||
var parser: ArgumentApplicationParser = null
|
||||
|
||||
var spark:SparkSession = null
|
||||
var spark: SparkSession = null
|
||||
|
||||
|
||||
def initialize():SparkScalaApplication = {
|
||||
def initialize(): SparkScalaApplication = {
|
||||
parser = parseArguments(args)
|
||||
spark = createSparkSession()
|
||||
this
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility for creating a spark session starting from parser
|
||||
*
|
||||
* @return a spark Session
|
||||
*/
|
||||
private def createSparkSession():SparkSession = {
|
||||
require(parser!= null)
|
||||
/** Utility for creating a spark session starting from parser
|
||||
*
|
||||
* @return a spark Session
|
||||
*/
|
||||
private def createSparkSession(): SparkSession = {
|
||||
require(parser != null)
|
||||
|
||||
val conf:SparkConf = new SparkConf()
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val master = parser.get("master")
|
||||
log.info(s"Creating Spark session: Master: $master")
|
||||
SparkSession.builder().config(conf)
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
|
|
@ -14,7 +14,6 @@ import scala.io.Source
|
|||
|
||||
object ScholixUtils extends Serializable {
|
||||
|
||||
|
||||
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
|
||||
|
||||
val DATE_RELATION_KEY: String = "RelationDate"
|
||||
|
@ -24,7 +23,11 @@ object ScholixUtils extends Serializable {
|
|||
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
|
||||
|
||||
val relations: Map[String, RelationVocabulary] = {
|
||||
val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")).mkString
|
||||
val input = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
|
||||
)
|
||||
.mkString
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
|
@ -32,13 +35,14 @@ object ScholixUtils extends Serializable {
|
|||
json.extract[Map[String, RelationVocabulary]]
|
||||
}
|
||||
|
||||
|
||||
def extractRelationDate(relation: Relation): String = {
|
||||
|
||||
if (relation.getProperties == null || !relation.getProperties.isEmpty)
|
||||
null
|
||||
else {
|
||||
val date = relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue)
|
||||
val date = relation.getProperties.asScala
|
||||
.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey))
|
||||
.map(p => p.getValue)
|
||||
if (date.isDefined)
|
||||
date.get
|
||||
else
|
||||
|
@ -58,78 +62,80 @@ object ScholixUtils extends Serializable {
|
|||
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
|
||||
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def generateScholixResourceFromResult(r:Result) :ScholixResource = {
|
||||
def generateScholixResourceFromResult(r: Result): ScholixResource = {
|
||||
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
||||
}
|
||||
|
||||
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
|
||||
new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
|
||||
override def zero: RelatedEntities = null
|
||||
|
||||
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
|
||||
override def zero: RelatedEntities = null
|
||||
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
|
||||
val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
|
||||
val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0
|
||||
|
||||
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
|
||||
val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
|
||||
val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0
|
||||
|
||||
if (b == null)
|
||||
RelatedEntities(a._1, relatedDataset, relatedPublication)
|
||||
else
|
||||
RelatedEntities(a._1, b.relatedDataset + relatedDataset, b.relatedPublication + relatedPublication)
|
||||
}
|
||||
|
||||
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
|
||||
if (b1 != null && b2 != null)
|
||||
RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication)
|
||||
|
||||
else if (b1 != null)
|
||||
b1
|
||||
else
|
||||
b2
|
||||
}
|
||||
|
||||
override def finish(reduction: RelatedEntities): RelatedEntities = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
|
||||
|
||||
override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
|
||||
}
|
||||
|
||||
|
||||
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
|
||||
override def zero: Scholix = null
|
||||
|
||||
|
||||
def scholix_complete(s: Scholix): Boolean = {
|
||||
if (s == null || s.getIdentifier == null) {
|
||||
false
|
||||
} else if (s.getSource == null || s.getTarget == null) {
|
||||
false
|
||||
if (b == null)
|
||||
RelatedEntities(a._1, relatedDataset, relatedPublication)
|
||||
else
|
||||
RelatedEntities(
|
||||
a._1,
|
||||
b.relatedDataset + relatedDataset,
|
||||
b.relatedPublication + relatedPublication
|
||||
)
|
||||
}
|
||||
else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
|
||||
false
|
||||
else
|
||||
true
|
||||
|
||||
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
|
||||
if (b1 != null && b2 != null)
|
||||
RelatedEntities(
|
||||
b1.id,
|
||||
b1.relatedDataset + b2.relatedDataset,
|
||||
b1.relatedPublication + b2.relatedPublication
|
||||
)
|
||||
else if (b1 != null)
|
||||
b1
|
||||
else
|
||||
b2
|
||||
}
|
||||
|
||||
override def finish(reduction: RelatedEntities): RelatedEntities = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
|
||||
|
||||
override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
|
||||
}
|
||||
|
||||
override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
|
||||
if (scholix_complete(b)) b else a._2
|
||||
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] =
|
||||
new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
|
||||
override def zero: Scholix = null
|
||||
|
||||
def scholix_complete(s: Scholix): Boolean = {
|
||||
if (s == null || s.getIdentifier == null) {
|
||||
false
|
||||
} else if (s.getSource == null || s.getTarget == null) {
|
||||
false
|
||||
} else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
|
||||
false
|
||||
else
|
||||
true
|
||||
}
|
||||
|
||||
override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
|
||||
if (scholix_complete(b)) b else a._2
|
||||
}
|
||||
|
||||
override def merge(b1: Scholix, b2: Scholix): Scholix = {
|
||||
if (scholix_complete(b1)) b1 else b2
|
||||
}
|
||||
|
||||
override def finish(reduction: Scholix): Scholix = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
|
||||
override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
}
|
||||
|
||||
override def merge(b1: Scholix, b2: Scholix): Scholix = {
|
||||
if (scholix_complete(b1)) b1 else b2
|
||||
}
|
||||
|
||||
override def finish(reduction: Scholix): Scholix = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
|
||||
override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
}
|
||||
|
||||
|
||||
def createInverseScholixRelation(scholix: Scholix): Scholix = {
|
||||
val s = new Scholix
|
||||
s.setPublicationDate(scholix.getPublicationDate)
|
||||
|
@ -138,16 +144,19 @@ object ScholixUtils extends Serializable {
|
|||
s.setRelationship(inverseRelationShip(scholix.getRelationship))
|
||||
s.setSource(scholix.getTarget)
|
||||
s.setTarget(scholix.getSource)
|
||||
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
|
||||
s.setIdentifier(
|
||||
DHPUtils.md5(
|
||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||
)
|
||||
)
|
||||
s
|
||||
|
||||
|
||||
}
|
||||
|
||||
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
|
||||
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
|
||||
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map {
|
||||
d => new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
|
||||
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
|
||||
new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
|
||||
}(collection.breakOut)
|
||||
l
|
||||
} else List()
|
||||
|
@ -155,8 +164,11 @@ object ScholixUtils extends Serializable {
|
|||
|
||||
def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
|
||||
if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
|
||||
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map {
|
||||
d => new ScholixEntityId(d.getDatasourceName, List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava)
|
||||
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d =>
|
||||
new ScholixEntityId(
|
||||
d.getDatasourceName,
|
||||
List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava
|
||||
)
|
||||
}(collection.breakOut)
|
||||
l
|
||||
} else List()
|
||||
|
@ -165,17 +177,16 @@ object ScholixUtils extends Serializable {
|
|||
def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
|
||||
if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
|
||||
|
||||
|
||||
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map {
|
||||
c =>
|
||||
|
||||
new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava)
|
||||
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c =>
|
||||
new ScholixEntityId(
|
||||
c.getValue,
|
||||
List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava
|
||||
)
|
||||
}.toList
|
||||
l
|
||||
} else List()
|
||||
}
|
||||
|
||||
|
||||
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
|
||||
val s = new Scholix
|
||||
s.setPublicationDate(scholix.getPublicationDate)
|
||||
|
@ -184,11 +195,14 @@ object ScholixUtils extends Serializable {
|
|||
s.setRelationship(scholix.getRelationship)
|
||||
s.setSource(scholix.getSource)
|
||||
s.setTarget(generateScholixResourceFromSummary(target))
|
||||
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
|
||||
s.setIdentifier(
|
||||
DHPUtils.md5(
|
||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||
)
|
||||
)
|
||||
s
|
||||
}
|
||||
|
||||
|
||||
def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = {
|
||||
val s = new Scholix
|
||||
s.setPublicationDate(scholix.getPublicationDate)
|
||||
|
@ -197,11 +211,14 @@ object ScholixUtils extends Serializable {
|
|||
s.setRelationship(scholix.getRelationship)
|
||||
s.setSource(scholix.getSource)
|
||||
s.setTarget(target)
|
||||
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
|
||||
s.setIdentifier(
|
||||
DHPUtils.md5(
|
||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||
)
|
||||
)
|
||||
s
|
||||
}
|
||||
|
||||
|
||||
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
|
||||
val r = new ScholixResource
|
||||
r.setIdentifier(summaryObject.getLocalIdentifier)
|
||||
|
@ -214,7 +231,8 @@ object ScholixUtils extends Serializable {
|
|||
r.setTitle(summaryObject.getTitle.get(0))
|
||||
|
||||
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
|
||||
val l: List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
|
||||
val l: List[ScholixEntityId] =
|
||||
summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
|
||||
if (l.nonEmpty)
|
||||
r.setCreator(l.asJava)
|
||||
}
|
||||
|
@ -222,20 +240,27 @@ object ScholixUtils extends Serializable {
|
|||
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
|
||||
r.setPublicationDate(summaryObject.getDate.get(0))
|
||||
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
|
||||
val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
|
||||
val plist: List[ScholixEntityId] =
|
||||
summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
|
||||
|
||||
if (plist.nonEmpty)
|
||||
r.setPublisher(plist.asJava)
|
||||
}
|
||||
|
||||
|
||||
if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {
|
||||
|
||||
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom(
|
||||
new ScholixEntityId(c.getDatasourceName, List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava)
|
||||
, "collected", "complete"
|
||||
|
||||
)).toList
|
||||
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala
|
||||
.map(c =>
|
||||
new ScholixCollectedFrom(
|
||||
new ScholixEntityId(
|
||||
c.getDatasourceName,
|
||||
List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava
|
||||
),
|
||||
"collected",
|
||||
"complete"
|
||||
)
|
||||
)
|
||||
.toList
|
||||
|
||||
if (l.nonEmpty)
|
||||
r.setCollectedFrom(l.asJava)
|
||||
|
@ -244,9 +269,7 @@ object ScholixUtils extends Serializable {
|
|||
r
|
||||
}
|
||||
|
||||
|
||||
|
||||
def scholixFromSource(relation: Relation, source: ScholixResource):Scholix = {
|
||||
def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = {
|
||||
if (relation == null || source == null)
|
||||
return null
|
||||
val s = new Scholix
|
||||
|
@ -262,7 +285,6 @@ object ScholixUtils extends Serializable {
|
|||
|
||||
s.setPublicationDate(d)
|
||||
|
||||
|
||||
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
|
||||
s.setPublisher(source.getPublisher)
|
||||
}
|
||||
|
@ -270,13 +292,14 @@ object ScholixUtils extends Serializable {
|
|||
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
||||
if (semanticRelation == null)
|
||||
return null
|
||||
s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
|
||||
s.setRelationship(
|
||||
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
||||
)
|
||||
s.setSource(source)
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
|
||||
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
|
||||
|
||||
if (relation == null || source == null)
|
||||
|
@ -298,12 +321,10 @@ object ScholixUtils extends Serializable {
|
|||
|
||||
s.setPublicationDate(d)
|
||||
|
||||
|
||||
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
|
||||
val l: List[ScholixEntityId] = source.getPublisher.asScala
|
||||
.map {
|
||||
p =>
|
||||
new ScholixEntityId(p, null)
|
||||
.map { p =>
|
||||
new ScholixEntityId(p, null)
|
||||
}(collection.breakOut)
|
||||
|
||||
if (l.nonEmpty)
|
||||
|
@ -313,31 +334,37 @@ object ScholixUtils extends Serializable {
|
|||
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
||||
if (semanticRelation == null)
|
||||
return null
|
||||
s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
|
||||
s.setRelationship(
|
||||
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
||||
)
|
||||
s.setSource(generateScholixResourceFromSummary(source))
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
def findURLForPID(
|
||||
pidValue: List[StructuredProperty],
|
||||
urls: List[String]
|
||||
): List[(StructuredProperty, String)] = {
|
||||
pidValue.map { p =>
|
||||
val pv = p.getValue
|
||||
|
||||
def findURLForPID(pidValue: List[StructuredProperty], urls: List[String]): List[(StructuredProperty, String)] = {
|
||||
pidValue.map {
|
||||
p =>
|
||||
val pv = p.getValue
|
||||
|
||||
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
|
||||
(p, r.orNull)
|
||||
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
|
||||
(p, r.orNull)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
|
||||
if (r.getInstance() == null || r.getInstance().isEmpty)
|
||||
return List()
|
||||
r.getInstance().asScala.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
|
||||
r.getInstance()
|
||||
.asScala
|
||||
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
|
||||
.filter(i => i.getPid != null && i.getUrl != null)
|
||||
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
|
||||
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList
|
||||
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
|
||||
.distinct
|
||||
.toList
|
||||
}
|
||||
|
||||
def resultToSummary(r: Result): ScholixSummary = {
|
||||
|
@ -371,7 +398,12 @@ object ScholixUtils extends Serializable {
|
|||
s.setAuthor(authors.asJava)
|
||||
}
|
||||
if (r.getInstance() != null) {
|
||||
val dt: List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue).toList
|
||||
val dt: List[String] = r
|
||||
.getInstance()
|
||||
.asScala
|
||||
.filter(i => i.getDateofacceptance != null)
|
||||
.map(i => i.getDateofacceptance.getValue)
|
||||
.toList
|
||||
if (dt.nonEmpty)
|
||||
s.setDate(dt.distinct.asJava)
|
||||
}
|
||||
|
@ -382,7 +414,9 @@ object ScholixUtils extends Serializable {
|
|||
}
|
||||
|
||||
if (r.getSubject != null && !r.getSubject.isEmpty) {
|
||||
val subjects: List[SchemeValue] = r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue)).toList
|
||||
val subjects: List[SchemeValue] = r.getSubject.asScala
|
||||
.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))
|
||||
.toList
|
||||
if (subjects.nonEmpty)
|
||||
s.setSubject(subjects.asJava)
|
||||
}
|
||||
|
@ -391,7 +425,9 @@ object ScholixUtils extends Serializable {
|
|||
s.setPublisher(List(r.getPublisher.getValue).asJava)
|
||||
|
||||
if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
|
||||
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete")).toList
|
||||
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala
|
||||
.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))
|
||||
.toList
|
||||
if (cf.nonEmpty)
|
||||
s.setDatasources(cf.distinct.asJava)
|
||||
}
|
||||
|
|
|
@ -7,16 +7,14 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode}
|
|||
|
||||
object CollectionUtils {
|
||||
|
||||
/**
|
||||
* This method in pipeline to the transformation phase,
|
||||
* generates relations in both verse, typically it should be a phase of flatMap
|
||||
*
|
||||
* @param i input OAF
|
||||
* @return
|
||||
* If the input OAF is an entity -> List(i)
|
||||
* If the input OAF is a relation -> List(relation, inverseRelation)
|
||||
*
|
||||
*/
|
||||
/** This method in pipeline to the transformation phase,
|
||||
* generates relations in both verse, typically it should be a phase of flatMap
|
||||
*
|
||||
* @param i input OAF
|
||||
* @return
|
||||
* If the input OAF is an entity -> List(i)
|
||||
* If the input OAF is a relation -> List(relation, inverseRelation)
|
||||
*/
|
||||
|
||||
def fixRelations(i: Oaf): List[Oaf] = {
|
||||
if (i.isInstanceOf[OafEntity])
|
||||
|
|
|
@ -6,7 +6,6 @@ import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest}
|
|||
import org.apache.http.entity.StringEntity
|
||||
import org.apache.http.impl.client.HttpClientBuilder
|
||||
|
||||
|
||||
abstract class AbstractRestClient extends Iterator[String] {
|
||||
|
||||
var buffer: List[String] = List()
|
||||
|
@ -16,12 +15,10 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
|
||||
var complete: Boolean = false
|
||||
|
||||
|
||||
def extractInfo(input: String): Unit
|
||||
|
||||
protected def getBufferData(): Unit
|
||||
|
||||
|
||||
def doHTTPGETRequest(url: String): String = {
|
||||
val httpGet = new HttpGet(url)
|
||||
doHTTPRequest(httpGet)
|
||||
|
@ -43,7 +40,6 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
buffer.nonEmpty && current_index < buffer.size
|
||||
}
|
||||
|
||||
|
||||
override def next(): String = {
|
||||
val next_item: String = buffer(current_index)
|
||||
current_index = current_index + 1
|
||||
|
@ -52,13 +48,14 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
next_item
|
||||
}
|
||||
|
||||
|
||||
private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
val config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
.setSocketTimeout(timeout * 1000)
|
||||
.build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
|
@ -69,8 +66,7 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
} else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
|
|
|
@ -3,7 +3,8 @@ package eu.dnetlib.dhp.datacite
|
|||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||
import org.json4s.{DefaultFormats, JValue}
|
||||
|
||||
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {
|
||||
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until: Long = -1)
|
||||
extends AbstractRestClient {
|
||||
|
||||
override def extractInfo(input: String): Unit = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
@ -16,15 +17,17 @@ class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -
|
|||
current_index = 0
|
||||
}
|
||||
|
||||
def get_url():String ={
|
||||
val to = if (until> 0) s"$until" else "*"
|
||||
def get_url(): String = {
|
||||
val to = if (until > 0) s"$until" else "*"
|
||||
s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]"
|
||||
|
||||
}
|
||||
|
||||
override def getBufferData(): Unit = {
|
||||
if (!complete) {
|
||||
val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url())
|
||||
val response =
|
||||
if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get)
|
||||
else doHTTPGETRequest(get_url())
|
||||
extractInfo(response)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,24 +10,38 @@ import java.util.Locale
|
|||
import java.util.regex.Pattern
|
||||
import scala.io.Source
|
||||
|
||||
/**
|
||||
* This class represent the dataModel of the input Dataset of Datacite
|
||||
* @param doi THE DOI
|
||||
* @param timestamp timestamp of last update date
|
||||
* @param isActive the record is active or deleted
|
||||
* @param json the json native records
|
||||
*/
|
||||
/** This class represent the dataModel of the input Dataset of Datacite
|
||||
* @param doi THE DOI
|
||||
* @param timestamp timestamp of last update date
|
||||
* @param isActive the record is active or deleted
|
||||
* @param json the json native records
|
||||
*/
|
||||
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
|
||||
|
||||
/*
|
||||
The following class are utility class used for the mapping from
|
||||
json datacite to OAF Shema
|
||||
*/
|
||||
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
|
||||
case class RelatedIdentifierType(
|
||||
relationType: String,
|
||||
relatedIdentifier: String,
|
||||
relatedIdentifierType: String
|
||||
) {}
|
||||
|
||||
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
|
||||
case class NameIdentifiersType(
|
||||
nameIdentifierScheme: Option[String],
|
||||
schemeUri: Option[String],
|
||||
nameIdentifier: Option[String]
|
||||
) {}
|
||||
|
||||
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
|
||||
case class CreatorType(
|
||||
nameType: Option[String],
|
||||
nameIdentifiers: Option[List[NameIdentifiersType]],
|
||||
name: Option[String],
|
||||
familyName: Option[String],
|
||||
givenName: Option[String],
|
||||
affiliation: Option[List[String]]
|
||||
) {}
|
||||
|
||||
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
||||
|
||||
|
@ -35,100 +49,230 @@ case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {
|
|||
|
||||
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
||||
|
||||
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
|
||||
case class FundingReferenceType(
|
||||
funderIdentifierType: Option[String],
|
||||
awardTitle: Option[String],
|
||||
awardUri: Option[String],
|
||||
funderName: Option[String],
|
||||
funderIdentifier: Option[String],
|
||||
awardNumber: Option[String]
|
||||
) {}
|
||||
|
||||
case class DateType(date: Option[String], dateType: Option[String]) {}
|
||||
|
||||
case class OAFRelations(relation:String, inverse:String, relType:String)
|
||||
case class OAFRelations(relation: String, inverse: String, relType: String)
|
||||
|
||||
|
||||
class DataciteModelConstants extends Serializable {
|
||||
|
||||
}
|
||||
class DataciteModelConstants extends Serializable {}
|
||||
|
||||
object DataciteModelConstants {
|
||||
|
||||
val REL_TYPE_VALUE:String = "resultResult"
|
||||
val REL_TYPE_VALUE: String = "resultResult"
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
|
||||
val DOI_CLASS = "doi"
|
||||
val SUBJ_CLASS = "keywords"
|
||||
val DATACITE_NAME = "Datacite"
|
||||
val dataInfo: DataInfo = dataciteDataInfo("0.9")
|
||||
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
|
||||
|
||||
val subRelTypeMapping: Map[String,OAFRelations] = Map(
|
||||
ModelConstants.REFERENCES -> OAFRelations(ModelConstants.REFERENCES, ModelConstants.IS_REFERENCED_BY, ModelConstants.RELATIONSHIP),
|
||||
ModelConstants.IS_REFERENCED_BY -> OAFRelations(ModelConstants.IS_REFERENCED_BY,ModelConstants.REFERENCES, ModelConstants.RELATIONSHIP),
|
||||
val DATACITE_COLLECTED_FROM: KeyValue =
|
||||
OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
|
||||
|
||||
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.SUPPLEMENT),
|
||||
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.SUPPLEMENT),
|
||||
|
||||
ModelConstants.HAS_PART -> OAFRelations(ModelConstants.HAS_PART,ModelConstants.IS_PART_OF, ModelConstants.PART),
|
||||
ModelConstants.IS_PART_OF -> OAFRelations(ModelConstants.IS_PART_OF,ModelConstants.HAS_PART, ModelConstants.PART),
|
||||
|
||||
ModelConstants.IS_VERSION_OF-> OAFRelations(ModelConstants.IS_VERSION_OF,ModelConstants.HAS_VERSION,ModelConstants.VERSION),
|
||||
ModelConstants.HAS_VERSION-> OAFRelations(ModelConstants.HAS_VERSION,ModelConstants.IS_VERSION_OF,ModelConstants.VERSION),
|
||||
|
||||
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(ModelConstants.IS_IDENTICAL_TO,ModelConstants.IS_IDENTICAL_TO, ModelConstants.RELATIONSHIP),
|
||||
|
||||
ModelConstants.IS_CONTINUED_BY -> OAFRelations(ModelConstants.IS_CONTINUED_BY,ModelConstants.CONTINUES, ModelConstants.RELATIONSHIP),
|
||||
ModelConstants.CONTINUES -> OAFRelations(ModelConstants.CONTINUES,ModelConstants.IS_CONTINUED_BY, ModelConstants.RELATIONSHIP),
|
||||
|
||||
ModelConstants.IS_NEW_VERSION_OF-> OAFRelations(ModelConstants.IS_NEW_VERSION_OF,ModelConstants.IS_PREVIOUS_VERSION_OF, ModelConstants.VERSION),
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF ->OAFRelations(ModelConstants.IS_PREVIOUS_VERSION_OF,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
|
||||
|
||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
|
||||
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
|
||||
|
||||
ModelConstants.IS_SOURCE_OF -> OAFRelations(ModelConstants.IS_SOURCE_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
|
||||
ModelConstants.IS_DERIVED_FROM -> OAFRelations(ModelConstants.IS_DERIVED_FROM,ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION),
|
||||
|
||||
ModelConstants.CITES -> OAFRelations(ModelConstants.CITES,ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
|
||||
ModelConstants.IS_CITED_BY -> OAFRelations(ModelConstants.IS_CITED_BY,ModelConstants.CITES, ModelConstants.CITATION),
|
||||
|
||||
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(ModelConstants.IS_VARIANT_FORM_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
|
||||
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(ModelConstants.IS_OBSOLETED_BY,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
|
||||
|
||||
ModelConstants.REVIEWS -> OAFRelations(ModelConstants.REVIEWS,ModelConstants.IS_REVIEWED_BY, ModelConstants.REVIEW),
|
||||
ModelConstants.IS_REVIEWED_BY -> OAFRelations(ModelConstants.IS_REVIEWED_BY,ModelConstants.REVIEWS, ModelConstants.REVIEW),
|
||||
|
||||
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
|
||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
|
||||
|
||||
ModelConstants.COMPILES -> OAFRelations(ModelConstants.COMPILES,ModelConstants.IS_COMPILED_BY, ModelConstants.RELATIONSHIP),
|
||||
ModelConstants.IS_COMPILED_BY -> OAFRelations(ModelConstants.IS_COMPILED_BY,ModelConstants.COMPILES, ModelConstants.RELATIONSHIP)
|
||||
val subRelTypeMapping: Map[String, OAFRelations] = Map(
|
||||
ModelConstants.REFERENCES -> OAFRelations(
|
||||
ModelConstants.REFERENCES,
|
||||
ModelConstants.IS_REFERENCED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_REFERENCED_BY -> OAFRelations(
|
||||
ModelConstants.IS_REFERENCED_BY,
|
||||
ModelConstants.REFERENCES,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
|
||||
ModelConstants.IS_SUPPLEMENTED_BY,
|
||||
ModelConstants.IS_SUPPLEMENT_TO,
|
||||
ModelConstants.SUPPLEMENT
|
||||
),
|
||||
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
|
||||
ModelConstants.IS_SUPPLEMENT_TO,
|
||||
ModelConstants.IS_SUPPLEMENTED_BY,
|
||||
ModelConstants.SUPPLEMENT
|
||||
),
|
||||
ModelConstants.HAS_PART -> OAFRelations(
|
||||
ModelConstants.HAS_PART,
|
||||
ModelConstants.IS_PART_OF,
|
||||
ModelConstants.PART
|
||||
),
|
||||
ModelConstants.IS_PART_OF -> OAFRelations(
|
||||
ModelConstants.IS_PART_OF,
|
||||
ModelConstants.HAS_PART,
|
||||
ModelConstants.PART
|
||||
),
|
||||
ModelConstants.IS_VERSION_OF -> OAFRelations(
|
||||
ModelConstants.IS_VERSION_OF,
|
||||
ModelConstants.HAS_VERSION,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.HAS_VERSION -> OAFRelations(
|
||||
ModelConstants.HAS_VERSION,
|
||||
ModelConstants.IS_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
|
||||
ModelConstants.IS_IDENTICAL_TO,
|
||||
ModelConstants.IS_IDENTICAL_TO,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_CONTINUED_BY -> OAFRelations(
|
||||
ModelConstants.IS_CONTINUED_BY,
|
||||
ModelConstants.CONTINUES,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.CONTINUES -> OAFRelations(
|
||||
ModelConstants.CONTINUES,
|
||||
ModelConstants.IS_CONTINUED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
|
||||
ModelConstants.IS_NEW_VERSION_OF,
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
||||
ModelConstants.IS_NEW_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.DOCUMENTS -> OAFRelations(
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_SOURCE_OF -> OAFRelations(
|
||||
ModelConstants.IS_SOURCE_OF,
|
||||
ModelConstants.IS_DERIVED_FROM,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_DERIVED_FROM -> OAFRelations(
|
||||
ModelConstants.IS_DERIVED_FROM,
|
||||
ModelConstants.IS_SOURCE_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.CITES -> OAFRelations(
|
||||
ModelConstants.CITES,
|
||||
ModelConstants.IS_CITED_BY,
|
||||
ModelConstants.CITATION
|
||||
),
|
||||
ModelConstants.IS_CITED_BY -> OAFRelations(
|
||||
ModelConstants.IS_CITED_BY,
|
||||
ModelConstants.CITES,
|
||||
ModelConstants.CITATION
|
||||
),
|
||||
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
|
||||
ModelConstants.IS_VARIANT_FORM_OF,
|
||||
ModelConstants.IS_DERIVED_FROM,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
|
||||
ModelConstants.IS_OBSOLETED_BY,
|
||||
ModelConstants.IS_NEW_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.REVIEWS -> OAFRelations(
|
||||
ModelConstants.REVIEWS,
|
||||
ModelConstants.IS_REVIEWED_BY,
|
||||
ModelConstants.REVIEW
|
||||
),
|
||||
ModelConstants.IS_REVIEWED_BY -> OAFRelations(
|
||||
ModelConstants.IS_REVIEWED_BY,
|
||||
ModelConstants.REVIEWS,
|
||||
ModelConstants.REVIEW
|
||||
),
|
||||
ModelConstants.DOCUMENTS -> OAFRelations(
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.COMPILES -> OAFRelations(
|
||||
ModelConstants.COMPILES,
|
||||
ModelConstants.IS_COMPILED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_COMPILED_BY -> OAFRelations(
|
||||
ModelConstants.IS_COMPILED_BY,
|
||||
ModelConstants.COMPILES,
|
||||
ModelConstants.RELATIONSHIP
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
val datacite_filter: List[String] = {
|
||||
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
|
||||
require(stream!= null)
|
||||
require(stream != null)
|
||||
Source.fromInputStream(stream).getLines().toList
|
||||
}
|
||||
|
||||
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||
trust
|
||||
)
|
||||
|
||||
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(false,null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, trust)
|
||||
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
|
||||
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
|
||||
Locale.ENGLISH
|
||||
)
|
||||
|
||||
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
|
||||
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||
val df_it: DateTimeFormatter =
|
||||
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||
|
||||
val funder_regex: List[(Pattern, String)] = List(
|
||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
|
||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
|
||||
|
||||
(
|
||||
Pattern.compile(
|
||||
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
||||
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
||||
),
|
||||
"40|corda__h2020::"
|
||||
),
|
||||
(
|
||||
Pattern.compile(
|
||||
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
||||
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
||||
),
|
||||
"40|corda_______::"
|
||||
)
|
||||
)
|
||||
|
||||
val Date_regex: List[Pattern] = List(
|
||||
//Y-M-D
|
||||
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
||||
Pattern.compile(
|
||||
"(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
|
||||
Pattern.MULTILINE
|
||||
),
|
||||
//M-D-Y
|
||||
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
|
||||
Pattern.compile(
|
||||
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
|
||||
Pattern.MULTILINE
|
||||
),
|
||||
//D-M-Y
|
||||
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
|
||||
Pattern.compile(
|
||||
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
|
||||
Pattern.MULTILINE
|
||||
),
|
||||
//Y
|
||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
||||
)
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -20,19 +20,16 @@ import java.time.format.DateTimeFormatter
|
|||
import java.util.{Date, Locale}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
||||
object DataciteToOAFTransformation {
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
|
||||
/**
|
||||
* This method should skip record if json contains invalid text
|
||||
* defined in gile datacite_filter
|
||||
*
|
||||
* @param json
|
||||
* @return True if the record should be skipped
|
||||
*/
|
||||
/** This method should skip record if json contains invalid text
|
||||
* defined in gile datacite_filter
|
||||
*
|
||||
* @param json
|
||||
* @return True if the record should be skipped
|
||||
*/
|
||||
def skip_record(json: String): Boolean = {
|
||||
datacite_filter.exists(f => json.contains(f))
|
||||
}
|
||||
|
@ -74,35 +71,35 @@ object DataciteToOAFTransformation {
|
|||
|
||||
}
|
||||
|
||||
|
||||
def embargo_end(embargo_end_date: String): Boolean = {
|
||||
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
|
||||
val td = LocalDate.now()
|
||||
td.isAfter(dt)
|
||||
}
|
||||
|
||||
|
||||
def extract_date(input: String): Option[String] = {
|
||||
val d = Date_regex.map(pattern => {
|
||||
val matcher = pattern.matcher(input)
|
||||
if (matcher.find())
|
||||
matcher.group(0)
|
||||
else
|
||||
null
|
||||
}
|
||||
).find(s => s != null)
|
||||
val d = Date_regex
|
||||
.map(pattern => {
|
||||
val matcher = pattern.matcher(input)
|
||||
if (matcher.find())
|
||||
matcher.group(0)
|
||||
else
|
||||
null
|
||||
})
|
||||
.find(s => s != null)
|
||||
|
||||
if (d.isDefined) {
|
||||
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
|
||||
try {
|
||||
return Some(LocalDate.parse(a_date, df_en).toString)
|
||||
} catch {
|
||||
case _: Throwable => try {
|
||||
return Some(LocalDate.parse(a_date, df_it).toString)
|
||||
} catch {
|
||||
case _: Throwable =>
|
||||
return None
|
||||
}
|
||||
case _: Throwable =>
|
||||
try {
|
||||
return Some(LocalDate.parse(a_date, df_it).toString)
|
||||
} catch {
|
||||
case _: Throwable =>
|
||||
return None
|
||||
}
|
||||
}
|
||||
}
|
||||
d
|
||||
|
@ -118,31 +115,63 @@ object DataciteToOAFTransformation {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
|
||||
def getTypeQualifier(
|
||||
resourceType: String,
|
||||
resourceTypeGeneral: String,
|
||||
schemaOrg: String,
|
||||
vocabularies: VocabularyGroup
|
||||
): (Qualifier, Qualifier) = {
|
||||
if (resourceType != null && resourceType.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
|
||||
val typeQualifier =
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
return (
|
||||
typeQualifier,
|
||||
vocabularies.getSynonymAsQualifier(
|
||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||
typeQualifier.getClassid
|
||||
)
|
||||
)
|
||||
}
|
||||
if (schemaOrg != null && schemaOrg.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
|
||||
val typeQualifier =
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
return (
|
||||
typeQualifier,
|
||||
vocabularies.getSynonymAsQualifier(
|
||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||
typeQualifier.getClassid
|
||||
)
|
||||
)
|
||||
|
||||
}
|
||||
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
resourceTypeGeneral
|
||||
)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
return (
|
||||
typeQualifier,
|
||||
vocabularies.getSynonymAsQualifier(
|
||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||
typeQualifier.getClassid
|
||||
)
|
||||
)
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
|
||||
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
def getResult(
|
||||
resourceType: String,
|
||||
resourceTypeGeneral: String,
|
||||
schemaOrg: String,
|
||||
vocabularies: VocabularyGroup
|
||||
): Result = {
|
||||
val typeQualifiers: (Qualifier, Qualifier) =
|
||||
getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
if (typeQualifiers == null)
|
||||
return null
|
||||
val i = new Instance
|
||||
|
@ -168,13 +197,12 @@ object DataciteToOAFTransformation {
|
|||
null
|
||||
}
|
||||
|
||||
|
||||
def available_date(input: String): Boolean = {
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
val l: List[String] = for {
|
||||
JObject(dates) <- json \\ "dates"
|
||||
JObject(dates) <- json \\ "dates"
|
||||
JField("dateType", JString(dateTypes)) <- dates
|
||||
} yield dateTypes
|
||||
|
||||
|
@ -182,18 +210,19 @@ object DataciteToOAFTransformation {
|
|||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* As describe in ticket #6377
|
||||
* when the result come from figshare we need to remove subject
|
||||
* and set Access rights OPEN.
|
||||
*
|
||||
* @param r
|
||||
*/
|
||||
/** As describe in ticket #6377
|
||||
* when the result come from figshare we need to remove subject
|
||||
* and set Access rights OPEN.
|
||||
*
|
||||
* @param r
|
||||
*/
|
||||
def fix_figshare(r: Result): Unit = {
|
||||
|
||||
if (r.getInstance() != null) {
|
||||
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
|
||||
val hosted_by_figshare = r
|
||||
.getInstance()
|
||||
.asScala
|
||||
.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
|
||||
if (hosted_by_figshare) {
|
||||
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
|
||||
val l: List[StructuredProperty] = List()
|
||||
|
@ -201,10 +230,8 @@ object DataciteToOAFTransformation {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
|
||||
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
|
||||
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
|
||||
|
@ -214,7 +241,13 @@ object DataciteToOAFTransformation {
|
|||
OafMapperUtils.structuredProperty(dt, q, null)
|
||||
}
|
||||
|
||||
def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
|
||||
def generateRelation(
|
||||
sourceId: String,
|
||||
targetId: String,
|
||||
relClass: String,
|
||||
cf: KeyValue,
|
||||
di: DataInfo
|
||||
): Relation = {
|
||||
|
||||
val r = new Relation
|
||||
r.setSource(sourceId)
|
||||
|
@ -226,7 +259,6 @@ object DataciteToOAFTransformation {
|
|||
r.setDataInfo(di)
|
||||
r
|
||||
|
||||
|
||||
}
|
||||
|
||||
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
|
||||
|
@ -238,14 +270,18 @@ object DataciteToOAFTransformation {
|
|||
val grantId = m.matcher(awardUri).replaceAll("$2")
|
||||
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
||||
List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo))
|
||||
}
|
||||
else
|
||||
} else
|
||||
List()
|
||||
|
||||
}
|
||||
|
||||
|
||||
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
|
||||
def generateOAF(
|
||||
input: String,
|
||||
ts: Long,
|
||||
dateOfCollection: Long,
|
||||
vocabularies: VocabularyGroup,
|
||||
exportLinks: Boolean
|
||||
): List[Oaf] = {
|
||||
if (skip_record(input))
|
||||
return List()
|
||||
|
||||
|
@ -253,7 +289,8 @@ object DataciteToOAFTransformation {
|
|||
lazy val json = parse(input)
|
||||
|
||||
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
|
||||
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
|
||||
val resourceTypeGeneral =
|
||||
(json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
|
||||
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
|
||||
|
||||
val doi = (json \ "attributes" \ "doi").extract[String]
|
||||
|
@ -265,8 +302,12 @@ object DataciteToOAFTransformation {
|
|||
if (result == null)
|
||||
return List()
|
||||
|
||||
|
||||
val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
|
||||
val doi_q = OafMapperUtils.qualifier(
|
||||
"doi",
|
||||
"doi",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
)
|
||||
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
|
||||
result.setPid(List(pid).asJava)
|
||||
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
|
||||
|
@ -275,48 +316,72 @@ object DataciteToOAFTransformation {
|
|||
val d = new Date(dateOfCollection * 1000)
|
||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||
|
||||
|
||||
result.setDateofcollection(ISO8601FORMAT.format(d))
|
||||
result.setDateoftransformation(ISO8601FORMAT.format(d))
|
||||
result.setDataInfo(dataInfo)
|
||||
|
||||
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
|
||||
|
||||
|
||||
val authors = creators.zipWithIndex.map { case (c, idx) =>
|
||||
val a = new Author
|
||||
a.setFullname(c.name.orNull)
|
||||
a.setName(c.givenName.orNull)
|
||||
a.setSurname(c.familyName.orNull)
|
||||
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
|
||||
a.setPid(c.nameIdentifiers.get.map(ni => {
|
||||
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
|
||||
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
|
||||
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
|
||||
}
|
||||
else
|
||||
null
|
||||
if (
|
||||
c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null
|
||||
) {
|
||||
a.setPid(
|
||||
c.nameIdentifiers.get
|
||||
.map(ni => {
|
||||
val q =
|
||||
if (ni.nameIdentifierScheme.isDefined)
|
||||
vocabularies.getTermAsQualifier(
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ni.nameIdentifierScheme.get.toLowerCase()
|
||||
)
|
||||
else null
|
||||
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
|
||||
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
|
||||
} else
|
||||
null
|
||||
|
||||
}
|
||||
})
|
||||
.asJava
|
||||
)
|
||||
.asJava)
|
||||
}
|
||||
if (c.affiliation.isDefined)
|
||||
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
|
||||
a.setAffiliation(
|
||||
c.affiliation.get
|
||||
.filter(af => af.nonEmpty)
|
||||
.map(af => OafMapperUtils.field(af, dataInfo))
|
||||
.asJava
|
||||
)
|
||||
a.setRank(idx + 1)
|
||||
a
|
||||
}
|
||||
|
||||
|
||||
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
|
||||
|
||||
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
|
||||
if (t.titleType.isEmpty) {
|
||||
OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
|
||||
} else {
|
||||
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null)
|
||||
}
|
||||
}).asJava)
|
||||
result.setTitle(
|
||||
titles
|
||||
.filter(t => t.title.nonEmpty)
|
||||
.map(t => {
|
||||
if (t.titleType.isEmpty) {
|
||||
OafMapperUtils
|
||||
.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
|
||||
} else {
|
||||
OafMapperUtils.structuredProperty(
|
||||
t.title.get,
|
||||
t.titleType.get,
|
||||
t.titleType.get,
|
||||
ModelConstants.DNET_DATACITE_TITLE,
|
||||
ModelConstants.DNET_DATACITE_TITLE,
|
||||
null
|
||||
)
|
||||
}
|
||||
})
|
||||
.asJava
|
||||
)
|
||||
|
||||
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
|
||||
return List()
|
||||
|
@ -330,53 +395,90 @@ object DataciteToOAFTransformation {
|
|||
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
|
||||
.map(d => extract_date(d.date.get))
|
||||
val a_date: Option[String] = dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
|
||||
.filter(d =>
|
||||
d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available")
|
||||
)
|
||||
.map(d => extract_date(d.date.get))
|
||||
.find(d => d != null && d.isDefined)
|
||||
.map(d => d.get)
|
||||
|
||||
if (a_date.isDefined) {
|
||||
if (doi.startsWith("10.14457"))
|
||||
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null))
|
||||
result.setEmbargoenddate(
|
||||
OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null)
|
||||
)
|
||||
else
|
||||
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
|
||||
}
|
||||
if (i_date.isDefined && i_date.get.isDefined) {
|
||||
if (doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
|
||||
}
|
||||
else {
|
||||
result.setDateofacceptance(
|
||||
OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
|
||||
)
|
||||
result
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.setDateofacceptance(
|
||||
OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
|
||||
)
|
||||
} else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
}
|
||||
}
|
||||
else if (publication_year != null) {
|
||||
} else if (publication_year != null) {
|
||||
if (doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
|
||||
result.setDateofacceptance(
|
||||
OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
|
||||
)
|
||||
result
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.setDateofacceptance(
|
||||
OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
|
||||
)
|
||||
|
||||
} else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
result
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.map(d => (extract_date(d.date.get), d.dateType.get))
|
||||
.filter(d => d._1.isDefined)
|
||||
.map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())))
|
||||
.filter(d => d._2 != null)
|
||||
.map(d => generateOAFDate(d._1, d._2)).asJava)
|
||||
result.setRelevantdate(
|
||||
dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.map(d => (extract_date(d.date.get), d.dateType.get))
|
||||
.filter(d => d._1.isDefined)
|
||||
.map(d =>
|
||||
(
|
||||
d._1.get,
|
||||
vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
|
||||
)
|
||||
)
|
||||
.filter(d => d._2 != null)
|
||||
.map(d => generateOAFDate(d._1, d._2))
|
||||
.asJava
|
||||
)
|
||||
|
||||
val subjects = (json \\ "subjects").extract[List[SubjectType]]
|
||||
|
||||
result.setSubject(subjects.filter(s => s.subject.nonEmpty)
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
|
||||
).asJava)
|
||||
|
||||
result.setSubject(
|
||||
subjects
|
||||
.filter(s => s.subject.nonEmpty)
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
s.subject.get,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
null
|
||||
)
|
||||
)
|
||||
.asJava
|
||||
)
|
||||
|
||||
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
|
||||
|
@ -384,66 +486,86 @@ object DataciteToOAFTransformation {
|
|||
|
||||
result.setDescription(
|
||||
descriptions
|
||||
.filter(d => d.description.isDefined).
|
||||
map(d =>
|
||||
OafMapperUtils.field(d.description.get, null)
|
||||
).filter(s => s != null).asJava)
|
||||
|
||||
.filter(d => d.description.isDefined)
|
||||
.map(d => OafMapperUtils.field(d.description.get, null))
|
||||
.filter(s => s != null)
|
||||
.asJava
|
||||
)
|
||||
|
||||
val publisher = (json \\ "publisher").extractOrElse[String](null)
|
||||
if (publisher != null)
|
||||
result.setPublisher(OafMapperUtils.field(publisher, null))
|
||||
|
||||
|
||||
val language: String = (json \\ "language").extractOrElse[String](null)
|
||||
|
||||
if (language != null)
|
||||
result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
|
||||
|
||||
result.setLanguage(
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
|
||||
)
|
||||
|
||||
val instance = result.getInstance().get(0)
|
||||
|
||||
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
|
||||
|
||||
val accessRights: List[String] = for {
|
||||
JObject(rightsList) <- json \\ "rightsList"
|
||||
JObject(rightsList) <- json \\ "rightsList"
|
||||
JField("rightsUri", JString(rightsUri)) <- rightsList
|
||||
} yield rightsUri
|
||||
|
||||
val aRights: Option[AccessRight] = accessRights.map(r => {
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
|
||||
}).find(q => q != null).map(q => {
|
||||
val a = new AccessRight
|
||||
a.setClassid(q.getClassid)
|
||||
a.setClassname(q.getClassname)
|
||||
a.setSchemeid(q.getSchemeid)
|
||||
a.setSchemename(q.getSchemename)
|
||||
a
|
||||
})
|
||||
val aRights: Option[AccessRight] = accessRights
|
||||
.map(r => {
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
|
||||
})
|
||||
.find(q => q != null)
|
||||
.map(q => {
|
||||
val a = new AccessRight
|
||||
a.setClassid(q.getClassid)
|
||||
a.setClassname(q.getClassname)
|
||||
a.setSchemeid(q.getSchemeid)
|
||||
a.setSchemename(q.getSchemename)
|
||||
a
|
||||
})
|
||||
|
||||
|
||||
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
val access_rights_qualifier =
|
||||
if (aRights.isDefined) aRights.get
|
||||
else
|
||||
OafMapperUtils.accessRight(
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.NOT_AVAILABLE,
|
||||
ModelConstants.DNET_ACCESS_MODES,
|
||||
ModelConstants.DNET_ACCESS_MODES
|
||||
)
|
||||
|
||||
if (client.isDefined) {
|
||||
|
||||
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID), ModelConstants.UNKNOWN_REPOSITORY.getValue))
|
||||
instance.setHostedby(
|
||||
OafMapperUtils.keyValue(
|
||||
generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID),
|
||||
ModelConstants.UNKNOWN_REPOSITORY.getValue
|
||||
)
|
||||
)
|
||||
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
|
||||
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
|
||||
instance.setAccessright(access_rights_qualifier)
|
||||
instance.setPid(result.getPid)
|
||||
val license = accessRights
|
||||
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
|
||||
.find(r =>
|
||||
r.startsWith("http") && r.matches(
|
||||
".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
|
||||
)
|
||||
)
|
||||
if (license.isDefined)
|
||||
instance.setLicense(OafMapperUtils.field(license.get, null))
|
||||
}
|
||||
|
||||
val awardUris: List[String] = for {
|
||||
JObject(fundingReferences) <- json \\ "fundingReferences"
|
||||
JObject(fundingReferences) <- json \\ "fundingReferences"
|
||||
JField("awardUri", JString(awardUri)) <- fundingReferences
|
||||
} yield awardUri
|
||||
|
||||
result.setId(IdentifierFactory.createIdentifier(result))
|
||||
var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
||||
var relations: List[Relation] =
|
||||
awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
||||
|
||||
fix_figshare(result)
|
||||
|
||||
|
@ -452,28 +574,35 @@ object DataciteToOAFTransformation {
|
|||
|
||||
if (exportLinks) {
|
||||
val rels: List[RelatedIdentifierType] = for {
|
||||
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
|
||||
JField("relationType", JString(relationType)) <- relIdentifier
|
||||
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
|
||||
JField("relationType", JString(relationType)) <- relIdentifier
|
||||
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
|
||||
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
|
||||
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
|
||||
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
|
||||
|
||||
relations = relations ::: generateRelations(rels, result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null)
|
||||
relations = relations ::: generateRelations(
|
||||
rels,
|
||||
result.getId,
|
||||
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
|
||||
)
|
||||
}
|
||||
if (relations != null && relations.nonEmpty) {
|
||||
List(result) ::: relations
|
||||
}
|
||||
else
|
||||
} else
|
||||
List(result)
|
||||
}
|
||||
|
||||
private def generateRelations(rels: List[RelatedIdentifierType], id: String, date: String): List[Relation] = {
|
||||
private def generateRelations(
|
||||
rels: List[RelatedIdentifierType],
|
||||
id: String,
|
||||
date: String
|
||||
): List[Relation] = {
|
||||
rels
|
||||
.filter(r =>
|
||||
subRelTypeMapping.contains(r.relationType) && (
|
||||
r.relatedIdentifierType.equalsIgnoreCase("doi") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
|
||||
subRelTypeMapping
|
||||
.contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
|
||||
)
|
||||
.map(r => {
|
||||
val rel = new Relation
|
||||
|
@ -490,19 +619,19 @@ object DataciteToOAFTransformation {
|
|||
rel.setProperties(List(dateProps).asJava)
|
||||
|
||||
rel.setSource(id)
|
||||
rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType))
|
||||
rel.setTarget(
|
||||
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
|
||||
)
|
||||
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
|
||||
rel
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
def generateDSId(input: String): String = {
|
||||
val b = StringUtils.substringBefore(input, "::")
|
||||
val a = StringUtils.substringAfter(input, "::")
|
||||
s"10|$b::${DHPUtils.md5(a)}"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -12,12 +12,12 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
|||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
class GenerateDataciteDatasetSpark(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) {
|
||||
/**
|
||||
* Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
|
@ -46,49 +46,65 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
|
|||
reportTotalSize(targetPath, outputBasePath)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* For working with MDStore we need to store in a file on hdfs the size of
|
||||
* the current dataset
|
||||
* @param targetPath
|
||||
* @param outputBasePath
|
||||
*/
|
||||
def reportTotalSize( targetPath: String, outputBasePath: String ):Unit = {
|
||||
/** For working with MDStore we need to store in a file on hdfs the size of
|
||||
* the current dataset
|
||||
* @param targetPath
|
||||
* @param outputBasePath
|
||||
*/
|
||||
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
||||
val total_items = spark.read.text(targetPath).count()
|
||||
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
|
||||
writeHdfsFile(
|
||||
spark.sparkContext.hadoopConfiguration,
|
||||
s"$total_items",
|
||||
outputBasePath + MDSTORE_SIZE_PATH
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate the transformed and cleaned OAF Dataset from the native one
|
||||
|
||||
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
|
||||
* @param exportLinks If true it generates unresolved links
|
||||
* @param vocabularies vocabularies for cleaning
|
||||
* @param targetPath the targetPath of the result Dataset
|
||||
*/
|
||||
def generateDataciteDataset(sourcePath: String, exportLinks: Boolean, vocabularies: VocabularyGroup, targetPath: String, spark:SparkSession):Unit = {
|
||||
require(spark!= null)
|
||||
/** Generate the transformed and cleaned OAF Dataset from the native one
|
||||
*
|
||||
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
|
||||
* @param exportLinks If true it generates unresolved links
|
||||
* @param vocabularies vocabularies for cleaning
|
||||
* @param targetPath the targetPath of the result Dataset
|
||||
*/
|
||||
def generateDataciteDataset(
|
||||
sourcePath: String,
|
||||
exportLinks: Boolean,
|
||||
vocabularies: VocabularyGroup,
|
||||
targetPath: String,
|
||||
spark: SparkSession
|
||||
): Unit = {
|
||||
require(spark != null)
|
||||
import spark.implicits._
|
||||
|
||||
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
|
||||
|
||||
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
CollectionUtils.saveDataset(
|
||||
spark.read.load(sourcePath).as[DataciteType]
|
||||
spark.read
|
||||
.load(sourcePath)
|
||||
.as[DataciteType]
|
||||
.filter(d => d.isActive)
|
||||
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
|
||||
.flatMap(d =>
|
||||
DataciteToOAFTransformation
|
||||
.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)
|
||||
)
|
||||
.filter(d => d != null),
|
||||
targetPath)
|
||||
targetPath
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
object GenerateDataciteDatasetSpark {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
new GenerateDataciteDatasetSpark("/eu/dnetlib/dhp/datacite/generate_dataset_params.json", args, log).initialize().run()
|
||||
new GenerateDataciteDatasetSpark(
|
||||
"/eu/dnetlib/dhp/datacite/generate_dataset_params.json",
|
||||
args,
|
||||
log
|
||||
).initialize().run()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,7 +22,6 @@ object ImportDatacite {
|
|||
|
||||
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
|
||||
|
||||
|
||||
def convertAPIStringToDataciteItem(input: String): DataciteType = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
|
@ -32,14 +31,26 @@ object ImportDatacite {
|
|||
|
||||
val timestamp_string = (json \ "attributes" \ "updated").extract[String]
|
||||
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
|
||||
DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input)
|
||||
DataciteType(
|
||||
doi = doi,
|
||||
timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000,
|
||||
isActive = isActive,
|
||||
json = input
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json"
|
||||
)
|
||||
)
|
||||
.mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
|
||||
|
@ -60,7 +71,8 @@ object ImportDatacite {
|
|||
val spkipImport = parser.get("skipImport")
|
||||
log.info(s"skipImport is $spkipImport")
|
||||
|
||||
val spark: SparkSession = SparkSession.builder()
|
||||
val spark: SparkSession = SparkSession
|
||||
.builder()
|
||||
.appName(ImportDatacite.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
@ -78,45 +90,48 @@ object ImportDatacite {
|
|||
|
||||
import spark.implicits._
|
||||
|
||||
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] =
|
||||
new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
|
||||
|
||||
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
|
||||
override def zero: DataciteType = null
|
||||
|
||||
override def zero: DataciteType = null
|
||||
|
||||
override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
if (b == null)
|
||||
return a
|
||||
if (a == null)
|
||||
return b
|
||||
if (a.timestamp > b.timestamp) {
|
||||
return a
|
||||
override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
if (b == null)
|
||||
return a
|
||||
if (a == null)
|
||||
return b
|
||||
if (a.timestamp > b.timestamp) {
|
||||
return a
|
||||
}
|
||||
b
|
||||
}
|
||||
b
|
||||
|
||||
override def merge(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
reduce(a, b)
|
||||
}
|
||||
|
||||
override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def finish(reduction: DataciteType): DataciteType = reduction
|
||||
}
|
||||
|
||||
override def merge(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
reduce(a, b)
|
||||
}
|
||||
|
||||
override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def finish(reduction: DataciteType): DataciteType = reduction
|
||||
}
|
||||
|
||||
val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType]
|
||||
val ts = dump.select(max("timestamp")).first().getLong(0)
|
||||
|
||||
println(s"last Timestamp is $ts")
|
||||
|
||||
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
|
||||
val cnt =
|
||||
if ("true".equalsIgnoreCase(spkipImport)) 1
|
||||
else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
|
||||
|
||||
println(s"Imported from Datacite API $cnt documents")
|
||||
|
||||
if (cnt > 0) {
|
||||
|
||||
val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
|
||||
val inputRdd: RDD[DataciteType] = sc
|
||||
.sequenceFile(targetPath, classOf[Int], classOf[Text])
|
||||
.map(s => s._2.toString)
|
||||
.map(s => convertAPIStringToDataciteItem(s))
|
||||
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
|
||||
|
@ -129,7 +144,9 @@ object ImportDatacite {
|
|||
.agg(dataciteAggregator.toColumn)
|
||||
.map(s => s._2)
|
||||
.repartition(4000)
|
||||
.write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"${dataciteDump}_updated")
|
||||
|
||||
val fs = FileSystem.get(sc.hadoopConfiguration)
|
||||
fs.delete(new Path(s"$dataciteDump"), true)
|
||||
|
@ -137,14 +154,24 @@ object ImportDatacite {
|
|||
}
|
||||
}
|
||||
|
||||
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs: Int): Long = {
|
||||
private def writeSequenceFile(
|
||||
hdfsTargetPath: Path,
|
||||
timestamp: Long,
|
||||
conf: Configuration,
|
||||
bs: Int
|
||||
): Long = {
|
||||
var from: Long = timestamp * 1000
|
||||
val delta: Long = 100000000L
|
||||
var client: DataciteAPIImporter = null
|
||||
val now: Long = System.currentTimeMillis()
|
||||
var i = 0
|
||||
try {
|
||||
val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
|
||||
val writer = SequenceFile.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(hdfsTargetPath),
|
||||
SequenceFile.Writer.keyClass(classOf[IntWritable]),
|
||||
SequenceFile.Writer.valueClass(classOf[Text])
|
||||
)
|
||||
try {
|
||||
var start: Long = System.currentTimeMillis
|
||||
while (from < now) {
|
||||
|
@ -153,16 +180,16 @@ object ImportDatacite {
|
|||
val key: IntWritable = new IntWritable(i)
|
||||
val value: Text = new Text
|
||||
while (client.hasNext) {
|
||||
key.set({
|
||||
key.set {
|
||||
i += 1;
|
||||
i - 1
|
||||
})
|
||||
}
|
||||
value.set(client.next())
|
||||
writer.append(key, value)
|
||||
writer.hflush()
|
||||
if (i % 1000 == 0) {
|
||||
end = System.currentTimeMillis
|
||||
val time = (end - start) / 1000.0F
|
||||
val time = (end - start) / 1000.0f
|
||||
println(s"Imported $i in $time seconds")
|
||||
start = System.currentTimeMillis
|
||||
}
|
||||
|
@ -174,8 +201,7 @@ object ImportDatacite {
|
|||
case e: Throwable =>
|
||||
println("Error", e)
|
||||
} finally if (writer != null) writer.close()
|
||||
}
|
||||
catch {
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
log.error("Error", e)
|
||||
}
|
||||
|
|
|
@ -17,7 +17,13 @@ object SparkDownloadUpdateDatacite {
|
|||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString)
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")
|
||||
)
|
||||
.mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
|
@ -26,8 +32,9 @@ object SparkDownloadUpdateDatacite {
|
|||
val hdfsuri = parser.get("namenode")
|
||||
log.info(s"namenode is $hdfsuri")
|
||||
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
val spark: SparkSession = SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
@ -37,13 +44,18 @@ object SparkDownloadUpdateDatacite {
|
|||
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
val maxDate: String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0)
|
||||
val maxDate: String = spark.read
|
||||
.load(workingPath)
|
||||
.as[Oaf]
|
||||
.filter(s => s.isInstanceOf[Result])
|
||||
.map(r => r.asInstanceOf[Result].getDateofcollection)
|
||||
.select(max("value"))
|
||||
.first()
|
||||
.getString(0)
|
||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||
val string_to_date = ISO8601FORMAT.parse(maxDate)
|
||||
val ts = string_to_date.getTime
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -12,39 +12,81 @@ object BioDBToOAF {
|
|||
|
||||
case class EBILinkItem(id: Long, links: String) {}
|
||||
|
||||
case class EBILinks(relType: String, date: String, title: String, pmid: String, targetPid: String, targetPidType: String, targetUrl: String) {}
|
||||
case class EBILinks(
|
||||
relType: String,
|
||||
date: String,
|
||||
title: String,
|
||||
pmid: String,
|
||||
targetPid: String,
|
||||
targetPidType: String,
|
||||
targetUrl: String
|
||||
) {}
|
||||
|
||||
case class UniprotDate(date: String, date_info: String) {}
|
||||
|
||||
case class ScholixResolved(pid: String, pidType: String, typology: String, tilte: List[String], datasource: List[String], date: List[String], authors: List[String]) {}
|
||||
case class ScholixResolved(
|
||||
pid: String,
|
||||
pidType: String,
|
||||
typology: String,
|
||||
tilte: List[String],
|
||||
datasource: List[String],
|
||||
date: List[String],
|
||||
authors: List[String]
|
||||
) {}
|
||||
|
||||
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
|
||||
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||
"0.9"
|
||||
)
|
||||
val SUBJ_CLASS = "Keywords"
|
||||
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
|
||||
val resolvedURL: Map[String, String] = Map(
|
||||
"genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
|
||||
"ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
|
||||
"genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
|
||||
"ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
|
||||
"clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/",
|
||||
"onim" -> "https://omim.org/entry/",
|
||||
"refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
|
||||
"onim" -> "https://omim.org/entry/",
|
||||
"refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
|
||||
)
|
||||
|
||||
|
||||
val collectedFromMap: Map[String, KeyValue] = {
|
||||
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", "Protein Data Bank")
|
||||
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive")
|
||||
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", "NCBI Nucleotide")
|
||||
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::296e1abaf1302897a6838d3588cd0310", "UniProtKB/Swiss-Prot")
|
||||
val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
|
||||
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature")
|
||||
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)")
|
||||
val pubmedCollectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2",
|
||||
"Protein Data Bank"
|
||||
)
|
||||
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|re3data_____::c2a591f440598b63d854556beaf01591",
|
||||
"European Nucleotide Archive"
|
||||
)
|
||||
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6",
|
||||
"NCBI Nucleotide"
|
||||
)
|
||||
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|re3data_____::296e1abaf1302897a6838d3588cd0310",
|
||||
"UniProtKB/Swiss-Prot"
|
||||
)
|
||||
val ElsevierCollectedFrom: KeyValue =
|
||||
OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
|
||||
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e",
|
||||
"Springer Nature"
|
||||
)
|
||||
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|opendoar____::83e60e09c222f206c725385f53d7e567c",
|
||||
"EMBL-EBIs Protein Data Bank in Europe (PDBe)"
|
||||
)
|
||||
val pubmedCollectedFrom: KeyValue =
|
||||
OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||
|
||||
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
|
||||
PDBCollectedFrom.setDataInfo(DATA_INFO)
|
||||
|
@ -56,14 +98,14 @@ object BioDBToOAF {
|
|||
springerNatureCollectedFrom.setDataInfo(DATA_INFO)
|
||||
|
||||
Map(
|
||||
"uniprot" -> UNIPROTCollectedFrom,
|
||||
"pdb" -> PDBCollectedFrom,
|
||||
"elsevier" -> ElsevierCollectedFrom,
|
||||
"ebi" -> EBICollectedFrom,
|
||||
"Springer Nature" -> springerNatureCollectedFrom,
|
||||
"NCBI Nucleotide" -> ncbiCollectedFrom,
|
||||
"uniprot" -> UNIPROTCollectedFrom,
|
||||
"pdb" -> PDBCollectedFrom,
|
||||
"elsevier" -> ElsevierCollectedFrom,
|
||||
"ebi" -> EBICollectedFrom,
|
||||
"Springer Nature" -> springerNatureCollectedFrom,
|
||||
"NCBI Nucleotide" -> ncbiCollectedFrom,
|
||||
"European Nucleotide Archive" -> enaCollectedFrom,
|
||||
"Europe PMC" -> pubmedCollectedFrom
|
||||
"Europe PMC" -> pubmedCollectedFrom
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -80,18 +122,32 @@ object BioDBToOAF {
|
|||
|
||||
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
|
||||
|
||||
createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type), collectedFromMap("elsevier"), "relationship", relation_semantic, date)
|
||||
createRelation(
|
||||
target_pid,
|
||||
target_pid_type,
|
||||
generate_unresolved_id(source_pid, source_pid_type),
|
||||
collectedFromMap("elsevier"),
|
||||
"relationship",
|
||||
relation_semantic,
|
||||
date
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
||||
def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(input.pid.toLowerCase, input.pidType.toLowerCase, input.pidType.toLowerCase, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.pid.toLowerCase,
|
||||
input.pidType.toLowerCase,
|
||||
input.pidType.toLowerCase,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
|
@ -101,7 +157,15 @@ object BioDBToOAF {
|
|||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
|
||||
|
||||
if (input.tilte != null && input.tilte.nonEmpty)
|
||||
d.setTitle(List(OafMapperUtils.structuredProperty(input.tilte.head, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.tilte.head,
|
||||
ModelConstants.MAIN_TITLE_QUALIFIER,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setOriginalId(List(input.pid).asJava)
|
||||
val i = new Instance
|
||||
|
@ -113,9 +177,23 @@ object BioDBToOAF {
|
|||
}
|
||||
|
||||
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0037", "Clinical Trial", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0037",
|
||||
"Clinical Trial",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
else
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
if (input.datasource == null || input.datasource.isEmpty)
|
||||
return null
|
||||
|
@ -141,7 +219,6 @@ object BioDBToOAF {
|
|||
d
|
||||
}
|
||||
|
||||
|
||||
def uniprotToOAF(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
@ -151,7 +228,14 @@ object BioDBToOAF {
|
|||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(pid, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
||||
OafMapperUtils.structuredProperty(
|
||||
pid,
|
||||
"uniprot",
|
||||
"uniprot",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
|
@ -162,32 +246,52 @@ object BioDBToOAF {
|
|||
val title: String = (json \ "title").extractOrElse[String](null)
|
||||
|
||||
if (title != null)
|
||||
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setOriginalId(List(pid).asJava)
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("uniprot"))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
val dates: List[UniprotDate] = for {
|
||||
JObject(dateOBJ) <- json \ "dates"
|
||||
JField("date", JString(date)) <- dateOBJ
|
||||
JObject(dateOBJ) <- json \ "dates"
|
||||
JField("date", JString(date)) <- dateOBJ
|
||||
JField("date_info", JString(date_info)) <- dateOBJ
|
||||
} yield UniprotDate(GraphCleaningFunctions.cleanDate(date), date_info)
|
||||
|
||||
val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
|
||||
|
||||
|
||||
if (subjects != null) {
|
||||
d.setSubject(
|
||||
subjects.map(s =>
|
||||
OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
|
||||
).asJava)
|
||||
subjects
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
s,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
null
|
||||
)
|
||||
)
|
||||
.asJava
|
||||
)
|
||||
}
|
||||
var i_date: Option[UniprotDate] = None
|
||||
|
||||
|
@ -197,45 +301,73 @@ object BioDBToOAF {
|
|||
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
}
|
||||
val relevant_dates: List[StructuredProperty] = dates.filter(d => !d.date_info.contains("entry version"))
|
||||
.map(date => OafMapperUtils.structuredProperty(date.date, ModelConstants.UNKNOWN, ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO))
|
||||
val relevant_dates: List[StructuredProperty] = dates
|
||||
.filter(d => !d.date_info.contains("entry version"))
|
||||
.map(date =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
date.date,
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.DNET_DATACITE_DATE,
|
||||
ModelConstants.DNET_DATACITE_DATE,
|
||||
DATA_INFO
|
||||
)
|
||||
)
|
||||
if (relevant_dates != null && relevant_dates.nonEmpty)
|
||||
d.setRelevantdate(relevant_dates.asJava)
|
||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
}
|
||||
|
||||
|
||||
val references_pmid: List[String] = for {
|
||||
JObject(reference) <- json \ "references"
|
||||
JObject(reference) <- json \ "references"
|
||||
JField("PubMed", JString(pid)) <- reference
|
||||
} yield pid
|
||||
|
||||
val references_doi: List[String] = for {
|
||||
JObject(reference) <- json \ "references"
|
||||
JObject(reference) <- json \ "references"
|
||||
JField(" DOI", JString(pid)) <- reference
|
||||
} yield pid
|
||||
|
||||
|
||||
if (references_pmid != null && references_pmid.nonEmpty) {
|
||||
val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
|
||||
val rel = createRelation(
|
||||
references_pmid.head,
|
||||
"pmid",
|
||||
d.getId,
|
||||
collectedFromMap("uniprot"),
|
||||
ModelConstants.RELATIONSHIP,
|
||||
ModelConstants.IS_RELATED_TO,
|
||||
if (i_date.isDefined) i_date.get.date else null
|
||||
)
|
||||
rel.getCollectedfrom
|
||||
List(d, rel)
|
||||
}
|
||||
else if (references_doi != null && references_doi.nonEmpty) {
|
||||
val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
|
||||
} else if (references_doi != null && references_doi.nonEmpty) {
|
||||
val rel = createRelation(
|
||||
references_doi.head,
|
||||
"doi",
|
||||
d.getId,
|
||||
collectedFromMap("uniprot"),
|
||||
ModelConstants.RELATIONSHIP,
|
||||
ModelConstants.IS_RELATED_TO,
|
||||
if (i_date.isDefined) i_date.get.date else null
|
||||
)
|
||||
List(d, rel)
|
||||
}
|
||||
else
|
||||
} else
|
||||
List(d)
|
||||
}
|
||||
|
||||
|
||||
def generate_unresolved_id(pid: String, pidType: String): String = {
|
||||
s"unresolved::$pid::$pidType"
|
||||
}
|
||||
|
||||
|
||||
def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType: String, relClass: String, date: String): Relation = {
|
||||
def createRelation(
|
||||
pid: String,
|
||||
pidType: String,
|
||||
sourceId: String,
|
||||
collectedFrom: KeyValue,
|
||||
subRelType: String,
|
||||
relClass: String,
|
||||
date: String
|
||||
): Relation = {
|
||||
|
||||
val rel = new Relation
|
||||
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
||||
|
@ -248,7 +380,6 @@ object BioDBToOAF {
|
|||
rel.setSource(sourceId)
|
||||
rel.setTarget(s"unresolved::$pid::$pidType")
|
||||
|
||||
|
||||
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
||||
|
||||
rel.setProperties(List(dateProps).asJava)
|
||||
|
@ -259,12 +390,24 @@ object BioDBToOAF {
|
|||
|
||||
}
|
||||
|
||||
|
||||
def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date: String): Relation = {
|
||||
createRelation(pid, pidType, sourceId, collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date)
|
||||
def createSupplementaryRelation(
|
||||
pid: String,
|
||||
pidType: String,
|
||||
sourceId: String,
|
||||
collectedFrom: KeyValue,
|
||||
date: String
|
||||
): Relation = {
|
||||
createRelation(
|
||||
pid,
|
||||
pidType,
|
||||
sourceId,
|
||||
collectedFrom,
|
||||
ModelConstants.SUPPLEMENT,
|
||||
ModelConstants.IS_SUPPLEMENT_TO,
|
||||
date
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
def pdbTOOaf(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
@ -277,7 +420,14 @@ object BioDBToOAF {
|
|||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(pdb, "pdb", "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
||||
OafMapperUtils.structuredProperty(
|
||||
pdb,
|
||||
"pdb",
|
||||
"Protein Data Bank Identifier",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
|
@ -290,13 +440,16 @@ object BioDBToOAF {
|
|||
|
||||
if (title == null)
|
||||
return List()
|
||||
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
|
||||
|
||||
if (authors != null) {
|
||||
val convertedAuthors = authors.zipWithIndex.map { a =>
|
||||
|
||||
val res = new Author
|
||||
res.setFullname(a._1)
|
||||
res.setRank(a._2 + 1)
|
||||
|
@ -310,7 +463,14 @@ object BioDBToOAF {
|
|||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("pdb"))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
@ -323,7 +483,6 @@ object BioDBToOAF {
|
|||
List(d)
|
||||
}
|
||||
|
||||
|
||||
def extractEBILinksFromDump(input: String): EBILinkItem = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
@ -333,49 +492,70 @@ object BioDBToOAF {
|
|||
EBILinkItem(pmid.toLong, compact(render(links)))
|
||||
}
|
||||
|
||||
|
||||
def EBITargetLinksFilter(input: EBILinks): Boolean = {
|
||||
|
||||
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot")
|
||||
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase(
|
||||
"pdb"
|
||||
) || input.targetPidType.equalsIgnoreCase("uniprot")
|
||||
|
||||
}
|
||||
|
||||
|
||||
def parse_ebi_links(input: String): List[EBILinks] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pmid = (json \ "request" \ "id").extract[String]
|
||||
for {
|
||||
JObject(link) <- json \\ "Link"
|
||||
JField("Target", JObject(target)) <- link
|
||||
JField("RelationshipType", JObject(relType)) <- link
|
||||
JField("Name", JString(relation)) <- relType
|
||||
JObject(link) <- json \\ "Link"
|
||||
JField("Target", JObject(target)) <- link
|
||||
JField("RelationshipType", JObject(relType)) <- link
|
||||
JField("Name", JString(relation)) <- relType
|
||||
JField("PublicationDate", JString(publicationDate)) <- link
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Identifier", JObject(identifier)) <- target
|
||||
JField("IDScheme", JString(idScheme)) <- identifier
|
||||
JField("IDURL", JString(idUrl)) <- identifier
|
||||
JField("ID", JString(id)) <- identifier
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Identifier", JObject(identifier)) <- target
|
||||
JField("IDScheme", JString(idScheme)) <- identifier
|
||||
JField("IDURL", JString(idUrl)) <- identifier
|
||||
JField("ID", JString(id)) <- identifier
|
||||
|
||||
} yield EBILinks(relation, GraphCleaningFunctions.cleanDate(publicationDate), title, pmid, id, idScheme, idUrl)
|
||||
} yield EBILinks(
|
||||
relation,
|
||||
GraphCleaningFunctions.cleanDate(publicationDate),
|
||||
title,
|
||||
pmid,
|
||||
id,
|
||||
idScheme,
|
||||
idUrl
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
|
||||
val d = new Dataset
|
||||
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setTitle(List(OafMapperUtils.structuredProperty(input.title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.title,
|
||||
ModelConstants.MAIN_TITLE_QUALIFIER,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
|
||||
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
|
||||
d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
|
||||
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(input.targetPid.toLowerCase, input.targetPidType.toLowerCase, "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.targetPid.toLowerCase,
|
||||
input.targetPidType.toLowerCase,
|
||||
"Protein Data Bank Identifier",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
|
@ -383,13 +563,35 @@ object BioDBToOAF {
|
|||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(input.targetUrl).asJava)
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("ebi"))
|
||||
d.setInstance(List(i).asJava)
|
||||
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
|
||||
i.setDateofacceptance(
|
||||
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
|
||||
)
|
||||
d.setDateofacceptance(
|
||||
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
|
||||
)
|
||||
|
||||
List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, GraphCleaningFunctions.cleanDate(input.date)))
|
||||
List(
|
||||
d,
|
||||
createRelation(
|
||||
input.pmid,
|
||||
"pmid",
|
||||
d.getId,
|
||||
collectedFromMap("ebi"),
|
||||
ModelConstants.RELATIONSHIP,
|
||||
ModelConstants.IS_RELATED_TO,
|
||||
GraphCleaningFunctions.cleanDate(input.date)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,7 +14,11 @@ object SparkTransformBioDatabaseToOAF {
|
|||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val database: String = parser.get("database")
|
||||
log.info("database: {}", database)
|
||||
|
@ -29,20 +33,33 @@ object SparkTransformBioDatabaseToOAF {
|
|||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
val sc = spark.sparkContext
|
||||
|
||||
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
import spark.implicits._
|
||||
database.toUpperCase() match {
|
||||
case "UNIPROT" =>
|
||||
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))), targetPath)
|
||||
CollectionUtils.saveDataset(
|
||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
|
||||
targetPath
|
||||
)
|
||||
case "PDB" =>
|
||||
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))), targetPath)
|
||||
CollectionUtils.saveDataset(
|
||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
|
||||
targetPath
|
||||
)
|
||||
case "SCHOLIX" =>
|
||||
CollectionUtils.saveDataset(spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)), targetPath)
|
||||
CollectionUtils.saveDataset(
|
||||
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
|
||||
targetPath
|
||||
)
|
||||
case "CROSSREF_LINKS" =>
|
||||
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))), targetPath)
|
||||
CollectionUtils.saveDataset(
|
||||
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
|
||||
targetPath
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -24,31 +24,37 @@ import scala.xml.pull.XMLEventReader
|
|||
|
||||
object SparkCreateBaselineDataFrame {
|
||||
|
||||
|
||||
def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = {
|
||||
val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
|
||||
|
||||
val result = data.lines.filter(l => l.startsWith("<a href=")).map { l =>
|
||||
val end = l.lastIndexOf("\">")
|
||||
val start = l.indexOf("<a href=\"")
|
||||
val result = data.lines
|
||||
.filter(l => l.startsWith("<a href="))
|
||||
.map { l =>
|
||||
val end = l.lastIndexOf("\">")
|
||||
val start = l.indexOf("<a href=\"")
|
||||
|
||||
if (start >= 0 && end > start)
|
||||
l.substring(start + 9, end - start)
|
||||
else
|
||||
""
|
||||
}.filter(s => s.endsWith(".gz")).filter(s => s > maxFile).map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList
|
||||
if (start >= 0 && end > start)
|
||||
l.substring(start + 9, end - start)
|
||||
else
|
||||
""
|
||||
}
|
||||
.filter(s => s.endsWith(".gz"))
|
||||
.filter(s => s > maxFile)
|
||||
.map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s"))
|
||||
.toList
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
|
||||
def downloadBaselinePart(url: String): InputStream = {
|
||||
val r = new HttpGet(url)
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
val config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
.setSocketTimeout(timeout * 1000)
|
||||
.build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
val response = client.execute(r)
|
||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
|
@ -59,10 +65,12 @@ object SparkCreateBaselineDataFrame {
|
|||
def requestPage(url: String): String = {
|
||||
val r = new HttpGet(url)
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
val config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
.setSocketTimeout(timeout * 1000)
|
||||
.build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
|
@ -73,8 +81,7 @@ object SparkCreateBaselineDataFrame {
|
|||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
} else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
|
@ -90,10 +97,8 @@ object SparkCreateBaselineDataFrame {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = {
|
||||
|
||||
|
||||
val conf = new Configuration
|
||||
conf.set("fs.defaultFS", hdfsServerUri)
|
||||
val fs = FileSystem.get(conf)
|
||||
|
@ -122,31 +127,36 @@ object SparkCreateBaselineDataFrame {
|
|||
|
||||
}
|
||||
|
||||
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] =
|
||||
new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
|
||||
override def zero: PMArticle = new PMArticle
|
||||
|
||||
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
|
||||
override def zero: PMArticle = new PMArticle
|
||||
override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
|
||||
if (b != null && b.getPmid != null) b else a._2
|
||||
}
|
||||
|
||||
override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
|
||||
if (b != null && b.getPmid != null) b else a._2
|
||||
override def merge(b1: PMArticle, b2: PMArticle): PMArticle = {
|
||||
if (b1 != null && b1.getPmid != null) b1 else b2
|
||||
|
||||
}
|
||||
|
||||
override def finish(reduction: PMArticle): PMArticle = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||
|
||||
override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||
}
|
||||
|
||||
override def merge(b1: PMArticle, b2: PMArticle): PMArticle = {
|
||||
if (b1 != null && b1.getPmid != null) b1 else b2
|
||||
|
||||
}
|
||||
|
||||
override def finish(reduction: PMArticle): PMArticle = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||
|
||||
override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
SparkEBILinksToOaf.getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
|
||||
)
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
|
@ -162,7 +172,6 @@ object SparkCreateBaselineDataFrame {
|
|||
val skipUpdate = parser.get("skipUpdate")
|
||||
log.info("skipUpdate: {}", skipUpdate)
|
||||
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
val spark: SparkSession =
|
||||
|
@ -170,7 +179,8 @@ object SparkCreateBaselineDataFrame {
|
|||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val sc = spark.sparkContext
|
||||
import spark.implicits._
|
||||
|
@ -183,20 +193,30 @@ object SparkCreateBaselineDataFrame {
|
|||
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
||||
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
|
||||
val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => {
|
||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
||||
new PMParser(xml)
|
||||
}))
|
||||
ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
|
||||
val ds: Dataset[PMArticle] = spark.createDataset(
|
||||
k.filter(i => i._1.endsWith(".gz"))
|
||||
.flatMap(i => {
|
||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
||||
new PMParser(xml)
|
||||
})
|
||||
)
|
||||
ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder))
|
||||
.groupByKey(_._1)
|
||||
.agg(pmArticleAggregator.toColumn)
|
||||
.map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
|
||||
.map(p => p._2)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/baseline_dataset")
|
||||
}
|
||||
|
||||
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
|
||||
CollectionUtils.saveDataset(exported_dataset
|
||||
.map(a => PubMedToOaf.convert(a, vocabularies)).as[Oaf]
|
||||
.filter(p => p != null),
|
||||
targetPath)
|
||||
CollectionUtils.saveDataset(
|
||||
exported_dataset
|
||||
.map(a => PubMedToOaf.convert(a, vocabularies))
|
||||
.as[Oaf]
|
||||
.filter(p => p != null),
|
||||
targetPath
|
||||
)
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,10 +25,12 @@ object SparkDownloadEBILinks {
|
|||
def requestPage(url: String): String = {
|
||||
val r = new HttpGet(url)
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
val config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
.setSocketTimeout(timeout * 1000)
|
||||
.build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
|
@ -39,8 +41,7 @@ object SparkDownloadEBILinks {
|
|||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
} else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
|
@ -66,14 +67,19 @@ object SparkDownloadEBILinks {
|
|||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val MAX_ITEM_PER_PARTITION = 20000
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
|
@ -87,22 +93,40 @@ object SparkDownloadEBILinks {
|
|||
log.info(s"workingPath -> $workingPath")
|
||||
|
||||
log.info("Getting max pubmedId where the links have already requested")
|
||||
val links: Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
|
||||
val links: Dataset[EBILinkItem] =
|
||||
spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
|
||||
val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)
|
||||
|
||||
log.info("Retrieving PMID to request links")
|
||||
val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
|
||||
pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request")
|
||||
pubmed
|
||||
.map(p => p.getPmid.toLong)
|
||||
.where(s"value > $lastPMIDRequested")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/id_to_request")
|
||||
|
||||
val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
|
||||
|
||||
val total = pmidToReq.count()
|
||||
|
||||
spark.createDataset(pmidToReq.rdd.repartition((total / MAX_ITEM_PER_PARTITION).toInt).map(pmid => createEBILinks(pmid)).filter(l => l != null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update")
|
||||
spark
|
||||
.createDataset(
|
||||
pmidToReq.rdd
|
||||
.repartition((total / MAX_ITEM_PER_PARTITION).toInt)
|
||||
.map(pmid => createEBILinks(pmid))
|
||||
.filter(l => l != null)
|
||||
)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/links_update")
|
||||
|
||||
val updates: Dataset[EBILinkItem] = spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
|
||||
val updates: Dataset[EBILinkItem] =
|
||||
spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
|
||||
|
||||
links.union(updates).groupByKey(_.id)
|
||||
links
|
||||
.union(updates)
|
||||
.groupByKey(_.id)
|
||||
.reduceGroups { (x, y) =>
|
||||
if (x == null || x.links == null)
|
||||
y
|
||||
|
@ -112,6 +136,10 @@ object SparkDownloadEBILinks {
|
|||
x
|
||||
else
|
||||
y
|
||||
}.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final")
|
||||
}
|
||||
.map(_._2)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/links_final")
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,15 +15,19 @@ object SparkEBILinksToOaf {
|
|||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
import spark.implicits._
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
|
@ -32,11 +36,17 @@ object SparkEBILinksToOaf {
|
|||
log.info(s"targetPath -> $targetPath")
|
||||
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
|
||||
val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{"))
|
||||
val ebLinks: Dataset[EBILinkItem] = spark.read
|
||||
.load(sourcePath)
|
||||
.as[EBILinkItem]
|
||||
.filter(l => l.links != null && l.links.startsWith("{"))
|
||||
|
||||
CollectionUtils.saveDataset(ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
||||
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
||||
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
|
||||
targetPath)
|
||||
CollectionUtils.saveDataset(
|
||||
ebLinks
|
||||
.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
||||
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
||||
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
|
||||
targetPath
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,16 +3,13 @@ package eu.dnetlib.dhp.sx.bio.pubmed
|
|||
import scala.xml.MetaData
|
||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
||||
|
||||
/** @param xml
|
||||
*/
|
||||
class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] {
|
||||
|
||||
/**
|
||||
*
|
||||
* @param xml
|
||||
*/
|
||||
class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
||||
var currentArticle: PMArticle = generateNextArticle()
|
||||
|
||||
var currentArticle:PMArticle = generateNextArticle()
|
||||
|
||||
override def hasNext: Boolean = currentArticle!= null
|
||||
override def hasNext: Boolean = currentArticle != null
|
||||
|
||||
override def next(): PMArticle = {
|
||||
val tmp = currentArticle
|
||||
|
@ -20,33 +17,30 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
|||
tmp
|
||||
}
|
||||
|
||||
def extractAttributes(attrs:MetaData, key:String):String = {
|
||||
def extractAttributes(attrs: MetaData, key: String): String = {
|
||||
|
||||
val res = attrs.get(key)
|
||||
if (res.isDefined) {
|
||||
val s =res.get
|
||||
val s = res.get
|
||||
if (s != null && s.nonEmpty)
|
||||
s.head.text
|
||||
else
|
||||
null
|
||||
}
|
||||
else null
|
||||
} else null
|
||||
}
|
||||
|
||||
|
||||
def validate_Date(year:String, month:String, day:String):String = {
|
||||
def validate_Date(year: String, month: String, day: String): String = {
|
||||
try {
|
||||
f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d"
|
||||
|
||||
} catch {
|
||||
case _: Throwable =>null
|
||||
case _: Throwable => null
|
||||
}
|
||||
}
|
||||
|
||||
def generateNextArticle():PMArticle = {
|
||||
def generateNextArticle(): PMArticle = {
|
||||
|
||||
|
||||
var currentSubject:PMSubject = null
|
||||
var currentSubject: PMSubject = null
|
||||
var currentAuthor: PMAuthor = null
|
||||
var currentJournal: PMJournal = null
|
||||
var currentGrant: PMGrant = null
|
||||
|
@ -54,12 +48,7 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
|||
var currentYear = "0"
|
||||
var currentMonth = "01"
|
||||
var currentDay = "01"
|
||||
var currentArticleType:String = null
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
var currentArticleType: String = null
|
||||
|
||||
while (xml.hasNext) {
|
||||
xml.next match {
|
||||
|
@ -68,64 +57,67 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
|||
|
||||
label match {
|
||||
case "PubmedArticle" => currentArticle = new PMArticle
|
||||
case "Author" => currentAuthor = new PMAuthor
|
||||
case "Journal" => currentJournal = new PMJournal
|
||||
case "Grant" => currentGrant = new PMGrant
|
||||
case "Author" => currentAuthor = new PMAuthor
|
||||
case "Journal" => currentJournal = new PMJournal
|
||||
case "Grant" => currentGrant = new PMGrant
|
||||
case "PublicationType" | "DescriptorName" =>
|
||||
currentSubject = new PMSubject
|
||||
currentSubject.setMeshId(extractAttributes(attrs, "UI"))
|
||||
case "ArticleId" => currentArticleType = extractAttributes(attrs,"IdType")
|
||||
case _ =>
|
||||
case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType")
|
||||
case _ =>
|
||||
}
|
||||
case EvElemEnd(_, label) =>
|
||||
label match {
|
||||
case "PubmedArticle" => return currentArticle
|
||||
case "Author" => currentArticle.getAuthors.add(currentAuthor)
|
||||
case "Journal" => currentArticle.setJournal(currentJournal)
|
||||
case "Grant" => currentArticle.getGrants.add(currentGrant)
|
||||
case "PubMedPubDate" => if (currentArticle.getDate== null)
|
||||
currentArticle.setDate(validate_Date(currentYear,currentMonth,currentDay))
|
||||
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
|
||||
case "PublicationType" =>currentArticle.getPublicationTypes.add(currentSubject)
|
||||
case _ =>
|
||||
case "Author" => currentArticle.getAuthors.add(currentAuthor)
|
||||
case "Journal" => currentArticle.setJournal(currentJournal)
|
||||
case "Grant" => currentArticle.getGrants.add(currentGrant)
|
||||
case "PubMedPubDate" =>
|
||||
if (currentArticle.getDate == null)
|
||||
currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
|
||||
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
|
||||
case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject)
|
||||
case _ =>
|
||||
}
|
||||
case EvText(text) =>
|
||||
if (currNode!= null && text.trim.nonEmpty)
|
||||
if (currNode != null && text.trim.nonEmpty)
|
||||
currNode match {
|
||||
case "ArticleTitle" => {
|
||||
if (currentArticle.getTitle==null)
|
||||
if (currentArticle.getTitle == null)
|
||||
currentArticle.setTitle(text.trim)
|
||||
else
|
||||
currentArticle.setTitle(currentArticle.getTitle + text.trim)
|
||||
}
|
||||
case "AbstractText" => {
|
||||
if (currentArticle.getDescription==null)
|
||||
if (currentArticle.getDescription == null)
|
||||
currentArticle.setDescription(text.trim)
|
||||
else
|
||||
currentArticle.setDescription(currentArticle.getDescription + text.trim)
|
||||
}
|
||||
case "PMID" => currentArticle.setPmid(text.trim)
|
||||
case "ArticleId" => if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
|
||||
case "Language" => currentArticle.setLanguage(text.trim)
|
||||
case "ISSN" => currentJournal.setIssn(text.trim)
|
||||
case "GrantID" => currentGrant.setGrantID(text.trim)
|
||||
case "Agency" => currentGrant.setAgency(text.trim)
|
||||
case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim)
|
||||
case "Year" => currentYear = text.trim
|
||||
case "Month" => currentMonth = text.trim
|
||||
case "Day" => currentDay = text.trim
|
||||
case "Volume" => currentJournal.setVolume( text.trim)
|
||||
case "Issue" => currentJournal.setIssue (text.trim)
|
||||
case "ArticleId" =>
|
||||
if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
|
||||
case "Language" => currentArticle.setLanguage(text.trim)
|
||||
case "ISSN" => currentJournal.setIssn(text.trim)
|
||||
case "GrantID" => currentGrant.setGrantID(text.trim)
|
||||
case "Agency" => currentGrant.setAgency(text.trim)
|
||||
case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim)
|
||||
case "Year" => currentYear = text.trim
|
||||
case "Month" => currentMonth = text.trim
|
||||
case "Day" => currentDay = text.trim
|
||||
case "Volume" => currentJournal.setVolume(text.trim)
|
||||
case "Issue" => currentJournal.setIssue(text.trim)
|
||||
case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim)
|
||||
case "LastName" => {
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setLastName(text.trim)
|
||||
}
|
||||
case "ForeName" => if (currentAuthor != null)
|
||||
currentAuthor.setForeName(text.trim)
|
||||
case "ForeName" =>
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setForeName(text.trim)
|
||||
case "Title" =>
|
||||
if (currentJournal.getTitle==null)
|
||||
if (currentJournal.getTitle == null)
|
||||
currentJournal.setTitle(text.trim)
|
||||
else
|
||||
currentJournal.setTitle(currentJournal.getTitle + text.trim)
|
||||
|
@ -139,8 +131,3 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
|||
null
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -2,38 +2,50 @@ package eu.dnetlib.dhp.sx.bio.pubmed
|
|||
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{
|
||||
GraphCleaningFunctions,
|
||||
IdentifierFactory,
|
||||
OafMapperUtils,
|
||||
PidType
|
||||
}
|
||||
import eu.dnetlib.dhp.schema.oaf._
|
||||
import collection.JavaConverters._
|
||||
|
||||
import java.util.regex.Pattern
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
*/
|
||||
object PubMedToOaf {
|
||||
|
||||
val SUBJ_CLASS = "keywords"
|
||||
|
||||
val urlMap = Map(
|
||||
"pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
|
||||
"doi" -> "https://dx.doi.org/"
|
||||
"doi" -> "https://dx.doi.org/"
|
||||
)
|
||||
val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
|
||||
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||
|
||||
val dataInfo: DataInfo = OafMapperUtils.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||
"0.9"
|
||||
)
|
||||
|
||||
/**
|
||||
* Cleaning the DOI Applying regex in order to
|
||||
* remove doi starting with URL
|
||||
*
|
||||
* @param doi input DOI
|
||||
* @return cleaned DOI
|
||||
*/
|
||||
val collectedFrom: KeyValue =
|
||||
OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||
|
||||
/** Cleaning the DOI Applying regex in order to
|
||||
* remove doi starting with URL
|
||||
*
|
||||
* @param doi input DOI
|
||||
* @return cleaned DOI
|
||||
*/
|
||||
def cleanDoi(doi: String): String = {
|
||||
|
||||
val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
|
||||
|
||||
|
||||
val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
|
||||
val matcher = pattern.matcher(doi)
|
||||
|
||||
|
@ -43,33 +55,34 @@ object PubMedToOaf {
|
|||
null
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Create an instance of class extends Result
|
||||
* starting from OAF instanceType value
|
||||
*
|
||||
* @param cobjQualifier OAF instance type
|
||||
* @param vocabularies All dnet vocabularies
|
||||
* @return the correct instance
|
||||
*/
|
||||
/** Create an instance of class extends Result
|
||||
* starting from OAF instanceType value
|
||||
*
|
||||
* @param cobjQualifier OAF instance type
|
||||
* @param vocabularies All dnet vocabularies
|
||||
* @return the correct instance
|
||||
*/
|
||||
def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
|
||||
val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid)
|
||||
val result_typologies = getVocabularyTerm(
|
||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||
vocabularies,
|
||||
cobjQualifier.getClassid
|
||||
)
|
||||
result_typologies.getClassid match {
|
||||
case "dataset" => new Dataset
|
||||
case "dataset" => new Dataset
|
||||
case "publication" => new Publication
|
||||
case "other" => new OtherResearchProduct
|
||||
case "software" => new Software
|
||||
case _ => null
|
||||
case "other" => new OtherResearchProduct
|
||||
case "software" => new Software
|
||||
case _ => null
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Mapping the Pubmedjournal info into the OAF Journale
|
||||
*
|
||||
* @param j the pubmedJournal
|
||||
* @return the OAF Journal
|
||||
*/
|
||||
/** Mapping the Pubmedjournal info into the OAF Journale
|
||||
*
|
||||
* @param j the pubmedJournal
|
||||
* @return the OAF Journal
|
||||
*/
|
||||
def mapJournal(j: PMJournal): Journal = {
|
||||
if (j == null)
|
||||
return null
|
||||
|
@ -83,40 +96,47 @@ object PubMedToOaf {
|
|||
journal.setIss(j.getIssue)
|
||||
journal
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Find vocabulary term into synonyms and term in the vocabulary
|
||||
*
|
||||
* @param vocabularyName the input vocabulary name
|
||||
* @param vocabularies all the vocabularies
|
||||
* @param term the term to find
|
||||
* @return the cleaned term value
|
||||
*/
|
||||
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
|
||||
/** Find vocabulary term into synonyms and term in the vocabulary
|
||||
*
|
||||
* @param vocabularyName the input vocabulary name
|
||||
* @param vocabularies all the vocabularies
|
||||
* @param term the term to find
|
||||
* @return the cleaned term value
|
||||
*/
|
||||
def getVocabularyTerm(
|
||||
vocabularyName: String,
|
||||
vocabularies: VocabularyGroup,
|
||||
term: String
|
||||
): Qualifier = {
|
||||
val a = vocabularies.getSynonymAsQualifier(vocabularyName, term)
|
||||
val b = vocabularies.getTermAsQualifier(vocabularyName, term)
|
||||
if (a == null) b else a
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Map the Pubmed Article into the OAF instance
|
||||
*
|
||||
* @param article the pubmed articles
|
||||
* @param vocabularies the vocabularies
|
||||
* @return The OAF instance if the mapping did not fail
|
||||
*/
|
||||
/** Map the Pubmed Article into the OAF instance
|
||||
*
|
||||
* @param article the pubmed articles
|
||||
* @param vocabularies the vocabularies
|
||||
* @return The OAF instance if the mapping did not fail
|
||||
*/
|
||||
def convert(article: PMArticle, vocabularies: VocabularyGroup): Oaf = {
|
||||
|
||||
if (article.getPublicationTypes == null)
|
||||
return null
|
||||
|
||||
|
||||
// MAP PMID into pid with classid = classname = pmid
|
||||
val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
|
||||
val pidList: List[StructuredProperty] = List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
article.getPmid,
|
||||
PidType.pmid.toString,
|
||||
PidType.pmid.toString,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
dataInfo
|
||||
)
|
||||
)
|
||||
if (pidList == null)
|
||||
return null
|
||||
|
||||
|
@ -125,7 +145,14 @@ object PubMedToOaf {
|
|||
if (article.getDoi != null) {
|
||||
val normalizedPid = cleanDoi(article.getDoi)
|
||||
if (normalizedPid != null)
|
||||
alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
|
||||
alternateIdentifier = OafMapperUtils.structuredProperty(
|
||||
normalizedPid,
|
||||
PidType.doi.toString,
|
||||
PidType.doi.toString,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
dataInfo
|
||||
)
|
||||
}
|
||||
|
||||
// INSTANCE MAPPING
|
||||
|
@ -133,14 +160,18 @@ object PubMedToOaf {
|
|||
|
||||
// If the article contains the typology Journal Article then we apply this type
|
||||
//else We have to find a terms that match the vocabulary otherwise we discard it
|
||||
val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
|
||||
val ja =
|
||||
article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
|
||||
val pubmedInstance = new Instance
|
||||
if (ja.isDefined) {
|
||||
val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
|
||||
val cojbCategory =
|
||||
getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
|
||||
pubmedInstance.setInstancetype(cojbCategory)
|
||||
} else {
|
||||
val i_type = article.getPublicationTypes.asScala
|
||||
.map(s => getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue))
|
||||
.map(s =>
|
||||
getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue)
|
||||
)
|
||||
.find(q => q != null)
|
||||
if (i_type.isDefined)
|
||||
pubmedInstance.setInstancetype(i_type.get)
|
||||
|
@ -155,7 +186,9 @@ object PubMedToOaf {
|
|||
if (alternateIdentifier != null)
|
||||
pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava)
|
||||
result.setInstance(List(pubmedInstance).asJava)
|
||||
pubmedInstance.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut)
|
||||
pubmedInstance.getPid.asScala
|
||||
.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid))
|
||||
.map(p => p.getValue)(collection.breakOut)
|
||||
//CREATE URL From pmid
|
||||
val urlLists: List[String] = pidList
|
||||
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
|
||||
|
@ -165,7 +198,9 @@ object PubMedToOaf {
|
|||
pubmedInstance.setUrl(urlLists.asJava)
|
||||
|
||||
//ASSIGN DateofAcceptance
|
||||
pubmedInstance.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
|
||||
pubmedInstance.setDateofacceptance(
|
||||
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
|
||||
)
|
||||
//ASSIGN COLLECTEDFROM
|
||||
pubmedInstance.setCollectedfrom(collectedFrom)
|
||||
result.setPid(pidList.asJava)
|
||||
|
@ -173,7 +208,6 @@ object PubMedToOaf {
|
|||
//END INSTANCE MAPPING
|
||||
//--------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
// JOURNAL MAPPING
|
||||
//--------------------------------------------------------------------------------------
|
||||
if (article.getJournal != null && result.isInstanceOf[Publication])
|
||||
|
@ -182,31 +216,48 @@ object PubMedToOaf {
|
|||
//END JOURNAL MAPPING
|
||||
//--------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
// RESULT MAPPING
|
||||
//--------------------------------------------------------------------------------------
|
||||
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
|
||||
result.setDateofacceptance(
|
||||
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
|
||||
)
|
||||
|
||||
if (article.getTitle == null || article.getTitle.isEmpty)
|
||||
return null
|
||||
result.setTitle(List(OafMapperUtils.structuredProperty(article.getTitle, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo)).asJava)
|
||||
result.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
article.getTitle,
|
||||
ModelConstants.MAIN_TITLE_QUALIFIER,
|
||||
dataInfo
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
if (article.getDescription != null && article.getDescription.nonEmpty)
|
||||
result.setDescription(List(OafMapperUtils.field(article.getDescription, dataInfo)).asJava)
|
||||
|
||||
if (article.getLanguage != null) {
|
||||
|
||||
val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
|
||||
val term =
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
|
||||
if (term != null)
|
||||
result.setLanguage(term)
|
||||
}
|
||||
|
||||
|
||||
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection.breakOut)
|
||||
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
s.getValue,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
dataInfo
|
||||
)
|
||||
)(collection.breakOut)
|
||||
if (subjects != null)
|
||||
result.setSubject(subjects.asJava)
|
||||
|
||||
|
||||
val authors: List[Author] = article.getAuthors.asScala.zipWithIndex.map { case (a, index) =>
|
||||
val author = new Author()
|
||||
author.setName(a.getForeName)
|
||||
|
@ -216,15 +267,12 @@ object PubMedToOaf {
|
|||
author
|
||||
}(collection.breakOut)
|
||||
|
||||
|
||||
if (authors != null && authors.nonEmpty)
|
||||
result.setAuthor(authors.asJava)
|
||||
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
||||
|
||||
|
||||
result.setId(article.getPmid)
|
||||
|
||||
|
||||
// END RESULT MAPPING
|
||||
//--------------------------------------------------------------------------------------
|
||||
val id = IdentifierFactory.createIdentifier(result)
|
||||
|
@ -234,5 +282,4 @@ object PubMedToOaf {
|
|||
result
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -17,7 +17,8 @@ import org.slf4j.{Logger, LoggerFactory}
|
|||
import scala.collection.JavaConverters._
|
||||
import java.text.SimpleDateFormat
|
||||
|
||||
class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) {
|
||||
class SparkRetrieveDataciteDelta(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
val ISO_DATE_PATTERN = "yyyy-MM-dd'T'HH:mm:ssZ"
|
||||
val simpleFormatter = new SimpleDateFormat(ISO_DATE_PATTERN)
|
||||
|
@ -25,162 +26,190 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
|||
val SCHOLIX_RESOURCE_PATH_NAME = "scholixResource"
|
||||
val DATACITE_OAF_PATH_NAME = "dataciteOAFUpdate"
|
||||
val PID_MAP_PATH_NAME = "pidMap"
|
||||
val RESOLVED_REL_PATH_NAME ="resolvedRelation"
|
||||
val RESOLVED_REL_PATH_NAME = "resolvedRelation"
|
||||
val SCHOLIX_PATH_NAME = "scholix"
|
||||
|
||||
def scholixResourcePath(workingPath: String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME"
|
||||
def dataciteOAFPath(workingPath: String) = s"$workingPath/$DATACITE_OAF_PATH_NAME"
|
||||
def pidMapPath(workingPath: String) = s"$workingPath/$PID_MAP_PATH_NAME"
|
||||
def resolvedRelationPath(workingPath: String) = s"$workingPath/$RESOLVED_REL_PATH_NAME"
|
||||
def scholixPath(workingPath: String) = s"$workingPath/$SCHOLIX_PATH_NAME"
|
||||
|
||||
def scholixResourcePath(workingPath:String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME"
|
||||
def dataciteOAFPath(workingPath:String) = s"$workingPath/$DATACITE_OAF_PATH_NAME"
|
||||
def pidMapPath(workingPath:String) = s"$workingPath/$PID_MAP_PATH_NAME"
|
||||
def resolvedRelationPath(workingPath:String) = s"$workingPath/$RESOLVED_REL_PATH_NAME"
|
||||
def scholixPath(workingPath:String) = s"$workingPath/$SCHOLIX_PATH_NAME"
|
||||
|
||||
|
||||
/**
|
||||
* Utility to parse Date in ISO8601 to epochMillis
|
||||
* @param inputDate The String represents an input date in ISO8601
|
||||
* @return The relative epochMillis of parsed date
|
||||
*/
|
||||
def ISO8601toEpochMillis(inputDate:String):Long = {
|
||||
/** Utility to parse Date in ISO8601 to epochMillis
|
||||
* @param inputDate The String represents an input date in ISO8601
|
||||
* @return The relative epochMillis of parsed date
|
||||
*/
|
||||
def ISO8601toEpochMillis(inputDate: String): Long = {
|
||||
simpleFormatter.parse(inputDate).getTime
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method tries to retrieve the last collection date from all datacite
|
||||
* records in HDFS.
|
||||
* This method should be called before indexing scholexplorer to retrieve
|
||||
* the delta of Datacite record to download, since from the generation of
|
||||
* raw graph to the generation of Scholexplorer sometimes it takes 20 days
|
||||
* @param spark
|
||||
* @param entitiesPath
|
||||
* @return the last collection date from the current scholexplorer Graph of the datacite records
|
||||
*/
|
||||
def retrieveLastCollectedFrom(spark:SparkSession, entitiesPath:String):Long = {
|
||||
/** This method tries to retrieve the last collection date from all datacite
|
||||
* records in HDFS.
|
||||
* This method should be called before indexing scholexplorer to retrieve
|
||||
* the delta of Datacite record to download, since from the generation of
|
||||
* raw graph to the generation of Scholexplorer sometimes it takes 20 days
|
||||
* @param spark
|
||||
* @param entitiesPath
|
||||
* @return the last collection date from the current scholexplorer Graph of the datacite records
|
||||
*/
|
||||
def retrieveLastCollectedFrom(spark: SparkSession, entitiesPath: String): Long = {
|
||||
log.info("Retrieve last entities collected From")
|
||||
|
||||
implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result]
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
|
||||
import spark.implicits._
|
||||
|
||||
val entitiesDS = spark.read.load(s"$entitiesPath/*").as[Oaf].filter(o =>o.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
|
||||
val entitiesDS = spark.read
|
||||
.load(s"$entitiesPath/*")
|
||||
.as[Oaf]
|
||||
.filter(o => o.isInstanceOf[Result])
|
||||
.map(r => r.asInstanceOf[Result])
|
||||
|
||||
val date = entitiesDS.filter(r => r.getDateofcollection!= null).map(_.getDateofcollection).select(max("value")).first.getString(0)
|
||||
val date = entitiesDS
|
||||
.filter(r => r.getDateofcollection != null)
|
||||
.map(_.getDateofcollection)
|
||||
.select(max("value"))
|
||||
.first
|
||||
.getString(0)
|
||||
|
||||
ISO8601toEpochMillis(date) / 1000
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* The method of update Datacite relationships on Scholexplorer
|
||||
* needs some utilities data structures
|
||||
* One is the scholixResource DS that stores all the nodes in the Scholix Graph
|
||||
* in format ScholixResource
|
||||
* @param summaryPath the path of the summary in Scholix
|
||||
* @param workingPath the working path
|
||||
* @param spark the spark session
|
||||
*/
|
||||
def generateScholixResource(summaryPath:String, workingPath: String, spark:SparkSession) :Unit = {
|
||||
implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
implicit val scholixResourceEncoder:Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
/** The method of update Datacite relationships on Scholexplorer
|
||||
* needs some utilities data structures
|
||||
* One is the scholixResource DS that stores all the nodes in the Scholix Graph
|
||||
* in format ScholixResource
|
||||
* @param summaryPath the path of the summary in Scholix
|
||||
* @param workingPath the working path
|
||||
* @param spark the spark session
|
||||
*/
|
||||
def generateScholixResource(
|
||||
summaryPath: String,
|
||||
workingPath: String,
|
||||
spark: SparkSession
|
||||
): Unit = {
|
||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
|
||||
log.info("Convert All summary to ScholixResource")
|
||||
spark.read.load(summaryPath).as[ScholixSummary]
|
||||
spark.read
|
||||
.load(summaryPath)
|
||||
.as[ScholixSummary]
|
||||
.map(ScholixUtils.generateScholixResourceFromSummary)(scholixResourceEncoder)
|
||||
.filter(r => r.getIdentifier!= null && r.getIdentifier.size>0)
|
||||
.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_native")
|
||||
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"${scholixResourcePath(workingPath)}_native")
|
||||
}
|
||||
|
||||
/**
|
||||
* This method convert the new Datacite Resource into Scholix Resource
|
||||
* Needed to fill the source and the type of Scholix Relationships
|
||||
* @param workingPath the Working Path
|
||||
* @param spark The spark Session
|
||||
*/
|
||||
def addMissingScholixResource(workingPath:String, spark:SparkSession ) :Unit = {
|
||||
implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val scholixResourceEncoder:Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result]
|
||||
/** This method convert the new Datacite Resource into Scholix Resource
|
||||
* Needed to fill the source and the type of Scholix Relationships
|
||||
* @param workingPath the Working Path
|
||||
* @param spark The spark Session
|
||||
*/
|
||||
def addMissingScholixResource(workingPath: String, spark: SparkSession): Unit = {
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
|
||||
import spark.implicits._
|
||||
|
||||
spark.read.load(dataciteOAFPath(workingPath)).as[Oaf]
|
||||
spark.read
|
||||
.load(dataciteOAFPath(workingPath))
|
||||
.as[Oaf]
|
||||
.filter(_.isInstanceOf[Result])
|
||||
.map(_.asInstanceOf[Result])
|
||||
.map(ScholixUtils.generateScholixResourceFromResult)
|
||||
.filter(r => r.getIdentifier!= null && r.getIdentifier.size>0)
|
||||
.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_update")
|
||||
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"${scholixResourcePath(workingPath)}_update")
|
||||
|
||||
val update = spark.read.load(s"${scholixResourcePath(workingPath)}_update").as[ScholixResource]
|
||||
val native = spark.read.load(s"${scholixResourcePath(workingPath)}_native").as[ScholixResource]
|
||||
val graph = update.union(native)
|
||||
val graph = update
|
||||
.union(native)
|
||||
.groupByKey(_.getDnetIdentifier)
|
||||
.reduceGroups((a,b) => if (a!= null && a.getDnetIdentifier!= null) a else b)
|
||||
.reduceGroups((a, b) => if (a != null && a.getDnetIdentifier != null) a else b)
|
||||
.map(_._2)
|
||||
graph.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_graph")
|
||||
}
|
||||
|
||||
/** This method get and Transform only datacite records with
|
||||
* timestamp greater than timestamp
|
||||
* @param datacitePath the datacite input Path
|
||||
* @param timestamp the timestamp
|
||||
* @param workingPath the working path where save the generated Dataset
|
||||
* @param spark SparkSession
|
||||
* @param vocabularies Vocabularies needed for transformation
|
||||
*/
|
||||
|
||||
/**
|
||||
* This method get and Transform only datacite records with
|
||||
* timestamp greater than timestamp
|
||||
* @param datacitePath the datacite input Path
|
||||
* @param timestamp the timestamp
|
||||
* @param workingPath the working path where save the generated Dataset
|
||||
* @param spark SparkSession
|
||||
* @param vocabularies Vocabularies needed for transformation
|
||||
*/
|
||||
|
||||
def getDataciteUpdate(datacitePath:String, timestamp:Long, workingPath:String, spark:SparkSession,vocabularies: VocabularyGroup): Long = {
|
||||
def getDataciteUpdate(
|
||||
datacitePath: String,
|
||||
timestamp: Long,
|
||||
workingPath: String,
|
||||
spark: SparkSession,
|
||||
vocabularies: VocabularyGroup
|
||||
): Long = {
|
||||
import spark.implicits._
|
||||
val ds = spark.read.load(datacitePath).as[DataciteType]
|
||||
implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
val total = ds.filter(_.timestamp>=timestamp).count()
|
||||
if (total >0) {
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
val total = ds.filter(_.timestamp >= timestamp).count()
|
||||
if (total > 0) {
|
||||
ds.filter(_.timestamp >= timestamp)
|
||||
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true))
|
||||
.flatMap(i => fixRelations(i)).filter(i => i != null)
|
||||
.write.mode(SaveMode.Overwrite).save(dataciteOAFPath(workingPath))
|
||||
.flatMap(d =>
|
||||
DataciteToOAFTransformation
|
||||
.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true)
|
||||
)
|
||||
.flatMap(i => fixRelations(i))
|
||||
.filter(i => i != null)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(dataciteOAFPath(workingPath))
|
||||
}
|
||||
total
|
||||
}
|
||||
|
||||
/**
|
||||
* After added the new ScholixResource, we need to update the scholix Pid Map
|
||||
* to intersected with the new Datacite Relations
|
||||
|
||||
* @param workingPath The working Path starting from save the new Map
|
||||
* @param spark the spark session
|
||||
*/
|
||||
def generatePidMap(workingPath:String, spark:SparkSession ) :Unit = {
|
||||
implicit val scholixResourceEncoder:Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
/** After added the new ScholixResource, we need to update the scholix Pid Map
|
||||
* to intersected with the new Datacite Relations
|
||||
*
|
||||
* @param workingPath The working Path starting from save the new Map
|
||||
* @param spark the spark session
|
||||
*/
|
||||
def generatePidMap(workingPath: String, spark: SparkSession): Unit = {
|
||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
import spark.implicits._
|
||||
spark.read.load(s"${scholixResourcePath(workingPath)}_graph").as[ScholixResource]
|
||||
.flatMap(r=>
|
||||
r.getIdentifier.asScala
|
||||
.map(i =>DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema))
|
||||
.map(t =>(t, r.getDnetIdentifier))
|
||||
)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
spark.read
|
||||
.load(s"${scholixResourcePath(workingPath)}_graph")
|
||||
.as[ScholixResource]
|
||||
.flatMap(r =>
|
||||
r.getIdentifier.asScala
|
||||
.map(i => DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema))
|
||||
.map(t => (t, r.getDnetIdentifier))
|
||||
)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.groupByKey(_._1)
|
||||
.reduceGroups((a,b) => if (a!= null && a._2!= null) a else b)
|
||||
.reduceGroups((a, b) => if (a != null && a._2 != null) a else b)
|
||||
.map(_._2)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.write.mode(SaveMode.Overwrite).save(pidMapPath(workingPath))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(pidMapPath(workingPath))
|
||||
}
|
||||
|
||||
/**
|
||||
* This method resolve the datacite relation and filter the resolved
|
||||
* relation
|
||||
* @param workingPath the working path
|
||||
* @param spark the spark session
|
||||
*/
|
||||
/** This method resolve the datacite relation and filter the resolved
|
||||
* relation
|
||||
* @param workingPath the working path
|
||||
* @param spark the spark session
|
||||
*/
|
||||
|
||||
def resolveUpdateRelation(workingPath:String, spark:SparkSession) :Unit = {
|
||||
implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val relationEncoder:Encoder[Relation] = Encoders.kryo[Relation]
|
||||
def resolveUpdateRelation(workingPath: String, spark: SparkSession): Unit = {
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
import spark.implicits._
|
||||
|
||||
val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String,String)]
|
||||
val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String, String)]
|
||||
|
||||
val unresolvedRelations:Dataset[(String,Relation)] = spark.read.load(dataciteOAFPath(workingPath)).as[Oaf]
|
||||
val unresolvedRelations: Dataset[(String, Relation)] = spark.read
|
||||
.load(dataciteOAFPath(workingPath))
|
||||
.as[Oaf]
|
||||
.filter(_.isInstanceOf[Relation])
|
||||
.map(_.asInstanceOf[Relation])
|
||||
.map { r =>
|
||||
|
@ -193,7 +222,7 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
|||
unresolvedRelations
|
||||
.joinWith(pidMap, unresolvedRelations("_1").equalTo(pidMap("_1")))
|
||||
.map(t => {
|
||||
val r =t._1._2
|
||||
val r = t._1._2
|
||||
val resolvedIdentifier = t._2._2
|
||||
if (r.getSource.startsWith("unresolved"))
|
||||
r.setSource(resolvedIdentifier)
|
||||
|
@ -201,56 +230,62 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
|||
r.setTarget(resolvedIdentifier)
|
||||
r
|
||||
})(relationEncoder)
|
||||
.filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved") ))
|
||||
.write.mode(SaveMode.Overwrite)
|
||||
.filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved")))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(resolvedRelationPath(workingPath))
|
||||
}
|
||||
|
||||
/** This method generate scholix starting from resolved relation
|
||||
*
|
||||
* @param workingPath
|
||||
* @param spark
|
||||
*/
|
||||
def generateScholixUpdate(workingPath: String, spark: SparkSession): Unit = {
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
implicit val intermediateEncoder: Encoder[(String, Scholix)] =
|
||||
Encoders.tuple(Encoders.STRING, scholixEncoder)
|
||||
|
||||
val relations: Dataset[(String, Relation)] = spark.read
|
||||
.load(resolvedRelationPath(workingPath))
|
||||
.as[Relation]
|
||||
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relationEncoder))
|
||||
|
||||
/**
|
||||
* This method generate scholix starting from resolved relation
|
||||
*
|
||||
*
|
||||
* @param workingPath
|
||||
* @param spark
|
||||
*/
|
||||
def generateScholixUpdate(workingPath:String, spark:SparkSession) :Unit = {
|
||||
implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val scholixEncoder:Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
implicit val scholixResourceEncoder:Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
implicit val relationEncoder:Encoder[Relation] = Encoders.kryo[Relation]
|
||||
implicit val intermediateEncoder :Encoder[(String,Scholix)] = Encoders.tuple(Encoders.STRING, scholixEncoder)
|
||||
|
||||
|
||||
val relations:Dataset[(String, Relation)] = spark.read.load(resolvedRelationPath(workingPath)).as[Relation].map(r =>(r.getSource,r))(Encoders.tuple(Encoders.STRING, relationEncoder))
|
||||
|
||||
val id_summary:Dataset[(String,ScholixResource)] = spark.read.load(s"${scholixResourcePath(workingPath)}_graph").as[ScholixResource].map(r => (r.getDnetIdentifier,r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder))
|
||||
val id_summary: Dataset[(String, ScholixResource)] = spark.read
|
||||
.load(s"${scholixResourcePath(workingPath)}_graph")
|
||||
.as[ScholixResource]
|
||||
.map(r => (r.getDnetIdentifier, r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder))
|
||||
|
||||
id_summary.cache()
|
||||
|
||||
relations.joinWith(id_summary, relations("_1").equalTo(id_summary("_1")),"inner")
|
||||
.map(t => (t._1._2.getTarget,ScholixUtils.scholixFromSource(t._1._2, t._2._2)))
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/scholix_one_verse")
|
||||
relations
|
||||
.joinWith(id_summary, relations("_1").equalTo(id_summary("_1")), "inner")
|
||||
.map(t => (t._1._2.getTarget, ScholixUtils.scholixFromSource(t._1._2, t._2._2)))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/scholix_one_verse")
|
||||
|
||||
val source_scholix:Dataset[(String, Scholix)] =spark.read.load(s"$workingPath/scholix_one_verse").as[(String,Scholix)]
|
||||
val source_scholix: Dataset[(String, Scholix)] =
|
||||
spark.read.load(s"$workingPath/scholix_one_verse").as[(String, Scholix)]
|
||||
|
||||
source_scholix.joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")),"inner")
|
||||
source_scholix
|
||||
.joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")), "inner")
|
||||
.map(t => {
|
||||
val target:ScholixResource =t._2._2
|
||||
val scholix:Scholix = t._1._2
|
||||
ScholixUtils.generateCompleteScholix(scholix,target)
|
||||
})(scholixEncoder).write.mode(SaveMode.Overwrite).save(s"$workingPath/scholix")
|
||||
val target: ScholixResource = t._2._2
|
||||
val scholix: Scholix = t._1._2
|
||||
ScholixUtils.generateCompleteScholix(scholix, target)
|
||||
})(scholixEncoder)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/scholix")
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"SourcePath is '$sourcePath'")
|
||||
|
@ -258,7 +293,7 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
|||
val datacitePath = parser.get("datacitePath")
|
||||
log.info(s"DatacitePath is '$datacitePath'")
|
||||
|
||||
val workingPath = parser.get("workingSupportPath")
|
||||
val workingPath = parser.get("workingSupportPath")
|
||||
log.info(s"workingPath is '$workingPath'")
|
||||
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
|
@ -268,38 +303,43 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
|
|||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
require(vocabularies != null)
|
||||
|
||||
|
||||
val updateDS:Boolean = "true".equalsIgnoreCase(parser.get("updateDS"))
|
||||
val updateDS: Boolean = "true".equalsIgnoreCase(parser.get("updateDS"))
|
||||
log.info(s"updateDS is '$updateDS'")
|
||||
|
||||
var lastCollectionDate = 0L
|
||||
if (updateDS) {
|
||||
generateScholixResource(s"$sourcePath/provision/summaries", workingPath, spark)
|
||||
log.info("Retrieve last entities collected From starting from scholix Graph")
|
||||
lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities")
|
||||
}
|
||||
else {
|
||||
lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities")
|
||||
} else {
|
||||
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
|
||||
fs.delete(new Path(s"${scholixResourcePath(workingPath)}_native"), true)
|
||||
fs.rename(new Path(s"${scholixResourcePath(workingPath)}_graph"), new Path(s"${scholixResourcePath(workingPath)}_native"))
|
||||
lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath))
|
||||
fs.rename(
|
||||
new Path(s"${scholixResourcePath(workingPath)}_graph"),
|
||||
new Path(s"${scholixResourcePath(workingPath)}_native")
|
||||
)
|
||||
lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath))
|
||||
}
|
||||
|
||||
val numRecords = getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies)
|
||||
if (numRecords>0) {
|
||||
addMissingScholixResource(workingPath,spark)
|
||||
val numRecords =
|
||||
getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies)
|
||||
if (numRecords > 0) {
|
||||
addMissingScholixResource(workingPath, spark)
|
||||
generatePidMap(workingPath, spark)
|
||||
resolveUpdateRelation(workingPath,spark)
|
||||
resolveUpdateRelation(workingPath, spark)
|
||||
generateScholixUpdate(workingPath, spark)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
object SparkRetrieveDataciteDelta {
|
||||
val log: Logger = LoggerFactory.getLogger(SparkRetrieveDataciteDelta.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
new SparkRetrieveDataciteDelta("/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json", args, log).initialize().run()
|
||||
new SparkRetrieveDataciteDelta(
|
||||
"/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json",
|
||||
args,
|
||||
log
|
||||
).initialize().run()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
|
||||
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
|
||||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
|
@ -20,95 +19,90 @@ import java.util.Locale
|
|||
import scala.io.Source
|
||||
|
||||
@ExtendWith(Array(classOf[MockitoExtension]))
|
||||
class DataciteToOAFTest extends AbstractVocabularyTest{
|
||||
class DataciteToOAFTest extends AbstractVocabularyTest {
|
||||
|
||||
private var workingDir:Path = null
|
||||
private var workingDir: Path = null
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
|
||||
@BeforeEach
|
||||
def setUp() :Unit = {
|
||||
def setUp(): Unit = {
|
||||
|
||||
workingDir= Files.createTempDirectory(getClass.getSimpleName)
|
||||
workingDir = Files.createTempDirectory(getClass.getSimpleName)
|
||||
super.setUpVocabulary()
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
def tearDown() :Unit = {
|
||||
def tearDown(): Unit = {
|
||||
FileUtils.deleteDirectory(workingDir.toFile)
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testDateMapping:Unit = {
|
||||
def testDateMapping: Unit = {
|
||||
val inputDate = "2021-07-14T11:52:54+0000"
|
||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||
val dt = ISO8601FORMAT.parse(inputDate)
|
||||
println(dt.getTime)
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testConvert(): Unit = {
|
||||
|
||||
|
||||
val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
|
||||
|
||||
val conf = new SparkConf()
|
||||
val spark:SparkSession = SparkSession.builder().config(conf)
|
||||
val spark: SparkSession = SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master("local[*]")
|
||||
.getOrCreate()
|
||||
|
||||
|
||||
|
||||
implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
val instance = new GenerateDataciteDatasetSpark(null, null, log)
|
||||
val targetPath = s"$workingDir/result"
|
||||
|
||||
instance.generateDataciteDataset(path, exportLinks = true, vocabularies,targetPath, spark)
|
||||
instance.generateDataciteDataset(path, exportLinks = true, vocabularies, targetPath, spark)
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
val nativeSize =spark.read.load(path).count()
|
||||
|
||||
val nativeSize = spark.read.load(path).count()
|
||||
|
||||
assertEquals(100, nativeSize)
|
||||
|
||||
val result:Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
|
||||
val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
|
||||
|
||||
|
||||
result.map(s => s.getClass.getSimpleName).groupBy(col("value").alias("class")).agg(count("value").alias("Total")).show(false)
|
||||
result
|
||||
.map(s => s.getClass.getSimpleName)
|
||||
.groupBy(col("value").alias("class"))
|
||||
.agg(count("value").alias("Total"))
|
||||
.show(false)
|
||||
|
||||
val t = spark.read.load(targetPath).count()
|
||||
|
||||
assertTrue(t >0)
|
||||
|
||||
assertTrue(t > 0)
|
||||
|
||||
spark.stop()
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testMapping() :Unit = {
|
||||
val record =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")).mkString
|
||||
def testMapping(): Unit = {
|
||||
val record = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")
|
||||
)
|
||||
.mkString
|
||||
|
||||
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
||||
val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true )
|
||||
val res: List[Oaf] = DataciteToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true)
|
||||
|
||||
res.foreach(r => {
|
||||
println (mapper.writeValueAsString(r))
|
||||
println(mapper.writeValueAsString(r))
|
||||
println("----------------------------")
|
||||
|
||||
})
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -20,14 +20,13 @@ import scala.io.Source
|
|||
import scala.xml.pull.XMLEventReader
|
||||
|
||||
@ExtendWith(Array(classOf[MockitoExtension]))
|
||||
class BioScholixTest extends AbstractVocabularyTest{
|
||||
|
||||
class BioScholixTest extends AbstractVocabularyTest {
|
||||
|
||||
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,false)
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
|
||||
|
||||
@BeforeEach
|
||||
def setUp() :Unit = {
|
||||
def setUp(): Unit = {
|
||||
|
||||
super.setUpVocabulary()
|
||||
}
|
||||
|
@ -38,52 +37,54 @@ class BioScholixTest extends AbstractVocabularyTest{
|
|||
}
|
||||
|
||||
object GzFileIterator {
|
||||
|
||||
def apply(is: InputStream, encoding: String) = {
|
||||
new BufferedReaderIterator(
|
||||
new BufferedReader(
|
||||
new InputStreamReader(
|
||||
new GZIPInputStream(
|
||||
is), encoding)))
|
||||
new BufferedReader(new InputStreamReader(new GZIPInputStream(is), encoding))
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def testEBIData() = {
|
||||
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")).mkString
|
||||
val inputXML = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
||||
.mkString
|
||||
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
||||
new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s)))
|
||||
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testPubmedToOaf(): Unit = {
|
||||
assertNotNull(vocabularies)
|
||||
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")).mkString
|
||||
val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies))
|
||||
val records: String = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump"))
|
||||
.mkString
|
||||
val r: List[Oaf] = records.lines.toList
|
||||
.map(s => mapper.readValue(s, classOf[PMArticle]))
|
||||
.map(a => PubMedToOaf.convert(a, vocabularies))
|
||||
assertEquals(10, r.size)
|
||||
assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p)))
|
||||
assertTrue(
|
||||
r.map(p => p.asInstanceOf[Result])
|
||||
.flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid))
|
||||
.exists(p => "0037".equalsIgnoreCase(p))
|
||||
)
|
||||
println(mapper.writeValueAsString(r.head))
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testPDBToOAF():Unit = {
|
||||
def testPDBToOAF(): Unit = {
|
||||
|
||||
assertNotNull(vocabularies)
|
||||
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump")).mkString
|
||||
val records: String = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump"))
|
||||
.mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
val result:List[Oaf]= records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
|
||||
|
||||
|
||||
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
|
||||
|
||||
assertTrue(result.nonEmpty)
|
||||
result.foreach(r => assertNotNull(r))
|
||||
|
@ -93,19 +94,18 @@ class BioScholixTest extends AbstractVocabularyTest{
|
|||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testUNIprotToOAF():Unit = {
|
||||
def testUNIprotToOAF(): Unit = {
|
||||
|
||||
assertNotNull(vocabularies)
|
||||
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
||||
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump")).mkString
|
||||
val records: String = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump"))
|
||||
.mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
val result:List[Oaf]= records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
|
||||
|
||||
|
||||
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
|
||||
|
||||
assertTrue(result.nonEmpty)
|
||||
result.foreach(r => assertNotNull(r))
|
||||
|
@ -115,35 +115,42 @@ class BioScholixTest extends AbstractVocabularyTest{
|
|||
|
||||
}
|
||||
|
||||
case class EBILinks(relType:String, date:String, title:String, pmid:String, targetPid:String, targetPidType:String) {}
|
||||
case class EBILinks(
|
||||
relType: String,
|
||||
date: String,
|
||||
title: String,
|
||||
pmid: String,
|
||||
targetPid: String,
|
||||
targetPidType: String
|
||||
) {}
|
||||
|
||||
def parse_ebi_links(input:String):List[EBILinks] ={
|
||||
def parse_ebi_links(input: String): List[EBILinks] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pmid = (json \ "publication" \"pmid").extract[String]
|
||||
val pmid = (json \ "publication" \ "pmid").extract[String]
|
||||
for {
|
||||
JObject(link) <- json \\ "Link"
|
||||
JField("Target",JObject(target)) <- link
|
||||
JField("RelationshipType",JObject(relType)) <- link
|
||||
JField("Name", JString(relation)) <- relType
|
||||
JField("PublicationDate",JString(publicationDate)) <- link
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Identifier",JObject(identifier)) <- target
|
||||
JField("IDScheme", JString(idScheme)) <- identifier
|
||||
JField("ID", JString(id)) <- identifier
|
||||
JObject(link) <- json \\ "Link"
|
||||
JField("Target", JObject(target)) <- link
|
||||
JField("RelationshipType", JObject(relType)) <- link
|
||||
JField("Name", JString(relation)) <- relType
|
||||
JField("PublicationDate", JString(publicationDate)) <- link
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Identifier", JObject(identifier)) <- target
|
||||
JField("IDScheme", JString(idScheme)) <- identifier
|
||||
JField("ID", JString(id)) <- identifier
|
||||
|
||||
} yield EBILinks(relation, publicationDate, title, pmid, id, idScheme)
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testCrossrefLinksToOAF():Unit = {
|
||||
def testCrossrefLinksToOAF(): Unit = {
|
||||
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links")).mkString
|
||||
val records: String = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links"))
|
||||
.mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
|
||||
val result:List[Oaf] =records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
|
||||
val result: List[Oaf] = records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
|
||||
|
||||
assertNotNull(result)
|
||||
assertTrue(result.nonEmpty)
|
||||
|
@ -153,36 +160,41 @@ class BioScholixTest extends AbstractVocabularyTest{
|
|||
}
|
||||
|
||||
@Test
|
||||
def testEBILinksToOAF():Unit = {
|
||||
val iterator = GzFileIterator(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"), "UTF-8")
|
||||
def testEBILinksToOAF(): Unit = {
|
||||
val iterator = GzFileIterator(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"),
|
||||
"UTF-8"
|
||||
)
|
||||
val data = iterator.next()
|
||||
|
||||
val res = BioDBToOAF.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links).filter(BioDBToOAF.EBITargetLinksFilter).flatMap(BioDBToOAF.convertEBILinksToOaf)
|
||||
val res = BioDBToOAF
|
||||
.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links)
|
||||
.filter(BioDBToOAF.EBITargetLinksFilter)
|
||||
.flatMap(BioDBToOAF.convertEBILinksToOaf)
|
||||
print(res.length)
|
||||
|
||||
|
||||
println(mapper.writeValueAsString(res.head))
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def scholixResolvedToOAF():Unit ={
|
||||
def scholixResolvedToOAF(): Unit = {
|
||||
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")).mkString
|
||||
val records: String = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")
|
||||
)
|
||||
.mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
val l:List[ScholixResolved] = records.lines.map{input =>
|
||||
val l: List[ScholixResolved] = records.lines.map { input =>
|
||||
lazy val json = parse(input)
|
||||
json.extract[ScholixResolved]
|
||||
}.toList
|
||||
|
||||
|
||||
val result:List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
|
||||
val result: List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
|
||||
|
||||
assertTrue(result.nonEmpty)
|
||||
}
|
||||
|
|
|
@ -16,10 +16,22 @@ import java.time.LocalDate
|
|||
import java.time.format.DateTimeFormatter
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
case class HostedByItemType(
|
||||
id: String,
|
||||
officialname: String,
|
||||
issn: String,
|
||||
eissn: String,
|
||||
lissn: String,
|
||||
openAccess: Boolean
|
||||
) {}
|
||||
|
||||
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
||||
|
||||
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){}
|
||||
case class DoiBoostAffiliation(
|
||||
PaperId: Long,
|
||||
AffiliationId: Long,
|
||||
GridId: Option[String],
|
||||
OfficialPage: Option[String],
|
||||
DisplayName: Option[String]
|
||||
) {}
|
||||
|
||||
object DoiBoostMappingUtil {
|
||||
|
||||
|
@ -43,9 +55,19 @@ object DoiBoostMappingUtil {
|
|||
val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"
|
||||
val DOI_PREFIX = "10."
|
||||
|
||||
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
|
||||
val invalidName = List(
|
||||
",",
|
||||
"none none",
|
||||
"none, none",
|
||||
"none &na;",
|
||||
"(:null)",
|
||||
"test test test",
|
||||
"test test",
|
||||
"test",
|
||||
"&na; &na;"
|
||||
)
|
||||
|
||||
def toActionSet(item:Oaf) :(String, String) = {
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
|
@ -75,59 +97,56 @@ object DoiBoostMappingUtil {
|
|||
|
||||
}
|
||||
|
||||
|
||||
def toHostedByItem(input:String): (String, HostedByItemType) = {
|
||||
def toHostedByItem(input: String): (String, HostedByItemType) = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]]
|
||||
val c: Map[String, HostedByItemType] = json.extract[Map[String, HostedByItemType]]
|
||||
(c.keys.head, c.values.head)
|
||||
}
|
||||
|
||||
|
||||
def toISSNPair(publication: Publication) : (String, Publication) = {
|
||||
def toISSNPair(publication: Publication): (String, Publication) = {
|
||||
val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
|
||||
val eissn =if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
|
||||
val lissn =if (publication.getJournal == null) null else publication.getJournal.getIssnLinking
|
||||
val eissn = if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
|
||||
val lissn = if (publication.getJournal == null) null else publication.getJournal.getIssnLinking
|
||||
|
||||
if (issn!= null && issn.nonEmpty)
|
||||
if (issn != null && issn.nonEmpty)
|
||||
(issn, publication)
|
||||
else if(eissn!= null && eissn.nonEmpty)
|
||||
else if (eissn != null && eissn.nonEmpty)
|
||||
(eissn, publication)
|
||||
else if(lissn!= null && lissn.nonEmpty)
|
||||
else if (lissn != null && lissn.nonEmpty)
|
||||
(lissn, publication)
|
||||
else
|
||||
(publication.getId, publication)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
def generateGridAffiliationId(gridId:String) :String = {
|
||||
def generateGridAffiliationId(gridId: String): String = {
|
||||
s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
|
||||
}
|
||||
|
||||
|
||||
def fixResult(result: Dataset) :Dataset = {
|
||||
def fixResult(result: Dataset): Dataset = {
|
||||
val instanceType = extractInstance(result)
|
||||
if (instanceType.isDefined) {
|
||||
result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
|
||||
}
|
||||
result.getInstance().asScala.foreach(i => {
|
||||
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
|
||||
})
|
||||
result
|
||||
.getInstance()
|
||||
.asScala
|
||||
.foreach(i => {
|
||||
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
|
||||
})
|
||||
result
|
||||
}
|
||||
|
||||
|
||||
def decideAccessRight(lic : Field[String], date:String) : AccessRight = {
|
||||
if(lic == null){
|
||||
def decideAccessRight(lic: Field[String], date: String): AccessRight = {
|
||||
if (lic == null) {
|
||||
//Default value Unknown
|
||||
return getUnknownQualifier()
|
||||
}
|
||||
val license : String = lic.getValue
|
||||
val license: String = lic.getValue
|
||||
//CC licenses
|
||||
if(license.startsWith("cc") ||
|
||||
if (
|
||||
license.startsWith("cc") ||
|
||||
license.startsWith("http://creativecommons.org/licenses") ||
|
||||
license.startsWith("https://creativecommons.org/licenses") ||
|
||||
|
||||
|
@ -137,40 +156,44 @@ object DoiBoostMappingUtil {
|
|||
license.equals("http://pubs.acs.org/page/policy/authorchoice_ccbyncnd_termsofuse.html") ||
|
||||
|
||||
//APA (considered OPEN also by Unpaywall)
|
||||
license.equals("http://www.apa.org/pubs/journals/resources/open-access.aspx")){
|
||||
license.equals("http://www.apa.org/pubs/journals/resources/open-access.aspx")
|
||||
) {
|
||||
|
||||
val oaq : AccessRight = getOpenAccessQualifier()
|
||||
val oaq: AccessRight = getOpenAccessQualifier()
|
||||
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
|
||||
return oaq
|
||||
}
|
||||
|
||||
//OUP (BUT ONLY AFTER 12 MONTHS FROM THE PUBLICATION DATE, OTHERWISE THEY ARE EMBARGOED)
|
||||
if(license.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")){
|
||||
if (
|
||||
license.equals(
|
||||
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
|
||||
)
|
||||
) {
|
||||
val now = java.time.LocalDate.now
|
||||
|
||||
try{
|
||||
try {
|
||||
val pub_date = LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd"))
|
||||
if (((now.toEpochDay - pub_date.toEpochDay)/365.0) > 1){
|
||||
val oaq : AccessRight = getOpenAccessQualifier()
|
||||
if (((now.toEpochDay - pub_date.toEpochDay) / 365.0) > 1) {
|
||||
val oaq: AccessRight = getOpenAccessQualifier()
|
||||
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
|
||||
return oaq
|
||||
}
|
||||
else{
|
||||
} else {
|
||||
return getEmbargoedAccessQualifier()
|
||||
}
|
||||
}catch {
|
||||
} catch {
|
||||
case e: Exception => {
|
||||
try{
|
||||
val pub_date = LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))
|
||||
if (((now.toEpochDay - pub_date.toEpochDay)/365.0) > 1){
|
||||
val oaq : AccessRight = getOpenAccessQualifier()
|
||||
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
|
||||
return oaq
|
||||
}
|
||||
else{
|
||||
return getEmbargoedAccessQualifier()
|
||||
}
|
||||
}catch{
|
||||
try {
|
||||
val pub_date =
|
||||
LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))
|
||||
if (((now.toEpochDay - pub_date.toEpochDay) / 365.0) > 1) {
|
||||
val oaq: AccessRight = getOpenAccessQualifier()
|
||||
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
|
||||
return oaq
|
||||
} else {
|
||||
return getEmbargoedAccessQualifier()
|
||||
}
|
||||
} catch {
|
||||
case ex: Exception => return getClosedAccessQualifier()
|
||||
}
|
||||
}
|
||||
|
@ -183,64 +206,91 @@ object DoiBoostMappingUtil {
|
|||
|
||||
}
|
||||
|
||||
def getOpenAccessQualifier(): AccessRight = {
|
||||
|
||||
|
||||
def getOpenAccessQualifier():AccessRight = {
|
||||
|
||||
OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN,"Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
OafMapperUtils.accessRight(
|
||||
ModelConstants.ACCESS_RIGHT_OPEN,
|
||||
"Open Access",
|
||||
ModelConstants.DNET_ACCESS_MODES,
|
||||
ModelConstants.DNET_ACCESS_MODES
|
||||
)
|
||||
}
|
||||
|
||||
def getRestrictedQualifier():AccessRight = {
|
||||
OafMapperUtils.accessRight( "RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
def getRestrictedQualifier(): AccessRight = {
|
||||
OafMapperUtils.accessRight(
|
||||
"RESTRICTED",
|
||||
"Restricted",
|
||||
ModelConstants.DNET_ACCESS_MODES,
|
||||
ModelConstants.DNET_ACCESS_MODES
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
def getUnknownQualifier():AccessRight = {
|
||||
OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
def getUnknownQualifier(): AccessRight = {
|
||||
OafMapperUtils.accessRight(
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.NOT_AVAILABLE,
|
||||
ModelConstants.DNET_ACCESS_MODES,
|
||||
ModelConstants.DNET_ACCESS_MODES
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
def getEmbargoedAccessQualifier():AccessRight = {
|
||||
OafMapperUtils.accessRight("EMBARGO","Embargo",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
def getEmbargoedAccessQualifier(): AccessRight = {
|
||||
OafMapperUtils.accessRight(
|
||||
"EMBARGO",
|
||||
"Embargo",
|
||||
ModelConstants.DNET_ACCESS_MODES,
|
||||
ModelConstants.DNET_ACCESS_MODES
|
||||
)
|
||||
}
|
||||
|
||||
def getClosedAccessQualifier():AccessRight = {
|
||||
OafMapperUtils.accessRight("CLOSED","Closed Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
def getClosedAccessQualifier(): AccessRight = {
|
||||
OafMapperUtils.accessRight(
|
||||
"CLOSED",
|
||||
"Closed Access",
|
||||
ModelConstants.DNET_ACCESS_MODES,
|
||||
ModelConstants.DNET_ACCESS_MODES
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
def extractInstance(r:Result):Option[Instance] = {
|
||||
r.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
|
||||
def extractInstance(r: Result): Option[Instance] = {
|
||||
r.getInstance()
|
||||
.asScala
|
||||
.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
|
||||
}
|
||||
|
||||
def fixPublication(input:((String,Publication), (String,HostedByItemType))): Publication = {
|
||||
def fixPublication(input: ((String, Publication), (String, HostedByItemType))): Publication = {
|
||||
|
||||
val publication = input._1._2
|
||||
|
||||
val item = if (input._2 != null) input._2._2 else null
|
||||
|
||||
val instanceType:Option[Instance] = extractInstance(publication)
|
||||
val instanceType: Option[Instance] = extractInstance(publication)
|
||||
|
||||
if (instanceType.isDefined) {
|
||||
publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
|
||||
publication
|
||||
.getInstance()
|
||||
.asScala
|
||||
.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
|
||||
}
|
||||
|
||||
publication.getInstance().asScala.foreach(i => {
|
||||
var hb = new KeyValue
|
||||
if (item != null) {
|
||||
hb.setValue(item.officialname)
|
||||
hb.setKey(item.id)
|
||||
if (item.openAccess) {
|
||||
i.setAccessright(getOpenAccessQualifier())
|
||||
i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
|
||||
}
|
||||
publication
|
||||
.getInstance()
|
||||
.asScala
|
||||
.foreach(i => {
|
||||
var hb = new KeyValue
|
||||
if (item != null) {
|
||||
hb.setValue(item.officialname)
|
||||
hb.setKey(item.id)
|
||||
if (item.openAccess) {
|
||||
i.setAccessright(getOpenAccessQualifier())
|
||||
i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
hb = ModelConstants.UNKNOWN_REPOSITORY
|
||||
}
|
||||
i.setHostedby(hb)
|
||||
})
|
||||
} else {
|
||||
hb = ModelConstants.UNKNOWN_REPOSITORY
|
||||
}
|
||||
i.setHostedby(hb)
|
||||
})
|
||||
|
||||
publication.setBestaccessright(OafMapperUtils.createBestAccessRights(publication.getInstance()))
|
||||
|
||||
|
@ -270,17 +320,22 @@ object DoiBoostMappingUtil {
|
|||
if (publication.getTitle == null || publication.getTitle.size == 0)
|
||||
return false
|
||||
|
||||
|
||||
val s = publication.getTitle.asScala.count(p => p.getValue != null
|
||||
&& p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]"))
|
||||
val s = publication.getTitle.asScala.count(p =>
|
||||
p.getValue != null
|
||||
&& p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]")
|
||||
)
|
||||
|
||||
if (s == 0)
|
||||
return false
|
||||
|
||||
// fixes #4360 (test publisher)
|
||||
val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
|
||||
val publisher =
|
||||
if (publication.getPublisher != null) publication.getPublisher.getValue else null
|
||||
|
||||
if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
|
||||
if (
|
||||
publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher
|
||||
.equalsIgnoreCase("CrossRef Test Account"))
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -288,18 +343,12 @@ object DoiBoostMappingUtil {
|
|||
if (publication.getAuthor == null || publication.getAuthor.size() == 0)
|
||||
return false
|
||||
|
||||
|
||||
//filter invalid author
|
||||
val authors = publication.getAuthor.asScala.map(s => {
|
||||
if (s.getFullname.nonEmpty) {
|
||||
s.getFullname
|
||||
}
|
||||
else
|
||||
s"${
|
||||
s.getName
|
||||
} ${
|
||||
s.getSurname
|
||||
}"
|
||||
} else
|
||||
s"${s.getName} ${s.getSurname}"
|
||||
})
|
||||
|
||||
val c = authors.count(isValidAuthorName)
|
||||
|
@ -307,13 +356,16 @@ object DoiBoostMappingUtil {
|
|||
return false
|
||||
|
||||
// fixes #4368
|
||||
if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue))
|
||||
if (
|
||||
authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(
|
||||
publication.getPublisher.getValue
|
||||
)
|
||||
)
|
||||
return false
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
|
||||
def isValidAuthorName(fullName: String): Boolean = {
|
||||
if (fullName == null || fullName.isEmpty)
|
||||
return false
|
||||
|
@ -322,32 +374,47 @@ object DoiBoostMappingUtil {
|
|||
true
|
||||
}
|
||||
|
||||
|
||||
def generateDataInfo(trust: String): DataInfo = {
|
||||
val di = new DataInfo
|
||||
di.setDeletedbyinference(false)
|
||||
di.setInferred(false)
|
||||
di.setInvisible(false)
|
||||
di.setTrust(trust)
|
||||
di.setProvenanceaction(OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_ACTIONSET,ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS))
|
||||
di.setProvenanceaction(
|
||||
OafMapperUtils.qualifier(
|
||||
ModelConstants.SYSIMPORT_ACTIONSET,
|
||||
ModelConstants.SYSIMPORT_ACTIONSET,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS
|
||||
)
|
||||
)
|
||||
di
|
||||
}
|
||||
|
||||
|
||||
|
||||
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
|
||||
def createSP(
|
||||
value: String,
|
||||
classId: String,
|
||||
className: String,
|
||||
schemeId: String,
|
||||
schemeName: String
|
||||
): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName))
|
||||
sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
|
||||
sp.setValue(value)
|
||||
sp
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
|
||||
def createSP(
|
||||
value: String,
|
||||
classId: String,
|
||||
className: String,
|
||||
schemeId: String,
|
||||
schemeName: String,
|
||||
dataInfo: DataInfo
|
||||
): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName))
|
||||
sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
|
||||
sp.setValue(value)
|
||||
sp.setDataInfo(dataInfo)
|
||||
sp
|
||||
|
@ -356,17 +423,20 @@ object DoiBoostMappingUtil {
|
|||
|
||||
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId))
|
||||
sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
|
||||
sp.setValue(value)
|
||||
sp
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
|
||||
def createSP(
|
||||
value: String,
|
||||
classId: String,
|
||||
schemeId: String,
|
||||
dataInfo: DataInfo
|
||||
): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId))
|
||||
sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
|
||||
sp.setValue(value)
|
||||
sp.setDataInfo(dataInfo)
|
||||
sp
|
||||
|
@ -382,7 +452,6 @@ object DoiBoostMappingUtil {
|
|||
|
||||
}
|
||||
|
||||
|
||||
def createUnpayWallCollectedFrom(): KeyValue = {
|
||||
|
||||
val cf = new KeyValue
|
||||
|
@ -401,15 +470,11 @@ object DoiBoostMappingUtil {
|
|||
|
||||
}
|
||||
|
||||
|
||||
def generateIdentifier (oaf: Result, doi: String): String = {
|
||||
val id = DHPUtils.md5 (doi.toLowerCase)
|
||||
def generateIdentifier(oaf: Result, doi: String): String = {
|
||||
val id = DHPUtils.md5(doi.toLowerCase)
|
||||
s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}"
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
def createMAGCollectedFrom(): KeyValue = {
|
||||
|
||||
val cf = new KeyValue
|
||||
|
@ -424,19 +489,21 @@ object DoiBoostMappingUtil {
|
|||
tmp.setValue(value)
|
||||
tmp
|
||||
|
||||
|
||||
}
|
||||
|
||||
def isEmpty(x: String) = x == null || x.trim.isEmpty
|
||||
|
||||
def normalizeDoi(input : String) :String ={
|
||||
if(input == null)
|
||||
def normalizeDoi(input: String): String = {
|
||||
if (input == null)
|
||||
return null
|
||||
val replaced = input.replaceAll("(?:\\n|\\r|\\t|\\s)", "").toLowerCase.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
|
||||
if (isEmpty(replaced))
|
||||
val replaced = input
|
||||
.replaceAll("(?:\\n|\\r|\\t|\\s)", "")
|
||||
.toLowerCase
|
||||
.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
|
||||
if (isEmpty(replaced))
|
||||
return null
|
||||
|
||||
if(replaced.indexOf("10.") < 0)
|
||||
if (replaced.indexOf("10.") < 0)
|
||||
return null
|
||||
|
||||
val ret = replaced.substring(replaced.indexOf("10."))
|
||||
|
@ -446,9 +513,6 @@ object DoiBoostMappingUtil {
|
|||
|
||||
return ret
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -17,22 +17,29 @@ object SparkGenerateDOIBoostActionSet {
|
|||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
|
||||
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
implicit val mapEncoderAS: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||
implicit val mapEncoderAS: Encoder[(String, String)] =
|
||||
Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||
|
||||
implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]]
|
||||
implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] =
|
||||
Encoders.kryo[AtomicAction[OafDataset]]
|
||||
|
||||
val dbPublicationPath = parser.get("dbPublicationPath")
|
||||
val dbDatasetPath = parser.get("dbDatasetPath")
|
||||
|
@ -41,35 +48,61 @@ object SparkGenerateDOIBoostActionSet {
|
|||
val dbOrganizationPath = parser.get("dbOrganizationPath")
|
||||
val sequenceFilePath = parser.get("sFilePath")
|
||||
|
||||
val asDataset = spark.read.load(dbDatasetPath).as[OafDataset]
|
||||
val asDataset = spark.read
|
||||
.load(dbDatasetPath)
|
||||
.as[OafDataset]
|
||||
.filter(p => p != null || p.getId != null)
|
||||
.map(d => DoiBoostMappingUtil.fixResult(d))
|
||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(
|
||||
Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||
)
|
||||
|
||||
|
||||
val asPublication = spark.read.load(dbPublicationPath).as[Publication]
|
||||
val asPublication = spark.read
|
||||
.load(dbPublicationPath)
|
||||
.as[Publication]
|
||||
.filter(p => p != null || p.getId != null)
|
||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(
|
||||
Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||
)
|
||||
|
||||
val asOrganization = spark.read
|
||||
.load(dbOrganizationPath)
|
||||
.as[Organization]
|
||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(
|
||||
Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||
)
|
||||
|
||||
val asOrganization = spark.read.load(dbOrganizationPath).as[Organization]
|
||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
|
||||
|
||||
val asCRelation = spark.read.load(crossRefRelation).as[Relation]
|
||||
val asCRelation = spark.read
|
||||
.load(crossRefRelation)
|
||||
.as[Relation]
|
||||
.filter(r => r != null && r.getSource != null && r.getTarget != null)
|
||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(
|
||||
Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||
)
|
||||
|
||||
val asRelAffiliation = spark.read
|
||||
.load(dbaffiliationRelationPath)
|
||||
.as[Relation]
|
||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(
|
||||
Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||
)
|
||||
|
||||
val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation]
|
||||
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
|
||||
|
||||
val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation)
|
||||
|
||||
|
||||
d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
|
||||
val d: Dataset[(String, String)] = asDataset
|
||||
.union(asPublication)
|
||||
.union(asOrganization)
|
||||
.union(asCRelation)
|
||||
.union(asRelAffiliation)
|
||||
|
||||
d.rdd
|
||||
.repartition(6000)
|
||||
.map(s => (new Text(s._1), new Text(s._2)))
|
||||
.saveAsHadoopFile(
|
||||
s"$sequenceFilePath",
|
||||
classOf[Text],
|
||||
classOf[Text],
|
||||
classOf[SequenceFileOutputFormat[Text, Text]],
|
||||
classOf[GzipCodec]
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -15,8 +15,8 @@ import org.json4s.JsonAST.{JField, JObject, JString}
|
|||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import scala.collection.JavaConverters._
|
||||
object SparkGenerateDoiBoost {
|
||||
|
||||
object SparkGenerateDoiBoost {
|
||||
|
||||
def extractIdGRID(input: String): List[(String, String)] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
@ -26,28 +26,32 @@ object SparkGenerateDoiBoost {
|
|||
|
||||
val grids: List[String] = for {
|
||||
|
||||
JObject(pid) <- json \ "pid"
|
||||
JObject(pid) <- json \ "pid"
|
||||
JField("qualifier", JObject(qualifier)) <- pid
|
||||
JField("classid", JString(classid)) <- qualifier
|
||||
JField("value", JString(vl)) <- pid
|
||||
JField("classid", JString(classid)) <- qualifier
|
||||
JField("value", JString(vl)) <- pid
|
||||
if classid == "GRID"
|
||||
} yield vl
|
||||
grids.map(g => (id, s"unresolved::grid::${g.toLowerCase}"))(collection.breakOut)
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
|
@ -55,7 +59,8 @@ object SparkGenerateDoiBoost {
|
|||
val workingDirPath = parser.get("workingPath")
|
||||
val openaireOrganizationPath = parser.get("openaireOrganizationPath")
|
||||
|
||||
val crossrefAggregator = new Aggregator[(String, Publication), Publication, Publication] with Serializable {
|
||||
val crossrefAggregator = new Aggregator[(String, Publication), Publication, Publication]
|
||||
with Serializable {
|
||||
override def zero: Publication = new Publication
|
||||
|
||||
override def reduce(b: Publication, a: (String, Publication)): Publication = {
|
||||
|
@ -65,8 +70,7 @@ object SparkGenerateDoiBoost {
|
|||
a._2.setId(a._1)
|
||||
return a._2
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
if (a != null && a._2 != null) {
|
||||
b.mergeFrom(a._2)
|
||||
b.setId(a._1)
|
||||
|
@ -82,8 +86,7 @@ object SparkGenerateDoiBoost {
|
|||
if (b1 == null) {
|
||||
if (b2 != null)
|
||||
return b2
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
if (b2 != null) {
|
||||
b1.mergeFrom(b2)
|
||||
val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
|
||||
|
@ -103,17 +106,19 @@ object SparkGenerateDoiBoost {
|
|||
override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
}
|
||||
|
||||
|
||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
|
||||
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =
|
||||
Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
||||
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
|
||||
logger.info("Phase 2) Join Crossref with UnpayWall")
|
||||
|
||||
val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
|
||||
val uwPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
|
||||
val crossrefPublication: Dataset[(String, Publication)] =
|
||||
spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
|
||||
val uwPublication: Dataset[(String, Publication)] =
|
||||
spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
|
||||
|
||||
def applyMerge(item: ((String, Publication), (String, Publication))): Publication = {
|
||||
val crossrefPub = item._1._2
|
||||
|
@ -127,86 +132,140 @@ object SparkGenerateDoiBoost {
|
|||
crossrefPub
|
||||
}
|
||||
|
||||
crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/firstJoin")
|
||||
crossrefPublication
|
||||
.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left")
|
||||
.map(applyMerge)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingDirPath/firstJoin")
|
||||
logger.info("Phase 3) Join Result with ORCID")
|
||||
val fj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
|
||||
val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
|
||||
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
|
||||
val fj: Dataset[(String, Publication)] =
|
||||
spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
|
||||
val orcidPublication: Dataset[(String, Publication)] =
|
||||
spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
|
||||
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left")
|
||||
.map(applyMerge)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingDirPath/secondJoin")
|
||||
|
||||
logger.info("Phase 4) Join Result with MAG")
|
||||
val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
|
||||
val sj: Dataset[(String, Publication)] =
|
||||
spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
|
||||
|
||||
val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
|
||||
sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublication")
|
||||
val magPublication: Dataset[(String, Publication)] =
|
||||
spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
|
||||
sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left")
|
||||
.map(applyMerge)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingDirPath/doiBoostPublication")
|
||||
|
||||
val doiBoostPublication: Dataset[(String, Publication)] = spark.read
|
||||
.load(s"$workingDirPath/doiBoostPublication")
|
||||
.as[Publication]
|
||||
.filter(p => DoiBoostMappingUtil.filterPublication(p))
|
||||
.map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
|
||||
|
||||
val doiBoostPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication].filter(p => DoiBoostMappingUtil.filterPublication(p)).map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
|
||||
val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(
|
||||
spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem)
|
||||
)
|
||||
|
||||
val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem))
|
||||
|
||||
|
||||
doiBoostPublication.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
|
||||
doiBoostPublication
|
||||
.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
|
||||
.map(DoiBoostMappingUtil.fixPublication)
|
||||
.map(p => (p.getId, p))
|
||||
.groupByKey(_._1)
|
||||
.agg(crossrefAggregator.toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingDirPath/doiBoostPublicationFiltered")
|
||||
|
||||
val affiliationPath = parser.get("affiliationPath")
|
||||
val paperAffiliationPath = parser.get("paperAffiliationPath")
|
||||
|
||||
val affiliation = spark.read.load(affiliationPath).select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))
|
||||
|
||||
val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId"))
|
||||
val affiliation = spark.read
|
||||
.load(affiliationPath)
|
||||
.select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))
|
||||
|
||||
val paperAffiliation = spark.read
|
||||
.load(paperAffiliationPath)
|
||||
.select(col("AffiliationId").alias("affId"), col("PaperId"))
|
||||
|
||||
val a: Dataset[DoiBoostAffiliation] = paperAffiliation
|
||||
.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId")))
|
||||
.select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId"), col("_2.OfficialPage"), col("_2.DisplayName")).as[DoiBoostAffiliation]
|
||||
.select(
|
||||
col("_1.PaperId"),
|
||||
col("_2.AffiliationId"),
|
||||
col("_2.GridId"),
|
||||
col("_2.OfficialPage"),
|
||||
col("_2.DisplayName")
|
||||
)
|
||||
.as[DoiBoostAffiliation]
|
||||
|
||||
val magPubs: Dataset[(String, Publication)] = spark.read
|
||||
.load(s"$workingDirPath/doiBoostPublicationFiltered")
|
||||
.as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(
|
||||
tupleForJoinEncoder
|
||||
)
|
||||
.filter(s => s._1 != null)
|
||||
|
||||
val magPubs: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s => s._1 != null)
|
||||
magPubs
|
||||
.joinWith(a, magPubs("_1").equalTo(a("PaperId")))
|
||||
.flatMap(item => {
|
||||
val pub: Publication = item._1._2
|
||||
val affiliation = item._2
|
||||
val affId: String =
|
||||
if (affiliation.GridId.isDefined)
|
||||
s"unresolved::grid::${affiliation.GridId.get.toLowerCase}"
|
||||
else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
|
||||
val r: Relation = new Relation
|
||||
r.setSource(pub.getId)
|
||||
r.setTarget(affId)
|
||||
r.setRelType(ModelConstants.RESULT_ORGANIZATION)
|
||||
r.setRelClass(ModelConstants.HAS_AUTHOR_INSTITUTION)
|
||||
r.setSubRelType(ModelConstants.AFFILIATION)
|
||||
r.setDataInfo(pub.getDataInfo)
|
||||
r.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
|
||||
val r1: Relation = new Relation
|
||||
r1.setTarget(pub.getId)
|
||||
r1.setSource(affId)
|
||||
r1.setRelType(ModelConstants.RESULT_ORGANIZATION)
|
||||
r1.setRelClass(ModelConstants.IS_AUTHOR_INSTITUTION_OF)
|
||||
r1.setSubRelType(ModelConstants.AFFILIATION)
|
||||
r1.setDataInfo(pub.getDataInfo)
|
||||
r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
|
||||
List(r, r1)
|
||||
})(mapEncoderRel)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
|
||||
|
||||
val unresolvedRels: Dataset[(String, Relation)] = spark.read
|
||||
.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
|
||||
.as[Relation]
|
||||
.map(r => {
|
||||
|
||||
magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
|
||||
val pub: Publication = item._1._2
|
||||
val affiliation = item._2
|
||||
val affId: String = if (affiliation.GridId.isDefined) s"unresolved::grid::${affiliation.GridId.get.toLowerCase}" else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
|
||||
val r: Relation = new Relation
|
||||
r.setSource(pub.getId)
|
||||
r.setTarget(affId)
|
||||
r.setRelType(ModelConstants.RESULT_ORGANIZATION)
|
||||
r.setRelClass(ModelConstants.HAS_AUTHOR_INSTITUTION)
|
||||
r.setSubRelType(ModelConstants.AFFILIATION)
|
||||
r.setDataInfo(pub.getDataInfo)
|
||||
r.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
|
||||
val r1: Relation = new Relation
|
||||
r1.setTarget(pub.getId)
|
||||
r1.setSource(affId)
|
||||
r1.setRelType(ModelConstants.RESULT_ORGANIZATION)
|
||||
r1.setRelClass(ModelConstants.IS_AUTHOR_INSTITUTION_OF)
|
||||
r1.setSubRelType(ModelConstants.AFFILIATION)
|
||||
r1.setDataInfo(pub.getDataInfo)
|
||||
r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
|
||||
List(r, r1)
|
||||
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
|
||||
if (r.getSource.startsWith("unresolved"))
|
||||
(r.getSource, r)
|
||||
else if (r.getTarget.startsWith("unresolved"))
|
||||
(r.getTarget, r)
|
||||
else
|
||||
("resolved", r)
|
||||
})(Encoders.tuple(Encoders.STRING, mapEncoderRel))
|
||||
|
||||
val openaireOrganization: Dataset[(String, String)] = spark.read
|
||||
.text(openaireOrganizationPath)
|
||||
.as[String]
|
||||
.flatMap(s => extractIdGRID(s))
|
||||
.groupByKey(_._2)
|
||||
.reduceGroups((x, y) => if (x != null) x else y)
|
||||
.map(_._2)
|
||||
|
||||
val unresolvedRels: Dataset[(String, Relation)] = spark.read.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved").as[Relation].map(r => {
|
||||
|
||||
if (r.getSource.startsWith("unresolved"))
|
||||
(r.getSource, r)
|
||||
else if (r.getTarget.startsWith("unresolved"))
|
||||
(r.getTarget, r)
|
||||
else
|
||||
("resolved", r)
|
||||
})(Encoders.tuple(Encoders.STRING, mapEncoderRel))
|
||||
|
||||
val openaireOrganization: Dataset[(String, String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x, y) => if (x != null) x else y).map(_._2)
|
||||
|
||||
unresolvedRels.joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2")))
|
||||
unresolvedRels
|
||||
.joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2")))
|
||||
.map { x =>
|
||||
val currentRels = x._1._2
|
||||
val currentOrgs = x._2
|
||||
|
@ -216,26 +275,35 @@ object SparkGenerateDoiBoost {
|
|||
else
|
||||
currentRels.setTarget(currentOrgs._1)
|
||||
currentRels
|
||||
}.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
|
||||
|
||||
magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).map(item => {
|
||||
val affiliation = item._2
|
||||
if (affiliation.GridId.isEmpty) {
|
||||
val o = new Organization
|
||||
o.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
|
||||
o.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
|
||||
o.setId(DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString))
|
||||
o.setOriginalId(List(affiliation.AffiliationId.toString).asJava)
|
||||
if (affiliation.DisplayName.nonEmpty)
|
||||
o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
|
||||
if (affiliation.OfficialPage.isDefined)
|
||||
o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
|
||||
o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
|
||||
o
|
||||
}
|
||||
else
|
||||
null
|
||||
}).filter(o => o != null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization")
|
||||
.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingDirPath/doiBoostPublicationAffiliation")
|
||||
|
||||
magPubs
|
||||
.joinWith(a, magPubs("_1").equalTo(a("PaperId")))
|
||||
.map(item => {
|
||||
val affiliation = item._2
|
||||
if (affiliation.GridId.isEmpty) {
|
||||
val o = new Organization
|
||||
o.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
|
||||
o.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
|
||||
o.setId(DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString))
|
||||
o.setOriginalId(List(affiliation.AffiliationId.toString).asJava)
|
||||
if (affiliation.DisplayName.nonEmpty)
|
||||
o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
|
||||
if (affiliation.OfficialPage.isDefined)
|
||||
o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
|
||||
o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
|
||||
o
|
||||
} else
|
||||
null
|
||||
})
|
||||
.filter(o => o != null)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingDirPath/doiBoostOrganization")
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -18,70 +18,74 @@ import scala.collection.JavaConverters._
|
|||
import scala.collection.mutable
|
||||
import scala.util.matching.Regex
|
||||
|
||||
case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
|
||||
case class CrossrefDT(doi: String, json: String, timestamp: Long) {}
|
||||
|
||||
case class mappingAffiliation(name: String) {}
|
||||
|
||||
case class mappingAuthor(given: Option[String], family: String, sequence:Option[String], ORCID: Option[String], affiliation: Option[mappingAffiliation]) {}
|
||||
case class mappingAuthor(
|
||||
given: Option[String],
|
||||
family: String,
|
||||
sequence: Option[String],
|
||||
ORCID: Option[String],
|
||||
affiliation: Option[mappingAffiliation]
|
||||
) {}
|
||||
|
||||
case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {}
|
||||
|
||||
|
||||
case object Crossref2Oaf {
|
||||
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
||||
|
||||
val mappingCrossrefType = Map(
|
||||
"book-section" -> "publication",
|
||||
"book" -> "publication",
|
||||
"book-chapter" -> "publication",
|
||||
"book-part" -> "publication",
|
||||
"book-series" -> "publication",
|
||||
"book-set" -> "publication",
|
||||
"book-track" -> "publication",
|
||||
"edited-book" -> "publication",
|
||||
"reference-book" -> "publication",
|
||||
"monograph" -> "publication",
|
||||
"journal-article" -> "publication",
|
||||
"dissertation" -> "publication",
|
||||
"other" -> "publication",
|
||||
"peer-review" -> "publication",
|
||||
"proceedings" -> "publication",
|
||||
"book-section" -> "publication",
|
||||
"book" -> "publication",
|
||||
"book-chapter" -> "publication",
|
||||
"book-part" -> "publication",
|
||||
"book-series" -> "publication",
|
||||
"book-set" -> "publication",
|
||||
"book-track" -> "publication",
|
||||
"edited-book" -> "publication",
|
||||
"reference-book" -> "publication",
|
||||
"monograph" -> "publication",
|
||||
"journal-article" -> "publication",
|
||||
"dissertation" -> "publication",
|
||||
"other" -> "publication",
|
||||
"peer-review" -> "publication",
|
||||
"proceedings" -> "publication",
|
||||
"proceedings-article" -> "publication",
|
||||
"reference-entry" -> "publication",
|
||||
"report" -> "publication",
|
||||
"report-series" -> "publication",
|
||||
"standard" -> "publication",
|
||||
"standard-series" -> "publication",
|
||||
"posted-content" -> "publication",
|
||||
"dataset" -> "dataset"
|
||||
"reference-entry" -> "publication",
|
||||
"report" -> "publication",
|
||||
"report-series" -> "publication",
|
||||
"standard" -> "publication",
|
||||
"standard-series" -> "publication",
|
||||
"posted-content" -> "publication",
|
||||
"dataset" -> "dataset"
|
||||
)
|
||||
|
||||
|
||||
val mappingCrossrefSubType = Map(
|
||||
"book-section" -> "0013 Part of book or chapter of book",
|
||||
"book" -> "0002 Book",
|
||||
"book-chapter" -> "0013 Part of book or chapter of book",
|
||||
"book-part" -> "0013 Part of book or chapter of book",
|
||||
"book-series" -> "0002 Book",
|
||||
"book-set" -> "0002 Book",
|
||||
"book-track" -> "0002 Book",
|
||||
"edited-book" -> "0002 Book",
|
||||
"reference-book" -> "0002 Book",
|
||||
"monograph" -> "0002 Book",
|
||||
"journal-article" -> "0001 Article",
|
||||
"dissertation" -> "0044 Thesis",
|
||||
"other" -> "0038 Other literature type",
|
||||
"peer-review" -> "0015 Review",
|
||||
"proceedings" -> "0004 Conference object",
|
||||
"book-section" -> "0013 Part of book or chapter of book",
|
||||
"book" -> "0002 Book",
|
||||
"book-chapter" -> "0013 Part of book or chapter of book",
|
||||
"book-part" -> "0013 Part of book or chapter of book",
|
||||
"book-series" -> "0002 Book",
|
||||
"book-set" -> "0002 Book",
|
||||
"book-track" -> "0002 Book",
|
||||
"edited-book" -> "0002 Book",
|
||||
"reference-book" -> "0002 Book",
|
||||
"monograph" -> "0002 Book",
|
||||
"journal-article" -> "0001 Article",
|
||||
"dissertation" -> "0044 Thesis",
|
||||
"other" -> "0038 Other literature type",
|
||||
"peer-review" -> "0015 Review",
|
||||
"proceedings" -> "0004 Conference object",
|
||||
"proceedings-article" -> "0004 Conference object",
|
||||
"reference-entry" -> "0013 Part of book or chapter of book",
|
||||
"report" -> "0017 Report",
|
||||
"report-series" -> "0017 Report",
|
||||
"standard" -> "0038 Other literature type",
|
||||
"standard-series" -> "0038 Other literature type",
|
||||
"dataset" -> "0021 Dataset",
|
||||
"preprint" -> "0016 Preprint",
|
||||
"report" -> "0017 Report"
|
||||
"reference-entry" -> "0013 Part of book or chapter of book",
|
||||
"report" -> "0017 Report",
|
||||
"report-series" -> "0017 Report",
|
||||
"standard" -> "0038 Other literature type",
|
||||
"standard-series" -> "0038 Other literature type",
|
||||
"dataset" -> "0021 Dataset",
|
||||
"preprint" -> "0016 Preprint",
|
||||
"report" -> "0017 Report"
|
||||
)
|
||||
|
||||
def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = {
|
||||
|
@ -100,7 +104,6 @@ case object Crossref2Oaf {
|
|||
val originalIds = new util.ArrayList(tmp.filter(id => id != null).asJava)
|
||||
result.setOriginalId(originalIds)
|
||||
|
||||
|
||||
// Add DataInfo
|
||||
result.setDataInfo(generateDataInfo())
|
||||
|
||||
|
@ -111,98 +114,169 @@ case object Crossref2Oaf {
|
|||
|
||||
// Publisher ( Name of work's publisher mapped into Result/Publisher)
|
||||
val publisher = (json \ "publisher").extractOrElse[String](null)
|
||||
if (publisher!= null && publisher.nonEmpty)
|
||||
if (publisher != null && publisher.nonEmpty)
|
||||
result.setPublisher(asField(publisher))
|
||||
|
||||
|
||||
// TITLE
|
||||
val mainTitles = for {JString(title) <- json \ "title" if title.nonEmpty} yield createSP(title, "main title", ModelConstants.DNET_DATACITE_TITLE)
|
||||
val originalTitles = for {JString(title) <- json \ "original-title" if title.nonEmpty} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
||||
val shortTitles = for {JString(title) <- json \ "short-title" if title.nonEmpty} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
||||
val subtitles = for {JString(title) <- json \ "subtitle" if title.nonEmpty} yield createSP(title, "subtitle", ModelConstants.DNET_DATACITE_TITLE)
|
||||
val mainTitles =
|
||||
for { JString(title) <- json \ "title" if title.nonEmpty } yield createSP(
|
||||
title,
|
||||
"main title",
|
||||
ModelConstants.DNET_DATACITE_TITLE
|
||||
)
|
||||
val originalTitles = for {
|
||||
JString(title) <- json \ "original-title" if title.nonEmpty
|
||||
} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
||||
val shortTitles = for {
|
||||
JString(title) <- json \ "short-title" if title.nonEmpty
|
||||
} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
||||
val subtitles =
|
||||
for { JString(title) <- json \ "subtitle" if title.nonEmpty } yield createSP(
|
||||
title,
|
||||
"subtitle",
|
||||
ModelConstants.DNET_DATACITE_TITLE
|
||||
)
|
||||
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
|
||||
|
||||
// DESCRIPTION
|
||||
val descriptionList = for {JString(description) <- json \ "abstract"} yield asField(description)
|
||||
val descriptionList =
|
||||
for { JString(description) <- json \ "abstract" } yield asField(description)
|
||||
result.setDescription(descriptionList.asJava)
|
||||
|
||||
// Source
|
||||
val sourceList = for {JString(source) <- json \ "source" if source!= null && source.nonEmpty} yield asField(source)
|
||||
val sourceList = for {
|
||||
JString(source) <- json \ "source" if source != null && source.nonEmpty
|
||||
} yield asField(source)
|
||||
result.setSource(sourceList.asJava)
|
||||
|
||||
//RELEVANT DATE Mapping
|
||||
val createdDate = generateDate((json \ "created" \ "date-time").extract[String], (json \ "created" \ "date-parts").extract[List[List[Int]]], "created", ModelConstants.DNET_DATACITE_DATE)
|
||||
val postedDate = generateDate((json \ "posted" \ "date-time").extractOrElse[String](null), (json \ "posted" \ "date-parts").extract[List[List[Int]]], "available", ModelConstants.DNET_DATACITE_DATE)
|
||||
val acceptedDate = generateDate((json \ "accepted" \ "date-time").extractOrElse[String](null), (json \ "accepted" \ "date-parts").extract[List[List[Int]]], "accepted", ModelConstants.DNET_DATACITE_DATE)
|
||||
val publishedPrintDate = generateDate((json \ "published-print" \ "date-time").extractOrElse[String](null), (json \ "published-print" \ "date-parts").extract[List[List[Int]]], "published-print", ModelConstants.DNET_DATACITE_DATE)
|
||||
val publishedOnlineDate = generateDate((json \ "published-online" \ "date-time").extractOrElse[String](null), (json \ "published-online" \ "date-parts").extract[List[List[Int]]], "published-online", ModelConstants.DNET_DATACITE_DATE)
|
||||
val createdDate = generateDate(
|
||||
(json \ "created" \ "date-time").extract[String],
|
||||
(json \ "created" \ "date-parts").extract[List[List[Int]]],
|
||||
"created",
|
||||
ModelConstants.DNET_DATACITE_DATE
|
||||
)
|
||||
val postedDate = generateDate(
|
||||
(json \ "posted" \ "date-time").extractOrElse[String](null),
|
||||
(json \ "posted" \ "date-parts").extract[List[List[Int]]],
|
||||
"available",
|
||||
ModelConstants.DNET_DATACITE_DATE
|
||||
)
|
||||
val acceptedDate = generateDate(
|
||||
(json \ "accepted" \ "date-time").extractOrElse[String](null),
|
||||
(json \ "accepted" \ "date-parts").extract[List[List[Int]]],
|
||||
"accepted",
|
||||
ModelConstants.DNET_DATACITE_DATE
|
||||
)
|
||||
val publishedPrintDate = generateDate(
|
||||
(json \ "published-print" \ "date-time").extractOrElse[String](null),
|
||||
(json \ "published-print" \ "date-parts").extract[List[List[Int]]],
|
||||
"published-print",
|
||||
ModelConstants.DNET_DATACITE_DATE
|
||||
)
|
||||
val publishedOnlineDate = generateDate(
|
||||
(json \ "published-online" \ "date-time").extractOrElse[String](null),
|
||||
(json \ "published-online" \ "date-parts").extract[List[List[Int]]],
|
||||
"published-online",
|
||||
ModelConstants.DNET_DATACITE_DATE
|
||||
)
|
||||
|
||||
val issuedDate = extractDate((json \ "issued" \ "date-time").extractOrElse[String](null), (json \ "issued" \ "date-parts").extract[List[List[Int]]])
|
||||
val issuedDate = extractDate(
|
||||
(json \ "issued" \ "date-time").extractOrElse[String](null),
|
||||
(json \ "issued" \ "date-parts").extract[List[List[Int]]]
|
||||
)
|
||||
if (StringUtils.isNotBlank(issuedDate)) {
|
||||
result.setDateofacceptance(asField(issuedDate))
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
result.setDateofacceptance(asField(createdDate.getValue))
|
||||
}
|
||||
result.setRelevantdate(List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate).filter(p => p != null).asJava)
|
||||
result.setRelevantdate(
|
||||
List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate)
|
||||
.filter(p => p != null)
|
||||
.asJava
|
||||
)
|
||||
|
||||
//Mapping Subject
|
||||
val subjectList:List[String] = (json \ "subject").extractOrElse[List[String]](List())
|
||||
val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List())
|
||||
|
||||
if (subjectList.nonEmpty) {
|
||||
result.setSubject(subjectList.map(s=> createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava)
|
||||
result.setSubject(
|
||||
subjectList.map(s => createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
|
||||
//Mapping Author
|
||||
val authorList: List[mappingAuthor] = (json \ "author").extractOrElse[List[mappingAuthor]](List())
|
||||
val authorList: List[mappingAuthor] =
|
||||
(json \ "author").extractOrElse[List[mappingAuthor]](List())
|
||||
|
||||
val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) =>
|
||||
a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first")
|
||||
)
|
||||
|
||||
|
||||
val sorted_list = authorList.sortWith((a:mappingAuthor, b:mappingAuthor) => a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first"))
|
||||
|
||||
result.setAuthor(sorted_list.zipWithIndex.map{case (a, index) => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)}.asJava)
|
||||
result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) =>
|
||||
generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)
|
||||
}.asJava)
|
||||
|
||||
// Mapping instance
|
||||
val instance = new Instance()
|
||||
val license = for {
|
||||
JObject(license) <- json \ "license"
|
||||
JField("URL", JString(lic)) <- license
|
||||
JObject(license) <- json \ "license"
|
||||
JField("URL", JString(lic)) <- license
|
||||
JField("content-version", JString(content_version)) <- license
|
||||
} yield (asField(lic), content_version)
|
||||
val l = license.filter(d => StringUtils.isNotBlank(d._1.getValue))
|
||||
if (l.nonEmpty){
|
||||
if (l exists (d => d._2.equals("vor"))){
|
||||
for(d <- l){
|
||||
if (d._2.equals("vor")){
|
||||
if (l.nonEmpty) {
|
||||
if (l exists (d => d._2.equals("vor"))) {
|
||||
for (d <- l) {
|
||||
if (d._2.equals("vor")) {
|
||||
instance.setLicense(d._1)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
instance.setLicense(l.head._1)
|
||||
}
|
||||
else{
|
||||
instance.setLicense(l.head._1)}
|
||||
}
|
||||
|
||||
// Ticket #6281 added pid to Instance
|
||||
instance.setPid(result.getPid)
|
||||
|
||||
val has_review = json \ "relation" \"has-review" \ "id"
|
||||
val has_review = json \ "relation" \ "has-review" \ "id"
|
||||
|
||||
if(has_review != JNothing) {
|
||||
if (has_review != JNothing) {
|
||||
instance.setRefereed(
|
||||
OafMapperUtils.qualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS))
|
||||
OafMapperUtils.qualifier(
|
||||
"0001",
|
||||
"peerReviewed",
|
||||
ModelConstants.DNET_REVIEW_LEVELS,
|
||||
ModelConstants.DNET_REVIEW_LEVELS
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
instance.setAccessright(decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue))
|
||||
instance.setInstancetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
result.setResourcetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
instance.setAccessright(
|
||||
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
|
||||
)
|
||||
instance.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
cobjCategory.substring(0, 4),
|
||||
cobjCategory.substring(5),
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
result.setResourcetype(
|
||||
OafMapperUtils.qualifier(
|
||||
cobjCategory.substring(0, 4),
|
||||
cobjCategory.substring(5),
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
instance.setCollectedfrom(createCrossrefCollectedFrom())
|
||||
if (StringUtils.isNotBlank(issuedDate)) {
|
||||
instance.setDateofacceptance(asField(issuedDate))
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
instance.setDateofacceptance(asField(createdDate.getValue))
|
||||
}
|
||||
val s: List[String] = List("https://doi.org/" + doi)
|
||||
|
@ -210,10 +284,9 @@ case object Crossref2Oaf {
|
|||
// if (links.nonEmpty) {
|
||||
// instance.setUrl(links.asJava)
|
||||
// }
|
||||
if(s.nonEmpty)
|
||||
{
|
||||
instance.setUrl(s.asJava)
|
||||
}
|
||||
if (s.nonEmpty) {
|
||||
instance.setUrl(s.asJava)
|
||||
}
|
||||
|
||||
result.setInstance(List(instance).asJava)
|
||||
|
||||
|
@ -236,15 +309,23 @@ case object Crossref2Oaf {
|
|||
result
|
||||
}
|
||||
|
||||
|
||||
def generateAuhtor(given: String, family: String, orcid: String, index:Int): Author = {
|
||||
def generateAuhtor(given: String, family: String, orcid: String, index: Int): Author = {
|
||||
val a = new Author
|
||||
a.setName(given)
|
||||
a.setSurname(family)
|
||||
a.setFullname(s"$given $family")
|
||||
a.setRank(index+1)
|
||||
a.setRank(index + 1)
|
||||
if (StringUtils.isNotBlank(orcid))
|
||||
a.setPid(List(createSP(orcid, ModelConstants.ORCID_PENDING, ModelConstants.DNET_PID_TYPES, generateDataInfo())).asJava)
|
||||
a.setPid(
|
||||
List(
|
||||
createSP(
|
||||
orcid,
|
||||
ModelConstants.ORCID_PENDING,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
generateDataInfo()
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
a
|
||||
}
|
||||
|
@ -255,54 +336,62 @@ case object Crossref2Oaf {
|
|||
|
||||
var resultList: List[Oaf] = List()
|
||||
|
||||
|
||||
val objectType = (json \ "type").extractOrElse[String](null)
|
||||
val objectSubType = (json \ "subtype").extractOrElse[String](null)
|
||||
if (objectType == null)
|
||||
return resultList
|
||||
|
||||
|
||||
val result = generateItemFromType(objectType, objectSubType)
|
||||
if (result == null)
|
||||
return List()
|
||||
val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type"))
|
||||
val cOBJCategory = mappingCrossrefSubType.getOrElse(
|
||||
objectType,
|
||||
mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type")
|
||||
)
|
||||
mappingResult(result, json, cOBJCategory)
|
||||
if (result == null || result.getId == null)
|
||||
return List()
|
||||
|
||||
|
||||
val funderList: List[mappingFunder] = (json \ "funder").extractOrElse[List[mappingFunder]](List())
|
||||
val funderList: List[mappingFunder] =
|
||||
(json \ "funder").extractOrElse[List[mappingFunder]](List())
|
||||
|
||||
if (funderList.nonEmpty) {
|
||||
resultList = resultList ::: mappingFunderToRelations(funderList, result.getId, createCrossrefCollectedFrom(), result.getDataInfo, result.getLastupdatetimestamp)
|
||||
resultList = resultList ::: mappingFunderToRelations(
|
||||
funderList,
|
||||
result.getId,
|
||||
createCrossrefCollectedFrom(),
|
||||
result.getDataInfo,
|
||||
result.getLastupdatetimestamp
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
result match {
|
||||
case publication: Publication => convertPublication(publication, json, cOBJCategory)
|
||||
case dataset: Dataset => convertDataset(dataset)
|
||||
case dataset: Dataset => convertDataset(dataset)
|
||||
}
|
||||
|
||||
resultList = resultList ::: List(result)
|
||||
resultList
|
||||
}
|
||||
|
||||
|
||||
def mappingFunderToRelations(funders: List[mappingFunder], sourceId: String, cf: KeyValue, di: DataInfo, ts: Long): List[Relation] = {
|
||||
def mappingFunderToRelations(
|
||||
funders: List[mappingFunder],
|
||||
sourceId: String,
|
||||
cf: KeyValue,
|
||||
di: DataInfo,
|
||||
ts: Long
|
||||
): List[Relation] = {
|
||||
|
||||
val queue = new mutable.Queue[Relation]
|
||||
|
||||
|
||||
def snsfRule(award:String): String = {
|
||||
val tmp1 = StringUtils.substringAfter(award,"_")
|
||||
val tmp2 = StringUtils.substringBefore(tmp1,"/")
|
||||
def snsfRule(award: String): String = {
|
||||
val tmp1 = StringUtils.substringAfter(award, "_")
|
||||
val tmp2 = StringUtils.substringBefore(tmp1, "/")
|
||||
logger.debug(s"From $award to $tmp2")
|
||||
tmp2
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def extractECAward(award: String): String = {
|
||||
val awardECRegex: Regex = "[0-9]{4,9}".r
|
||||
if (awardECRegex.findAllIn(award).hasNext)
|
||||
|
@ -310,8 +399,7 @@ case object Crossref2Oaf {
|
|||
null
|
||||
}
|
||||
|
||||
|
||||
def generateRelation(sourceId:String, targetId:String, relClass:String) :Relation = {
|
||||
def generateRelation(sourceId: String, targetId: String, relClass: String): Relation = {
|
||||
|
||||
val r = new Relation
|
||||
r.setSource(sourceId)
|
||||
|
@ -324,98 +412,121 @@ case object Crossref2Oaf {
|
|||
r.setLastupdatetimestamp(ts)
|
||||
r
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def generateSimpleRelationFromAward(funder: mappingFunder, nsPrefix: String, extractField: String => String): Unit = {
|
||||
def generateSimpleRelationFromAward(
|
||||
funder: mappingFunder,
|
||||
nsPrefix: String,
|
||||
extractField: String => String
|
||||
): Unit = {
|
||||
if (funder.award.isDefined && funder.award.get.nonEmpty)
|
||||
funder.award.get.map(extractField).filter(a => a!= null && a.nonEmpty).foreach(
|
||||
award => {
|
||||
funder.award.get
|
||||
.map(extractField)
|
||||
.filter(a => a != null && a.nonEmpty)
|
||||
.foreach(award => {
|
||||
val targetId = getProjectId(nsPrefix, DHPUtils.md5(award))
|
||||
queue += generateRelation(sourceId, targetId , ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId , sourceId, ModelConstants.PRODUCES)
|
||||
}
|
||||
)
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
})
|
||||
}
|
||||
|
||||
def getProjectId (nsPrefix:String, targetId:String):String = {
|
||||
def getProjectId(nsPrefix: String, targetId: String): String = {
|
||||
s"40|$nsPrefix::$targetId"
|
||||
}
|
||||
|
||||
|
||||
if (funders != null)
|
||||
funders.foreach(funder => {
|
||||
if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
|
||||
funder.DOI.get match {
|
||||
case "10.13039/100010663" |
|
||||
"10.13039/100010661" |
|
||||
"10.13039/501100007601" |
|
||||
"10.13039/501100000780" |
|
||||
"10.13039/100010665" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||
case "10.13039/100011199" |
|
||||
"10.13039/100004431" |
|
||||
"10.13039/501100004963" |
|
||||
"10.13039/501100000780" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||
case "10.13039/501100000781" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||
case "10.13039/100000001" => generateSimpleRelationFromAward(funder, "nsf_________", a => a)
|
||||
case "10.13039/501100001665" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
||||
case "10.13039/501100002341" => generateSimpleRelationFromAward(funder, "aka_________", a => a)
|
||||
case "10.13039/501100001602" => generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
|
||||
case "10.13039/501100000923" => generateSimpleRelationFromAward(funder, "arc_________", a => a)
|
||||
case "10.13039/501100000038"=> val targetId = getProjectId("nserc_______" , "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
case "10.13039/501100000155"=> val targetId = getProjectId("sshrc_______" , "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId,targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId,sourceId, ModelConstants.PRODUCES)
|
||||
case "10.13039/501100000024"=> val targetId = getProjectId("cihr________" , "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId,targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId,sourceId, ModelConstants.PRODUCES)
|
||||
case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a)
|
||||
case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
|
||||
case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a=>a)
|
||||
case "10.13039/501100004564" => generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
|
||||
case "10.13039/501100003407" => generateSimpleRelationFromAward(funder, "miur________", a=>a)
|
||||
val targetId = getProjectId("miur________" , "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId,targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId,sourceId, ModelConstants.PRODUCES)
|
||||
case "10.13039/501100006588" |
|
||||
"10.13039/501100004488" => generateSimpleRelationFromAward(funder, "irb_hr______", a=>a.replaceAll("Project No.", "").replaceAll("HRZZ-","") )
|
||||
case "10.13039/501100006769"=> generateSimpleRelationFromAward(funder, "rsf_________", a=>a)
|
||||
case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
|
||||
case "10.13039/501100004410"=> generateSimpleRelationFromAward(funder, "tubitakf____", a =>a)
|
||||
case "10.10.13039/100004440"=> generateSimpleRelationFromAward(funder, "wt__________", a =>a)
|
||||
case "10.13039/100004440"=> val targetId = getProjectId("wt__________" , "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId,targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId,sourceId, ModelConstants.PRODUCES)
|
||||
funders.foreach(funder => {
|
||||
if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
|
||||
funder.DOI.get match {
|
||||
case "10.13039/100010663" | "10.13039/100010661" |
|
||||
"10.13039/501100007601" | "10.13039/501100000780" | "10.13039/100010665" =>
|
||||
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||
case "10.13039/100011199" | "10.13039/100004431" | "10.13039/501100004963" |
|
||||
"10.13039/501100000780" =>
|
||||
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||
case "10.13039/501100000781" =>
|
||||
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||
case "10.13039/100000001" =>
|
||||
generateSimpleRelationFromAward(funder, "nsf_________", a => a)
|
||||
case "10.13039/501100001665" =>
|
||||
generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
||||
case "10.13039/501100002341" =>
|
||||
generateSimpleRelationFromAward(funder, "aka_________", a => a)
|
||||
case "10.13039/501100001602" =>
|
||||
generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
|
||||
case "10.13039/501100000923" =>
|
||||
generateSimpleRelationFromAward(funder, "arc_________", a => a)
|
||||
case "10.13039/501100000038" =>
|
||||
val targetId = getProjectId("nserc_______", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
case "10.13039/501100000155" =>
|
||||
val targetId = getProjectId("sshrc_______", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
case "10.13039/501100000024" =>
|
||||
val targetId = getProjectId("cihr________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
case "10.13039/501100002848" =>
|
||||
generateSimpleRelationFromAward(funder, "conicytf____", a => a)
|
||||
case "10.13039/501100003448" =>
|
||||
generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
|
||||
case "10.13039/501100010198" =>
|
||||
generateSimpleRelationFromAward(funder, "sgov________", a => a)
|
||||
case "10.13039/501100004564" =>
|
||||
generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
|
||||
case "10.13039/501100003407" =>
|
||||
generateSimpleRelationFromAward(funder, "miur________", a => a)
|
||||
val targetId = getProjectId("miur________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
case "10.13039/501100006588" | "10.13039/501100004488" =>
|
||||
generateSimpleRelationFromAward(
|
||||
funder,
|
||||
"irb_hr______",
|
||||
a => a.replaceAll("Project No.", "").replaceAll("HRZZ-", "")
|
||||
)
|
||||
case "10.13039/501100006769" =>
|
||||
generateSimpleRelationFromAward(funder, "rsf_________", a => a)
|
||||
case "10.13039/501100001711" =>
|
||||
generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
|
||||
case "10.13039/501100004410" =>
|
||||
generateSimpleRelationFromAward(funder, "tubitakf____", a => a)
|
||||
case "10.10.13039/100004440" =>
|
||||
generateSimpleRelationFromAward(funder, "wt__________", a => a)
|
||||
case "10.13039/100004440" =>
|
||||
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
|
||||
case _ => logger.debug("no match for "+funder.DOI.get )
|
||||
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
funder.name match {
|
||||
case "European Union’s Horizon 2020 research and innovation program" =>
|
||||
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||
case "European Union's" =>
|
||||
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||
case "The French National Research Agency (ANR)" |
|
||||
"The French National Research Agency" =>
|
||||
generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
||||
case "CONICYT, Programa de Formación de Capital Humano Avanzado" =>
|
||||
generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
|
||||
case "Wellcome Trust Masters Fellowship" =>
|
||||
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
case _ => logger.debug("no match for " + funder.name)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
funder.name match {
|
||||
case "European Union’s Horizon 2020 research and innovation program" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||
case "European Union's" =>
|
||||
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||
case "The French National Research Agency (ANR)" |
|
||||
"The French National Research Agency" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
||||
case "CONICYT, Programa de Formación de Capital Humano Avanzado" => generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
|
||||
case "Wellcome Trust Masters Fellowship" => val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY )
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES )
|
||||
case _ => logger.debug("no match for "+funder.name )
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
)
|
||||
})
|
||||
queue.toList
|
||||
}
|
||||
|
||||
|
@ -423,33 +534,31 @@ case object Crossref2Oaf {
|
|||
// TODO check if there are other info to map into the Dataset
|
||||
}
|
||||
|
||||
|
||||
def convertPublication(publication: Publication, json: JValue, cobjCategory: String): Unit = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
val containerTitles = for {JString(ct) <- json \ "container-title"} yield ct
|
||||
|
||||
val containerTitles = for { JString(ct) <- json \ "container-title" } yield ct
|
||||
|
||||
//Mapping book
|
||||
if (cobjCategory.toLowerCase.contains("book")) {
|
||||
val ISBN = for {JString(isbn) <- json \ "ISBN"} yield isbn
|
||||
val ISBN = for { JString(isbn) <- json \ "ISBN" } yield isbn
|
||||
if (ISBN.nonEmpty && containerTitles.nonEmpty) {
|
||||
val source = s"${containerTitles.head} ISBN: ${ISBN.head}"
|
||||
if (publication.getSource != null) {
|
||||
val l: List[Field[String]] = publication.getSource.asScala.toList
|
||||
val ll: List[Field[String]] = l ::: List(asField(source))
|
||||
publication.setSource(ll.asJava)
|
||||
}
|
||||
else
|
||||
} else
|
||||
publication.setSource(List(asField(source)).asJava)
|
||||
}
|
||||
} else {
|
||||
// Mapping Journal
|
||||
|
||||
val issnInfos = for {JArray(issn_types) <- json \ "issn-type"
|
||||
JObject(issn_type) <- issn_types
|
||||
JField("type", JString(tp)) <- issn_type
|
||||
JField("value", JString(vl)) <- issn_type
|
||||
} yield Tuple2(tp, vl)
|
||||
val issnInfos = for {
|
||||
JArray(issn_types) <- json \ "issn-type"
|
||||
JObject(issn_type) <- issn_types
|
||||
JField("type", JString(tp)) <- issn_type
|
||||
JField("value", JString(vl)) <- issn_type
|
||||
} yield Tuple2(tp, vl)
|
||||
|
||||
val volume = (json \ "volume").extractOrElse[String](null)
|
||||
if (containerTitles.nonEmpty) {
|
||||
|
@ -460,7 +569,7 @@ case object Crossref2Oaf {
|
|||
issnInfos.foreach(tp => {
|
||||
tp._1 match {
|
||||
case "electronic" => journal.setIssnOnline(tp._2)
|
||||
case "print" => journal.setIssnPrinted(tp._2)
|
||||
case "print" => journal.setIssnPrinted(tp._2)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
@ -494,7 +603,12 @@ case object Crossref2Oaf {
|
|||
|
||||
}
|
||||
|
||||
def generateDate(dt: String, datePart: List[List[Int]], classId: String, schemeId: String): StructuredProperty = {
|
||||
def generateDate(
|
||||
dt: String,
|
||||
datePart: List[List[Int]],
|
||||
classId: String,
|
||||
schemeId: String
|
||||
): StructuredProperty = {
|
||||
val dp = extractDate(dt, datePart)
|
||||
if (StringUtils.isNotBlank(dp))
|
||||
return createSP(dp, classId, schemeId)
|
||||
|
|
|
@ -16,7 +16,6 @@ object CrossrefDataset {
|
|||
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
||||
|
||||
|
||||
def to_item(input: String): CrossrefDT = {
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
@ -29,19 +28,24 @@ object CrossrefDataset {
|
|||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
CrossrefDataset.getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json"
|
||||
)
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
val crossrefAggregator = new Aggregator[CrossrefDT, CrossrefDT, CrossrefDT] with Serializable {
|
||||
|
||||
override def zero: CrossrefDT = null
|
||||
|
@ -52,7 +56,6 @@ object CrossrefDataset {
|
|||
if (a == null)
|
||||
return b
|
||||
|
||||
|
||||
if (a.timestamp > b.timestamp) {
|
||||
return a
|
||||
}
|
||||
|
@ -80,19 +83,24 @@ object CrossrefDataset {
|
|||
|
||||
val workingPath: String = parser.get("workingPath")
|
||||
|
||||
|
||||
val main_ds: Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT]
|
||||
|
||||
|
||||
val update =
|
||||
spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text])
|
||||
.map(i => CrossrefImporter.decompressBlob(i._2.toString))
|
||||
.map(i => to_item(i)))
|
||||
spark.createDataset(
|
||||
spark.sparkContext
|
||||
.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text])
|
||||
.map(i => CrossrefImporter.decompressBlob(i._2.toString))
|
||||
.map(i => to_item(i))
|
||||
)
|
||||
|
||||
main_ds.union(update).groupByKey(_.doi)
|
||||
main_ds
|
||||
.union(update)
|
||||
.groupByKey(_.doi)
|
||||
.agg(crossrefAggregator.toColumn)
|
||||
.map(s => s._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/crossref_ds_updated")
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -18,7 +18,6 @@ object GenerateCrossrefDataset {
|
|||
|
||||
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
|
||||
|
||||
|
||||
def crossrefElement(meta: String): CrossrefDT = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(meta)
|
||||
|
@ -30,13 +29,23 @@ object GenerateCrossrefDataset {
|
|||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString)
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
|
||||
)
|
||||
)
|
||||
.mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
val spark: SparkSession = SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(UnpackCrtossrefEntries.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
@ -44,12 +53,14 @@ object GenerateCrossrefDataset {
|
|||
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
val tmp: RDD[String] = sc.textFile(sourcePath, 6000)
|
||||
|
||||
spark.createDataset(tmp)
|
||||
spark
|
||||
.createDataset(tmp)
|
||||
.map(entry => crossrefElement(entry))
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(targetPath)
|
||||
// .map(meta => crossrefElement(meta))
|
||||
// .toDS.as[CrossrefDT]
|
||||
// .write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
|
|
|
@ -8,7 +8,6 @@ import org.apache.spark.SparkConf
|
|||
import org.apache.spark.sql._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
|
||||
case class Reference(author: String, firstPage: String) {}
|
||||
|
||||
object SparkMapDumpIntoOAF {
|
||||
|
@ -19,14 +18,21 @@ object SparkMapDumpIntoOAF {
|
|||
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
SparkMapDumpIntoOAF.getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json"
|
||||
)
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
|
@ -35,19 +41,34 @@ object SparkMapDumpIntoOAF {
|
|||
|
||||
val targetPath = parser.get("targetPath")
|
||||
|
||||
spark.read.load(parser.get("sourcePath")).as[CrossrefDT]
|
||||
spark.read
|
||||
.load(parser.get("sourcePath"))
|
||||
.as[CrossrefDT]
|
||||
.flatMap(k => Crossref2Oaf.convert(k.json))
|
||||
.filter(o => o != null)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/mixObject")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/mixObject")
|
||||
|
||||
val ds:Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]
|
||||
val ds: Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]
|
||||
|
||||
ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefPublication")
|
||||
ds.filter(o => o.isInstanceOf[Publication])
|
||||
.map(o => o.asInstanceOf[Publication])
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/crossrefPublication")
|
||||
|
||||
ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefRelation")
|
||||
ds.filter(o => o.isInstanceOf[Relation])
|
||||
.map(o => o.asInstanceOf[Relation])
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/crossrefRelation")
|
||||
|
||||
ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefDataset")
|
||||
ds.filter(o => o.isInstanceOf[OafDataset])
|
||||
.map(o => o.asInstanceOf[OafDataset])
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/crossrefDataset")
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -16,7 +16,6 @@ object UnpackCrtossrefEntries {
|
|||
|
||||
val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass)
|
||||
|
||||
|
||||
def extractDump(input: String): List[String] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
|
@ -24,28 +23,36 @@ object UnpackCrtossrefEntries {
|
|||
val a = (json \ "items").extract[JArray]
|
||||
a.arr.map(s => compact(render(s)))
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString)
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
|
||||
)
|
||||
)
|
||||
.mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
val spark: SparkSession = SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(UnpackCrtossrefEntries.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
val sc: SparkContext = spark.sparkContext
|
||||
|
||||
sc.wholeTextFiles(sourcePath, 6000).flatMap(d => extractDump(d._2))
|
||||
sc.wholeTextFiles(sourcePath, 6000)
|
||||
.flatMap(d => extractDump(d._2))
|
||||
.saveAsTextFile(targetPath, classOf[GzipCodec])
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
package eu.dnetlib.doiboost.mag
|
||||
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
||||
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
|
||||
|
@ -14,59 +13,134 @@ import scala.collection.JavaConverters._
|
|||
import scala.collection.mutable
|
||||
import scala.util.matching.Regex
|
||||
|
||||
|
||||
case class MagPapers(PaperId: Long, Rank: Integer, Doi: String,
|
||||
DocType: String, PaperTitle: String, OriginalTitle: String,
|
||||
BookTitle: String, Year: Option[Integer], Date: Option[java.sql.Timestamp], Publisher: String,
|
||||
JournalId: Option[Long], ConferenceSeriesId: Option[Long], ConferenceInstanceId: Option[Long],
|
||||
Volume: String, Issue: String, FirstPage: String, LastPage: String,
|
||||
ReferenceCount: Option[Long], CitationCount: Option[Long], EstimatedCitation: Option[Long],
|
||||
OriginalVenue: String, FamilyId: Option[Long], CreatedDate: java.sql.Timestamp) {}
|
||||
|
||||
case class MagPapers(
|
||||
PaperId: Long,
|
||||
Rank: Integer,
|
||||
Doi: String,
|
||||
DocType: String,
|
||||
PaperTitle: String,
|
||||
OriginalTitle: String,
|
||||
BookTitle: String,
|
||||
Year: Option[Integer],
|
||||
Date: Option[java.sql.Timestamp],
|
||||
Publisher: String,
|
||||
JournalId: Option[Long],
|
||||
ConferenceSeriesId: Option[Long],
|
||||
ConferenceInstanceId: Option[Long],
|
||||
Volume: String,
|
||||
Issue: String,
|
||||
FirstPage: String,
|
||||
LastPage: String,
|
||||
ReferenceCount: Option[Long],
|
||||
CitationCount: Option[Long],
|
||||
EstimatedCitation: Option[Long],
|
||||
OriginalVenue: String,
|
||||
FamilyId: Option[Long],
|
||||
CreatedDate: java.sql.Timestamp
|
||||
) {}
|
||||
|
||||
case class MagPaperAbstract(PaperId: Long, IndexedAbstract: String) {}
|
||||
|
||||
case class MagAuthor(AuthorId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], LastKnownAffiliationId: Option[Long], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
|
||||
case class MagAuthor(
|
||||
AuthorId: Long,
|
||||
Rank: Option[Int],
|
||||
NormalizedName: Option[String],
|
||||
DisplayName: Option[String],
|
||||
LastKnownAffiliationId: Option[Long],
|
||||
PaperCount: Option[Long],
|
||||
CitationCount: Option[Long],
|
||||
CreatedDate: Option[java.sql.Timestamp]
|
||||
) {}
|
||||
|
||||
case class MagAffiliation(AffiliationId: Long, Rank: Int, NormalizedName: String, DisplayName: String, GridId: String, OfficialPage: String, WikiPage: String, PaperCount: Long, CitationCount: Long, Latitude: Option[Float], Longitude: Option[Float], CreatedDate: java.sql.Timestamp) {}
|
||||
case class MagAffiliation(
|
||||
AffiliationId: Long,
|
||||
Rank: Int,
|
||||
NormalizedName: String,
|
||||
DisplayName: String,
|
||||
GridId: String,
|
||||
OfficialPage: String,
|
||||
WikiPage: String,
|
||||
PaperCount: Long,
|
||||
CitationCount: Long,
|
||||
Latitude: Option[Float],
|
||||
Longitude: Option[Float],
|
||||
CreatedDate: java.sql.Timestamp
|
||||
) {}
|
||||
|
||||
case class MagPaperAuthorAffiliation(PaperId: Long, AuthorId: Long, AffiliationId: Option[Long], AuthorSequenceNumber: Int, OriginalAuthor: String, OriginalAffiliation: String) {}
|
||||
case class MagPaperAuthorAffiliation(
|
||||
PaperId: Long,
|
||||
AuthorId: Long,
|
||||
AffiliationId: Option[Long],
|
||||
AuthorSequenceNumber: Int,
|
||||
OriginalAuthor: String,
|
||||
OriginalAffiliation: String
|
||||
) {}
|
||||
|
||||
|
||||
case class MagAuthorAffiliation(author: MagAuthor, affiliation:String, sequenceNumber:Int)
|
||||
case class MagAuthorAffiliation(author: MagAuthor, affiliation: String, sequenceNumber: Int)
|
||||
|
||||
case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {}
|
||||
|
||||
case class MagPaperAuthorDenormalized(PaperId: Long, author: MagAuthor, affiliation:String, sequenceNumber:Int) {}
|
||||
case class MagPaperAuthorDenormalized(
|
||||
PaperId: Long,
|
||||
author: MagAuthor,
|
||||
affiliation: String,
|
||||
sequenceNumber: Int
|
||||
) {}
|
||||
|
||||
case class MagPaperUrl(PaperId: Long, SourceType: Option[Int], SourceUrl: Option[String], LanguageCode: Option[String]) {}
|
||||
case class MagPaperUrl(
|
||||
PaperId: Long,
|
||||
SourceType: Option[Int],
|
||||
SourceUrl: Option[String],
|
||||
LanguageCode: Option[String]
|
||||
) {}
|
||||
|
||||
case class MagUrlInstance(SourceUrl:String){}
|
||||
case class MagUrlInstance(SourceUrl: String) {}
|
||||
|
||||
case class MagUrl(PaperId: Long, instances: List[MagUrlInstance])
|
||||
|
||||
case class MagSubject(FieldOfStudyId:Long, DisplayName:String, MainType:Option[String], Score:Float){}
|
||||
case class MagSubject(
|
||||
FieldOfStudyId: Long,
|
||||
DisplayName: String,
|
||||
MainType: Option[String],
|
||||
Score: Float
|
||||
) {}
|
||||
|
||||
case class MagFieldOfStudy(PaperId:Long, subjects:List[MagSubject]) {}
|
||||
case class MagFieldOfStudy(PaperId: Long, subjects: List[MagSubject]) {}
|
||||
|
||||
case class MagJournal(JournalId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], Issn: Option[String], Publisher: Option[String], Webpage: Option[String], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
|
||||
case class MagJournal(
|
||||
JournalId: Long,
|
||||
Rank: Option[Int],
|
||||
NormalizedName: Option[String],
|
||||
DisplayName: Option[String],
|
||||
Issn: Option[String],
|
||||
Publisher: Option[String],
|
||||
Webpage: Option[String],
|
||||
PaperCount: Option[Long],
|
||||
CitationCount: Option[Long],
|
||||
CreatedDate: Option[java.sql.Timestamp]
|
||||
) {}
|
||||
|
||||
|
||||
case class MagConferenceInstance(ci:Long, DisplayName:Option[String], Location:Option[String], StartDate:Option[java.sql.Timestamp], EndDate:Option[java.sql.Timestamp], PaperId:Long){}
|
||||
case class MagConferenceInstance(
|
||||
ci: Long,
|
||||
DisplayName: Option[String],
|
||||
Location: Option[String],
|
||||
StartDate: Option[java.sql.Timestamp],
|
||||
EndDate: Option[java.sql.Timestamp],
|
||||
PaperId: Long
|
||||
) {}
|
||||
|
||||
case object ConversionUtil {
|
||||
|
||||
def extractMagIdentifier(pids:mutable.Buffer[String]) :String ={
|
||||
def extractMagIdentifier(pids: mutable.Buffer[String]): String = {
|
||||
val magIDRegex: Regex = "^[0-9]+$".r
|
||||
val s =pids.filter(p=> magIDRegex.findAllIn(p).hasNext)
|
||||
val s = pids.filter(p => magIDRegex.findAllIn(p).hasNext)
|
||||
|
||||
if (s.nonEmpty)
|
||||
return s.head
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
def mergePublication(a: Publication, b:Publication) : Publication = {
|
||||
def mergePublication(a: Publication, b: Publication): Publication = {
|
||||
if ((a != null) && (b != null)) {
|
||||
a.mergeFrom(b)
|
||||
a
|
||||
|
@ -74,10 +148,9 @@ case object ConversionUtil {
|
|||
if (a == null) b else a
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
def choiceLatestMagArtitcle(p1: MagPapers, p2:MagPapers) :MagPapers = {
|
||||
def choiceLatestMagArtitcle(p1: MagPapers, p2: MagPapers): MagPapers = {
|
||||
var r = if (p1 == null) p2 else p1
|
||||
if (p1 != null && p2 != null) {
|
||||
if (p1.CreatedDate != null && p2.CreatedDate != null) {
|
||||
|
@ -93,8 +166,9 @@ case object ConversionUtil {
|
|||
|
||||
}
|
||||
|
||||
|
||||
def updatePubsWithDescription(inputItem:((String, Publication), MagPaperAbstract)) : Publication = {
|
||||
def updatePubsWithDescription(
|
||||
inputItem: ((String, Publication), MagPaperAbstract)
|
||||
): Publication = {
|
||||
val pub = inputItem._1._2
|
||||
val abst = inputItem._2
|
||||
if (abst != null) {
|
||||
|
@ -104,20 +178,22 @@ case object ConversionUtil {
|
|||
|
||||
}
|
||||
|
||||
def updatePubsWithConferenceInfo(
|
||||
inputItem: ((String, Publication), MagConferenceInstance)
|
||||
): Publication = {
|
||||
val publication: Publication = inputItem._1._2
|
||||
val ci: MagConferenceInstance = inputItem._2
|
||||
|
||||
def updatePubsWithConferenceInfo(inputItem:((String, Publication), MagConferenceInstance)) : Publication = {
|
||||
val publication:Publication= inputItem._1._2
|
||||
val ci:MagConferenceInstance = inputItem._2
|
||||
if (ci != null) {
|
||||
|
||||
if (ci!= null){
|
||||
|
||||
val j:Journal = new Journal
|
||||
val j: Journal = new Journal
|
||||
if (ci.Location.isDefined)
|
||||
j.setConferenceplace(ci.Location.get)
|
||||
j.setName(ci.DisplayName.get)
|
||||
if (ci.StartDate.isDefined && ci.EndDate.isDefined)
|
||||
{
|
||||
j.setConferencedate(s"${ci.StartDate.get.toString.substring(0,10)} - ${ci.EndDate.get.toString.substring(0,10)}")
|
||||
if (ci.StartDate.isDefined && ci.EndDate.isDefined) {
|
||||
j.setConferencedate(
|
||||
s"${ci.StartDate.get.toString.substring(0, 10)} - ${ci.EndDate.get.toString.substring(0, 10)}"
|
||||
)
|
||||
}
|
||||
|
||||
publication.setJournal(j)
|
||||
|
@ -125,7 +201,7 @@ case object ConversionUtil {
|
|||
publication
|
||||
}
|
||||
|
||||
def updatePubsWithSubject(item:((String, Publication), MagFieldOfStudy)) : Publication = {
|
||||
def updatePubsWithSubject(item: ((String, Publication), MagFieldOfStudy)): Publication = {
|
||||
|
||||
val publication = item._1._2
|
||||
val fieldOfStudy = item._2
|
||||
|
@ -135,16 +211,34 @@ case object ConversionUtil {
|
|||
val classid = "MAG"
|
||||
|
||||
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
|
||||
val s1 = createSP(s.DisplayName, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
|
||||
val s1 = createSP(
|
||||
s.DisplayName,
|
||||
classid,
|
||||
className,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES
|
||||
)
|
||||
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
|
||||
var resList: List[StructuredProperty] = List(s1)
|
||||
if (s.MainType.isDefined) {
|
||||
val maintp = s.MainType.get
|
||||
val s2 = createSP(s.MainType.get, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
|
||||
val s2 = createSP(
|
||||
s.MainType.get,
|
||||
classid,
|
||||
className,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES
|
||||
)
|
||||
s2.setDataInfo(di)
|
||||
resList = resList ::: List(s2)
|
||||
if (maintp.contains(".")) {
|
||||
val s3 = createSP(maintp.split("\\.").head, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
|
||||
val s3 = createSP(
|
||||
maintp.split("\\.").head,
|
||||
classid,
|
||||
className,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES
|
||||
)
|
||||
s3.setDataInfo(di)
|
||||
resList = resList ::: List(s3)
|
||||
}
|
||||
|
@ -156,25 +250,27 @@ case object ConversionUtil {
|
|||
publication
|
||||
}
|
||||
|
||||
|
||||
|
||||
def addInstances(a: (Publication, MagUrl)): Publication = {
|
||||
val pub = a._1
|
||||
val urls = a._2
|
||||
|
||||
|
||||
|
||||
val i = new Instance
|
||||
|
||||
if (urls != null) {
|
||||
|
||||
if (urls!= null) {
|
||||
|
||||
val l:List[String] = urls.instances.filter(k=>k.SourceUrl.nonEmpty).map(k=>k.SourceUrl):::List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}")
|
||||
val l: List[String] = urls.instances
|
||||
.filter(k => k.SourceUrl.nonEmpty)
|
||||
.map(k => k.SourceUrl) ::: List(
|
||||
s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}"
|
||||
)
|
||||
|
||||
i.setUrl(l.asJava)
|
||||
}
|
||||
else
|
||||
i.setUrl(List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}").asJava)
|
||||
} else
|
||||
i.setUrl(
|
||||
List(
|
||||
s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}"
|
||||
).asJava
|
||||
)
|
||||
|
||||
// Ticket #6281 added pid to Instance
|
||||
i.setPid(pub.getPid)
|
||||
|
@ -184,13 +280,13 @@ case object ConversionUtil {
|
|||
pub
|
||||
}
|
||||
|
||||
|
||||
def transformPaperAbstract(input: MagPaperAbstract): MagPaperAbstract = {
|
||||
MagPaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract))
|
||||
}
|
||||
|
||||
|
||||
def createOAFFromJournalAuthorPaper(inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)): Publication = {
|
||||
def createOAFFromJournalAuthorPaper(
|
||||
inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)
|
||||
): Publication = {
|
||||
val paper = inputParams._1._1
|
||||
val journal = inputParams._1._2
|
||||
val authors = inputParams._2
|
||||
|
@ -206,31 +302,37 @@ case object ConversionUtil {
|
|||
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
|
||||
|
||||
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
|
||||
val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
||||
val originalTitles =
|
||||
createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
||||
pub.setTitle(List(mainTitles, originalTitles).asJava)
|
||||
|
||||
pub.setSource(List(asField(paper.BookTitle)).asJava)
|
||||
|
||||
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
|
||||
|
||||
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
|
||||
a.setRank(f.sequenceNumber)
|
||||
if (f.author.DisplayName.isDefined)
|
||||
a.setFullname(f.author.DisplayName.get)
|
||||
if(f.affiliation!= null)
|
||||
if (f.affiliation != null)
|
||||
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava)
|
||||
a.setPid(
|
||||
List(
|
||||
createSP(
|
||||
s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}",
|
||||
"URL",
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
a
|
||||
}
|
||||
pub.setAuthor(authorsOAF.asJava)
|
||||
|
||||
|
||||
if (paper.Date != null && paper.Date.isDefined) {
|
||||
pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0,10)))
|
||||
pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0, 10)))
|
||||
}
|
||||
pub.setPublisher(asField(paper.Publisher))
|
||||
|
||||
|
||||
if (journal != null && journal.DisplayName.isDefined) {
|
||||
val j = new Journal
|
||||
|
||||
|
@ -250,8 +352,9 @@ case object ConversionUtil {
|
|||
pub
|
||||
}
|
||||
|
||||
|
||||
def createOAF(inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)): Publication = {
|
||||
def createOAF(
|
||||
inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)
|
||||
): Publication = {
|
||||
|
||||
val paper = inputParams._1._1
|
||||
val authors = inputParams._1._2
|
||||
|
@ -268,46 +371,48 @@ case object ConversionUtil {
|
|||
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
|
||||
|
||||
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
|
||||
val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
||||
val originalTitles =
|
||||
createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
|
||||
pub.setTitle(List(mainTitles, originalTitles).asJava)
|
||||
|
||||
pub.setSource(List(asField(paper.BookTitle)).asJava)
|
||||
|
||||
|
||||
if (description != null) {
|
||||
pub.setDescription(List(asField(description.IndexedAbstract)).asJava)
|
||||
}
|
||||
|
||||
|
||||
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
|
||||
|
||||
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
|
||||
|
||||
a.setFullname(f.author.DisplayName.get)
|
||||
|
||||
if(f.affiliation!= null)
|
||||
if (f.affiliation != null)
|
||||
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||
|
||||
|
||||
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava)
|
||||
a.setPid(
|
||||
List(
|
||||
createSP(
|
||||
s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}",
|
||||
"URL",
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
a
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (paper.Date != null) {
|
||||
pub.setDateofacceptance(asField(paper.Date.toString.substring(0,10)))
|
||||
pub.setDateofacceptance(asField(paper.Date.toString.substring(0, 10)))
|
||||
}
|
||||
|
||||
pub.setAuthor(authorsOAF.asJava)
|
||||
|
||||
|
||||
pub
|
||||
|
||||
}
|
||||
|
||||
|
||||
def convertInvertedIndexString(json_input: String): String = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(json_input)
|
||||
|
@ -317,13 +422,13 @@ case object ConversionUtil {
|
|||
|
||||
val iid = (json \ "InvertedIndex").extract[Map[String, List[Int]]]
|
||||
|
||||
for {(k: String, v: List[Int]) <- iid} {
|
||||
for { (k: String, v: List[Int]) <- iid } {
|
||||
v.foreach(item => res(item) = k)
|
||||
}
|
||||
(0 until idl).foreach(i => {
|
||||
if (res(i) == null)
|
||||
res(i) = ""
|
||||
})
|
||||
(0 until idl).foreach(i => {
|
||||
if (res(i) == null)
|
||||
res(i) = ""
|
||||
})
|
||||
return res.mkString(" ")
|
||||
}
|
||||
""
|
||||
|
|
|
@ -8,44 +8,244 @@ import org.apache.spark.sql.{SaveMode, SparkSession}
|
|||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkImportMagIntoDataset {
|
||||
|
||||
val datatypedict = Map(
|
||||
"bool" -> BooleanType,
|
||||
"int" -> IntegerType,
|
||||
"uint" -> IntegerType,
|
||||
"long" -> LongType,
|
||||
"ulong" -> LongType,
|
||||
"float" -> FloatType,
|
||||
"string" -> StringType,
|
||||
"bool" -> BooleanType,
|
||||
"int" -> IntegerType,
|
||||
"uint" -> IntegerType,
|
||||
"long" -> LongType,
|
||||
"ulong" -> LongType,
|
||||
"float" -> FloatType,
|
||||
"string" -> StringType,
|
||||
"DateTime" -> DateType
|
||||
)
|
||||
|
||||
|
||||
val stream = Map(
|
||||
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||
"AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")),
|
||||
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||
"ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
|
||||
"FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
|
||||
"FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
|
||||
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
|
||||
"PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
|
||||
"PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
|
||||
"PaperExtendedAttributes" -> Tuple2("mag/PaperExtendedAttributes.txt", Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")),
|
||||
"PaperFieldsOfStudy" -> Tuple2("advanced/PaperFieldsOfStudy.txt", Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")),
|
||||
"PaperMeSH" -> Tuple2("advanced/PaperMeSH.txt", Seq("PaperId:long", "DescriptorUI:string", "DescriptorName:string", "QualifierUI:string", "QualifierName:string", "IsMajorTopic:bool")),
|
||||
"PaperRecommendations" -> Tuple2("advanced/PaperRecommendations.txt", Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")),
|
||||
"PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
|
||||
"PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
|
||||
"PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
|
||||
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "FamilyRank:uint?", "CreatedDate:DateTime")),
|
||||
"RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
|
||||
"Affiliations" -> Tuple2(
|
||||
"mag/Affiliations.txt",
|
||||
Seq(
|
||||
"AffiliationId:long",
|
||||
"Rank:uint",
|
||||
"NormalizedName:string",
|
||||
"DisplayName:string",
|
||||
"GridId:string",
|
||||
"OfficialPage:string",
|
||||
"WikiPage:string",
|
||||
"PaperCount:long",
|
||||
"PaperFamilyCount:long",
|
||||
"CitationCount:long",
|
||||
"Iso3166Code:string",
|
||||
"Latitude:float?",
|
||||
"Longitude:float?",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"AuthorExtendedAttributes" -> Tuple2(
|
||||
"mag/AuthorExtendedAttributes.txt",
|
||||
Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")
|
||||
),
|
||||
"Authors" -> Tuple2(
|
||||
"mag/Authors.txt",
|
||||
Seq(
|
||||
"AuthorId:long",
|
||||
"Rank:uint",
|
||||
"NormalizedName:string",
|
||||
"DisplayName:string",
|
||||
"LastKnownAffiliationId:long?",
|
||||
"PaperCount:long",
|
||||
"PaperFamilyCount:long",
|
||||
"CitationCount:long",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"ConferenceInstances" -> Tuple2(
|
||||
"mag/ConferenceInstances.txt",
|
||||
Seq(
|
||||
"ConferenceInstanceId:long",
|
||||
"NormalizedName:string",
|
||||
"DisplayName:string",
|
||||
"ConferenceSeriesId:long",
|
||||
"Location:string",
|
||||
"OfficialUrl:string",
|
||||
"StartDate:DateTime?",
|
||||
"EndDate:DateTime?",
|
||||
"AbstractRegistrationDate:DateTime?",
|
||||
"SubmissionDeadlineDate:DateTime?",
|
||||
"NotificationDueDate:DateTime?",
|
||||
"FinalVersionDueDate:DateTime?",
|
||||
"PaperCount:long",
|
||||
"PaperFamilyCount:long",
|
||||
"CitationCount:long",
|
||||
"Latitude:float?",
|
||||
"Longitude:float?",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"ConferenceSeries" -> Tuple2(
|
||||
"mag/ConferenceSeries.txt",
|
||||
Seq(
|
||||
"ConferenceSeriesId:long",
|
||||
"Rank:uint",
|
||||
"NormalizedName:string",
|
||||
"DisplayName:string",
|
||||
"PaperCount:long",
|
||||
"PaperFamilyCount:long",
|
||||
"CitationCount:long",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"EntityRelatedEntities" -> Tuple2(
|
||||
"advanced/EntityRelatedEntities.txt",
|
||||
Seq(
|
||||
"EntityId:long",
|
||||
"EntityType:string",
|
||||
"RelatedEntityId:long",
|
||||
"RelatedEntityType:string",
|
||||
"RelatedType:int",
|
||||
"Score:float"
|
||||
)
|
||||
),
|
||||
"FieldOfStudyChildren" -> Tuple2(
|
||||
"advanced/FieldOfStudyChildren.txt",
|
||||
Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")
|
||||
),
|
||||
"FieldOfStudyExtendedAttributes" -> Tuple2(
|
||||
"advanced/FieldOfStudyExtendedAttributes.txt",
|
||||
Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")
|
||||
),
|
||||
"FieldsOfStudy" -> Tuple2(
|
||||
"advanced/FieldsOfStudy.txt",
|
||||
Seq(
|
||||
"FieldOfStudyId:long",
|
||||
"Rank:uint",
|
||||
"NormalizedName:string",
|
||||
"DisplayName:string",
|
||||
"MainType:string",
|
||||
"Level:int",
|
||||
"PaperCount:long",
|
||||
"PaperFamilyCount:long",
|
||||
"CitationCount:long",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"Journals" -> Tuple2(
|
||||
"mag/Journals.txt",
|
||||
Seq(
|
||||
"JournalId:long",
|
||||
"Rank:uint",
|
||||
"NormalizedName:string",
|
||||
"DisplayName:string",
|
||||
"Issn:string",
|
||||
"Publisher:string",
|
||||
"Webpage:string",
|
||||
"PaperCount:long",
|
||||
"PaperFamilyCount:long",
|
||||
"CitationCount:long",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"PaperAbstractsInvertedIndex" -> Tuple2(
|
||||
"nlp/PaperAbstractsInvertedIndex.txt.*",
|
||||
Seq("PaperId:long", "IndexedAbstract:string")
|
||||
),
|
||||
"PaperAuthorAffiliations" -> Tuple2(
|
||||
"mag/PaperAuthorAffiliations.txt",
|
||||
Seq(
|
||||
"PaperId:long",
|
||||
"AuthorId:long",
|
||||
"AffiliationId:long?",
|
||||
"AuthorSequenceNumber:uint",
|
||||
"OriginalAuthor:string",
|
||||
"OriginalAffiliation:string"
|
||||
)
|
||||
),
|
||||
"PaperCitationContexts" -> Tuple2(
|
||||
"nlp/PaperCitationContexts.txt",
|
||||
Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")
|
||||
),
|
||||
"PaperExtendedAttributes" -> Tuple2(
|
||||
"mag/PaperExtendedAttributes.txt",
|
||||
Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")
|
||||
),
|
||||
"PaperFieldsOfStudy" -> Tuple2(
|
||||
"advanced/PaperFieldsOfStudy.txt",
|
||||
Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")
|
||||
),
|
||||
"PaperMeSH" -> Tuple2(
|
||||
"advanced/PaperMeSH.txt",
|
||||
Seq(
|
||||
"PaperId:long",
|
||||
"DescriptorUI:string",
|
||||
"DescriptorName:string",
|
||||
"QualifierUI:string",
|
||||
"QualifierName:string",
|
||||
"IsMajorTopic:bool"
|
||||
)
|
||||
),
|
||||
"PaperRecommendations" -> Tuple2(
|
||||
"advanced/PaperRecommendations.txt",
|
||||
Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")
|
||||
),
|
||||
"PaperReferences" -> Tuple2(
|
||||
"mag/PaperReferences.txt",
|
||||
Seq("PaperId:long", "PaperReferenceId:long")
|
||||
),
|
||||
"PaperResources" -> Tuple2(
|
||||
"mag/PaperResources.txt",
|
||||
Seq(
|
||||
"PaperId:long",
|
||||
"ResourceType:int",
|
||||
"ResourceUrl:string",
|
||||
"SourceUrl:string",
|
||||
"RelationshipType:int"
|
||||
)
|
||||
),
|
||||
"PaperUrls" -> Tuple2(
|
||||
"mag/PaperUrls.txt",
|
||||
Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")
|
||||
),
|
||||
"Papers" -> Tuple2(
|
||||
"mag/Papers.txt",
|
||||
Seq(
|
||||
"PaperId:long",
|
||||
"Rank:uint",
|
||||
"Doi:string",
|
||||
"DocType:string",
|
||||
"PaperTitle:string",
|
||||
"OriginalTitle:string",
|
||||
"BookTitle:string",
|
||||
"Year:int?",
|
||||
"Date:DateTime?",
|
||||
"OnlineDate:DateTime?",
|
||||
"Publisher:string",
|
||||
"JournalId:long?",
|
||||
"ConferenceSeriesId:long?",
|
||||
"ConferenceInstanceId:long?",
|
||||
"Volume:string",
|
||||
"Issue:string",
|
||||
"FirstPage:string",
|
||||
"LastPage:string",
|
||||
"ReferenceCount:long",
|
||||
"CitationCount:long",
|
||||
"EstimatedCitation:long",
|
||||
"OriginalVenue:string",
|
||||
"FamilyId:long?",
|
||||
"FamilyRank:uint?",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"RelatedFieldOfStudy" -> Tuple2(
|
||||
"advanced/RelatedFieldOfStudy.txt",
|
||||
Seq(
|
||||
"FieldOfStudyId1:long",
|
||||
"Type1:string",
|
||||
"FieldOfStudyId2:long",
|
||||
"Type2:string",
|
||||
"Rank:float"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def getSchema(streamName: String): StructType = {
|
||||
var schema = new StructType()
|
||||
val d: Seq[String] = stream(streamName)._2
|
||||
|
@ -61,19 +261,22 @@ object SparkImportMagIntoDataset {
|
|||
schema
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
stream.foreach { case (k, v) =>
|
||||
val s: StructType = getSchema(k)
|
||||
|
|
|
@ -9,21 +9,42 @@ import org.apache.spark.sql.functions.{col, collect_list, struct}
|
|||
import org.apache.spark.sql._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkProcessMAG {
|
||||
|
||||
def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = {
|
||||
d.where(col("Doi").isNotNull)
|
||||
.groupByKey(mp => DoiBoostMappingUtil.normalizeDoi(mp.Doi))(Encoders.STRING)
|
||||
.reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2))
|
||||
.reduceGroups((p1: MagPapers, p2: MagPapers) =>
|
||||
ConversionUtil.choiceLatestMagArtitcle(p1, p2)
|
||||
)
|
||||
.map(_._2)(Encoders.product[MagPapers])
|
||||
.map(mp => {
|
||||
MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi),
|
||||
mp.DocType, mp.PaperTitle, mp.OriginalTitle,
|
||||
mp.BookTitle, mp.Year, mp.Date, mp.Publisher: String,
|
||||
mp.JournalId, mp.ConferenceSeriesId, mp.ConferenceInstanceId,
|
||||
mp.Volume, mp.Issue, mp.FirstPage, mp.LastPage,
|
||||
mp.ReferenceCount, mp.CitationCount, mp.EstimatedCitation,
|
||||
mp.OriginalVenue, mp.FamilyId, mp.CreatedDate)
|
||||
MagPapers(
|
||||
mp.PaperId,
|
||||
mp.Rank,
|
||||
DoiBoostMappingUtil.normalizeDoi(mp.Doi),
|
||||
mp.DocType,
|
||||
mp.PaperTitle,
|
||||
mp.OriginalTitle,
|
||||
mp.BookTitle,
|
||||
mp.Year,
|
||||
mp.Date,
|
||||
mp.Publisher: String,
|
||||
mp.JournalId,
|
||||
mp.ConferenceSeriesId,
|
||||
mp.ConferenceInstanceId,
|
||||
mp.Volume,
|
||||
mp.Issue,
|
||||
mp.FirstPage,
|
||||
mp.LastPage,
|
||||
mp.ReferenceCount,
|
||||
mp.CitationCount,
|
||||
mp.EstimatedCitation,
|
||||
mp.OriginalVenue,
|
||||
mp.FamilyId,
|
||||
mp.CreatedDate
|
||||
)
|
||||
})(Encoders.product[MagPapers])
|
||||
}
|
||||
|
||||
|
@ -31,22 +52,29 @@ object SparkProcessMAG {
|
|||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val workingPath = parser.get("workingPath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
|
||||
import spark.implicits._
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||
implicit val mapEncoderPubs: Encoder[Publication] =
|
||||
org.apache.spark.sql.Encoders.kryo[Publication]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =
|
||||
Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||
|
||||
logger.info("Phase 1) make uninue DOI in Papers:")
|
||||
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
|
||||
|
@ -58,16 +86,23 @@ object SparkProcessMAG {
|
|||
|
||||
logger.info("Phase 0) Enrich Publication with description")
|
||||
val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"$workingPath/PaperAbstract")
|
||||
pa.map(ConversionUtil.transformPaperAbstract)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/PaperAbstract")
|
||||
|
||||
logger.info("Phase 3) Group Author by PaperId")
|
||||
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
|
||||
|
||||
val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]
|
||||
val paperAuthorAffiliation = spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
|
||||
val paperAuthorAffiliation =
|
||||
spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
|
||||
|
||||
paperAuthorAffiliation.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
|
||||
.map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) => (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber)) }
|
||||
paperAuthorAffiliation
|
||||
.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
|
||||
.map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) =>
|
||||
(a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber))
|
||||
}
|
||||
.joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")
|
||||
.map(s => {
|
||||
val mpa = s._1._2
|
||||
|
@ -76,79 +111,133 @@ object SparkProcessMAG {
|
|||
MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName, mpa.sequenceNumber)
|
||||
} else
|
||||
mpa
|
||||
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors"))
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_1_paper_authors")
|
||||
})
|
||||
.groupBy("PaperId")
|
||||
.agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors"))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/merge_step_1_paper_authors")
|
||||
|
||||
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
|
||||
logger.info(
|
||||
"Phase 4) create First Version of publication Entity with Paper Journal and Authors"
|
||||
)
|
||||
|
||||
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
|
||||
|
||||
val papers = spark.read.load((s"$workingPath/Papers_distinct")).as[MagPapers]
|
||||
val papers = spark.read.load(s"$workingPath/Papers_distinct").as[MagPapers]
|
||||
|
||||
val paperWithAuthors = spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
||||
val paperWithAuthors =
|
||||
spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
||||
|
||||
val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
|
||||
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
|
||||
val firstJoin =
|
||||
papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
|
||||
firstJoin
|
||||
.joinWith(
|
||||
paperWithAuthors,
|
||||
firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")),
|
||||
"left"
|
||||
)
|
||||
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_2")
|
||||
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/merge_step_2")
|
||||
|
||||
var magPubs: Dataset[(String, Publication)] =
|
||||
spark.read.load(s"$workingPath/merge_step_2").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
spark.read
|
||||
.load(s"$workingPath/merge_step_2")
|
||||
.as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
|
||||
.as[(String, Publication)]
|
||||
|
||||
val conference = spark.read
|
||||
.load(s"$sourcePath/ConferenceInstances")
|
||||
.select(
|
||||
$"ConferenceInstanceId".as("ci"),
|
||||
$"DisplayName",
|
||||
$"Location",
|
||||
$"StartDate",
|
||||
$"EndDate"
|
||||
)
|
||||
val conferenceInstance = conference
|
||||
.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
|
||||
.select(
|
||||
$"_1.ci",
|
||||
$"_1.DisplayName",
|
||||
$"_1.Location",
|
||||
$"_1.StartDate",
|
||||
$"_1.EndDate",
|
||||
$"_2.PaperId"
|
||||
)
|
||||
.as[MagConferenceInstance]
|
||||
|
||||
val conference = spark.read.load(s"$sourcePath/ConferenceInstances")
|
||||
.select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate", $"EndDate")
|
||||
val conferenceInstance = conference.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
|
||||
.select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate", $"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance]
|
||||
|
||||
|
||||
magPubs.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
|
||||
magPubs
|
||||
.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
|
||||
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/merge_step_3")
|
||||
|
||||
val paperAbstract = spark.read.load(s"$workingPath/PaperAbstract").as[MagPaperAbstract]
|
||||
|
||||
val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract]
|
||||
|
||||
|
||||
magPubs = spark.read.load(s"$workingPath/merge_step_3").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
|
||||
magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
|
||||
.map(item => ConversionUtil.updatePubsWithDescription(item)
|
||||
).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4")
|
||||
magPubs = spark.read
|
||||
.load(s"$workingPath/merge_step_3")
|
||||
.as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
|
||||
.as[(String, Publication)]
|
||||
|
||||
magPubs
|
||||
.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
|
||||
.map(item => ConversionUtil.updatePubsWithDescription(item))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/merge_step_4")
|
||||
|
||||
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
|
||||
|
||||
magPubs = spark.read.load(s"$workingPath/merge_step_4").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
magPubs = spark.read
|
||||
.load(s"$workingPath/merge_step_4")
|
||||
.as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
|
||||
.as[(String, Publication)]
|
||||
|
||||
val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
|
||||
val fos = spark.read
|
||||
.load(s"$sourcePath/FieldsOfStudy")
|
||||
.select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
|
||||
|
||||
val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy")
|
||||
|
||||
val paperField = pfos.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
|
||||
val paperField = pfos
|
||||
.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
|
||||
.select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score")
|
||||
.groupBy($"PaperId").agg(collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score")).as("subjects"))
|
||||
.groupBy($"PaperId")
|
||||
.agg(
|
||||
collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score"))
|
||||
.as("subjects")
|
||||
)
|
||||
.as[MagFieldOfStudy]
|
||||
|
||||
magPubs.joinWith(paperField, col("_1")
|
||||
.equalTo(paperField("PaperId")), "left")
|
||||
magPubs
|
||||
.joinWith(
|
||||
paperField,
|
||||
col("_1")
|
||||
.equalTo(paperField("PaperId")),
|
||||
"left"
|
||||
)
|
||||
.map(item => ConversionUtil.updatePubsWithSubject(item))
|
||||
.write.mode(SaveMode.Overwrite)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/mag_publication")
|
||||
|
||||
spark.read.load(s"$workingPath/mag_publication").as[Publication]
|
||||
spark.read
|
||||
.load(s"$workingPath/mag_publication")
|
||||
.as[Publication]
|
||||
.filter(p => p.getId != null)
|
||||
.groupByKey(p => p.getId)
|
||||
.reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b))
|
||||
.map(_._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
|
||||
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/magPublication")
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,15 +15,20 @@ import org.slf4j.{Logger, LoggerFactory}
|
|||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
case class ORCIDItem(doi: String, authors: List[OrcidAuthor]) {}
|
||||
|
||||
case class ORCIDItem(doi:String, authors:List[OrcidAuthor]){}
|
||||
case class OrcidAuthor(oid:String, name:Option[String], surname:Option[String], creditName:Option[String], otherNames:Option[List[String]], errorCode:Option[String]){}
|
||||
case class OrcidWork(oid:String, doi:String)
|
||||
case class OrcidAuthor(
|
||||
oid: String,
|
||||
name: Option[String],
|
||||
surname: Option[String],
|
||||
creditName: Option[String],
|
||||
otherNames: Option[List[String]],
|
||||
errorCode: Option[String]
|
||||
) {}
|
||||
case class OrcidWork(oid: String, doi: String)
|
||||
|
||||
case class ORCIDElement(doi: String, authors: List[ORCIDItem]) {}
|
||||
|
||||
|
||||
|
||||
case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
|
||||
object ORCIDToOAF {
|
||||
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
|
||||
val mapper = new ObjectMapper()
|
||||
|
@ -41,7 +46,7 @@ object ORCIDToOAF {
|
|||
|
||||
def extractValueFromInputString(input: String): (String, String) = {
|
||||
val i = input.indexOf('[')
|
||||
if (i <5) {
|
||||
if (i < 5) {
|
||||
return null
|
||||
}
|
||||
val orcidList = input.substring(i, input.length - 1)
|
||||
|
@ -51,17 +56,16 @@ object ORCIDToOAF {
|
|||
} else null
|
||||
}
|
||||
|
||||
|
||||
def strValid(s:Option[String]) : Boolean = {
|
||||
def strValid(s: Option[String]): Boolean = {
|
||||
s.isDefined && s.get.nonEmpty
|
||||
}
|
||||
|
||||
def authorValid(author:OrcidAuthor): Boolean ={
|
||||
def authorValid(author: OrcidAuthor): Boolean = {
|
||||
if (strValid(author.name) && strValid(author.surname)) {
|
||||
return true
|
||||
}
|
||||
if (strValid(author.surname)) {
|
||||
return true
|
||||
return true
|
||||
}
|
||||
if (strValid(author.creditName)) {
|
||||
return true
|
||||
|
@ -70,37 +74,35 @@ object ORCIDToOAF {
|
|||
false
|
||||
}
|
||||
|
||||
|
||||
def extractDOIWorks(input:String): List[OrcidWork] = {
|
||||
def extractDOIWorks(input: String): List[OrcidWork] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
|
||||
val oid = (json \ "workDetail" \"oid").extractOrElse[String](null)
|
||||
val oid = (json \ "workDetail" \ "oid").extractOrElse[String](null)
|
||||
if (oid == null)
|
||||
return List()
|
||||
val doi:List[(String, String)] = for {
|
||||
JObject(extIds) <- json \ "workDetail" \"extIds"
|
||||
val doi: List[(String, String)] = for {
|
||||
JObject(extIds) <- json \ "workDetail" \ "extIds"
|
||||
JField("type", JString(typeValue)) <- extIds
|
||||
JField("value", JString(value)) <- extIds
|
||||
JField("value", JString(value)) <- extIds
|
||||
if "doi".equalsIgnoreCase(typeValue)
|
||||
} yield (typeValue, DoiBoostMappingUtil.normalizeDoi(value))
|
||||
if (doi.nonEmpty) {
|
||||
return doi.map(l =>OrcidWork(oid, l._2))
|
||||
return doi.map(l => OrcidWork(oid, l._2))
|
||||
}
|
||||
List()
|
||||
}
|
||||
|
||||
def convertORCIDAuthor(input:String): OrcidAuthor = {
|
||||
def convertORCIDAuthor(input: String): OrcidAuthor = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
|
||||
(json \"authorData" ).extractOrElse[OrcidAuthor](null)
|
||||
}
|
||||
(json \ "authorData").extractOrElse[OrcidAuthor](null)
|
||||
}
|
||||
|
||||
|
||||
def convertTOOAF(input:ORCIDItem) :Publication = {
|
||||
def convertTOOAF(input: ORCIDItem): Publication = {
|
||||
val doi = input.doi
|
||||
val pub:Publication = new Publication
|
||||
val pub: Publication = new Publication
|
||||
pub.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
|
||||
pub.setDataInfo(generateDataInfo())
|
||||
|
||||
|
@ -108,9 +110,9 @@ object ORCIDToOAF {
|
|||
if (pub.getId == null)
|
||||
return null
|
||||
|
||||
try{
|
||||
try {
|
||||
|
||||
val l:List[Author]= input.authors.map(a=> {
|
||||
val l: List[Author] = input.authors.map(a => {
|
||||
generateAuthor(a)
|
||||
})(collection.breakOut)
|
||||
|
||||
|
@ -125,30 +127,38 @@ object ORCIDToOAF {
|
|||
}
|
||||
}
|
||||
|
||||
def generateOricPIDDatainfo():DataInfo = {
|
||||
val di =DoiBoostMappingUtil.generateDataInfo("0.91")
|
||||
def generateOricPIDDatainfo(): DataInfo = {
|
||||
val di = DoiBoostMappingUtil.generateDataInfo("0.91")
|
||||
di.getProvenanceaction.setClassid(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)
|
||||
di.getProvenanceaction.setClassname(ModelConstants.HARVESTED)
|
||||
di
|
||||
}
|
||||
|
||||
def generateAuthor(o : OrcidAuthor): Author = {
|
||||
def generateAuthor(o: OrcidAuthor): Author = {
|
||||
val a = new Author
|
||||
if (strValid(o.name)) {
|
||||
a.setName(o.name.get.capitalize)
|
||||
a.setName(o.name.get.capitalize)
|
||||
}
|
||||
if (strValid(o.surname)) {
|
||||
a.setSurname(o.surname.get.capitalize)
|
||||
}
|
||||
if(strValid(o.name) && strValid(o.surname))
|
||||
if (strValid(o.name) && strValid(o.surname))
|
||||
a.setFullname(s"${o.name.get.capitalize} ${o.surname.get.capitalize}")
|
||||
else if (strValid(o.creditName))
|
||||
a.setFullname(o.creditName.get)
|
||||
if (StringUtils.isNotBlank(o.oid))
|
||||
a.setPid(List(createSP(o.oid, ModelConstants.ORCID, ModelConstants.DNET_PID_TYPES, generateOricPIDDatainfo())).asJava)
|
||||
a.setPid(
|
||||
List(
|
||||
createSP(
|
||||
o.oid,
|
||||
ModelConstants.ORCID,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
generateOricPIDDatainfo()
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
a
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -10,11 +10,11 @@ import org.slf4j.{Logger, LoggerFactory}
|
|||
object SparkConvertORCIDToOAF {
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
|
||||
|
||||
|
||||
def run(spark: SparkSession, workingPath: String, targetPath: String): Unit = {
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
import spark.implicits._
|
||||
val dataset: Dataset[ORCIDItem] = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]
|
||||
val dataset: Dataset[ORCIDItem] =
|
||||
spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]
|
||||
|
||||
logger.info("Converting ORCID to OAF")
|
||||
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
|
@ -22,15 +22,21 @@ object SparkConvertORCIDToOAF {
|
|||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
SparkConvertORCIDToOAF.getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json"
|
||||
)
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val workingPath = parser.get("workingPath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
|
|
|
@ -17,45 +17,72 @@ object SparkPreprocessORCID {
|
|||
|
||||
}
|
||||
|
||||
|
||||
def run(spark: SparkSession, sourcePath: String, workingPath: String): Unit = {
|
||||
import spark.implicits._
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
|
||||
val inputRDD: RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s != null).filter(s => ORCIDToOAF.authorValid(s))
|
||||
val inputRDD: RDD[OrcidAuthor] = spark.sparkContext
|
||||
.textFile(s"$sourcePath/authors")
|
||||
.map(s => ORCIDToOAF.convertORCIDAuthor(s))
|
||||
.filter(s => s != null)
|
||||
.filter(s => ORCIDToOAF.authorValid(s))
|
||||
|
||||
spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author")
|
||||
spark
|
||||
.createDataset(inputRDD)
|
||||
.as[OrcidAuthor]
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/author")
|
||||
|
||||
val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s != null)
|
||||
val res = spark.sparkContext
|
||||
.textFile(s"$sourcePath/works")
|
||||
.flatMap(s => ORCIDToOAF.extractDOIWorks(s))
|
||||
.filter(s => s != null)
|
||||
|
||||
spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works")
|
||||
spark
|
||||
.createDataset(res)
|
||||
.as[OrcidWork]
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/works")
|
||||
|
||||
val authors: Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor]
|
||||
|
||||
val works: Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork]
|
||||
|
||||
works.joinWith(authors, authors("oid").equalTo(works("oid")))
|
||||
works
|
||||
.joinWith(authors, authors("oid").equalTo(works("oid")))
|
||||
.map(i => {
|
||||
val doi = i._1.doi
|
||||
val author = i._2
|
||||
(doi, author)
|
||||
}).groupBy(col("_1").alias("doi"))
|
||||
.agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem]
|
||||
})
|
||||
.groupBy(col("_1").alias("doi"))
|
||||
.agg(collect_list(col("_2")).alias("authors"))
|
||||
.as[ORCIDItem]
|
||||
.map(s => fixORCIDItem(s))
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/orcidworksWithAuthor")
|
||||
}
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/preprocess_orcid_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
SparkConvertORCIDToOAF.getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/preprocess_orcid_params.json"
|
||||
)
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val workingPath = parser.get("workingPath")
|
||||
|
|
|
@ -13,28 +13,35 @@ object SparkMapUnpayWallToOAF {
|
|||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
SparkMapDumpIntoOAF.getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json"
|
||||
)
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
val inputRDD: RDD[String] = spark.sparkContext.textFile(s"$sourcePath")
|
||||
|
||||
logger.info("Converting UnpayWall to OAF")
|
||||
|
||||
val d: Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null)).as[Publication]
|
||||
val d: Dataset[Publication] = spark
|
||||
.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null))
|
||||
.as[Publication]
|
||||
d.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
}
|
||||
|
||||
|
|
|
@ -12,33 +12,41 @@ import org.slf4j.{Logger, LoggerFactory}
|
|||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
||||
|
||||
case class OALocation(evidence:Option[String], host_type:Option[String], is_best:Option[Boolean], license: Option[String], pmh_id:Option[String], updated:Option[String],
|
||||
url:Option[String], url_for_landing_page:Option[String], url_for_pdf:Option[String], version:Option[String]) {}
|
||||
|
||||
|
||||
|
||||
case class OALocation(
|
||||
evidence: Option[String],
|
||||
host_type: Option[String],
|
||||
is_best: Option[Boolean],
|
||||
license: Option[String],
|
||||
pmh_id: Option[String],
|
||||
updated: Option[String],
|
||||
url: Option[String],
|
||||
url_for_landing_page: Option[String],
|
||||
url_for_pdf: Option[String],
|
||||
version: Option[String]
|
||||
) {}
|
||||
|
||||
object UnpayWallToOAF {
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
|
||||
|
||||
def get_unpaywall_color(input:String):Option[OpenAccessRoute] = {
|
||||
if(input == null || input.equalsIgnoreCase("close"))
|
||||
def get_unpaywall_color(input: String): Option[OpenAccessRoute] = {
|
||||
if (input == null || input.equalsIgnoreCase("close"))
|
||||
return None
|
||||
if(input.equalsIgnoreCase("green"))
|
||||
if (input.equalsIgnoreCase("green"))
|
||||
return Some(OpenAccessRoute.green)
|
||||
if(input.equalsIgnoreCase("bronze"))
|
||||
if (input.equalsIgnoreCase("bronze"))
|
||||
return Some(OpenAccessRoute.bronze)
|
||||
if(input.equalsIgnoreCase("hybrid"))
|
||||
if (input.equalsIgnoreCase("hybrid"))
|
||||
return Some(OpenAccessRoute.hybrid)
|
||||
else
|
||||
return Some(OpenAccessRoute.gold)
|
||||
|
||||
}
|
||||
|
||||
def get_color(is_oa:Boolean, location: OALocation, journal_is_oa:Boolean):Option[OpenAccessRoute] = {
|
||||
def get_color(
|
||||
is_oa: Boolean,
|
||||
location: OALocation,
|
||||
journal_is_oa: Boolean
|
||||
): Option[OpenAccessRoute] = {
|
||||
if (is_oa) {
|
||||
if (location.host_type.isDefined) {
|
||||
{
|
||||
|
@ -62,23 +70,22 @@ object UnpayWallToOAF {
|
|||
None
|
||||
}
|
||||
|
||||
|
||||
def convertToOAF(input:String):Publication = {
|
||||
def convertToOAF(input: String): Publication = {
|
||||
val pub = new Publication
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
|
||||
val doi = DoiBoostMappingUtil.normalizeDoi((json \"doi").extract[String])
|
||||
val doi = DoiBoostMappingUtil.normalizeDoi((json \ "doi").extract[String])
|
||||
|
||||
if(doi == null)
|
||||
if (doi == null)
|
||||
return null
|
||||
|
||||
val is_oa = (json\ "is_oa").extract[Boolean]
|
||||
val is_oa = (json \ "is_oa").extract[Boolean]
|
||||
|
||||
val journal_is_oa= (json\ "journal_is_oa").extract[Boolean]
|
||||
val journal_is_oa = (json \ "journal_is_oa").extract[Boolean]
|
||||
|
||||
val oaLocation:OALocation = (json \ "best_oa_location").extractOrElse[OALocation](null)
|
||||
val oaLocation: OALocation = (json \ "best_oa_location").extractOrElse[OALocation](null)
|
||||
|
||||
val colour = get_unpaywall_color((json \ "oa_status").extractOrElse[String](null))
|
||||
|
||||
|
@ -88,9 +95,9 @@ object UnpayWallToOAF {
|
|||
if (!is_oa)
|
||||
return null
|
||||
|
||||
if(oaLocation== null || oaLocation.url.isEmpty)
|
||||
return null
|
||||
val i :Instance= new Instance()
|
||||
if (oaLocation == null || oaLocation.url.isEmpty)
|
||||
return null
|
||||
val i: Instance = new Instance()
|
||||
|
||||
i.setCollectedfrom(createUnpayWallCollectedFrom())
|
||||
// i.setAccessright(getOpenAccessQualifier())
|
||||
|
@ -122,7 +129,4 @@ object UnpayWallToOAF {
|
|||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -6,15 +6,11 @@ import org.junit.jupiter.api.Test
|
|||
class DoiBoostHostedByMapTest {
|
||||
|
||||
@Test
|
||||
def idDSGeneration():Unit = {
|
||||
val s ="doajarticles::0066-782X"
|
||||
|
||||
|
||||
def idDSGeneration(): Unit = {
|
||||
val s = "doajarticles::0066-782X"
|
||||
|
||||
println(DoiBoostMappingUtil.generateDSId(s))
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -6,38 +6,36 @@ import org.junit.jupiter.api.Test
|
|||
class NormalizeDOITest {
|
||||
|
||||
@Test
|
||||
def doiDSLowerCase():Unit = {
|
||||
val doi ="10.1042/BCJ20160876"
|
||||
def doiDSLowerCase(): Unit = {
|
||||
val doi = "10.1042/BCJ20160876"
|
||||
|
||||
assert(DoiBoostMappingUtil.normalizeDoi(doi).equals(doi.toLowerCase()))
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def doiFiltered():Unit = {
|
||||
def doiFiltered(): Unit = {
|
||||
val doi = "0.1042/BCJ20160876"
|
||||
|
||||
assert(DoiBoostMappingUtil.normalizeDoi(doi) == null)
|
||||
}
|
||||
|
||||
@Test
|
||||
def doiFiltered2():Unit = {
|
||||
def doiFiltered2(): Unit = {
|
||||
val doi = "https://doi.org/0.1042/BCJ20160876"
|
||||
|
||||
assert(DoiBoostMappingUtil.normalizeDoi(doi) == null)
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def doiCleaned():Unit = {
|
||||
def doiCleaned(): Unit = {
|
||||
val doi = "https://doi.org/10.1042/BCJ20160876"
|
||||
|
||||
assert(DoiBoostMappingUtil.normalizeDoi(doi).equals("10.1042/BCJ20160876".toLowerCase()))
|
||||
}
|
||||
|
||||
@Test
|
||||
def doiCleaned1():Unit = {
|
||||
def doiCleaned1(): Unit = {
|
||||
val doi = "https://doi.org/10.1042/ BCJ20160876"
|
||||
|
||||
assert(DoiBoostMappingUtil.normalizeDoi(doi).equals("10.1042/BCJ20160876".toLowerCase()))
|
||||
|
|
|
@ -12,20 +12,24 @@ import scala.collection.JavaConverters._
|
|||
import scala.io.Source
|
||||
import scala.util.matching.Regex
|
||||
|
||||
|
||||
class CrossrefMappingTest {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def testFunderRelationshipsMapping(): Unit = {
|
||||
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString
|
||||
val funder_doi = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString
|
||||
val funder_name = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString
|
||||
|
||||
val template = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")
|
||||
)
|
||||
.mkString
|
||||
val funder_doi = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi"))
|
||||
.mkString
|
||||
val funder_name = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi"))
|
||||
.mkString
|
||||
|
||||
for (line <- funder_doi.lines) {
|
||||
val json = template.replace("%s", line)
|
||||
|
@ -43,7 +47,8 @@ class CrossrefMappingTest {
|
|||
|
||||
def checkRelation(generatedOAF: List[Oaf]): Unit = {
|
||||
|
||||
val rels: List[Relation] = generatedOAF.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
|
||||
val rels: List[Relation] =
|
||||
generatedOAF.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
|
||||
assertFalse(rels.isEmpty)
|
||||
rels.foreach(relation => {
|
||||
val relJson = mapper.writeValueAsString(relation)
|
||||
|
@ -59,22 +64,22 @@ class CrossrefMappingTest {
|
|||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testSum() :Unit = {
|
||||
val from:Long = 1613135645000L
|
||||
val delta:Long = 1000000L
|
||||
|
||||
|
||||
println(s"updating from value: $from -> ${from+delta}")
|
||||
def testSum(): Unit = {
|
||||
val from: Long = 1613135645000L
|
||||
val delta: Long = 1000000L
|
||||
|
||||
println(s"updating from value: $from -> ${from + delta}")
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testOrcidID() :Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")).mkString
|
||||
|
||||
def testOrcidID(): Unit = {
|
||||
val json = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")
|
||||
)
|
||||
.mkString
|
||||
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty);
|
||||
|
@ -85,17 +90,18 @@ class CrossrefMappingTest {
|
|||
|
||||
val items = resultList.filter(p => p.isInstanceOf[Result])
|
||||
|
||||
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
items.foreach(p => println(mapper.writeValueAsString(p)))
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testEmptyTitle() :Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")).mkString
|
||||
|
||||
def testEmptyTitle(): Unit = {
|
||||
val json = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")
|
||||
)
|
||||
.mkString
|
||||
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty);
|
||||
|
@ -106,17 +112,16 @@ class CrossrefMappingTest {
|
|||
|
||||
val items = resultList.filter(p => p.isInstanceOf[Result])
|
||||
|
||||
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
items.foreach(p => println(mapper.writeValueAsString(p)))
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testPeerReviewed(): Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json")).mkString
|
||||
val json = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json"))
|
||||
.mkString
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
|
||||
assertNotNull(json)
|
||||
|
@ -128,12 +133,8 @@ class CrossrefMappingTest {
|
|||
|
||||
val items = resultList.filter(p => p.isInstanceOf[Result])
|
||||
|
||||
|
||||
items.foreach(p => logger.info(mapper.writeValueAsString(p)))
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
def extractECAward(award: String): String = {
|
||||
|
@ -143,21 +144,21 @@ class CrossrefMappingTest {
|
|||
null
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def extractECTest(): Unit = {
|
||||
val s = "FP7/2007-2013"
|
||||
val s = "FP7/2007-2013"
|
||||
val awardExtracted = extractECAward(s)
|
||||
println(awardExtracted)
|
||||
|
||||
println(DHPUtils.md5(awardExtracted))
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testJournalRelation(): Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json")).mkString
|
||||
val json = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json"))
|
||||
.mkString
|
||||
assertNotNull(json)
|
||||
|
||||
assertFalse(json.isEmpty)
|
||||
|
@ -165,20 +166,19 @@ class CrossrefMappingTest {
|
|||
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
||||
|
||||
assertTrue(resultList.nonEmpty)
|
||||
val rels:List[Relation] = resultList.filter(p => p.isInstanceOf[Relation]).map(r=> r.asInstanceOf[Relation])
|
||||
|
||||
|
||||
val rels: List[Relation] =
|
||||
resultList.filter(p => p.isInstanceOf[Relation]).map(r => r.asInstanceOf[Relation])
|
||||
|
||||
rels.foreach(s => logger.info(s.getTarget))
|
||||
assertEquals(rels.size, 6 )
|
||||
|
||||
assertEquals(rels.size, 6)
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testConvertBookFromCrossRef2Oaf(): Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json")).mkString
|
||||
val json = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json"))
|
||||
.mkString
|
||||
assertNotNull(json)
|
||||
|
||||
assertFalse(json.isEmpty);
|
||||
|
@ -199,42 +199,64 @@ class CrossrefMappingTest {
|
|||
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
|
||||
assertNotNull(
|
||||
result.getDataInfo.getProvenanceaction,
|
||||
"DataInfo/Provenance test not null Failed");
|
||||
"DataInfo/Provenance test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
|
||||
"DataInfo/Provenance/classId test not null Failed");
|
||||
"DataInfo/Provenance/classId test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
|
||||
"DataInfo/Provenance/className test not null Failed");
|
||||
"DataInfo/Provenance/className test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
|
||||
"DataInfo/Provenance/SchemeId test not null Failed");
|
||||
"DataInfo/Provenance/SchemeId test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
|
||||
"DataInfo/Provenance/SchemeName test not null Failed");
|
||||
"DataInfo/Provenance/SchemeName test not null Failed"
|
||||
);
|
||||
|
||||
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
|
||||
assertFalse(result.getCollectedfrom.isEmpty);
|
||||
|
||||
val collectedFromList = result.getCollectedfrom.asScala
|
||||
assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
|
||||
|
||||
assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
|
||||
assert(
|
||||
collectedFromList.exists(c =>
|
||||
c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")
|
||||
),
|
||||
"Wrong collected from assertion"
|
||||
)
|
||||
|
||||
assert(
|
||||
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
|
||||
"Wrong collected from assertion"
|
||||
)
|
||||
|
||||
val relevantDates = result.getRelevantdate.asScala
|
||||
|
||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
|
||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), "Missing relevant date of type published-online")
|
||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), "Missing relevant date of type published-print")
|
||||
assert(
|
||||
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
|
||||
"Missing relevant date of type created"
|
||||
)
|
||||
assert(
|
||||
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")),
|
||||
"Missing relevant date of type published-online"
|
||||
)
|
||||
assert(
|
||||
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")),
|
||||
"Missing relevant date of type published-print"
|
||||
)
|
||||
val rels = resultList.filter(p => p.isInstanceOf[Relation])
|
||||
assert(rels.isEmpty)
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testConvertPreprintFromCrossRef2Oaf(): Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json")).mkString
|
||||
val json = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json"))
|
||||
.mkString
|
||||
assertNotNull(json)
|
||||
|
||||
assertFalse(json.isEmpty);
|
||||
|
@ -255,44 +277,72 @@ class CrossrefMappingTest {
|
|||
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
|
||||
assertNotNull(
|
||||
result.getDataInfo.getProvenanceaction,
|
||||
"DataInfo/Provenance test not null Failed");
|
||||
"DataInfo/Provenance test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
|
||||
"DataInfo/Provenance/classId test not null Failed");
|
||||
"DataInfo/Provenance/classId test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
|
||||
"DataInfo/Provenance/className test not null Failed");
|
||||
"DataInfo/Provenance/className test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
|
||||
"DataInfo/Provenance/SchemeId test not null Failed");
|
||||
"DataInfo/Provenance/SchemeId test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
|
||||
"DataInfo/Provenance/SchemeName test not null Failed");
|
||||
"DataInfo/Provenance/SchemeName test not null Failed"
|
||||
);
|
||||
|
||||
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
|
||||
assertFalse(result.getCollectedfrom.isEmpty);
|
||||
|
||||
val collectedFromList = result.getCollectedfrom.asScala
|
||||
assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
|
||||
|
||||
assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
|
||||
assert(
|
||||
collectedFromList.exists(c =>
|
||||
c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")
|
||||
),
|
||||
"Wrong collected from assertion"
|
||||
)
|
||||
|
||||
assert(
|
||||
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
|
||||
"Wrong collected from assertion"
|
||||
)
|
||||
|
||||
val relevantDates = result.getRelevantdate.asScala
|
||||
|
||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
|
||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("available")), "Missing relevant date of type available")
|
||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("accepted")), "Missing relevant date of type accepted")
|
||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), "Missing relevant date of type published-online")
|
||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), "Missing relevant date of type published-print")
|
||||
assert(
|
||||
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
|
||||
"Missing relevant date of type created"
|
||||
)
|
||||
assert(
|
||||
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("available")),
|
||||
"Missing relevant date of type available"
|
||||
)
|
||||
assert(
|
||||
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("accepted")),
|
||||
"Missing relevant date of type accepted"
|
||||
)
|
||||
assert(
|
||||
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")),
|
||||
"Missing relevant date of type published-online"
|
||||
)
|
||||
assert(
|
||||
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")),
|
||||
"Missing relevant date of type published-print"
|
||||
)
|
||||
val rels = resultList.filter(p => p.isInstanceOf[Relation])
|
||||
assert(rels.isEmpty)
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testConvertDatasetFromCrossRef2Oaf(): Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json")).mkString
|
||||
val json = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json"))
|
||||
.mkString
|
||||
assertNotNull(json)
|
||||
|
||||
assertFalse(json.isEmpty);
|
||||
|
@ -313,19 +363,24 @@ class CrossrefMappingTest {
|
|||
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
|
||||
assertNotNull(
|
||||
result.getDataInfo.getProvenanceaction,
|
||||
"DataInfo/Provenance test not null Failed");
|
||||
"DataInfo/Provenance test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
|
||||
"DataInfo/Provenance/classId test not null Failed");
|
||||
"DataInfo/Provenance/classId test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
|
||||
"DataInfo/Provenance/className test not null Failed");
|
||||
"DataInfo/Provenance/className test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
|
||||
"DataInfo/Provenance/SchemeId test not null Failed");
|
||||
"DataInfo/Provenance/SchemeId test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
|
||||
"DataInfo/Provenance/SchemeName test not null Failed");
|
||||
"DataInfo/Provenance/SchemeName test not null Failed"
|
||||
);
|
||||
|
||||
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
|
||||
assertFalse(result.getCollectedfrom.isEmpty);
|
||||
|
@ -333,7 +388,9 @@ class CrossrefMappingTest {
|
|||
|
||||
@Test
|
||||
def testConvertArticleFromCrossRef2Oaf(): Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString
|
||||
val json = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json"))
|
||||
.mkString
|
||||
assertNotNull(json)
|
||||
|
||||
assertFalse(json.isEmpty);
|
||||
|
@ -354,32 +411,47 @@ class CrossrefMappingTest {
|
|||
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
|
||||
assertNotNull(
|
||||
result.getDataInfo.getProvenanceaction,
|
||||
"DataInfo/Provenance test not null Failed");
|
||||
"DataInfo/Provenance test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
|
||||
"DataInfo/Provenance/classId test not null Failed");
|
||||
"DataInfo/Provenance/classId test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
|
||||
"DataInfo/Provenance/className test not null Failed");
|
||||
"DataInfo/Provenance/className test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
|
||||
"DataInfo/Provenance/SchemeId test not null Failed");
|
||||
"DataInfo/Provenance/SchemeId test not null Failed"
|
||||
);
|
||||
assertFalse(
|
||||
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
|
||||
"DataInfo/Provenance/SchemeName test not null Failed");
|
||||
"DataInfo/Provenance/SchemeName test not null Failed"
|
||||
);
|
||||
|
||||
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
|
||||
assertFalse(result.getCollectedfrom.isEmpty);
|
||||
|
||||
val collectedFromList = result.getCollectedfrom.asScala
|
||||
assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
|
||||
|
||||
assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
|
||||
assert(
|
||||
collectedFromList.exists(c =>
|
||||
c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")
|
||||
),
|
||||
"Wrong collected from assertion"
|
||||
)
|
||||
|
||||
assert(
|
||||
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
|
||||
"Wrong collected from assertion"
|
||||
)
|
||||
|
||||
val relevantDates = result.getRelevantdate.asScala
|
||||
|
||||
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
|
||||
assert(
|
||||
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
|
||||
"Missing relevant date of type created"
|
||||
)
|
||||
|
||||
val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
|
||||
assertFalse(rels.isEmpty)
|
||||
|
@ -393,15 +465,14 @@ class CrossrefMappingTest {
|
|||
|
||||
})
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def testSetDateOfAcceptanceCrossRef2Oaf(): Unit = {
|
||||
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json")).mkString
|
||||
val json = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json"))
|
||||
.mkString
|
||||
assertNotNull(json)
|
||||
|
||||
assertFalse(json.isEmpty);
|
||||
|
@ -421,8 +492,13 @@ class CrossrefMappingTest {
|
|||
|
||||
@Test
|
||||
def testNormalizeDOI(): Unit = {
|
||||
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString
|
||||
val line :String = "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
|
||||
val template = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")
|
||||
)
|
||||
.mkString
|
||||
val line: String =
|
||||
"\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
|
||||
val json = template.replace("%s", line)
|
||||
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
||||
assertTrue(resultList.nonEmpty)
|
||||
|
@ -431,13 +507,17 @@ class CrossrefMappingTest {
|
|||
|
||||
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
|
||||
assertTrue(result.getPid.size() == 1)
|
||||
result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())))
|
||||
result.getPid.asScala.foreach(pid =>
|
||||
assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testNormalizeDOI2(): Unit = {
|
||||
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString
|
||||
val template = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json"))
|
||||
.mkString
|
||||
|
||||
val resultList: List[Oaf] = Crossref2Oaf.convert(template)
|
||||
assertTrue(resultList.nonEmpty)
|
||||
|
@ -446,14 +526,19 @@ class CrossrefMappingTest {
|
|||
|
||||
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
|
||||
assertTrue(result.getPid.size() == 1)
|
||||
result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())))
|
||||
result.getPid.asScala.foreach(pid =>
|
||||
assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testLicenseVorClosed() :Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")).mkString
|
||||
|
||||
def testLicenseVorClosed(): Unit = {
|
||||
val json = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")
|
||||
)
|
||||
.mkString
|
||||
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty);
|
||||
|
@ -462,25 +547,30 @@ class CrossrefMappingTest {
|
|||
|
||||
assertTrue(resultList.nonEmpty)
|
||||
|
||||
|
||||
val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
println(mapper.writeValueAsString(item))
|
||||
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor")))
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED")))
|
||||
assertTrue(
|
||||
item.getInstance().asScala exists (i =>
|
||||
i.getLicense.getValue.equals("https://www.springer.com/vor")
|
||||
)
|
||||
)
|
||||
assertTrue(
|
||||
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED"))
|
||||
)
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testLicenseOpen() :Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")).mkString
|
||||
|
||||
def testLicenseOpen(): Unit = {
|
||||
val json = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")
|
||||
)
|
||||
.mkString
|
||||
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty);
|
||||
|
@ -489,21 +579,35 @@ class CrossrefMappingTest {
|
|||
|
||||
assertTrue(resultList.nonEmpty)
|
||||
|
||||
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||
|
||||
val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html")))
|
||||
assertTrue(
|
||||
item.getInstance().asScala exists (i =>
|
||||
i.getLicense.getValue.equals(
|
||||
"http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html"
|
||||
)
|
||||
)
|
||||
)
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid))
|
||||
assertTrue(
|
||||
item.getInstance().asScala exists (i =>
|
||||
i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid
|
||||
)
|
||||
)
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
println(mapper.writeValueAsString(item))
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testLicenseEmbargoOpen() :Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json")).mkString
|
||||
|
||||
def testLicenseEmbargoOpen(): Unit = {
|
||||
val json = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json"
|
||||
)
|
||||
)
|
||||
.mkString
|
||||
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty);
|
||||
|
@ -512,21 +616,35 @@ class CrossrefMappingTest {
|
|||
|
||||
assertTrue(resultList.nonEmpty)
|
||||
|
||||
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||
|
||||
val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
|
||||
assertTrue(
|
||||
item.getInstance().asScala exists (i =>
|
||||
i.getLicense.getValue.equals(
|
||||
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
|
||||
)
|
||||
)
|
||||
)
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid))
|
||||
assertTrue(
|
||||
item.getInstance().asScala exists (i =>
|
||||
i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid
|
||||
)
|
||||
)
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
println(mapper.writeValueAsString(item))
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testLicenseEmbargo() :Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo.json")).mkString
|
||||
|
||||
def testLicenseEmbargo(): Unit = {
|
||||
val json = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/doiboost/crossref/publication_license_embargo.json"
|
||||
)
|
||||
)
|
||||
.mkString
|
||||
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty);
|
||||
|
@ -535,35 +653,18 @@ class CrossrefMappingTest {
|
|||
|
||||
assertTrue(resultList.nonEmpty)
|
||||
|
||||
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||
|
||||
val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO")))
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
println(mapper.writeValueAsString(item))
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testLicenseEmbargoDateTime() :Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json")).mkString
|
||||
|
||||
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty);
|
||||
|
||||
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
||||
|
||||
assertTrue(resultList.nonEmpty)
|
||||
|
||||
|
||||
val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO")))
|
||||
assertTrue(
|
||||
item.getInstance().asScala exists (i =>
|
||||
i.getLicense.getValue.equals(
|
||||
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
|
||||
)
|
||||
)
|
||||
)
|
||||
assertTrue(
|
||||
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))
|
||||
)
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
println(mapper.writeValueAsString(item))
|
||||
|
@ -571,9 +672,14 @@ class CrossrefMappingTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
def testMultipleURLs() :Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")).mkString
|
||||
|
||||
def testLicenseEmbargoDateTime(): Unit = {
|
||||
val json = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json"
|
||||
)
|
||||
)
|
||||
.mkString
|
||||
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty);
|
||||
|
@ -582,12 +688,47 @@ class CrossrefMappingTest {
|
|||
|
||||
assertTrue(resultList.nonEmpty)
|
||||
|
||||
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||
|
||||
val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||
assertTrue(
|
||||
item.getInstance().asScala exists (i =>
|
||||
i.getLicense.getValue.equals(
|
||||
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
|
||||
)
|
||||
)
|
||||
)
|
||||
assertTrue(
|
||||
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))
|
||||
)
|
||||
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
println(mapper.writeValueAsString(item))
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testMultipleURLs(): Unit = {
|
||||
val json = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")
|
||||
)
|
||||
.mkString
|
||||
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty);
|
||||
|
||||
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
||||
|
||||
assertTrue(resultList.nonEmpty)
|
||||
|
||||
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
|
||||
|
||||
assertEquals(1, item.getInstance().size())
|
||||
assertEquals(1, item.getInstance().get(0).getUrl().size())
|
||||
assertEquals("https://doi.org/10.1016/j.jas.2019.105013", item.getInstance().get(0).getUrl().get(0))
|
||||
assertEquals(
|
||||
"https://doi.org/10.1016/j.jas.2019.105013",
|
||||
item.getInstance().get(0).getUrl().get(0)
|
||||
)
|
||||
//println(mapper.writeValueAsString(item))
|
||||
|
||||
}
|
||||
|
|
|
@ -12,43 +12,35 @@ import org.slf4j.{Logger, LoggerFactory}
|
|||
import java.sql.Timestamp
|
||||
import scala.io.Source
|
||||
|
||||
|
||||
|
||||
class MAGMappingTest {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def testSplitter():Unit = {
|
||||
def testSplitter(): Unit = {
|
||||
val s = "sports.team"
|
||||
|
||||
|
||||
if (s.contains(".")) {
|
||||
println(s.split("\\.")head)
|
||||
println(s.split("\\.") head)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def testDate() :Unit = {
|
||||
def testDate(): Unit = {
|
||||
|
||||
val p:Timestamp = Timestamp.valueOf("2011-10-02 00:00:00")
|
||||
val p: Timestamp = Timestamp.valueOf("2011-10-02 00:00:00")
|
||||
|
||||
println(p.toString.substring(0,10))
|
||||
println(p.toString.substring(0, 10))
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def buildInvertedIndexTest(): Unit = {
|
||||
val json_input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json")).mkString
|
||||
val json_input = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json"))
|
||||
.mkString
|
||||
val description = ConversionUtil.convertInvertedIndexString(json_input)
|
||||
assertNotNull(description)
|
||||
assertTrue(description.nonEmpty)
|
||||
|
@ -56,10 +48,9 @@ class MAGMappingTest {
|
|||
logger.debug(description)
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def normalizeDoiTest():Unit = {
|
||||
|
||||
|
||||
def normalizeDoiTest(): Unit = {
|
||||
|
||||
implicit val formats = DefaultFormats
|
||||
|
||||
|
@ -78,8 +69,9 @@ class MAGMappingTest {
|
|||
val schema = Encoders.product[MagPapers].schema
|
||||
|
||||
import spark.implicits._
|
||||
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
|
||||
val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
|
||||
val magPapers: Dataset[MagPapers] =
|
||||
spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers]
|
||||
val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
|
||||
assertTrue(ret.count == 10)
|
||||
ret.take(10).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
|
||||
|
||||
|
@ -87,7 +79,7 @@ class MAGMappingTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
def normalizeDoiTest2():Unit = {
|
||||
def normalizeDoiTest2(): Unit = {
|
||||
|
||||
import org.json4s.DefaultFormats
|
||||
|
||||
|
@ -108,15 +100,13 @@ class MAGMappingTest {
|
|||
val schema = Encoders.product[MagPapers].schema
|
||||
|
||||
import spark.implicits._
|
||||
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
|
||||
val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
|
||||
val magPapers: Dataset[MagPapers] =
|
||||
spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers]
|
||||
val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
|
||||
assertTrue(ret.count == 8)
|
||||
ret.take(8).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
|
||||
spark.close()
|
||||
//ret.take(8).foreach(mp => println(write(mp)))
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -19,8 +19,10 @@ class MappingORCIDToOAFTest {
|
|||
val mapper = new ObjectMapper()
|
||||
|
||||
@Test
|
||||
def testExtractData():Unit ={
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput")).mkString
|
||||
def testExtractData(): Unit = {
|
||||
val json = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput"))
|
||||
.mkString
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty)
|
||||
json.lines.foreach(s => {
|
||||
|
@ -29,10 +31,10 @@ class MappingORCIDToOAFTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
def testOAFConvert(@TempDir testDir: Path):Unit ={
|
||||
val sourcePath:String = getClass.getResource("/eu/dnetlib/doiboost/orcid/datasets").getPath
|
||||
val targetPath: String =s"${testDir.toString}/output/orcidPublication"
|
||||
val workingPath =s"${testDir.toString}/wp/"
|
||||
def testOAFConvert(@TempDir testDir: Path): Unit = {
|
||||
val sourcePath: String = getClass.getResource("/eu/dnetlib/doiboost/orcid/datasets").getPath
|
||||
val targetPath: String = s"${testDir.toString}/output/orcidPublication"
|
||||
val workingPath = s"${testDir.toString}/wp/"
|
||||
|
||||
val conf = new SparkConf()
|
||||
conf.setMaster("local[*]")
|
||||
|
@ -46,18 +48,14 @@ class MappingORCIDToOAFTest {
|
|||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
import spark.implicits._
|
||||
|
||||
SparkPreprocessORCID.run( spark,sourcePath, workingPath)
|
||||
SparkPreprocessORCID.run(spark, sourcePath, workingPath)
|
||||
|
||||
SparkConvertORCIDToOAF.run(spark, workingPath,targetPath)
|
||||
SparkConvertORCIDToOAF.run(spark, workingPath, targetPath)
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
|
||||
|
||||
val oA = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem].count()
|
||||
|
||||
|
||||
|
||||
val p: Dataset[Publication] = spark.read.load(targetPath).as[Publication]
|
||||
|
||||
assertTrue(oA == p.count())
|
||||
|
@ -65,19 +63,18 @@ class MappingORCIDToOAFTest {
|
|||
|
||||
spark.close()
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testExtractDat1():Unit ={
|
||||
def testExtractDat1(): Unit = {
|
||||
|
||||
val aList: List[OrcidAuthor] = List(
|
||||
OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null),
|
||||
OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null),
|
||||
OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null)
|
||||
)
|
||||
|
||||
|
||||
val aList: List[OrcidAuthor] = List(OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null ),
|
||||
OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null ), OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null ))
|
||||
|
||||
val orcid:ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList)
|
||||
val orcid: ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList)
|
||||
|
||||
val oaf = ORCIDToOAF.convertTOOAF(orcid)
|
||||
assert(oaf.getPid.size() == 1)
|
||||
|
@ -85,10 +82,6 @@ class MappingORCIDToOAFTest {
|
|||
oaf.getPid.toList.foreach(pid => assert(pid.getValue.equals("10.1042/BCJ20160876")))
|
||||
//println(mapper.writeValueAsString(ORCIDToOAF.convertTOOAF(orcid)))
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -14,41 +14,43 @@ class UnpayWallMappingTest {
|
|||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
|
||||
@Test
|
||||
def testMappingToOAF():Unit ={
|
||||
def testMappingToOAF(): Unit = {
|
||||
|
||||
val Ilist = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json")).mkString
|
||||
val Ilist = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json"))
|
||||
.mkString
|
||||
|
||||
var i:Int = 0
|
||||
for (line <-Ilist.lines) {
|
||||
var i: Int = 0
|
||||
for (line <- Ilist.lines) {
|
||||
val p = UnpayWallToOAF.convertToOAF(line)
|
||||
|
||||
if(p!= null) {
|
||||
assertTrue(p.getInstance().size()==1)
|
||||
if (i== 0){
|
||||
if (p != null) {
|
||||
assertTrue(p.getInstance().size() == 1)
|
||||
if (i == 0) {
|
||||
assertTrue(p.getPid.get(0).getValue.equals("10.1038/2211089b0"))
|
||||
}
|
||||
if (i== 1){
|
||||
if (i == 1) {
|
||||
assertTrue(p.getPid.get(0).getValue.equals("10.1021/acs.bioconjchem.8b00058.s001"))
|
||||
}
|
||||
if (i== 2){
|
||||
if (i == 2) {
|
||||
assertTrue(p.getPid.get(0).getValue.equals("10.1021/acs.bioconjchem.8b00086.s001"))
|
||||
}
|
||||
logger.info(s"ID : ${p.getId}")
|
||||
}
|
||||
assertNotNull(line)
|
||||
assertTrue(line.nonEmpty)
|
||||
i = i+1
|
||||
i = i + 1
|
||||
}
|
||||
|
||||
|
||||
|
||||
val l = Ilist.lines.next()
|
||||
val l = Ilist.lines.next()
|
||||
|
||||
val item = UnpayWallToOAF.convertToOAF(l)
|
||||
|
||||
assertEquals(item.getInstance().get(0).getAccessright.getOpenAccessRoute, OpenAccessRoute.bronze)
|
||||
assertEquals(
|
||||
item.getInstance().get(0).getAccessright.getOpenAccessRoute,
|
||||
OpenAccessRoute.bronze
|
||||
)
|
||||
|
||||
logger.info(mapper.writeValueAsString(item))
|
||||
|
||||
|
|
|
@ -4,137 +4,190 @@ import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
|||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
|
||||
|
||||
case class HostedByItemType(
|
||||
id: String,
|
||||
officialname: String,
|
||||
issn: String,
|
||||
eissn: String,
|
||||
lissn: String,
|
||||
openAccess: Boolean
|
||||
) {}
|
||||
|
||||
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
||||
case class HostedByInfo(id: String, officialname: String, journal_id: String, provenance : String, id_type: String) {}
|
||||
case class HostedByInfo(
|
||||
id: String,
|
||||
officialname: String,
|
||||
journal_id: String,
|
||||
provenance: String,
|
||||
id_type: String
|
||||
) {}
|
||||
|
||||
object Aggregators {
|
||||
|
||||
|
||||
|
||||
def getId(s1:String, s2:String) : String = {
|
||||
if (s1.startsWith("10|")){
|
||||
return s1}
|
||||
s2
|
||||
}
|
||||
|
||||
def getValue(s1:String, s2:String) : String = {
|
||||
if(!s1.equals("")){
|
||||
def getId(s1: String, s2: String): String = {
|
||||
if (s1.startsWith("10|")) {
|
||||
return s1
|
||||
}
|
||||
s2
|
||||
}
|
||||
|
||||
def getValue(s1: String, s2: String): String = {
|
||||
if (!s1.equals("")) {
|
||||
return s1
|
||||
}
|
||||
s2
|
||||
}
|
||||
|
||||
def explodeHostedByItemType(df: Dataset[(String, HostedByItemType)]): Dataset[(String, HostedByItemType)] = {
|
||||
val transformedData : Dataset[(String, HostedByItemType)] = df
|
||||
def explodeHostedByItemType(
|
||||
df: Dataset[(String, HostedByItemType)]
|
||||
): Dataset[(String, HostedByItemType)] = {
|
||||
val transformedData: Dataset[(String, HostedByItemType)] = df
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(Aggregators.hostedByAggregator)
|
||||
.map{
|
||||
case (id:String , res:(String, HostedByItemType)) => res
|
||||
.map { case (id: String, res: (String, HostedByItemType)) =>
|
||||
res
|
||||
}(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]))
|
||||
|
||||
transformedData
|
||||
}
|
||||
|
||||
val hostedByAggregator: TypedColumn[(String, HostedByItemType), (String, HostedByItemType)] = new Aggregator[(String, HostedByItemType), (String, HostedByItemType), (String, HostedByItemType)] {
|
||||
override def zero: (String, HostedByItemType) = ("", HostedByItemType("","","","","",false))
|
||||
override def reduce(b: (String, HostedByItemType), a:(String,HostedByItemType)): (String, HostedByItemType) = {
|
||||
return merge(b, a)
|
||||
}
|
||||
override def merge(b1: (String, HostedByItemType), b2: (String, HostedByItemType)): (String, HostedByItemType) = {
|
||||
if (b1 == null){
|
||||
return b2
|
||||
val hostedByAggregator: TypedColumn[(String, HostedByItemType), (String, HostedByItemType)] =
|
||||
new Aggregator[
|
||||
(String, HostedByItemType),
|
||||
(String, HostedByItemType),
|
||||
(String, HostedByItemType)
|
||||
] {
|
||||
|
||||
override def zero: (String, HostedByItemType) =
|
||||
("", HostedByItemType("", "", "", "", "", false))
|
||||
|
||||
override def reduce(
|
||||
b: (String, HostedByItemType),
|
||||
a: (String, HostedByItemType)
|
||||
): (String, HostedByItemType) = {
|
||||
return merge(b, a)
|
||||
}
|
||||
if(b2 == null){
|
||||
return b1
|
||||
}
|
||||
if(b1._2.id.startsWith("10|")){
|
||||
return (b1._1, HostedByItemType(b1._2.id, b1._2.officialname, b1._2.issn, b1._2.eissn, b1._2.lissn, b1._2.openAccess || b2._2.openAccess))
|
||||
|
||||
override def merge(
|
||||
b1: (String, HostedByItemType),
|
||||
b2: (String, HostedByItemType)
|
||||
): (String, HostedByItemType) = {
|
||||
if (b1 == null) {
|
||||
return b2
|
||||
}
|
||||
if (b2 == null) {
|
||||
return b1
|
||||
}
|
||||
if (b1._2.id.startsWith("10|")) {
|
||||
return (
|
||||
b1._1,
|
||||
HostedByItemType(
|
||||
b1._2.id,
|
||||
b1._2.officialname,
|
||||
b1._2.issn,
|
||||
b1._2.eissn,
|
||||
b1._2.lissn,
|
||||
b1._2.openAccess || b2._2.openAccess
|
||||
)
|
||||
)
|
||||
|
||||
}
|
||||
return (
|
||||
b2._1,
|
||||
HostedByItemType(
|
||||
b2._2.id,
|
||||
b2._2.officialname,
|
||||
b2._2.issn,
|
||||
b2._2.eissn,
|
||||
b2._2.lissn,
|
||||
b1._2.openAccess || b2._2.openAccess
|
||||
)
|
||||
)
|
||||
|
||||
}
|
||||
return (b2._1, HostedByItemType(b2._2.id, b2._2.officialname, b2._2.issn, b2._2.eissn, b2._2.lissn, b1._2.openAccess || b2._2.openAccess))
|
||||
|
||||
}
|
||||
override def finish(reduction: (String,HostedByItemType)): (String, HostedByItemType) = reduction
|
||||
override def bufferEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType])
|
||||
override def finish(reduction: (String, HostedByItemType)): (String, HostedByItemType) =
|
||||
reduction
|
||||
|
||||
override def outputEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType])
|
||||
}.toColumn
|
||||
override def bufferEncoder: Encoder[(String, HostedByItemType)] =
|
||||
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
|
||||
|
||||
override def outputEncoder: Encoder[(String, HostedByItemType)] =
|
||||
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
|
||||
}.toColumn
|
||||
|
||||
def resultToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] =
|
||||
new Aggregator[EntityInfo, EntityInfo, EntityInfo] {
|
||||
override def zero: EntityInfo = EntityInfo.newInstance("", "", "")
|
||||
|
||||
|
||||
def resultToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = new Aggregator[EntityInfo, EntityInfo, EntityInfo]{
|
||||
override def zero: EntityInfo = EntityInfo.newInstance("","","")
|
||||
|
||||
override def reduce(b: EntityInfo, a:EntityInfo): EntityInfo = {
|
||||
return merge(b, a)
|
||||
}
|
||||
override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
|
||||
if (b1 == null){
|
||||
return b2
|
||||
override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = {
|
||||
return merge(b, a)
|
||||
}
|
||||
if(b2 == null){
|
||||
return b1
|
||||
|
||||
override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
|
||||
if (b1 == null) {
|
||||
return b2
|
||||
}
|
||||
if (b2 == null) {
|
||||
return b1
|
||||
}
|
||||
if (!b1.getHostedById.equals("")) {
|
||||
b1.setOpenAccess(b1.getOpenAccess || b2.getOpenAccess)
|
||||
return b1
|
||||
}
|
||||
b2.setOpenAccess(b1.getOpenAccess || b2.getOpenAccess)
|
||||
b2
|
||||
|
||||
}
|
||||
if(!b1.getHostedById.equals("")){
|
||||
b1.setOpenAccess(b1.getOpenAccess || b2.getOpenAccess)
|
||||
return b1
|
||||
}
|
||||
b2.setOpenAccess(b1.getOpenAccess || b2.getOpenAccess)
|
||||
b2
|
||||
override def finish(reduction: EntityInfo): EntityInfo = reduction
|
||||
override def bufferEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
|
||||
}
|
||||
override def finish(reduction: EntityInfo): EntityInfo = reduction
|
||||
override def bufferEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
}.toColumn
|
||||
|
||||
override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
}.toColumn
|
||||
|
||||
def resultToSingleId(df:Dataset[EntityInfo]): Dataset[EntityInfo] = {
|
||||
val transformedData : Dataset[EntityInfo] = df
|
||||
def resultToSingleId(df: Dataset[EntityInfo]): Dataset[EntityInfo] = {
|
||||
val transformedData: Dataset[EntityInfo] = df
|
||||
.groupByKey(_.getId)(Encoders.STRING)
|
||||
.agg(Aggregators.resultToSingleIdAggregator)
|
||||
.map{
|
||||
case (id:String , res: EntityInfo) => res
|
||||
.map { case (id: String, res: EntityInfo) =>
|
||||
res
|
||||
}(Encoders.bean(classOf[EntityInfo]))
|
||||
|
||||
transformedData
|
||||
}
|
||||
|
||||
def datasourceToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = new Aggregator[EntityInfo, EntityInfo, EntityInfo]{
|
||||
override def zero: EntityInfo = EntityInfo.newInstance("","","")
|
||||
def datasourceToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] =
|
||||
new Aggregator[EntityInfo, EntityInfo, EntityInfo] {
|
||||
override def zero: EntityInfo = EntityInfo.newInstance("", "", "")
|
||||
|
||||
override def reduce(b: EntityInfo, a:EntityInfo): EntityInfo = {
|
||||
return merge(b, a)
|
||||
}
|
||||
override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
|
||||
if (b1 == null){
|
||||
return b2
|
||||
override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = {
|
||||
return merge(b, a)
|
||||
}
|
||||
if(b2 == null){
|
||||
return b1
|
||||
|
||||
override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
|
||||
if (b1 == null) {
|
||||
return b2
|
||||
}
|
||||
if (b2 == null) {
|
||||
return b1
|
||||
}
|
||||
if (!b1.getHostedById.equals("")) {
|
||||
return b1
|
||||
}
|
||||
b2
|
||||
|
||||
}
|
||||
if(!b1.getHostedById.equals("")){
|
||||
return b1
|
||||
}
|
||||
b2
|
||||
override def finish(reduction: EntityInfo): EntityInfo = reduction
|
||||
override def bufferEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
|
||||
}
|
||||
override def finish(reduction: EntityInfo): EntityInfo = reduction
|
||||
override def bufferEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
}.toColumn
|
||||
|
||||
override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
}.toColumn
|
||||
|
||||
|
||||
def datasourceToSingleId(df:Dataset[EntityInfo]): Dataset[EntityInfo] = {
|
||||
val transformedData : Dataset[EntityInfo] = df
|
||||
def datasourceToSingleId(df: Dataset[EntityInfo]): Dataset[EntityInfo] = {
|
||||
val transformedData: Dataset[EntityInfo] = df
|
||||
.groupByKey(_.getHostedById)(Encoders.STRING)
|
||||
.agg(Aggregators.datasourceToSingleIdAggregator)
|
||||
.map{
|
||||
case (id:String , res: EntityInfo) => res
|
||||
.map { case (id: String, res: EntityInfo) =>
|
||||
res
|
||||
}(Encoders.bean(classOf[EntityInfo]))
|
||||
|
||||
transformedData
|
||||
|
|
|
@ -14,7 +14,8 @@ import org.slf4j.{Logger, LoggerFactory}
|
|||
object SparkApplyHostedByMapToDatasource {
|
||||
|
||||
def applyHBtoDats(join: Dataset[EntityInfo], dats: Dataset[Datasource]): Dataset[Datasource] = {
|
||||
dats.joinWith(join, dats.col("id").equalTo(join.col("hostedById")), "left")
|
||||
dats
|
||||
.joinWith(join, dats.col("id").equalTo(join.col("hostedById")), "left")
|
||||
.map(t2 => {
|
||||
val d: Datasource = t2._1
|
||||
if (t2._2 != null) {
|
||||
|
@ -31,14 +32,21 @@ object SparkApplyHostedByMapToDatasource {
|
|||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"
|
||||
)
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val graphPath = parser.get("graphPath")
|
||||
val outputPath = parser.get("outputPath")
|
||||
|
@ -51,20 +59,27 @@ object SparkApplyHostedByMapToDatasource {
|
|||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
val dats: Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource")
|
||||
val dats: Dataset[Datasource] = spark.read
|
||||
.textFile(graphPath + "/datasource")
|
||||
.map(r => mapper.readValue(r, classOf[Datasource]))
|
||||
|
||||
val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(preparedInfoPath)
|
||||
.map(ei => mapper.readValue(ei, classOf[EntityInfo])))
|
||||
val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(
|
||||
spark.read
|
||||
.textFile(preparedInfoPath)
|
||||
.map(ei => mapper.readValue(ei, classOf[EntityInfo]))
|
||||
)
|
||||
|
||||
applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
|
||||
applyHBtoDats(pinfo, dats).write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath)
|
||||
|
||||
spark.read.textFile(outputPath)
|
||||
spark.read
|
||||
.textFile(outputPath)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(graphPath + "/datasource")
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -16,7 +16,8 @@ import scala.collection.JavaConverters._
|
|||
object SparkApplyHostedByMapToResult {
|
||||
|
||||
def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = {
|
||||
pubs.joinWith(join, pubs.col("id").equalTo(join.col("id")), "left")
|
||||
pubs
|
||||
.joinWith(join, pubs.col("id").equalTo(join.col("id")), "left")
|
||||
.map(t2 => {
|
||||
val p: Publication = t2._1
|
||||
if (t2._2 != null) {
|
||||
|
@ -27,7 +28,14 @@ object SparkApplyHostedByMapToResult {
|
|||
inst.getHostedby.setKey(ei.getHostedById)
|
||||
inst.getHostedby.setValue(ei.getName)
|
||||
if (ei.getOpenAccess) {
|
||||
inst.setAccessright(OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN, "Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES))
|
||||
inst.setAccessright(
|
||||
OafMapperUtils.accessRight(
|
||||
ModelConstants.ACCESS_RIGHT_OPEN,
|
||||
"Open Access",
|
||||
ModelConstants.DNET_ACCESS_MODES,
|
||||
ModelConstants.DNET_ACCESS_MODES
|
||||
)
|
||||
)
|
||||
inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
|
||||
p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
|
||||
}
|
||||
|
@ -40,46 +48,54 @@ object SparkApplyHostedByMapToResult {
|
|||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"
|
||||
)
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val graphPath = parser.get("graphPath")
|
||||
|
||||
val outputPath = parser.get("outputPath")
|
||||
val preparedInfoPath = parser.get("preparedInfoPath")
|
||||
|
||||
|
||||
implicit val formats = DefaultFormats
|
||||
|
||||
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication])
|
||||
implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
val pubs: Dataset[Publication] = spark.read.textFile(graphPath + "/publication")
|
||||
val pubs: Dataset[Publication] = spark.read
|
||||
.textFile(graphPath + "/publication")
|
||||
.map(r => mapper.readValue(r, classOf[Publication]))
|
||||
|
||||
val pinfo: Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath)
|
||||
val pinfo: Dataset[EntityInfo] = spark.read
|
||||
.textFile(preparedInfoPath)
|
||||
.map(ei => mapper.readValue(ei, classOf[EntityInfo]))
|
||||
|
||||
applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
|
||||
applyHBtoPubs(pinfo, pubs).write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath)
|
||||
|
||||
spark.read.textFile(outputPath)
|
||||
spark.read
|
||||
.textFile(outputPath)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(graphPath + "/publication")
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -19,7 +19,6 @@ object SparkPrepareHostedByInfoToApply {
|
|||
def getList(id: String, j: Journal, name: String): List[EntityInfo] = {
|
||||
var lst: List[EntityInfo] = List()
|
||||
|
||||
|
||||
if (j.getIssnLinking != null && !j.getIssnLinking.equals("")) {
|
||||
lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst
|
||||
}
|
||||
|
@ -37,14 +36,14 @@ object SparkPrepareHostedByInfoToApply {
|
|||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
val dd: Dataset[Publication] = spark.read.textFile(publicationPath)
|
||||
val dd: Dataset[Publication] = spark.read
|
||||
.textFile(publicationPath)
|
||||
.map(r => mapper.readValue(r, classOf[Publication]))
|
||||
|
||||
dd.filter(p => p.getJournal != null).flatMap(p => getList(p.getId, p.getJournal, ""))
|
||||
|
||||
}
|
||||
|
||||
|
||||
def toEntityInfo(input: String): EntityInfo = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
|
@ -53,7 +52,6 @@ object SparkPrepareHostedByInfoToApply {
|
|||
toEntityItem(c.keys.head, c.values.head)
|
||||
}
|
||||
|
||||
|
||||
def toEntityItem(journal_id: String, hbi: HostedByItemType): EntityInfo = {
|
||||
|
||||
EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess)
|
||||
|
@ -61,62 +59,69 @@ object SparkPrepareHostedByInfoToApply {
|
|||
}
|
||||
|
||||
def joinResHBM(res: Dataset[EntityInfo], hbm: Dataset[EntityInfo]): Dataset[EntityInfo] = {
|
||||
Aggregators.resultToSingleId(res.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left")
|
||||
.map(t2 => {
|
||||
val res: EntityInfo = t2._1
|
||||
if (t2._2 != null) {
|
||||
val ds = t2._2
|
||||
res.setHostedById(ds.getId)
|
||||
res.setOpenAccess(ds.getOpenAccess)
|
||||
res.setName(ds.getName)
|
||||
}
|
||||
res
|
||||
}))
|
||||
Aggregators.resultToSingleId(
|
||||
res
|
||||
.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left")
|
||||
.map(t2 => {
|
||||
val res: EntityInfo = t2._1
|
||||
if (t2._2 != null) {
|
||||
val ds = t2._2
|
||||
res.setHostedById(ds.getId)
|
||||
res.setOpenAccess(ds.getOpenAccess)
|
||||
res.setName(ds.getName)
|
||||
}
|
||||
res
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json"
|
||||
)
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val graphPath = parser.get("graphPath")
|
||||
|
||||
val outputPath = parser.get("preparedInfoPath")
|
||||
val hostedByMapPath = parser.get("hostedByMapPath")
|
||||
|
||||
|
||||
implicit val formats = DefaultFormats
|
||||
|
||||
|
||||
logger.info("Getting the Datasources")
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
//STEP1: read the hostedbymap and transform it in EntityInfo
|
||||
val hostedByInfo: Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
|
||||
val hostedByInfo: Dataset[EntityInfo] =
|
||||
spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
|
||||
|
||||
//STEP2: create association (publication, issn), (publication, eissn), (publication, lissn)
|
||||
val resultInfoDataset: Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication")
|
||||
val resultInfoDataset: Dataset[EntityInfo] =
|
||||
prepareResultInfo(spark, graphPath + "/publication")
|
||||
|
||||
//STEP3: left join resultInfo with hostedByInfo on journal_id. Reduction of all the results with the same id in just
|
||||
//one entry (one result could be associated to issn and eissn and so possivly matching more than once against the map)
|
||||
//to this entry we add the id of the datasource for the next step
|
||||
joinResHBM(resultInfoDataset, hostedByInfo)
|
||||
.write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
|
||||
|
||||
joinResHBM(resultInfoDataset, hostedByInfo).write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath)
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -17,9 +17,8 @@ import java.io.PrintWriter
|
|||
|
||||
object SparkProduceHostedByMap {
|
||||
|
||||
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] = Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
|
||||
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] =
|
||||
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
|
||||
|
||||
def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)): HostedByItemType = {
|
||||
val openaire: HostedByInfo = input._1._1
|
||||
|
@ -28,9 +27,33 @@ object SparkProduceHostedByMap {
|
|||
val isOpenAccess: Boolean = doaj == null && gold == null
|
||||
|
||||
openaire.journal_id match {
|
||||
case Constants.ISSN => HostedByItemType(openaire.id, openaire.officialname, openaire.journal_id, "", "", isOpenAccess)
|
||||
case Constants.EISSN => HostedByItemType(openaire.id, openaire.officialname, "", openaire.journal_id, "", isOpenAccess)
|
||||
case Constants.ISSNL => HostedByItemType(openaire.id, openaire.officialname, "", "", openaire.journal_id, isOpenAccess)
|
||||
case Constants.ISSN =>
|
||||
HostedByItemType(
|
||||
openaire.id,
|
||||
openaire.officialname,
|
||||
openaire.journal_id,
|
||||
"",
|
||||
"",
|
||||
isOpenAccess
|
||||
)
|
||||
case Constants.EISSN =>
|
||||
HostedByItemType(
|
||||
openaire.id,
|
||||
openaire.officialname,
|
||||
"",
|
||||
openaire.journal_id,
|
||||
"",
|
||||
isOpenAccess
|
||||
)
|
||||
case Constants.ISSNL =>
|
||||
HostedByItemType(
|
||||
openaire.id,
|
||||
openaire.officialname,
|
||||
"",
|
||||
"",
|
||||
openaire.journal_id,
|
||||
isOpenAccess
|
||||
)
|
||||
|
||||
// catch the default with a variable so you can print it
|
||||
case whoa => null
|
||||
|
@ -46,11 +69,16 @@ object SparkProduceHostedByMap {
|
|||
|
||||
Serialization.write(map)
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def getHostedByItemType(id: String, officialname: String, issn: String, eissn: String, issnl: String, oa: Boolean): HostedByItemType = {
|
||||
def getHostedByItemType(
|
||||
id: String,
|
||||
officialname: String,
|
||||
issn: String,
|
||||
eissn: String,
|
||||
issnl: String,
|
||||
oa: Boolean
|
||||
): HostedByItemType = {
|
||||
if (issn != null) {
|
||||
if (eissn != null) {
|
||||
if (issnl != null) {
|
||||
|
@ -85,7 +113,14 @@ object SparkProduceHostedByMap {
|
|||
def oaToHostedbyItemType(dats: Datasource): HostedByItemType = {
|
||||
if (dats.getJournal != null) {
|
||||
|
||||
return getHostedByItemType(dats.getId, dats.getOfficialname.getValue, dats.getJournal.getIssnPrinted, dats.getJournal.getIssnOnline, dats.getJournal.getIssnLinking, false)
|
||||
return getHostedByItemType(
|
||||
dats.getId,
|
||||
dats.getOfficialname.getValue,
|
||||
dats.getJournal.getIssnPrinted,
|
||||
dats.getJournal.getIssnOnline,
|
||||
dats.getJournal.getIssnLinking,
|
||||
false
|
||||
)
|
||||
}
|
||||
HostedByItemType("", "", "", "", "", false)
|
||||
}
|
||||
|
@ -94,32 +129,41 @@ object SparkProduceHostedByMap {
|
|||
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
implicit var encoderD = Encoders.kryo[Datasource]
|
||||
|
||||
val dd: Dataset[Datasource] = spark.read.textFile(datasourcePath)
|
||||
val dd: Dataset[Datasource] = spark.read
|
||||
.textFile(datasourcePath)
|
||||
.map(r => mapper.readValue(r, classOf[Datasource]))
|
||||
|
||||
dd.map { ddt => oaToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
|
||||
|
||||
}
|
||||
|
||||
|
||||
def goldToHostedbyItemType(gold: UnibiGoldModel): HostedByItemType = {
|
||||
return getHostedByItemType(Constants.UNIBI, gold.getTitle, gold.getIssn, "", gold.getIssnL, true)
|
||||
return getHostedByItemType(
|
||||
Constants.UNIBI,
|
||||
gold.getTitle,
|
||||
gold.getIssn,
|
||||
"",
|
||||
gold.getIssnL,
|
||||
true
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
def goldHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
|
||||
def goldHostedByDataset(
|
||||
spark: SparkSession,
|
||||
datasourcePath: String
|
||||
): Dataset[HostedByItemType] = {
|
||||
import spark.implicits._
|
||||
|
||||
implicit val mapEncoderUnibi: Encoder[UnibiGoldModel] = Encoders.kryo[UnibiGoldModel]
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
val dd: Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath)
|
||||
val dd: Dataset[UnibiGoldModel] = spark.read
|
||||
.textFile(datasourcePath)
|
||||
.map(r => mapper.readValue(r, classOf[UnibiGoldModel]))
|
||||
|
||||
dd.map { ddt => goldToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
|
||||
|
@ -128,17 +172,28 @@ object SparkProduceHostedByMap {
|
|||
|
||||
def doajToHostedbyItemType(doaj: DOAJModel): HostedByItemType = {
|
||||
|
||||
return getHostedByItemType(Constants.DOAJ, doaj.getJournalTitle, doaj.getIssn, doaj.getEissn, "", true)
|
||||
return getHostedByItemType(
|
||||
Constants.DOAJ,
|
||||
doaj.getJournalTitle,
|
||||
doaj.getIssn,
|
||||
doaj.getEissn,
|
||||
"",
|
||||
true
|
||||
)
|
||||
}
|
||||
|
||||
def doajHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
|
||||
def doajHostedByDataset(
|
||||
spark: SparkSession,
|
||||
datasourcePath: String
|
||||
): Dataset[HostedByItemType] = {
|
||||
import spark.implicits._
|
||||
|
||||
implicit val mapEncoderDOAJ: Encoder[DOAJModel] = Encoders.kryo[DOAJModel]
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
val dd: Dataset[DOAJModel] = spark.read.textFile(datasourcePath)
|
||||
val dd: Dataset[DOAJModel] = spark.read
|
||||
.textFile(datasourcePath)
|
||||
.map(r => mapper.readValue(r, classOf[DOAJModel]))
|
||||
|
||||
dd.map { ddt => doajToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
|
||||
|
@ -159,7 +214,6 @@ object SparkProduceHostedByMap {
|
|||
lst
|
||||
}
|
||||
|
||||
|
||||
def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = {
|
||||
val conf = new Configuration()
|
||||
|
||||
|
@ -169,49 +223,51 @@ object SparkProduceHostedByMap {
|
|||
val writer = new PrintWriter(output)
|
||||
try {
|
||||
input.foreach(hbi => writer.println(hbi))
|
||||
}
|
||||
finally {
|
||||
} finally {
|
||||
writer.close()
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val datasourcePath = parser.get("datasourcePath")
|
||||
val workingDirPath = parser.get("workingPath")
|
||||
val outputPath = parser.get("outputPath")
|
||||
|
||||
|
||||
implicit val formats = DefaultFormats
|
||||
|
||||
|
||||
logger.info("Getting the Datasources")
|
||||
|
||||
|
||||
Aggregators.explodeHostedByItemType(oaHostedByDataset(spark, datasourcePath)
|
||||
.union(goldHostedByDataset(spark, workingDirPath + "/unibi_gold.json"))
|
||||
.union(doajHostedByDataset(spark, workingDirPath + "/doaj.json"))
|
||||
.flatMap(hbi => toList(hbi))).filter(hbi => hbi._2.id.startsWith("10|"))
|
||||
Aggregators
|
||||
.explodeHostedByItemType(
|
||||
oaHostedByDataset(spark, datasourcePath)
|
||||
.union(goldHostedByDataset(spark, workingDirPath + "/unibi_gold.json"))
|
||||
.union(doajHostedByDataset(spark, workingDirPath + "/doaj.json"))
|
||||
.flatMap(hbi => toList(hbi))
|
||||
)
|
||||
.filter(hbi => hbi._2.id.startsWith("10|"))
|
||||
.map(hbi => toHostedByMap(hbi))(Encoders.STRING)
|
||||
.rdd.saveAsTextFile(outputPath, classOf[GzipCodec])
|
||||
|
||||
.rdd
|
||||
.saveAsTextFile(outputPath, classOf[GzipCodec])
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -20,7 +20,13 @@ object CopyHdfsOafSparkApplication {
|
|||
def main(args: Array[String]): Unit = {
|
||||
val log = LoggerFactory.getLogger(getClass)
|
||||
val conf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json")).mkString)
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json")
|
||||
)
|
||||
.mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
|
||||
val spark =
|
||||
|
@ -28,7 +34,8 @@ object CopyHdfsOafSparkApplication {
|
|||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val sc: SparkContext = spark.sparkContext
|
||||
|
||||
|
@ -49,19 +56,22 @@ object CopyHdfsOafSparkApplication {
|
|||
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
|
||||
val paths = DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala
|
||||
val paths =
|
||||
DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala
|
||||
|
||||
val validPaths: List[String] = paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList
|
||||
val validPaths: List[String] =
|
||||
paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList
|
||||
|
||||
val types = ModelSupport.oafTypes.entrySet
|
||||
.asScala
|
||||
val types = ModelSupport.oafTypes.entrySet.asScala
|
||||
.map(e => Tuple2(e.getKey, e.getValue))
|
||||
|
||||
if (validPaths.nonEmpty) {
|
||||
val oaf = spark.read.textFile(validPaths: _*)
|
||||
val mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
|
||||
val mapper =
|
||||
new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
|
||||
|
||||
types.foreach(t => oaf
|
||||
types.foreach(t =>
|
||||
oaf
|
||||
.filter(o => isOafType(o, t._1))
|
||||
.map(j => mapper.readValue(j, t._2).asInstanceOf[Oaf])
|
||||
.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
|
||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.oa.graph.resolution
|
|||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.common.EntityType
|
||||
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset,_}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.spark.SparkConf
|
||||
|
@ -13,20 +13,32 @@ import org.slf4j.{Logger, LoggerFactory}
|
|||
object SparkResolveEntities {
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
val entities = List(EntityType.dataset, EntityType.publication, EntityType.software, EntityType.otherresearchproduct)
|
||||
|
||||
val entities = List(
|
||||
EntityType.dataset,
|
||||
EntityType.publication,
|
||||
EntityType.software,
|
||||
EntityType.otherresearchproduct
|
||||
)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json"
|
||||
)
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val graphBasePath = parser.get("graphBasePath")
|
||||
log.info(s"graphBasePath -> $graphBasePath")
|
||||
|
@ -38,7 +50,6 @@ object SparkResolveEntities {
|
|||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
|
||||
|
||||
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
|
||||
fs.mkdirs(new Path(workingPath))
|
||||
|
||||
|
@ -46,60 +57,84 @@ object SparkResolveEntities {
|
|||
generateResolvedEntities(spark, workingPath, graphBasePath, targetPath)
|
||||
}
|
||||
|
||||
|
||||
def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = {
|
||||
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
import spark.implicits._
|
||||
|
||||
val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
|
||||
val up: Dataset[(String, Result)] = spark.read.text(unresolvedPath).as[String].map(s => mapper.readValue(s, classOf[Result])).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
|
||||
val rPid: Dataset[(String, String)] =
|
||||
spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
|
||||
val up: Dataset[(String, Result)] = spark.read
|
||||
.text(unresolvedPath)
|
||||
.as[String]
|
||||
.map(s => mapper.readValue(s, classOf[Result]))
|
||||
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
|
||||
|
||||
rPid.joinWith(up, rPid("_2").equalTo(up("_1")), "inner").map {
|
||||
r =>
|
||||
rPid
|
||||
.joinWith(up, rPid("_2").equalTo(up("_1")), "inner")
|
||||
.map { r =>
|
||||
val result = r._2._2
|
||||
val dnetId = r._1._1
|
||||
result.setId(dnetId)
|
||||
result
|
||||
}.write.mode(SaveMode.Overwrite).save(s"$workingPath/resolvedEntities")
|
||||
}
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/resolvedEntities")
|
||||
}
|
||||
|
||||
|
||||
def deserializeObject(input: String, entity: EntityType): Result = {
|
||||
|
||||
entity match {
|
||||
case EntityType.publication => mapper.readValue(input, classOf[Publication])
|
||||
case EntityType.dataset => mapper.readValue(input, classOf[OafDataset])
|
||||
case EntityType.software => mapper.readValue(input, classOf[Software])
|
||||
case EntityType.publication => mapper.readValue(input, classOf[Publication])
|
||||
case EntityType.dataset => mapper.readValue(input, classOf[OafDataset])
|
||||
case EntityType.software => mapper.readValue(input, classOf[Software])
|
||||
case EntityType.otherresearchproduct => mapper.readValue(input, classOf[OtherResearchProduct])
|
||||
}
|
||||
}
|
||||
|
||||
def generateResolvedEntities(spark: SparkSession, workingPath: String, graphBasePath: String, targetPath: String) = {
|
||||
def generateResolvedEntities(
|
||||
spark: SparkSession,
|
||||
workingPath: String,
|
||||
graphBasePath: String,
|
||||
targetPath: String
|
||||
) = {
|
||||
|
||||
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
import spark.implicits._
|
||||
|
||||
val re: Dataset[(String, Result)] = spark.read.load(s"$workingPath/resolvedEntities").as[Result].map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
|
||||
entities.foreach {
|
||||
e => {
|
||||
val re: Dataset[(String, Result)] = spark.read
|
||||
.load(s"$workingPath/resolvedEntities")
|
||||
.as[Result]
|
||||
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
|
||||
entities.foreach { e =>
|
||||
{
|
||||
|
||||
val currentEntityDataset: Dataset[(String, Result)] = spark.read.text(s"$graphBasePath/$e").as[String].map(s => deserializeObject(s, e)).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
|
||||
val currentEntityDataset: Dataset[(String, Result)] = spark.read
|
||||
.text(s"$graphBasePath/$e")
|
||||
.as[String]
|
||||
.map(s => deserializeObject(s, e))
|
||||
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
|
||||
|
||||
currentEntityDataset.joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left").map(k => {
|
||||
currentEntityDataset
|
||||
.joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left")
|
||||
.map(k => {
|
||||
|
||||
val a = k._1
|
||||
val b = k._2
|
||||
if (b == null)
|
||||
a._2
|
||||
else {
|
||||
a._2.mergeFrom(b._2)
|
||||
a._2
|
||||
}
|
||||
}).map(r => mapper.writeValueAsString(r))(Encoders.STRING)
|
||||
.write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$targetPath/$e")
|
||||
val a = k._1
|
||||
val b = k._2
|
||||
if (b == null)
|
||||
a._2
|
||||
else {
|
||||
a._2.mergeFrom(b._2)
|
||||
a._2
|
||||
}
|
||||
})
|
||||
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(s"$targetPath/$e")
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,18 +17,25 @@ import org.json4s.jackson.JsonMethods.parse
|
|||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkResolveRelation {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json"
|
||||
)
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val graphBasePath = parser.get("graphBasePath")
|
||||
log.info(s"graphBasePath -> $graphBasePath")
|
||||
|
@ -41,7 +48,6 @@ object SparkResolveRelation {
|
|||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
//CLEANING TEMPORARY FOLDER
|
||||
HdfsSupport.remove(workingPath, spark.sparkContext.hadoopConfiguration)
|
||||
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
|
||||
|
@ -51,39 +57,49 @@ object SparkResolveRelation {
|
|||
|
||||
val mapper: ObjectMapper = new ObjectMapper()
|
||||
|
||||
val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
|
||||
val rPid: Dataset[(String, String)] =
|
||||
spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
|
||||
|
||||
val relationDs: Dataset[(String, Relation)] = spark.read.text(s"$graphBasePath/relation").as[String]
|
||||
.map(s => mapper.readValue(s, classOf[Relation])).as[Relation]
|
||||
val relationDs: Dataset[(String, Relation)] = spark.read
|
||||
.text(s"$graphBasePath/relation")
|
||||
.as[String]
|
||||
.map(s => mapper.readValue(s, classOf[Relation]))
|
||||
.as[Relation]
|
||||
.map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||
|
||||
relationDs.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left").map {
|
||||
m =>
|
||||
relationDs
|
||||
.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left")
|
||||
.map { m =>
|
||||
val sourceResolved = m._2
|
||||
val currentRelation = m._1._2
|
||||
if (sourceResolved != null && sourceResolved._1 != null && sourceResolved._1.nonEmpty)
|
||||
currentRelation.setSource(sourceResolved._1)
|
||||
currentRelation
|
||||
}.write
|
||||
}
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/relationResolvedSource")
|
||||
|
||||
|
||||
val relationSourceResolved: Dataset[(String, Relation)] = spark.read.load(s"$workingPath/relationResolvedSource").as[Relation]
|
||||
val relationSourceResolved: Dataset[(String, Relation)] = spark.read
|
||||
.load(s"$workingPath/relationResolvedSource")
|
||||
.as[Relation]
|
||||
.map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||
relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left").map {
|
||||
m =>
|
||||
relationSourceResolved
|
||||
.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left")
|
||||
.map { m =>
|
||||
val targetResolved = m._2
|
||||
val currentRelation = m._1._2
|
||||
if (targetResolved != null && targetResolved._1.nonEmpty)
|
||||
currentRelation.setTarget(targetResolved._1)
|
||||
currentRelation
|
||||
}
|
||||
}
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/relation_resolved")
|
||||
|
||||
spark.read.load(s"$workingPath/relation_resolved").as[Relation]
|
||||
spark.read
|
||||
.load(s"$workingPath/relation_resolved")
|
||||
.as[Relation]
|
||||
.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
|
||||
.map(r => mapper.writeValueAsString(r))
|
||||
.write
|
||||
|
@ -96,33 +112,31 @@ object SparkResolveRelation {
|
|||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
val result: List[(String, String)] = for {
|
||||
JObject(iObj) <- json \ "instance"
|
||||
JField("collectedfrom", JObject(cf)) <- iObj
|
||||
JObject(iObj) <- json \ "instance"
|
||||
JField("collectedfrom", JObject(cf)) <- iObj
|
||||
JField("instancetype", JObject(instancetype)) <- iObj
|
||||
JField("value", JString(collectedFrom)) <- cf
|
||||
JField("classname", JString(classname)) <- instancetype
|
||||
JField("value", JString(collectedFrom)) <- cf
|
||||
JField("classname", JString(classname)) <- instancetype
|
||||
} yield (classname, collectedFrom)
|
||||
|
||||
result
|
||||
|
||||
}
|
||||
|
||||
|
||||
def extractPidsFromRecord(input: String): (String, List[(String, String)]) = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
val id: String = (json \ "id").extract[String]
|
||||
val result: List[(String, String)] = for {
|
||||
JObject(pids) <- json \\ "instance" \ "pid"
|
||||
JField("value", JString(pidValue)) <- pids
|
||||
JObject(pids) <- json \\ "instance" \ "pid"
|
||||
JField("value", JString(pidValue)) <- pids
|
||||
JField("qualifier", JObject(qualifier)) <- pids
|
||||
JField("classid", JString(pidType)) <- qualifier
|
||||
JField("classid", JString(pidType)) <- qualifier
|
||||
} yield (pidValue, pidType)
|
||||
|
||||
(id, result)
|
||||
}
|
||||
|
||||
|
||||
private def isRelation(input: String): Boolean = {
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
@ -132,20 +146,25 @@ object SparkResolveRelation {
|
|||
source != null
|
||||
}
|
||||
|
||||
def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = {
|
||||
def extractPidResolvedTableFromJsonRDD(
|
||||
spark: SparkSession,
|
||||
graphPath: String,
|
||||
workingPath: String
|
||||
) = {
|
||||
import spark.implicits._
|
||||
|
||||
val d: RDD[(String, String)] = spark.sparkContext.textFile(s"$graphPath/*")
|
||||
val d: RDD[(String, String)] = spark.sparkContext
|
||||
.textFile(s"$graphPath/*")
|
||||
.filter(i => !isRelation(i))
|
||||
.map(i => extractPidsFromRecord(i))
|
||||
.filter(s => s != null && s._1 != null && s._2 != null && s._2.nonEmpty)
|
||||
.flatMap { p =>
|
||||
p._2.map(pid =>
|
||||
(p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2))
|
||||
)
|
||||
}.filter(r => r._1 != null || r._2 != null)
|
||||
p._2.map(pid => (p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2)))
|
||||
}
|
||||
.filter(r => r._1 != null || r._2 != null)
|
||||
|
||||
spark.createDataset(d)
|
||||
spark
|
||||
.createDataset(d)
|
||||
.groupByKey(_._2)
|
||||
.reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y)
|
||||
.map(s => s._2)
|
||||
|
|
|
@ -7,24 +7,26 @@ import org.apache.spark.sql.SparkSession
|
|||
|
||||
object SparkDataciteToOAF {
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val sc = spark.sparkContext
|
||||
|
||||
val inputPath = parser.get("inputPath")
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -11,18 +11,22 @@ import org.slf4j.{Logger, LoggerFactory}
|
|||
|
||||
object SparkConvertDatasetToJsonRDD {
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
|
@ -33,9 +37,13 @@ object SparkConvertDatasetToJsonRDD {
|
|||
val mapper = new ObjectMapper()
|
||||
implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
|
||||
|
||||
resultObject.foreach { item =>
|
||||
spark.read.load(s"$sourcePath/$item").as[Result].map(r => mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec])
|
||||
spark.read
|
||||
.load(s"$sourcePath/$item")
|
||||
.as[Result]
|
||||
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
|
||||
.rdd
|
||||
.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec])
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -15,14 +15,19 @@ object SparkConvertObjectToJson {
|
|||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
|
@ -33,24 +38,28 @@ object SparkConvertObjectToJson {
|
|||
val scholixUpdatePath = parser.get("scholixUpdatePath")
|
||||
log.info(s"scholixUpdatePath -> $scholixUpdatePath")
|
||||
|
||||
|
||||
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
|
||||
|
||||
val mapper = new ObjectMapper
|
||||
|
||||
objectType.toLowerCase match {
|
||||
case "scholix" =>
|
||||
log.info("Serialize Scholix")
|
||||
val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix]
|
||||
val u :Dataset[Scholix]= spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
|
||||
d.union(u).repartition(8000).map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.saveAsTextFile(targetPath, classOf[GzipCodec])
|
||||
val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
|
||||
d.union(u)
|
||||
.repartition(8000)
|
||||
.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
|
||||
.rdd
|
||||
.saveAsTextFile(targetPath, classOf[GzipCodec])
|
||||
case "summary" =>
|
||||
log.info("Serialize Summary")
|
||||
val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary]
|
||||
d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(1000).saveAsTextFile(targetPath, classOf[GzipCodec])
|
||||
d.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
|
||||
.rdd
|
||||
.repartition(1000)
|
||||
.saveAsTextFile(targetPath, classOf[GzipCodec])
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2,26 +2,38 @@ package eu.dnetlib.dhp.sx.graph
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.schema.oaf.{
|
||||
OtherResearchProduct,
|
||||
Publication,
|
||||
Relation,
|
||||
Result,
|
||||
Software,
|
||||
Dataset => OafDataset
|
||||
}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkConvertRDDtoDataset {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
|
@ -31,43 +43,79 @@ object SparkConvertRDDtoDataset {
|
|||
val entityPath = s"$t/entities"
|
||||
val relPath = s"$t/relation"
|
||||
val mapper = new ObjectMapper()
|
||||
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
|
||||
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
|
||||
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||
implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct])
|
||||
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
|
||||
|
||||
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
|
||||
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
|
||||
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||
implicit val orpEncoder: Encoder[OtherResearchProduct] =
|
||||
Encoders.kryo(classOf[OtherResearchProduct])
|
||||
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
|
||||
|
||||
log.info("Converting dataset")
|
||||
val rddDataset =spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
|
||||
spark.createDataset(rddDataset).as[OafDataset].write.mode(SaveMode.Overwrite).save(s"$entityPath/dataset")
|
||||
|
||||
val rddDataset = spark.sparkContext
|
||||
.textFile(s"$sourcePath/dataset")
|
||||
.map(s => mapper.readValue(s, classOf[OafDataset]))
|
||||
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
||||
spark
|
||||
.createDataset(rddDataset)
|
||||
.as[OafDataset]
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$entityPath/dataset")
|
||||
|
||||
log.info("Converting publication")
|
||||
val rddPublication =spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
|
||||
spark.createDataset(rddPublication).as[Publication].write.mode(SaveMode.Overwrite).save(s"$entityPath/publication")
|
||||
val rddPublication = spark.sparkContext
|
||||
.textFile(s"$sourcePath/publication")
|
||||
.map(s => mapper.readValue(s, classOf[Publication]))
|
||||
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
||||
spark
|
||||
.createDataset(rddPublication)
|
||||
.as[Publication]
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$entityPath/publication")
|
||||
|
||||
log.info("Converting software")
|
||||
val rddSoftware =spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
|
||||
spark.createDataset(rddSoftware).as[Software].write.mode(SaveMode.Overwrite).save(s"$entityPath/software")
|
||||
val rddSoftware = spark.sparkContext
|
||||
.textFile(s"$sourcePath/software")
|
||||
.map(s => mapper.readValue(s, classOf[Software]))
|
||||
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
||||
spark
|
||||
.createDataset(rddSoftware)
|
||||
.as[Software]
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$entityPath/software")
|
||||
|
||||
log.info("Converting otherresearchproduct")
|
||||
val rddOtherResearchProduct =spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
|
||||
spark.createDataset(rddOtherResearchProduct).as[OtherResearchProduct].write.mode(SaveMode.Overwrite).save(s"$entityPath/otherresearchproduct")
|
||||
|
||||
val rddOtherResearchProduct = spark.sparkContext
|
||||
.textFile(s"$sourcePath/otherresearchproduct")
|
||||
.map(s => mapper.readValue(s, classOf[OtherResearchProduct]))
|
||||
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
||||
spark
|
||||
.createDataset(rddOtherResearchProduct)
|
||||
.as[OtherResearchProduct]
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$entityPath/otherresearchproduct")
|
||||
|
||||
log.info("Converting Relation")
|
||||
|
||||
val relationSemanticFilter = List(
|
||||
"cites",
|
||||
"iscitedby",
|
||||
"merges",
|
||||
"ismergedin",
|
||||
"HasAmongTopNSimilarDocuments",
|
||||
"IsAmongTopNSimilarDocuments"
|
||||
)
|
||||
|
||||
val relationSemanticFilter = List("cites", "iscitedby","merges", "ismergedin", "HasAmongTopNSimilarDocuments","IsAmongTopNSimilarDocuments" )
|
||||
|
||||
val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation")
|
||||
val rddRelation = spark.sparkContext
|
||||
.textFile(s"$sourcePath/relation")
|
||||
.map(s => mapper.readValue(s, classOf[Relation]))
|
||||
.filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
|
||||
.filter(r=> r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
|
||||
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
||||
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
|
||||
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
|
||||
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
package eu.dnetlib.dhp.sx.graph
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset,_}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql._
|
||||
|
@ -13,82 +13,131 @@ object SparkCreateInputGraph {
|
|||
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/extract_entities_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/extract_entities_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val resultObject = List(
|
||||
("publication", classOf[Publication]),
|
||||
("dataset", classOf[OafDataset]),
|
||||
("software", classOf[Software]),
|
||||
("otherResearchProduct", classOf[OtherResearchProduct])
|
||||
|
||||
)
|
||||
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
|
||||
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
|
||||
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
|
||||
implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct])
|
||||
implicit val orpEncoder: Encoder[OtherResearchProduct] =
|
||||
Encoders.kryo(classOf[OtherResearchProduct])
|
||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
|
||||
|
||||
val oafDs: Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf]
|
||||
|
||||
|
||||
log.info("Extract Publication")
|
||||
oafDs.filter(o => o.isInstanceOf[Publication]).map(p => p.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/publication")
|
||||
oafDs
|
||||
.filter(o => o.isInstanceOf[Publication])
|
||||
.map(p => p.asInstanceOf[Publication])
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/extracted/publication")
|
||||
|
||||
log.info("Extract dataset")
|
||||
oafDs.filter(o => o.isInstanceOf[OafDataset]).map(p => p.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/dataset")
|
||||
oafDs
|
||||
.filter(o => o.isInstanceOf[OafDataset])
|
||||
.map(p => p.asInstanceOf[OafDataset])
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/extracted/dataset")
|
||||
|
||||
log.info("Extract software")
|
||||
oafDs.filter(o => o.isInstanceOf[Software]).map(p => p.asInstanceOf[Software]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/software")
|
||||
oafDs
|
||||
.filter(o => o.isInstanceOf[Software])
|
||||
.map(p => p.asInstanceOf[Software])
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/extracted/software")
|
||||
|
||||
log.info("Extract otherResearchProduct")
|
||||
oafDs.filter(o => o.isInstanceOf[OtherResearchProduct]).map(p => p.asInstanceOf[OtherResearchProduct]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/otherResearchProduct")
|
||||
oafDs
|
||||
.filter(o => o.isInstanceOf[OtherResearchProduct])
|
||||
.map(p => p.asInstanceOf[OtherResearchProduct])
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/extracted/otherResearchProduct")
|
||||
|
||||
log.info("Extract Relation")
|
||||
oafDs.filter(o => o.isInstanceOf[Relation]).map(p => p.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/relation")
|
||||
oafDs
|
||||
.filter(o => o.isInstanceOf[Relation])
|
||||
.map(p => p.asInstanceOf[Relation])
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/extracted/relation")
|
||||
|
||||
resultObject.foreach { r =>
|
||||
log.info(s"Make ${r._1} unique")
|
||||
makeDatasetUnique(s"$targetPath/extracted/${r._1}", s"$targetPath/preprocess/${r._1}", spark, r._2)
|
||||
makeDatasetUnique(
|
||||
s"$targetPath/extracted/${r._1}",
|
||||
s"$targetPath/preprocess/${r._1}",
|
||||
spark,
|
||||
r._2
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def extractEntities[T <: Oaf](oafDs: Dataset[Oaf], targetPath: String, clazz: Class[T], log: Logger): Unit = {
|
||||
def extractEntities[T <: Oaf](
|
||||
oafDs: Dataset[Oaf],
|
||||
targetPath: String,
|
||||
clazz: Class[T],
|
||||
log: Logger
|
||||
): Unit = {
|
||||
|
||||
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
|
||||
log.info(s"Extract ${clazz.getSimpleName}")
|
||||
oafDs.filter(o => o.isInstanceOf[T]).map(p => p.asInstanceOf[T]).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
oafDs
|
||||
.filter(o => o.isInstanceOf[T])
|
||||
.map(p => p.asInstanceOf[T])
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(targetPath)
|
||||
}
|
||||
|
||||
|
||||
def makeDatasetUnique[T <: Result](sourcePath: String, targetPath: String, spark: SparkSession, clazz: Class[T]): Unit = {
|
||||
def makeDatasetUnique[T <: Result](
|
||||
sourcePath: String,
|
||||
targetPath: String,
|
||||
spark: SparkSession,
|
||||
clazz: Class[T]
|
||||
): Unit = {
|
||||
import spark.implicits._
|
||||
|
||||
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
|
||||
|
||||
val ds: Dataset[T] = spark.read.load(sourcePath).as[T]
|
||||
|
||||
ds.groupByKey(_.getId).reduceGroups { (x, y) =>
|
||||
x.mergeFrom(y)
|
||||
x
|
||||
}.map(_._2).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
ds.groupByKey(_.getId)
|
||||
.reduceGroups { (x, y) =>
|
||||
x.mergeFrom(y)
|
||||
x
|
||||
}
|
||||
.map(_._2)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(targetPath)
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -17,14 +17,19 @@ object SparkCreateScholix {
|
|||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val relationPath = parser.get("relationPath")
|
||||
log.info(s"relationPath -> $relationPath")
|
||||
|
@ -33,37 +38,46 @@ object SparkCreateScholix {
|
|||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
|
||||
|
||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation]
|
||||
.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
||||
val relationDS: Dataset[(String, Relation)] = spark.read
|
||||
.load(relationPath)
|
||||
.as[Relation]
|
||||
.filter(r =>
|
||||
(r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase
|
||||
.contains("merge")
|
||||
)
|
||||
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||
|
||||
val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary]
|
||||
val summaryDS: Dataset[(String, ScholixSummary)] = spark.read
|
||||
.load(summaryPath)
|
||||
.as[ScholixSummary]
|
||||
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, summaryEncoder))
|
||||
|
||||
|
||||
relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
|
||||
relationDS
|
||||
.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
|
||||
.map { input: ((String, Relation), (String, ScholixSummary)) =>
|
||||
if (input._1 != null && input._2 != null) {
|
||||
val rel: Relation = input._1._2
|
||||
val source: ScholixSummary = input._2._2
|
||||
(rel.getTarget, ScholixUtils.scholixFromSource(rel, source))
|
||||
}
|
||||
else null
|
||||
} else null
|
||||
}(Encoders.tuple(Encoders.STRING, scholixEncoder))
|
||||
.filter(r => r != null)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/scholix_from_source")
|
||||
|
||||
val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
|
||||
val scholixSource: Dataset[(String, Scholix)] = spark.read
|
||||
.load(s"$targetPath/scholix_from_source")
|
||||
.as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
|
||||
|
||||
scholixSource.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
|
||||
scholixSource
|
||||
.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
|
||||
.map { input: ((String, Scholix), (String, ScholixSummary)) =>
|
||||
if (input._2 == null) {
|
||||
null
|
||||
|
@ -72,40 +86,73 @@ object SparkCreateScholix {
|
|||
val target: ScholixSummary = input._2._2
|
||||
ScholixUtils.generateCompleteScholix(s, target)
|
||||
}
|
||||
}.filter(s => s != null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse")
|
||||
}
|
||||
.filter(s => s != null)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/scholix_one_verse")
|
||||
|
||||
val scholix_o_v: Dataset[Scholix] =
|
||||
spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
|
||||
|
||||
val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
|
||||
|
||||
scholix_o_v.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))).as[Scholix]
|
||||
scholix_o_v
|
||||
.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s)))
|
||||
.as[Scholix]
|
||||
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder))
|
||||
.groupByKey(_._1)
|
||||
.agg(ScholixUtils.scholixAggregator.toColumn)
|
||||
.map(s => s._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/scholix")
|
||||
|
||||
val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix]
|
||||
|
||||
val stats: Dataset[(String, String, Long)] = scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String, String, Long)]
|
||||
|
||||
val stats: Dataset[(String, String, Long)] = scholix_final
|
||||
.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType))
|
||||
.groupBy("_1", "_2")
|
||||
.agg(count("_1"))
|
||||
.as[(String, String, Long)]
|
||||
|
||||
stats
|
||||
.map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0))
|
||||
.map(s =>
|
||||
RelatedEntities(
|
||||
s._1,
|
||||
if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0,
|
||||
if ("publication".equalsIgnoreCase(s._2)) s._3 else 0
|
||||
)
|
||||
)
|
||||
.groupByKey(_.id)
|
||||
.reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset + b.relatedDataset, a.relatedPublication + b.relatedPublication))
|
||||
.reduceGroups((a, b) =>
|
||||
RelatedEntities(
|
||||
a.id,
|
||||
a.relatedDataset + b.relatedDataset,
|
||||
a.relatedPublication + b.relatedPublication
|
||||
)
|
||||
)
|
||||
.map(_._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/related_entities")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$targetPath/related_entities")
|
||||
|
||||
val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication > 0 || r.relatedDataset > 0)
|
||||
val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read
|
||||
.load(s"$targetPath/related_entities")
|
||||
.as[RelatedEntities]
|
||||
.filter(r => r.relatedPublication > 0 || r.relatedDataset > 0)
|
||||
|
||||
relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map { i =>
|
||||
val re = i._1
|
||||
val sum = i._2._2
|
||||
relatedEntitiesDS
|
||||
.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner")
|
||||
.map { i =>
|
||||
val re = i._1
|
||||
val sum = i._2._2
|
||||
|
||||
sum.setRelatedDatasets(re.relatedDataset)
|
||||
sum.setRelatedPublications(re.relatedPublication)
|
||||
sum
|
||||
}.write.mode(SaveMode.Overwrite).save(s"${summaryPath}_filtered")
|
||||
sum.setRelatedDatasets(re.relatedDataset)
|
||||
sum.setRelatedPublications(re.relatedPublication)
|
||||
sum
|
||||
}
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"${summaryPath}_filtered")
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,14 +14,19 @@ object SparkCreateSummaryObject {
|
|||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
|
@ -33,10 +38,17 @@ object SparkCreateSummaryObject {
|
|||
|
||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
|
||||
val ds: Dataset[Result] = spark.read
|
||||
.load(s"$sourcePath/*")
|
||||
.as[Result]
|
||||
.filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false)
|
||||
|
||||
val ds: Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false)
|
||||
|
||||
ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
ds.repartition(6000)
|
||||
.map(r => ScholixUtils.resultToSummary(r))
|
||||
.filter(s => s != null)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(targetPath)
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -10,61 +10,89 @@ import java.util.regex.Pattern
|
|||
import scala.language.postfixOps
|
||||
import scala.xml.{Elem, Node, XML}
|
||||
|
||||
case class PangaeaDataModel(identifier:String, title:List[String], objectType:List[String], creator:List[String],
|
||||
publisher:List[String], dataCenter :List[String],subject :List[String], language:String,
|
||||
rights:String, parent:String,relation :List[String],linkage:List[(String,String)] ) {}
|
||||
case class PangaeaDataModel(
|
||||
identifier: String,
|
||||
title: List[String],
|
||||
objectType: List[String],
|
||||
creator: List[String],
|
||||
publisher: List[String],
|
||||
dataCenter: List[String],
|
||||
subject: List[String],
|
||||
language: String,
|
||||
rights: String,
|
||||
parent: String,
|
||||
relation: List[String],
|
||||
linkage: List[(String, String)]
|
||||
) {}
|
||||
|
||||
object PangaeaUtils {
|
||||
|
||||
|
||||
def toDataset(input:String):PangaeaDataModel = {
|
||||
def toDataset(input: String): PangaeaDataModel = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
val xml= (json \ "xml").extract[String]
|
||||
val xml = (json \ "xml").extract[String]
|
||||
parseXml(xml)
|
||||
}
|
||||
|
||||
def findDOIInRelation( input:List[String]):List[String] = {
|
||||
def findDOIInRelation(input: List[String]): List[String] = {
|
||||
val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b")
|
||||
input.map(i => {
|
||||
val matcher = pattern.matcher(i)
|
||||
if (matcher.find())
|
||||
matcher.group(0)
|
||||
else
|
||||
null
|
||||
}).filter(i => i!= null)
|
||||
input
|
||||
.map(i => {
|
||||
val matcher = pattern.matcher(i)
|
||||
if (matcher.find())
|
||||
matcher.group(0)
|
||||
else
|
||||
null
|
||||
})
|
||||
.filter(i => i != null)
|
||||
}
|
||||
|
||||
def attributeOpt(attribute: String, node:Node): Option[String] =
|
||||
def attributeOpt(attribute: String, node: Node): Option[String] =
|
||||
node.attribute(attribute) flatMap (_.headOption) map (_.text)
|
||||
|
||||
def extractLinkage(node:Elem):List[(String, String)] = {
|
||||
(node \ "linkage").map(n =>(attributeOpt("type",n), n.text)).filter(t => t._1.isDefined).map(t=> (t._1.get, t._2))(collection.breakOut)
|
||||
def extractLinkage(node: Elem): List[(String, String)] = {
|
||||
(node \ "linkage")
|
||||
.map(n => (attributeOpt("type", n), n.text))
|
||||
.filter(t => t._1.isDefined)
|
||||
.map(t => (t._1.get, t._2))(collection.breakOut)
|
||||
}
|
||||
|
||||
def parseXml(input:String):PangaeaDataModel = {
|
||||
def parseXml(input: String): PangaeaDataModel = {
|
||||
val xml = XML.loadString(input)
|
||||
|
||||
val identifier = (xml \ "identifier").text
|
||||
val title :List[String] = (xml \ "title").map(n => n.text)(collection.breakOut)
|
||||
val pType :List[String] = (xml \ "type").map(n => n.text)(collection.breakOut)
|
||||
val creators:List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut)
|
||||
val publisher :List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut)
|
||||
val dataCenter :List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut)
|
||||
val subject :List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut)
|
||||
val language= (xml \ "language").text
|
||||
val rights= (xml \ "rights").text
|
||||
val parentIdentifier= (xml \ "parentIdentifier").text
|
||||
val relation :List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut)
|
||||
val title: List[String] = (xml \ "title").map(n => n.text)(collection.breakOut)
|
||||
val pType: List[String] = (xml \ "type").map(n => n.text)(collection.breakOut)
|
||||
val creators: List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut)
|
||||
val publisher: List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut)
|
||||
val dataCenter: List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut)
|
||||
val subject: List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut)
|
||||
val language = (xml \ "language").text
|
||||
val rights = (xml \ "rights").text
|
||||
val parentIdentifier = (xml \ "parentIdentifier").text
|
||||
val relation: List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut)
|
||||
val relationFiltered = findDOIInRelation(relation)
|
||||
val linkage:List[(String,String)] = extractLinkage(xml)
|
||||
val linkage: List[(String, String)] = extractLinkage(xml)
|
||||
|
||||
PangaeaDataModel(identifier,title, pType, creators,publisher, dataCenter, subject, language, rights, parentIdentifier, relationFiltered, linkage)
|
||||
PangaeaDataModel(
|
||||
identifier,
|
||||
title,
|
||||
pType,
|
||||
creators,
|
||||
publisher,
|
||||
dataCenter,
|
||||
subject,
|
||||
language,
|
||||
rights,
|
||||
parentIdentifier,
|
||||
relationFiltered,
|
||||
linkage
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] = new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel]{
|
||||
|
||||
def getDatasetAggregator()
|
||||
: Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] =
|
||||
new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] {
|
||||
|
||||
override def zero: PangaeaDataModel = null
|
||||
|
||||
|
@ -77,7 +105,7 @@ object PangaeaUtils {
|
|||
else {
|
||||
if (b.title != null && b.title.nonEmpty)
|
||||
b
|
||||
else
|
||||
else
|
||||
a._2
|
||||
|
||||
}
|
||||
|
@ -106,7 +134,4 @@ object PangaeaUtils {
|
|||
override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -11,20 +11,25 @@ import scala.io.Source
|
|||
|
||||
object SparkGeneratePanagaeaDataset {
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")).mkString)
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")
|
||||
)
|
||||
.mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
|
||||
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkGeneratePanagaeaDataset.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}"))
|
||||
logger.info("Converting sequential file into Dataset")
|
||||
|
@ -34,16 +39,20 @@ object SparkGeneratePanagaeaDataset {
|
|||
|
||||
implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
||||
|
||||
val inputRDD: RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
|
||||
val inputRDD: RDD[PangaeaDataModel] =
|
||||
sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
|
||||
|
||||
spark.createDataset(inputRDD).as[PangaeaDataModel]
|
||||
spark
|
||||
.createDataset(inputRDD)
|
||||
.as[PangaeaDataModel]
|
||||
.map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(PangaeaUtils.getDatasetAggregator().toColumn)
|
||||
.map(s => s._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/dataset")
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -9,10 +9,10 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
|||
import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue}
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
class TestApply extends java.io.Serializable{
|
||||
class TestApply extends java.io.Serializable {
|
||||
|
||||
@Test
|
||||
def testApplyOnResult (): Unit = {
|
||||
def testApplyOnResult(): Unit = {
|
||||
val conf = new SparkConf()
|
||||
conf.setMaster("local[*]")
|
||||
conf.set("spark.driver.host", "localhost")
|
||||
|
@ -25,54 +25,104 @@ class TestApply extends java.io.Serializable{
|
|||
val pub = getClass.getResource("publication.json").getPath
|
||||
val hbm = getClass.getResource("preparedInfo.json").getPath
|
||||
|
||||
val mapper:ObjectMapper = new ObjectMapper()
|
||||
val mapper: ObjectMapper = new ObjectMapper()
|
||||
|
||||
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
implicit val mapEncoderPubInfo: Encoder[Publication] = Encoders.bean(classOf[Publication])
|
||||
|
||||
|
||||
val pub_ds :Dataset[Publication] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[Publication]))
|
||||
val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||
|
||||
val pub_ds: Dataset[Publication] =
|
||||
spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[Publication]))
|
||||
val hbm_ds: Dataset[EntityInfo] =
|
||||
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||
|
||||
assertEquals(13, pub_ds.count())
|
||||
|
||||
val ds:Dataset[Publication] = SparkApplyHostedByMapToResult.applyHBtoPubs(hbm_ds, pub_ds)
|
||||
val ds: Dataset[Publication] = SparkApplyHostedByMapToResult.applyHBtoPubs(hbm_ds, pub_ds)
|
||||
|
||||
assertEquals(13, ds.count)
|
||||
assertEquals(13, ds.count)
|
||||
|
||||
val temp: Dataset[(Publication, Publication)] = pub_ds.joinWith(ds, pub_ds.col("id").equalTo(ds.col("id")), "left")
|
||||
val temp: Dataset[(Publication, Publication)] =
|
||||
pub_ds.joinWith(ds, pub_ds.col("id").equalTo(ds.col("id")), "left")
|
||||
assertEquals(13, temp.count())
|
||||
temp.foreach(t2 => {
|
||||
val pb : Publication = t2._1
|
||||
val pa : Publication = t2._2
|
||||
val pb: Publication = t2._1
|
||||
val pa: Publication = t2._2
|
||||
assertEquals(1, pa.getInstance().size())
|
||||
assertEquals(1, pb.getInstance().size())
|
||||
assertTrue(t2._1.getId.equals(t2._2.getId))
|
||||
if(pb.getId.equals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9")){
|
||||
assertTrue(pa.getInstance().get(0).getHostedby.getKey.equals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735"))
|
||||
if (pb.getId.equals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9")) {
|
||||
assertTrue(
|
||||
pa.getInstance()
|
||||
.get(0)
|
||||
.getHostedby
|
||||
.getKey
|
||||
.equals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735")
|
||||
)
|
||||
assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals("Academic Therapy"))
|
||||
assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals("OPEN"))
|
||||
assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals("Open Access"))
|
||||
assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.gold))
|
||||
assertTrue(
|
||||
pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.gold)
|
||||
)
|
||||
assertTrue(pa.getBestaccessright.getClassid.equals("OPEN"))
|
||||
assertTrue(pa.getBestaccessright.getClassname.equals("Open Access"))
|
||||
|
||||
|
||||
assertTrue(pb.getInstance().get(0).getHostedby.getKey.equals("10|openaire____::0b74b6a356bbf23c245f9ae9a748745c"))
|
||||
assertTrue(pb.getInstance().get(0).getHostedby.getValue.equals("Revistas de investigación Universidad Nacional Mayor de San Marcos"))
|
||||
assertTrue(
|
||||
pb.getInstance()
|
||||
.get(0)
|
||||
.getHostedby
|
||||
.getKey
|
||||
.equals("10|openaire____::0b74b6a356bbf23c245f9ae9a748745c")
|
||||
)
|
||||
assertTrue(
|
||||
pb.getInstance()
|
||||
.get(0)
|
||||
.getHostedby
|
||||
.getValue
|
||||
.equals("Revistas de investigación Universidad Nacional Mayor de San Marcos")
|
||||
)
|
||||
assertTrue(pb.getInstance().get(0).getAccessright.getClassname.equals("not available"))
|
||||
assertTrue(pb.getInstance().get(0).getAccessright.getClassid.equals("UNKNOWN"))
|
||||
assertTrue(pb.getInstance().get(0).getAccessright.getOpenAccessRoute == null)
|
||||
assertTrue(pb.getBestaccessright.getClassid.equals("UNKNOWN"))
|
||||
assertTrue(pb.getBestaccessright.getClassname.equals("not available"))
|
||||
|
||||
}else{
|
||||
assertTrue(pa.getInstance().get(0).getHostedby.getKey.equals(pb.getInstance().get(0).getHostedby.getKey))
|
||||
assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals(pb.getInstance().get(0).getHostedby.getValue))
|
||||
assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals(pb.getInstance().get(0).getAccessright.getClassid))
|
||||
assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals(pb.getInstance().get(0).getAccessright.getClassname))
|
||||
assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute == pb.getInstance().get(0).getAccessright.getOpenAccessRoute)
|
||||
} else {
|
||||
assertTrue(
|
||||
pa.getInstance()
|
||||
.get(0)
|
||||
.getHostedby
|
||||
.getKey
|
||||
.equals(pb.getInstance().get(0).getHostedby.getKey)
|
||||
)
|
||||
assertTrue(
|
||||
pa.getInstance()
|
||||
.get(0)
|
||||
.getHostedby
|
||||
.getValue
|
||||
.equals(pb.getInstance().get(0).getHostedby.getValue)
|
||||
)
|
||||
assertTrue(
|
||||
pa.getInstance()
|
||||
.get(0)
|
||||
.getAccessright
|
||||
.getClassid
|
||||
.equals(pb.getInstance().get(0).getAccessright.getClassid)
|
||||
)
|
||||
assertTrue(
|
||||
pa.getInstance()
|
||||
.get(0)
|
||||
.getAccessright
|
||||
.getClassname
|
||||
.equals(pb.getInstance().get(0).getAccessright.getClassname)
|
||||
)
|
||||
assertTrue(
|
||||
pa.getInstance().get(0).getAccessright.getOpenAccessRoute == pb
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.getAccessright
|
||||
.getOpenAccessRoute
|
||||
)
|
||||
|
||||
}
|
||||
})
|
||||
|
@ -80,9 +130,8 @@ class TestApply extends java.io.Serializable{
|
|||
spark.close()
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testApplyOnDatasource():Unit = {
|
||||
def testApplyOnDatasource(): Unit = {
|
||||
val conf = new SparkConf()
|
||||
conf.setMaster("local[*]")
|
||||
conf.set("spark.driver.host", "localhost")
|
||||
|
@ -95,38 +144,49 @@ class TestApply extends java.io.Serializable{
|
|||
val dats = getClass.getResource("datasource.json").getPath
|
||||
val hbm = getClass.getResource("preparedInfo2.json").getPath
|
||||
|
||||
val mapper:ObjectMapper = new ObjectMapper()
|
||||
val mapper: ObjectMapper = new ObjectMapper()
|
||||
|
||||
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
implicit val mapEncoderPubInfo: Encoder[Datasource] = Encoders.bean(classOf[Datasource])
|
||||
|
||||
|
||||
val dats_ds :Dataset[Datasource] = spark.read.textFile(dats).map(p => mapper.readValue(p, classOf[Datasource]))
|
||||
val hbm_ds :Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])))
|
||||
|
||||
val dats_ds: Dataset[Datasource] =
|
||||
spark.read.textFile(dats).map(p => mapper.readValue(p, classOf[Datasource]))
|
||||
val hbm_ds: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(
|
||||
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||
)
|
||||
|
||||
assertEquals(10, dats_ds.count())
|
||||
|
||||
val ds:Dataset[Datasource] = SparkApplyHostedByMapToDatasource.applyHBtoDats(hbm_ds, dats_ds)
|
||||
val ds: Dataset[Datasource] = SparkApplyHostedByMapToDatasource.applyHBtoDats(hbm_ds, dats_ds)
|
||||
|
||||
assertEquals(10, ds.count)
|
||||
assertEquals(10, ds.count)
|
||||
|
||||
val temp: Dataset[(Datasource, Datasource)] = dats_ds.joinWith(ds, dats_ds.col("id").equalTo(ds.col("id")), "left")
|
||||
val temp: Dataset[(Datasource, Datasource)] =
|
||||
dats_ds.joinWith(ds, dats_ds.col("id").equalTo(ds.col("id")), "left")
|
||||
assertEquals(10, temp.count())
|
||||
temp.foreach(t2 => {
|
||||
val pb : Datasource = t2._1
|
||||
val pa : Datasource = t2._2
|
||||
val pb: Datasource = t2._1
|
||||
val pa: Datasource = t2._2
|
||||
assertTrue(t2._1.getId.equals(t2._2.getId))
|
||||
if(pb.getId.equals("10|doajarticles::0ab37b7620eb9a73ac95d3ca4320c97d")) {
|
||||
if (pb.getId.equals("10|doajarticles::0ab37b7620eb9a73ac95d3ca4320c97d")) {
|
||||
assertTrue(pa.getOpenairecompatibility().getClassid.equals("hostedBy"))
|
||||
assertTrue(pa.getOpenairecompatibility().getClassname.equals("collected from a compatible aggregator"))
|
||||
assertTrue(
|
||||
pa.getOpenairecompatibility()
|
||||
.getClassname
|
||||
.equals("collected from a compatible aggregator")
|
||||
)
|
||||
|
||||
assertTrue(pb.getOpenairecompatibility().getClassid.equals(ModelConstants.UNKNOWN))
|
||||
|
||||
|
||||
} else {
|
||||
assertTrue(pa.getOpenairecompatibility().getClassid.equals(pb.getOpenairecompatibility.getClassid))
|
||||
assertTrue(pa.getOpenairecompatibility().getClassname.equals(pb.getOpenairecompatibility.getClassname))
|
||||
assertTrue(
|
||||
pa.getOpenairecompatibility().getClassid.equals(pb.getOpenairecompatibility.getClassid)
|
||||
)
|
||||
assertTrue(
|
||||
pa.getOpenairecompatibility()
|
||||
.getClassname
|
||||
.equals(pb.getOpenairecompatibility.getClassname)
|
||||
)
|
||||
|
||||
}
|
||||
})
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
package eu.dnetlib.dhp.oa.graph.hostedbymap
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkPrepareHostedByInfoToApply.{joinResHBM, prepareResultInfo, toEntityInfo}
|
||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkPrepareHostedByInfoToApply.{
|
||||
joinResHBM,
|
||||
prepareResultInfo,
|
||||
toEntityInfo
|
||||
}
|
||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||
|
@ -9,9 +13,9 @@ import org.json4s.DefaultFormats
|
|||
import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue}
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
class TestPrepare extends java.io.Serializable{
|
||||
class TestPrepare extends java.io.Serializable {
|
||||
|
||||
def getString(input:HostedByItemType):String = {
|
||||
def getString(input: HostedByItemType): String = {
|
||||
|
||||
import org.json4s.jackson.Serialization.write
|
||||
implicit val formats = DefaultFormats
|
||||
|
@ -19,9 +23,8 @@ class TestPrepare extends java.io.Serializable{
|
|||
write(input)
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testHostedByMaptoEntityInfo() : Unit = {
|
||||
def testHostedByMaptoEntityInfo(): Unit = {
|
||||
val conf = new SparkConf()
|
||||
conf.setMaster("local[*]")
|
||||
conf.set("spark.driver.host", "localhost")
|
||||
|
@ -33,23 +36,23 @@ class TestPrepare extends java.io.Serializable{
|
|||
.getOrCreate()
|
||||
val hbm = getClass.getResource("hostedbymap.json").getPath
|
||||
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
val mapper:ObjectMapper = new ObjectMapper()
|
||||
val mapper: ObjectMapper = new ObjectMapper()
|
||||
|
||||
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
|
||||
val ds :Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hbm)).map(toEntityInfo)
|
||||
val ds: Dataset[EntityInfo] =
|
||||
spark.createDataset(spark.sparkContext.textFile(hbm)).map(toEntityInfo)
|
||||
|
||||
ds.foreach(e => println(mapper.writeValueAsString(e)))
|
||||
|
||||
assertEquals(20, ds.count)
|
||||
assertEquals(20, ds.count)
|
||||
spark.close()
|
||||
}
|
||||
|
||||
@Test
|
||||
def testPublicationtoEntityInfo() : Unit = {
|
||||
def testPublicationtoEntityInfo(): Unit = {
|
||||
val conf = new SparkConf()
|
||||
conf.setMaster("local[*]")
|
||||
conf.set("spark.driver.host", "localhost")
|
||||
|
@ -61,24 +64,30 @@ class TestPrepare extends java.io.Serializable{
|
|||
.getOrCreate()
|
||||
val path = getClass.getResource("publication.json").getPath
|
||||
|
||||
val mapper:ObjectMapper = new ObjectMapper()
|
||||
val mapper: ObjectMapper = new ObjectMapper()
|
||||
|
||||
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
|
||||
val ds :Dataset[EntityInfo] = prepareResultInfo(spark, path)
|
||||
val ds: Dataset[EntityInfo] = prepareResultInfo(spark, path)
|
||||
|
||||
ds.foreach(e => println(mapper.writeValueAsString(e)))
|
||||
|
||||
assertEquals(2, ds.count)
|
||||
assertEquals(2, ds.count)
|
||||
|
||||
assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ds.filter(ei => ei.getJournalId.equals("1728-5852")).first().getId)
|
||||
assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ds.filter(ei => ei.getJournalId.equals("0001-396X")).first().getId)
|
||||
assertEquals(
|
||||
"50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9",
|
||||
ds.filter(ei => ei.getJournalId.equals("1728-5852")).first().getId
|
||||
)
|
||||
assertEquals(
|
||||
"50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9",
|
||||
ds.filter(ei => ei.getJournalId.equals("0001-396X")).first().getId
|
||||
)
|
||||
|
||||
spark.close()
|
||||
}
|
||||
|
||||
@Test
|
||||
def testJoinResHBM (): Unit = {
|
||||
def testJoinResHBM(): Unit = {
|
||||
val conf = new SparkConf()
|
||||
conf.setMaster("local[*]")
|
||||
conf.set("spark.driver.host", "localhost")
|
||||
|
@ -91,18 +100,20 @@ class TestPrepare extends java.io.Serializable{
|
|||
val pub = getClass.getResource("iteminfofrompublication").getPath
|
||||
val hbm = getClass.getResource("iteminfofromhostedbymap.json").getPath
|
||||
|
||||
val mapper:ObjectMapper = new ObjectMapper()
|
||||
val mapper: ObjectMapper = new ObjectMapper()
|
||||
|
||||
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
|
||||
val pub_ds :Dataset[EntityInfo] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||
val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||
val pub_ds: Dataset[EntityInfo] =
|
||||
spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||
val hbm_ds: Dataset[EntityInfo] =
|
||||
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||
|
||||
val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds)
|
||||
|
||||
assertEquals(1, ds.count)
|
||||
assertEquals(1, ds.count)
|
||||
|
||||
val ei:EntityInfo = ds.first()
|
||||
val ei: EntityInfo = ds.first()
|
||||
|
||||
assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ei.getId)
|
||||
assertEquals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735", ei.getHostedById)
|
||||
|
@ -114,7 +125,7 @@ class TestPrepare extends java.io.Serializable{
|
|||
}
|
||||
|
||||
@Test
|
||||
def testJoinResHBM2 (): Unit = {
|
||||
def testJoinResHBM2(): Unit = {
|
||||
val conf = new SparkConf()
|
||||
conf.setMaster("local[*]")
|
||||
conf.set("spark.driver.host", "localhost")
|
||||
|
@ -127,18 +138,20 @@ class TestPrepare extends java.io.Serializable{
|
|||
val pub = getClass.getResource("iteminfofrompublication2").getPath
|
||||
val hbm = getClass.getResource("iteminfofromhostedbymap2.json").getPath
|
||||
|
||||
val mapper:ObjectMapper = new ObjectMapper()
|
||||
val mapper: ObjectMapper = new ObjectMapper()
|
||||
|
||||
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
|
||||
val pub_ds :Dataset[EntityInfo] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||
val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||
val pub_ds: Dataset[EntityInfo] =
|
||||
spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||
val hbm_ds: Dataset[EntityInfo] =
|
||||
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
|
||||
|
||||
val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds)
|
||||
|
||||
assertEquals(1, ds.count)
|
||||
assertEquals(1, ds.count)
|
||||
|
||||
val ei:EntityInfo = ds.first()
|
||||
val ei: EntityInfo = ds.first()
|
||||
|
||||
assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ei.getId)
|
||||
assertEquals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735", ei.getHostedById)
|
||||
|
@ -150,6 +163,4 @@ class TestPrepare extends java.io.Serializable{
|
|||
spark.close()
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -8,20 +8,19 @@ import org.json4s.jackson.Serialization.write
|
|||
import org.junit.jupiter.api.Assertions._
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
class TestPreprocess extends java.io.Serializable{
|
||||
class TestPreprocess extends java.io.Serializable {
|
||||
|
||||
implicit val mapEncoderDats: Encoder[Datasource] = Encoders.kryo[Datasource]
|
||||
implicit val schema = Encoders.product[HostedByInfo]
|
||||
|
||||
|
||||
def toHBIString (hbi:HostedByItemType): String = {
|
||||
def toHBIString(hbi: HostedByItemType): String = {
|
||||
implicit val formats = DefaultFormats
|
||||
|
||||
write(hbi)
|
||||
}
|
||||
|
||||
@Test
|
||||
def readDatasource():Unit = {
|
||||
def readDatasource(): Unit = {
|
||||
val conf = new SparkConf()
|
||||
conf.setMaster("local[*]")
|
||||
conf.set("spark.driver.host", "localhost")
|
||||
|
@ -33,29 +32,41 @@ class TestPreprocess extends java.io.Serializable{
|
|||
.getOrCreate()
|
||||
val path = getClass.getResource("datasource.json").getPath
|
||||
|
||||
val ds :Dataset[HostedByItemType]= SparkProduceHostedByMap.oaHostedByDataset(spark, path)
|
||||
val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.oaHostedByDataset(spark, path)
|
||||
|
||||
assertEquals(9, ds.count)
|
||||
assertEquals(9, ds.count)
|
||||
|
||||
assertEquals(8, ds.filter(hbi => !hbi.issn.equals("")).count)
|
||||
assertEquals(5, ds.filter(hbi => !hbi.eissn.equals("")).count)
|
||||
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
|
||||
|
||||
assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count)
|
||||
assertEquals(
|
||||
0,
|
||||
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
|
||||
)
|
||||
|
||||
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365")).count == 1)
|
||||
assertTrue(ds.filter(hbi => hbi.eissn.equals("2253-900X")).count == 1)
|
||||
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1)
|
||||
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata")).count == 1)
|
||||
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.id.equals("10|doajarticles::abbc9265bea9ff62776a1c39785af00c")).count == 1)
|
||||
assertTrue(
|
||||
ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1
|
||||
)
|
||||
assertTrue(
|
||||
ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata"))
|
||||
.count == 1
|
||||
)
|
||||
assertTrue(
|
||||
ds.filter(hbi =>
|
||||
hbi.issn.equals("0212-8365") && hbi.id
|
||||
.equals("10|doajarticles::abbc9265bea9ff62776a1c39785af00c")
|
||||
).count == 1
|
||||
)
|
||||
ds.foreach(hbi => assertTrue(hbi.id.startsWith("10|")))
|
||||
ds.foreach(hbi => println(toHBIString(hbi)))
|
||||
spark.close()
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def readGold():Unit = {
|
||||
def readGold(): Unit = {
|
||||
val conf = new SparkConf()
|
||||
conf.setMaster("local[*]")
|
||||
conf.set("spark.driver.host", "localhost")
|
||||
|
@ -67,8 +78,7 @@ class TestPreprocess extends java.io.Serializable{
|
|||
.getOrCreate()
|
||||
val path = getClass.getResource("unibi_transformed.json").getPath
|
||||
|
||||
|
||||
val ds :Dataset[HostedByItemType]= SparkProduceHostedByMap.goldHostedByDataset(spark, path)
|
||||
val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.goldHostedByDataset(spark, path)
|
||||
|
||||
assertEquals(29, ds.count)
|
||||
|
||||
|
@ -76,9 +86,17 @@ class TestPreprocess extends java.io.Serializable{
|
|||
assertEquals(0, ds.filter(hbi => !hbi.eissn.equals("")).count)
|
||||
assertEquals(29, ds.filter(hbi => !hbi.lissn.equals("")).count)
|
||||
|
||||
assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count)
|
||||
assertEquals(
|
||||
0,
|
||||
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
|
||||
)
|
||||
|
||||
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().officialname.equals("European journal of sustainable development."))
|
||||
assertTrue(
|
||||
ds.filter(hbi => hbi.issn.equals("2239-6101"))
|
||||
.first()
|
||||
.officialname
|
||||
.equals("European journal of sustainable development.")
|
||||
)
|
||||
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().lissn.equals("2239-5938"))
|
||||
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).count == 1)
|
||||
ds.foreach(hbi => assertTrue(hbi.id.equals(Constants.UNIBI)))
|
||||
|
@ -88,7 +106,7 @@ class TestPreprocess extends java.io.Serializable{
|
|||
}
|
||||
|
||||
@Test
|
||||
def readDoaj():Unit = {
|
||||
def readDoaj(): Unit = {
|
||||
val conf = new SparkConf()
|
||||
conf.setMaster("local[*]")
|
||||
conf.set("spark.driver.host", "localhost")
|
||||
|
@ -100,7 +118,7 @@ class TestPreprocess extends java.io.Serializable{
|
|||
.getOrCreate()
|
||||
val path = getClass.getResource("doaj_transformed.json").getPath
|
||||
|
||||
val ds :Dataset[HostedByItemType]= SparkProduceHostedByMap.doajHostedByDataset(spark, path)
|
||||
val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.doajHostedByDataset(spark, path)
|
||||
|
||||
assertEquals(25, ds.count)
|
||||
|
||||
|
@ -108,9 +126,17 @@ class TestPreprocess extends java.io.Serializable{
|
|||
assertEquals(21, ds.filter(hbi => !hbi.eissn.equals("")).count)
|
||||
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
|
||||
|
||||
assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count)
|
||||
assertEquals(
|
||||
0,
|
||||
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
|
||||
)
|
||||
|
||||
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().officialname.equals("Journal of Space Technology"))
|
||||
assertTrue(
|
||||
ds.filter(hbi => hbi.issn.equals("2077-3099"))
|
||||
.first()
|
||||
.officialname
|
||||
.equals("Journal of Space Technology")
|
||||
)
|
||||
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().eissn.equals("2411-5029"))
|
||||
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).count == 1)
|
||||
assertTrue(ds.filter(hbi => hbi.eissn.equals("2077-2955")).first().issn.equals(""))
|
||||
|
@ -121,7 +147,7 @@ class TestPreprocess extends java.io.Serializable{
|
|||
}
|
||||
|
||||
@Test
|
||||
def testAggregator() : Unit = {
|
||||
def testAggregator(): Unit = {
|
||||
|
||||
val conf = new SparkConf()
|
||||
conf.setMaster("local[*]")
|
||||
|
@ -133,22 +159,40 @@ class TestPreprocess extends java.io.Serializable{
|
|||
.config(conf)
|
||||
.getOrCreate()
|
||||
|
||||
|
||||
val tmp = SparkProduceHostedByMap.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
|
||||
.union(SparkProduceHostedByMap.goldHostedByDataset(spark,getClass.getResource("unibi_transformed.json").getPath))
|
||||
.union(SparkProduceHostedByMap.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath))
|
||||
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]))
|
||||
val tmp = SparkProduceHostedByMap
|
||||
.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
|
||||
.union(
|
||||
SparkProduceHostedByMap
|
||||
.goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath)
|
||||
)
|
||||
.union(
|
||||
SparkProduceHostedByMap
|
||||
.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)
|
||||
)
|
||||
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(
|
||||
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
|
||||
)
|
||||
|
||||
assertEquals(106, tmp.count)
|
||||
assertEquals(82, tmp.map(i => i._1)(Encoders.STRING).distinct().count)
|
||||
|
||||
val ds: Dataset[(String, HostedByItemType)] = Aggregators.explodeHostedByItemType(
|
||||
SparkProduceHostedByMap
|
||||
.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
|
||||
.union(
|
||||
SparkProduceHostedByMap
|
||||
.goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath)
|
||||
)
|
||||
.union(
|
||||
SparkProduceHostedByMap
|
||||
.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)
|
||||
)
|
||||
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(
|
||||
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
|
||||
)
|
||||
)
|
||||
|
||||
val ds :Dataset[(String, HostedByItemType)] = Aggregators.explodeHostedByItemType(SparkProduceHostedByMap.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
|
||||
.union(SparkProduceHostedByMap.goldHostedByDataset(spark,getClass.getResource("unibi_transformed.json").getPath))
|
||||
.union(SparkProduceHostedByMap.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath))
|
||||
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])))
|
||||
|
||||
assertEquals(82, ds.count)
|
||||
assertEquals(82, ds.count)
|
||||
|
||||
assertEquals(13, ds.filter(i => i._2.id.startsWith("10|")).count)
|
||||
|
||||
|
@ -156,14 +200,13 @@ class TestPreprocess extends java.io.Serializable{
|
|||
assertTrue(ds.filter(i => i._1.equals("2077-3757")).first()._2.openAccess)
|
||||
assertEquals(1, ds.filter(i => i._1.equals("2077-3757")).count)
|
||||
|
||||
val hbmap : Dataset[String] = ds.filter(hbi => hbi._2.id.startsWith("10|")).map(SparkProduceHostedByMap.toHostedByMap)(Encoders.STRING)
|
||||
val hbmap: Dataset[String] = ds
|
||||
.filter(hbi => hbi._2.id.startsWith("10|"))
|
||||
.map(SparkProduceHostedByMap.toHostedByMap)(Encoders.STRING)
|
||||
|
||||
hbmap.foreach(entry => println(entry))
|
||||
spark.close()
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
package eu.dnetlib.dhp.oa.graph.resolution
|
||||
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.schema.common.EntityType
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
||||
|
@ -19,174 +18,241 @@ import scala.io.Source
|
|||
@TestInstance(Lifecycle.PER_CLASS)
|
||||
class ResolveEntitiesTest extends Serializable {
|
||||
|
||||
var workingDir:Path = null
|
||||
var workingDir: Path = null
|
||||
|
||||
val FAKE_TITLE = "FAKETITLE"
|
||||
val FAKE_SUBJECT = "FAKESUBJECT"
|
||||
|
||||
var sparkSession:Option[SparkSession] = None
|
||||
|
||||
var sparkSession: Option[SparkSession] = None
|
||||
|
||||
@BeforeAll
|
||||
def setUp() :Unit = {
|
||||
def setUp(): Unit = {
|
||||
workingDir = Files.createTempDirectory(getClass.getSimpleName)
|
||||
|
||||
val conf = new SparkConf()
|
||||
sparkSession = Some(SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master("local[*]").getOrCreate())
|
||||
sparkSession = Some(
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master("local[*]")
|
||||
.getOrCreate()
|
||||
)
|
||||
populateDatasets(sparkSession.get)
|
||||
generateUpdates(sparkSession.get)
|
||||
|
||||
}
|
||||
|
||||
|
||||
@AfterAll
|
||||
def tearDown():Unit = {
|
||||
def tearDown(): Unit = {
|
||||
FileUtils.deleteDirectory(workingDir.toFile)
|
||||
sparkSession.get.stop()
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def generateUpdates(spark:SparkSession):Unit = {
|
||||
def generateUpdates(spark: SparkSession): Unit = {
|
||||
val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString
|
||||
|
||||
val pids: List[String] = template.lines
|
||||
.map { id =>
|
||||
val r = new Result
|
||||
r.setId(id.toLowerCase.trim)
|
||||
r.setSubject(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
FAKE_SUBJECT,
|
||||
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
|
||||
null
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
r.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
FAKE_TITLE,
|
||||
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
|
||||
null
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
r
|
||||
}
|
||||
.map { r =>
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
val pids:List[String] = template.lines.map{id =>
|
||||
val r = new Result
|
||||
r.setId(id.toLowerCase.trim)
|
||||
r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
|
||||
r.setTitle(List(OafMapperUtils.structuredProperty(FAKE_TITLE, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
|
||||
r
|
||||
}.map{r =>
|
||||
val mapper = new ObjectMapper()
|
||||
mapper.writeValueAsString(r)
|
||||
}
|
||||
.toList
|
||||
|
||||
mapper.writeValueAsString(r)}.toList
|
||||
|
||||
|
||||
val sc =spark.sparkContext
|
||||
val sc = spark.sparkContext
|
||||
|
||||
println(sc.parallelize(pids).count())
|
||||
|
||||
spark.createDataset(sc.parallelize(pids))(Encoders.STRING).write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingDir/updates")
|
||||
|
||||
|
||||
|
||||
|
||||
spark
|
||||
.createDataset(sc.parallelize(pids))(Encoders.STRING)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(s"$workingDir/updates")
|
||||
|
||||
import spark.implicits._
|
||||
implicit val resEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
|
||||
val ds = spark.read.text(s"$workingDir/updates").as[String].map{s => val mapper = new ObjectMapper()
|
||||
mapper.readValue(s, classOf[Result])}.collect()
|
||||
|
||||
|
||||
|
||||
val ds = spark.read
|
||||
.text(s"$workingDir/updates")
|
||||
.as[String]
|
||||
.map { s =>
|
||||
val mapper = new ObjectMapper()
|
||||
mapper.readValue(s, classOf[Result])
|
||||
}
|
||||
.collect()
|
||||
|
||||
assertEquals(4, ds.length)
|
||||
ds.foreach{r => assertNotNull(r.getSubject)}
|
||||
ds.foreach{r => assertEquals(1,r.getSubject.size())}
|
||||
ds.foreach{r => assertNotNull(r.getTitle)}
|
||||
ds.foreach{r => assertEquals(1,r.getTitle.size())}
|
||||
ds.foreach { r => assertNotNull(r.getSubject) }
|
||||
ds.foreach { r => assertEquals(1, r.getSubject.size()) }
|
||||
ds.foreach { r => assertNotNull(r.getTitle) }
|
||||
ds.foreach { r => assertEquals(1, r.getTitle.size()) }
|
||||
|
||||
|
||||
|
||||
ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_TITLE,t))
|
||||
ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_SUBJECT,t))
|
||||
ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue))
|
||||
.foreach(t => assertEquals(FAKE_TITLE, t))
|
||||
ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue))
|
||||
.foreach(t => assertEquals(FAKE_SUBJECT, t))
|
||||
|
||||
println("generated Updates")
|
||||
}
|
||||
|
||||
|
||||
def populateDatasets(spark:SparkSession):Unit = {
|
||||
def populateDatasets(spark: SparkSession): Unit = {
|
||||
import spark.implicits._
|
||||
val entities =SparkResolveEntities.entities
|
||||
val entities = SparkResolveEntities.entities
|
||||
|
||||
entities.foreach{
|
||||
e =>
|
||||
val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString
|
||||
spark.createDataset(spark.sparkContext.parallelize(template.lines.toList)).as[String].write.option("compression", "gzip").text(s"$workingDir/graph/$e")
|
||||
println(s"Created Dataset $e")
|
||||
entities.foreach { e =>
|
||||
val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString
|
||||
spark
|
||||
.createDataset(spark.sparkContext.parallelize(template.lines.toList))
|
||||
.as[String]
|
||||
.write
|
||||
.option("compression", "gzip")
|
||||
.text(s"$workingDir/graph/$e")
|
||||
println(s"Created Dataset $e")
|
||||
}
|
||||
SparkResolveRelation.extractPidResolvedTableFromJsonRDD(spark, s"$workingDir/graph", s"$workingDir/work")
|
||||
SparkResolveRelation.extractPidResolvedTableFromJsonRDD(
|
||||
spark,
|
||||
s"$workingDir/graph",
|
||||
s"$workingDir/work"
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testResolution():Unit = {
|
||||
val spark:SparkSession = sparkSession.get
|
||||
def testResolution(): Unit = {
|
||||
val spark: SparkSession = sparkSession.get
|
||||
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" )
|
||||
SparkResolveEntities.resolveEntities(spark, s"$workingDir/work", s"$workingDir/updates")
|
||||
|
||||
val ds = spark.read.load(s"$workingDir/work/resolvedEntities").as[Result]
|
||||
|
||||
assertEquals(3, ds.count())
|
||||
|
||||
ds.collect().foreach{
|
||||
r =>
|
||||
ds.collect().foreach { r =>
|
||||
assertTrue(r.getId.startsWith("50"))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
private def structuredPContainsValue(l:java.util.List[StructuredProperty], exptectedValue:String):Boolean = {
|
||||
l.asScala.exists(p =>p.getValue!= null && p.getValue.equalsIgnoreCase(exptectedValue))
|
||||
private def structuredPContainsValue(
|
||||
l: java.util.List[StructuredProperty],
|
||||
exptectedValue: String
|
||||
): Boolean = {
|
||||
l.asScala.exists(p => p.getValue != null && p.getValue.equalsIgnoreCase(exptectedValue))
|
||||
}
|
||||
|
||||
@Test
|
||||
def testUpdate():Unit = {
|
||||
val spark:SparkSession = sparkSession.get
|
||||
def testUpdate(): Unit = {
|
||||
val spark: SparkSession = sparkSession.get
|
||||
import spark.implicits._
|
||||
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
val m = new ObjectMapper()
|
||||
SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" )
|
||||
SparkResolveEntities.generateResolvedEntities(spark,s"$workingDir/work",s"$workingDir/graph", s"$workingDir/target" )
|
||||
|
||||
|
||||
|
||||
val pubDS:Dataset[Result] = spark.read.text(s"$workingDir/target/publication").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
|
||||
val t = pubDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
|
||||
SparkResolveEntities.resolveEntities(spark, s"$workingDir/work", s"$workingDir/updates")
|
||||
SparkResolveEntities.generateResolvedEntities(
|
||||
spark,
|
||||
s"$workingDir/work",
|
||||
s"$workingDir/graph",
|
||||
s"$workingDir/target"
|
||||
)
|
||||
|
||||
val pubDS: Dataset[Result] = spark.read
|
||||
.text(s"$workingDir/target/publication")
|
||||
.as[String]
|
||||
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
|
||||
val t = pubDS
|
||||
.filter(p => p.getTitle != null && p.getSubject != null)
|
||||
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
|
||||
.count()
|
||||
|
||||
var ct = pubDS.count()
|
||||
var et = pubDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
|
||||
var et = pubDS
|
||||
.filter(p =>
|
||||
p.getTitle != null && p.getTitle.asScala.forall(t =>
|
||||
t.getValue != null && t.getValue.nonEmpty
|
||||
)
|
||||
)
|
||||
.count()
|
||||
|
||||
assertEquals(ct, et)
|
||||
|
||||
|
||||
|
||||
val datDS:Dataset[Result] = spark.read.text(s"$workingDir/target/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
|
||||
val td = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
|
||||
val datDS: Dataset[Result] = spark.read
|
||||
.text(s"$workingDir/target/dataset")
|
||||
.as[String]
|
||||
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
|
||||
val td = datDS
|
||||
.filter(p => p.getTitle != null && p.getSubject != null)
|
||||
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
|
||||
.count()
|
||||
ct = datDS.count()
|
||||
et = datDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
|
||||
et = datDS
|
||||
.filter(p =>
|
||||
p.getTitle != null && p.getTitle.asScala.forall(t =>
|
||||
t.getValue != null && t.getValue.nonEmpty
|
||||
)
|
||||
)
|
||||
.count()
|
||||
assertEquals(ct, et)
|
||||
|
||||
|
||||
val softDS:Dataset[Result] = spark.read.text(s"$workingDir/target/software").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.software))
|
||||
val ts = softDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
|
||||
val softDS: Dataset[Result] = spark.read
|
||||
.text(s"$workingDir/target/software")
|
||||
.as[String]
|
||||
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.software))
|
||||
val ts = softDS
|
||||
.filter(p => p.getTitle != null && p.getSubject != null)
|
||||
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
|
||||
.count()
|
||||
ct = softDS.count()
|
||||
et = softDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
|
||||
et = softDS
|
||||
.filter(p =>
|
||||
p.getTitle != null && p.getTitle.asScala.forall(t =>
|
||||
t.getValue != null && t.getValue.nonEmpty
|
||||
)
|
||||
)
|
||||
.count()
|
||||
assertEquals(ct, et)
|
||||
|
||||
|
||||
val orpDS:Dataset[Result] = spark.read.text(s"$workingDir/target/otherresearchproduct").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct))
|
||||
val to = orpDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
|
||||
|
||||
val orpDS: Dataset[Result] = spark.read
|
||||
.text(s"$workingDir/target/otherresearchproduct")
|
||||
.as[String]
|
||||
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct))
|
||||
val to = orpDS
|
||||
.filter(p => p.getTitle != null && p.getSubject != null)
|
||||
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
|
||||
.count()
|
||||
|
||||
ct = orpDS.count()
|
||||
et = orpDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
|
||||
et = orpDS
|
||||
.filter(p =>
|
||||
p.getTitle != null && p.getTitle.asScala.forall(t =>
|
||||
t.getValue != null && t.getValue.nonEmpty
|
||||
)
|
||||
)
|
||||
.count()
|
||||
assertEquals(ct, et)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
assertEquals(0, t)
|
||||
assertEquals(2, td)
|
||||
assertEquals(1, ts)
|
||||
|
@ -194,40 +260,35 @@ class ResolveEntitiesTest extends Serializable {
|
|||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def testMerge():Unit = {
|
||||
def testMerge(): Unit = {
|
||||
|
||||
val r = new Result
|
||||
r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
|
||||
r.setSubject(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
FAKE_SUBJECT,
|
||||
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
|
||||
null
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
val p = mapper.readValue(Source.fromInputStream(this.getClass.getResourceAsStream(s"publication")).mkString.lines.next(), classOf[Publication])
|
||||
|
||||
val p = mapper.readValue(
|
||||
Source
|
||||
.fromInputStream(this.getClass.getResourceAsStream(s"publication"))
|
||||
.mkString
|
||||
.lines
|
||||
.next(),
|
||||
classOf[Publication]
|
||||
)
|
||||
|
||||
r.mergeFrom(p)
|
||||
|
||||
|
||||
println(mapper.writeValueAsString(r))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -1,26 +1,20 @@
|
|||
package eu.dnetlib.dhp.sx.graph
|
||||
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
import java.text.SimpleDateFormat
|
||||
|
||||
|
||||
|
||||
class RetrieveDataciteDeltaTest {
|
||||
|
||||
@Test
|
||||
def testParsingDate(): Unit = {
|
||||
|
||||
|
||||
val inputDate = "2021-12-02T11:17:36+0000"
|
||||
|
||||
val t = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ").parse(inputDate).getTime
|
||||
|
||||
|
||||
println(t)
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -18,78 +18,89 @@ import scala.collection.JavaConverters._
|
|||
import scala.io.Source
|
||||
|
||||
@ExtendWith(Array(classOf[MockitoExtension]))
|
||||
class ScholixGraphTest extends AbstractVocabularyTest{
|
||||
|
||||
class ScholixGraphTest extends AbstractVocabularyTest {
|
||||
|
||||
val mapper: ObjectMapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,false)
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
|
||||
|
||||
@BeforeEach
|
||||
def setUp() :Unit = {
|
||||
def setUp(): Unit = {
|
||||
|
||||
super.setUpVocabulary()
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testExtractPids():Unit = {
|
||||
def testExtractPids(): Unit = {
|
||||
|
||||
val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json")).mkString
|
||||
val res =SparkResolveRelation.extractPidsFromRecord(input)
|
||||
val input = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json"))
|
||||
.mkString
|
||||
val res = SparkResolveRelation.extractPidsFromRecord(input)
|
||||
assertNotNull(res)
|
||||
|
||||
assertEquals(1,res._2.size)
|
||||
assertEquals(1, res._2.size)
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testOAFToSummary():Unit= {
|
||||
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary")).mkString
|
||||
def testOAFToSummary(): Unit = {
|
||||
val inputRelations = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary"))
|
||||
.mkString
|
||||
val items = inputRelations.lines.toList
|
||||
assertNotNull(items)
|
||||
items.foreach(i =>assertTrue(i.nonEmpty))
|
||||
val result = items.map(r => mapper.readValue(r, classOf[Result])).map(i => ScholixUtils.resultToSummary(i))
|
||||
items.foreach(i => assertTrue(i.nonEmpty))
|
||||
val result =
|
||||
items.map(r => mapper.readValue(r, classOf[Result])).map(i => ScholixUtils.resultToSummary(i))
|
||||
|
||||
assertNotNull(result)
|
||||
|
||||
assertEquals(result.size, items.size)
|
||||
val d = result.find(s => s.getLocalIdentifier.asScala.exists(i => i.getUrl == null || i.getUrl.isEmpty))
|
||||
val d = result.find(s =>
|
||||
s.getLocalIdentifier.asScala.exists(i => i.getUrl == null || i.getUrl.isEmpty)
|
||||
)
|
||||
assertFalse(d.isDefined)
|
||||
println(mapper.writeValueAsString(result.head))
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def testScholixMergeOnSource():Unit = {
|
||||
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix")).mkString
|
||||
val result:List[(Relation,ScholixSummary)] =inputRelations.lines.sliding(2).map(s => (s.head, s(1))).map(p => (mapper.readValue(p._1, classOf[Relation]),mapper.readValue(p._2, classOf[ScholixSummary]) )).toList
|
||||
def testScholixMergeOnSource(): Unit = {
|
||||
val inputRelations = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix")
|
||||
)
|
||||
.mkString
|
||||
val result: List[(Relation, ScholixSummary)] = inputRelations.lines
|
||||
.sliding(2)
|
||||
.map(s => (s.head, s(1)))
|
||||
.map(p =>
|
||||
(mapper.readValue(p._1, classOf[Relation]), mapper.readValue(p._2, classOf[ScholixSummary]))
|
||||
)
|
||||
.toList
|
||||
assertNotNull(result)
|
||||
assertTrue(result.nonEmpty)
|
||||
result.foreach(r => assertEquals(r._1.getSource, r._2.getId))
|
||||
val scholix:List[Scholix] = result.map(r => ScholixUtils.scholixFromSource(r._1, r._2))
|
||||
val scholix: List[Scholix] = result.map(r => ScholixUtils.scholixFromSource(r._1, r._2))
|
||||
println(mapper.writeValueAsString(scholix.head))
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def testScholixRelationshipsClean(): Unit = {
|
||||
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relation_transform.json")).mkString
|
||||
val inputRelations = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relation_transform.json")
|
||||
)
|
||||
.mkString
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
lazy val json: json4s.JValue = parse(inputRelations)
|
||||
val l:List[String] =json.extract[List[String]]
|
||||
val l: List[String] = json.extract[List[String]]
|
||||
assertNotNull(l)
|
||||
assertTrue(l.nonEmpty)
|
||||
val relVocbaulary =ScholixUtils.relations
|
||||
l.foreach(r => assertTrue(relVocbaulary.contains(r.toLowerCase)))
|
||||
val relVocbaulary = ScholixUtils.relations
|
||||
l.foreach(r => assertTrue(relVocbaulary.contains(r.toLowerCase)))
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
39
pom.xml
39
pom.xml
|
@ -620,6 +620,18 @@
|
|||
</dependency>
|
||||
</dependencies>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.antipathy</groupId>
|
||||
<artifactId>mvn-scalafmt_2.11</artifactId>
|
||||
<version>1.0.1640073709.733712b</version>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-code-style</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
<plugins>
|
||||
|
@ -665,6 +677,33 @@
|
|||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.antipathy</groupId>
|
||||
<artifactId>mvn-scalafmt_2.11</artifactId>
|
||||
<configuration>
|
||||
<configLocation>dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf</configLocation>
|
||||
<skipTestSources>false</skipTestSources>
|
||||
<skipSources>false</skipSources>
|
||||
<sourceDirectories>
|
||||
<param>${project.basedir}/src/main/scala</param>
|
||||
</sourceDirectories>
|
||||
<testSourceDirectories>
|
||||
<param>${project.basedir}/src/test/scala</param>
|
||||
</testSourceDirectories>
|
||||
<validateOnly>false</validateOnly>
|
||||
<onlyChangedFiles>false</onlyChangedFiles>
|
||||
<branch>: git rev-parse --abbrev-ref HEAD</branch>
|
||||
<useSpecifiedRepositories>false</useSpecifiedRepositories>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>validate</phase>
|
||||
<goals>
|
||||
<goal>format</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-release-plugin</artifactId>
|
||||
|
|
Loading…
Reference in New Issue