diff --git a/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf b/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf
new file mode 100644
index 000000000..0b5dbe0b4
--- /dev/null
+++ b/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf
@@ -0,0 +1,21 @@
+style = defaultWithAlign
+
+align.openParenCallSite = false
+align.openParenDefnSite = false
+align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
+continuationIndent.callSite = 2
+continuationIndent.defnSite = 2
+danglingParentheses = true
+indentOperator = spray
+maxColumn = 120
+newlines.alwaysBeforeTopLevelStatements = true
+project.excludeFilters = [".*\\.sbt"]
+rewrite.rules = [AvoidInfix]
+rewrite.rules = [ExpandImportSelectors]
+rewrite.rules = [RedundantBraces]
+rewrite.rules = [RedundantParens]
+rewrite.rules = [SortImports]
+rewrite.rules = [SortModifiers]
+rewrite.rules = [PreferCurlyFors]
+spaces.inImportCurlyBraces = false
+unindentTopLevelOperators = true
\ No newline at end of file
diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
index 6541746b2..f8afe9af4 100644
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
@@ -2,71 +2,72 @@ package eu.dnetlib.dhp.application
import scala.io.Source
-/**
- * This is the main Interface SparkApplication
- * where all the Spark Scala class should inherit
- *
- */
+/** This is the main Interface SparkApplication
+ * where all the Spark Scala class should inherit
+ */
trait SparkScalaApplication {
- /**
- * This is the path in the classpath of the json
- * describes all the argument needed to run
- */
+
+ /** This is the path in the classpath of the json
+ * describes all the argument needed to run
+ */
val propertyPath: String
- /**
- * Utility to parse the arguments using the
- * property json in the classpath identified from
- * the variable propertyPath
- *
- * @param args the list of arguments
- */
+ /** Utility to parse the arguments using the
+ * property json in the classpath identified from
+ * the variable propertyPath
+ *
+ * @param args the list of arguments
+ */
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
- val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString)
+ val parser = new ArgumentApplicationParser(
+ Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString
+ )
parser.parseArgument(args)
parser
}
- /**
- * Here all the spark applications runs this method
- * where the whole logic of the spark node is defined
- */
+ /** Here all the spark applications runs this method
+ * where the whole logic of the spark node is defined
+ */
def run(): Unit
}
-
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.slf4j.Logger
-abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends SparkScalaApplication {
+abstract class AbstractScalaApplication(
+ val propertyPath: String,
+ val args: Array[String],
+ log: Logger
+) extends SparkScalaApplication {
var parser: ArgumentApplicationParser = null
- var spark:SparkSession = null
+ var spark: SparkSession = null
-
- def initialize():SparkScalaApplication = {
+ def initialize(): SparkScalaApplication = {
parser = parseArguments(args)
spark = createSparkSession()
this
}
- /**
- * Utility for creating a spark session starting from parser
- *
- * @return a spark Session
- */
- private def createSparkSession():SparkSession = {
- require(parser!= null)
+ /** Utility for creating a spark session starting from parser
+ *
+ * @return a spark Session
+ */
+ private def createSparkSession(): SparkSession = {
+ require(parser != null)
- val conf:SparkConf = new SparkConf()
+ val conf: SparkConf = new SparkConf()
val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master")
- SparkSession.builder().config(conf)
+ SparkSession
+ .builder()
+ .config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()
}
-}
\ No newline at end of file
+}
diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
index f35af0905..a995016a8 100644
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@@ -14,7 +14,6 @@ import scala.io.Source
object ScholixUtils extends Serializable {
-
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
val DATE_RELATION_KEY: String = "RelationDate"
@@ -24,7 +23,11 @@ object ScholixUtils extends Serializable {
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
val relations: Map[String, RelationVocabulary] = {
- val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")).mkString
+ val input = Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
+ )
+ .mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
@@ -32,13 +35,14 @@ object ScholixUtils extends Serializable {
json.extract[Map[String, RelationVocabulary]]
}
-
def extractRelationDate(relation: Relation): String = {
if (relation.getProperties == null || !relation.getProperties.isEmpty)
null
else {
- val date = relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue)
+ val date = relation.getProperties.asScala
+ .find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey))
+ .map(p => p.getValue)
if (date.isDefined)
date.get
else
@@ -58,78 +62,80 @@ object ScholixUtils extends Serializable {
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
-
}
-
- def generateScholixResourceFromResult(r:Result) :ScholixResource = {
+ def generateScholixResourceFromResult(r: Result): ScholixResource = {
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
}
+ val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
+ new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
+ override def zero: RelatedEntities = null
- val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
- override def zero: RelatedEntities = null
+ override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
+ val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
+ val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0
- override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
- val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
- val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0
-
- if (b == null)
- RelatedEntities(a._1, relatedDataset, relatedPublication)
- else
- RelatedEntities(a._1, b.relatedDataset + relatedDataset, b.relatedPublication + relatedPublication)
- }
-
- override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
- if (b1 != null && b2 != null)
- RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication)
-
- else if (b1 != null)
- b1
- else
- b2
- }
-
- override def finish(reduction: RelatedEntities): RelatedEntities = reduction
-
- override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
-
- override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
- }
-
-
- val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
- override def zero: Scholix = null
-
-
- def scholix_complete(s: Scholix): Boolean = {
- if (s == null || s.getIdentifier == null) {
- false
- } else if (s.getSource == null || s.getTarget == null) {
- false
+ if (b == null)
+ RelatedEntities(a._1, relatedDataset, relatedPublication)
+ else
+ RelatedEntities(
+ a._1,
+ b.relatedDataset + relatedDataset,
+ b.relatedPublication + relatedPublication
+ )
}
- else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
- false
- else
- true
+
+ override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
+ if (b1 != null && b2 != null)
+ RelatedEntities(
+ b1.id,
+ b1.relatedDataset + b2.relatedDataset,
+ b1.relatedPublication + b2.relatedPublication
+ )
+ else if (b1 != null)
+ b1
+ else
+ b2
+ }
+
+ override def finish(reduction: RelatedEntities): RelatedEntities = reduction
+
+ override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
+
+ override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
}
- override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
- if (scholix_complete(b)) b else a._2
+ val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] =
+ new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
+ override def zero: Scholix = null
+
+ def scholix_complete(s: Scholix): Boolean = {
+ if (s == null || s.getIdentifier == null) {
+ false
+ } else if (s.getSource == null || s.getTarget == null) {
+ false
+ } else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
+ false
+ else
+ true
+ }
+
+ override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
+ if (scholix_complete(b)) b else a._2
+ }
+
+ override def merge(b1: Scholix, b2: Scholix): Scholix = {
+ if (scholix_complete(b1)) b1 else b2
+ }
+
+ override def finish(reduction: Scholix): Scholix = reduction
+
+ override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
+
+ override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
}
- override def merge(b1: Scholix, b2: Scholix): Scholix = {
- if (scholix_complete(b1)) b1 else b2
- }
-
- override def finish(reduction: Scholix): Scholix = reduction
-
- override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
-
- override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
- }
-
-
def createInverseScholixRelation(scholix: Scholix): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
@@ -138,16 +144,19 @@ object ScholixUtils extends Serializable {
s.setRelationship(inverseRelationShip(scholix.getRelationship))
s.setSource(scholix.getTarget)
s.setTarget(scholix.getSource)
- s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
+ s.setIdentifier(
+ DHPUtils.md5(
+ s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
+ )
+ )
s
-
}
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
- val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map {
- d => new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
+ val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
+ new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
}(collection.breakOut)
l
} else List()
@@ -155,8 +164,11 @@ object ScholixUtils extends Serializable {
def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
- val l: List[ScholixEntityId] = summary.getDatasources.asScala.map {
- d => new ScholixEntityId(d.getDatasourceName, List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava)
+ val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d =>
+ new ScholixEntityId(
+ d.getDatasourceName,
+ List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava
+ )
}(collection.breakOut)
l
} else List()
@@ -165,17 +177,16 @@ object ScholixUtils extends Serializable {
def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
-
- val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map {
- c =>
-
- new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava)
+ val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c =>
+ new ScholixEntityId(
+ c.getValue,
+ List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava
+ )
}.toList
l
} else List()
}
-
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
@@ -184,11 +195,14 @@ object ScholixUtils extends Serializable {
s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource)
s.setTarget(generateScholixResourceFromSummary(target))
- s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
+ s.setIdentifier(
+ DHPUtils.md5(
+ s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
+ )
+ )
s
}
-
def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
@@ -197,11 +211,14 @@ object ScholixUtils extends Serializable {
s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource)
s.setTarget(target)
- s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
+ s.setIdentifier(
+ DHPUtils.md5(
+ s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
+ )
+ )
s
}
-
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
val r = new ScholixResource
r.setIdentifier(summaryObject.getLocalIdentifier)
@@ -214,7 +231,8 @@ object ScholixUtils extends Serializable {
r.setTitle(summaryObject.getTitle.get(0))
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
- val l: List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
+ val l: List[ScholixEntityId] =
+ summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
if (l.nonEmpty)
r.setCreator(l.asJava)
}
@@ -222,20 +240,27 @@ object ScholixUtils extends Serializable {
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
r.setPublicationDate(summaryObject.getDate.get(0))
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
- val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
+ val plist: List[ScholixEntityId] =
+ summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
if (plist.nonEmpty)
r.setPublisher(plist.asJava)
}
-
if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {
- val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom(
- new ScholixEntityId(c.getDatasourceName, List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava)
- , "collected", "complete"
-
- )).toList
+ val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala
+ .map(c =>
+ new ScholixCollectedFrom(
+ new ScholixEntityId(
+ c.getDatasourceName,
+ List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava
+ ),
+ "collected",
+ "complete"
+ )
+ )
+ .toList
if (l.nonEmpty)
r.setCollectedFrom(l.asJava)
@@ -244,9 +269,7 @@ object ScholixUtils extends Serializable {
r
}
-
-
- def scholixFromSource(relation: Relation, source: ScholixResource):Scholix = {
+ def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = {
if (relation == null || source == null)
return null
val s = new Scholix
@@ -262,7 +285,6 @@ object ScholixUtils extends Serializable {
s.setPublicationDate(d)
-
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
s.setPublisher(source.getPublisher)
}
@@ -270,13 +292,14 @@ object ScholixUtils extends Serializable {
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null)
return null
- s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
+ s.setRelationship(
+ new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
+ )
s.setSource(source)
s
}
-
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
if (relation == null || source == null)
@@ -298,12 +321,10 @@ object ScholixUtils extends Serializable {
s.setPublicationDate(d)
-
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
val l: List[ScholixEntityId] = source.getPublisher.asScala
- .map {
- p =>
- new ScholixEntityId(p, null)
+ .map { p =>
+ new ScholixEntityId(p, null)
}(collection.breakOut)
if (l.nonEmpty)
@@ -313,31 +334,37 @@ object ScholixUtils extends Serializable {
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null)
return null
- s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
+ s.setRelationship(
+ new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
+ )
s.setSource(generateScholixResourceFromSummary(source))
s
}
+ def findURLForPID(
+ pidValue: List[StructuredProperty],
+ urls: List[String]
+ ): List[(StructuredProperty, String)] = {
+ pidValue.map { p =>
+ val pv = p.getValue
- def findURLForPID(pidValue: List[StructuredProperty], urls: List[String]): List[(StructuredProperty, String)] = {
- pidValue.map {
- p =>
- val pv = p.getValue
-
- val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
- (p, r.orNull)
+ val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
+ (p, r.orNull)
}
}
-
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
if (r.getInstance() == null || r.getInstance().isEmpty)
return List()
- r.getInstance().asScala.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
+ r.getInstance()
+ .asScala
+ .filter(i => i.getUrl != null && !i.getUrl.isEmpty)
.filter(i => i.getPid != null && i.getUrl != null)
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
- .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList
+ .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
+ .distinct
+ .toList
}
def resultToSummary(r: Result): ScholixSummary = {
@@ -371,7 +398,12 @@ object ScholixUtils extends Serializable {
s.setAuthor(authors.asJava)
}
if (r.getInstance() != null) {
- val dt: List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue).toList
+ val dt: List[String] = r
+ .getInstance()
+ .asScala
+ .filter(i => i.getDateofacceptance != null)
+ .map(i => i.getDateofacceptance.getValue)
+ .toList
if (dt.nonEmpty)
s.setDate(dt.distinct.asJava)
}
@@ -382,7 +414,9 @@ object ScholixUtils extends Serializable {
}
if (r.getSubject != null && !r.getSubject.isEmpty) {
- val subjects: List[SchemeValue] = r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue)).toList
+ val subjects: List[SchemeValue] = r.getSubject.asScala
+ .map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))
+ .toList
if (subjects.nonEmpty)
s.setSubject(subjects.asJava)
}
@@ -391,7 +425,9 @@ object ScholixUtils extends Serializable {
s.setPublisher(List(r.getPublisher.getValue).asJava)
if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
- val cf: List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete")).toList
+ val cf: List[CollectedFromType] = r.getCollectedfrom.asScala
+ .map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))
+ .toList
if (cf.nonEmpty)
s.setDatasources(cf.distinct.asJava)
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala
index 86a28ac10..85f5a3082 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala
@@ -7,16 +7,14 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode}
object CollectionUtils {
- /**
- * This method in pipeline to the transformation phase,
- * generates relations in both verse, typically it should be a phase of flatMap
- *
- * @param i input OAF
- * @return
- * If the input OAF is an entity -> List(i)
- * If the input OAF is a relation -> List(relation, inverseRelation)
- *
- */
+ /** This method in pipeline to the transformation phase,
+ * generates relations in both verse, typically it should be a phase of flatMap
+ *
+ * @param i input OAF
+ * @return
+ * If the input OAF is an entity -> List(i)
+ * If the input OAF is a relation -> List(relation, inverseRelation)
+ */
def fixRelations(i: Oaf): List[Oaf] = {
if (i.isInstanceOf[OafEntity])
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala
index 6a9b8e3e5..471149b25 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala
@@ -6,7 +6,6 @@ import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest}
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClientBuilder
-
abstract class AbstractRestClient extends Iterator[String] {
var buffer: List[String] = List()
@@ -16,12 +15,10 @@ abstract class AbstractRestClient extends Iterator[String] {
var complete: Boolean = false
-
def extractInfo(input: String): Unit
protected def getBufferData(): Unit
-
def doHTTPGETRequest(url: String): String = {
val httpGet = new HttpGet(url)
doHTTPRequest(httpGet)
@@ -43,7 +40,6 @@ abstract class AbstractRestClient extends Iterator[String] {
buffer.nonEmpty && current_index < buffer.size
}
-
override def next(): String = {
val next_item: String = buffer(current_index)
current_index = current_index + 1
@@ -52,13 +48,14 @@ abstract class AbstractRestClient extends Iterator[String] {
next_item
}
-
private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
val timeout = 60; // seconds
- val config = RequestConfig.custom()
+ val config = RequestConfig
+ .custom()
.setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000)
- .setSocketTimeout(timeout * 1000).build()
+ .setSocketTimeout(timeout * 1000)
+ .build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
try {
var tries = 4
@@ -69,8 +66,7 @@ abstract class AbstractRestClient extends Iterator[String] {
println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
- }
- else
+ } else
return IOUtils.toString(response.getEntity.getContent)
} catch {
case e: Throwable =>
@@ -87,4 +83,4 @@ abstract class AbstractRestClient extends Iterator[String] {
}
getBufferData()
-}
\ No newline at end of file
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala
index 7ec44a6ff..d2fd709aa 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala
@@ -3,7 +3,7 @@ package eu.dnetlib.dhp.datacite
import org.json4s.jackson.JsonMethods.{compact, parse, render}
import org.json4s.{DefaultFormats, JValue}
-class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {
+class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until: Long = -1) extends AbstractRestClient {
override def extractInfo(input: String): Unit = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@@ -16,16 +16,18 @@ class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -
current_index = 0
}
- def get_url():String ={
- val to = if (until> 0) s"$until" else "*"
+ def get_url(): String = {
+ val to = if (until > 0) s"$until" else "*"
s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]"
}
override def getBufferData(): Unit = {
if (!complete) {
- val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url())
+ val response =
+ if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get)
+ else doHTTPGETRequest(get_url())
extractInfo(response)
}
}
-}
\ No newline at end of file
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala
index 6c5dc8cce..a59779387 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala
@@ -10,24 +10,38 @@ import java.util.Locale
import java.util.regex.Pattern
import scala.io.Source
-/**
- * This class represent the dataModel of the input Dataset of Datacite
- * @param doi THE DOI
- * @param timestamp timestamp of last update date
- * @param isActive the record is active or deleted
- * @param json the json native records
- */
+/** This class represent the dataModel of the input Dataset of Datacite
+ * @param doi THE DOI
+ * @param timestamp timestamp of last update date
+ * @param isActive the record is active or deleted
+ * @param json the json native records
+ */
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
/*
The following class are utility class used for the mapping from
json datacite to OAF Shema
*/
-case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
+case class RelatedIdentifierType(
+ relationType: String,
+ relatedIdentifier: String,
+ relatedIdentifierType: String
+) {}
-case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
+case class NameIdentifiersType(
+ nameIdentifierScheme: Option[String],
+ schemeUri: Option[String],
+ nameIdentifier: Option[String]
+) {}
-case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
+case class CreatorType(
+ nameType: Option[String],
+ nameIdentifiers: Option[List[NameIdentifiersType]],
+ name: Option[String],
+ familyName: Option[String],
+ givenName: Option[String],
+ affiliation: Option[List[String]]
+) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
@@ -35,100 +49,230 @@ case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
-case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
+case class FundingReferenceType(
+ funderIdentifierType: Option[String],
+ awardTitle: Option[String],
+ awardUri: Option[String],
+ funderName: Option[String],
+ funderIdentifier: Option[String],
+ awardNumber: Option[String]
+) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
-case class OAFRelations(relation:String, inverse:String, relType:String)
+case class OAFRelations(relation: String, inverse: String, relType: String)
-
-class DataciteModelConstants extends Serializable {
-
-}
+class DataciteModelConstants extends Serializable {}
object DataciteModelConstants {
- val REL_TYPE_VALUE:String = "resultResult"
+ val REL_TYPE_VALUE: String = "resultResult"
val DATE_RELATION_KEY = "RelationDate"
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val DATACITE_NAME = "Datacite"
val dataInfo: DataInfo = dataciteDataInfo("0.9")
- val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
- val subRelTypeMapping: Map[String,OAFRelations] = Map(
- ModelConstants.REFERENCES -> OAFRelations(ModelConstants.REFERENCES, ModelConstants.IS_REFERENCED_BY, ModelConstants.RELATIONSHIP),
- ModelConstants.IS_REFERENCED_BY -> OAFRelations(ModelConstants.IS_REFERENCED_BY,ModelConstants.REFERENCES, ModelConstants.RELATIONSHIP),
+ val DATACITE_COLLECTED_FROM: KeyValue =
+ OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
- ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.SUPPLEMENT),
- ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.SUPPLEMENT),
-
- ModelConstants.HAS_PART -> OAFRelations(ModelConstants.HAS_PART,ModelConstants.IS_PART_OF, ModelConstants.PART),
- ModelConstants.IS_PART_OF -> OAFRelations(ModelConstants.IS_PART_OF,ModelConstants.HAS_PART, ModelConstants.PART),
-
- ModelConstants.IS_VERSION_OF-> OAFRelations(ModelConstants.IS_VERSION_OF,ModelConstants.HAS_VERSION,ModelConstants.VERSION),
- ModelConstants.HAS_VERSION-> OAFRelations(ModelConstants.HAS_VERSION,ModelConstants.IS_VERSION_OF,ModelConstants.VERSION),
-
- ModelConstants.IS_IDENTICAL_TO -> OAFRelations(ModelConstants.IS_IDENTICAL_TO,ModelConstants.IS_IDENTICAL_TO, ModelConstants.RELATIONSHIP),
-
- ModelConstants.IS_CONTINUED_BY -> OAFRelations(ModelConstants.IS_CONTINUED_BY,ModelConstants.CONTINUES, ModelConstants.RELATIONSHIP),
- ModelConstants.CONTINUES -> OAFRelations(ModelConstants.CONTINUES,ModelConstants.IS_CONTINUED_BY, ModelConstants.RELATIONSHIP),
-
- ModelConstants.IS_NEW_VERSION_OF-> OAFRelations(ModelConstants.IS_NEW_VERSION_OF,ModelConstants.IS_PREVIOUS_VERSION_OF, ModelConstants.VERSION),
- ModelConstants.IS_PREVIOUS_VERSION_OF ->OAFRelations(ModelConstants.IS_PREVIOUS_VERSION_OF,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
-
- ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
- ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
-
- ModelConstants.IS_SOURCE_OF -> OAFRelations(ModelConstants.IS_SOURCE_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
- ModelConstants.IS_DERIVED_FROM -> OAFRelations(ModelConstants.IS_DERIVED_FROM,ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION),
-
- ModelConstants.CITES -> OAFRelations(ModelConstants.CITES,ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
- ModelConstants.IS_CITED_BY -> OAFRelations(ModelConstants.IS_CITED_BY,ModelConstants.CITES, ModelConstants.CITATION),
-
- ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(ModelConstants.IS_VARIANT_FORM_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
- ModelConstants.IS_OBSOLETED_BY -> OAFRelations(ModelConstants.IS_OBSOLETED_BY,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
-
- ModelConstants.REVIEWS -> OAFRelations(ModelConstants.REVIEWS,ModelConstants.IS_REVIEWED_BY, ModelConstants.REVIEW),
- ModelConstants.IS_REVIEWED_BY -> OAFRelations(ModelConstants.IS_REVIEWED_BY,ModelConstants.REVIEWS, ModelConstants.REVIEW),
-
- ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
- ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
-
- ModelConstants.COMPILES -> OAFRelations(ModelConstants.COMPILES,ModelConstants.IS_COMPILED_BY, ModelConstants.RELATIONSHIP),
- ModelConstants.IS_COMPILED_BY -> OAFRelations(ModelConstants.IS_COMPILED_BY,ModelConstants.COMPILES, ModelConstants.RELATIONSHIP)
+ val subRelTypeMapping: Map[String, OAFRelations] = Map(
+ ModelConstants.REFERENCES -> OAFRelations(
+ ModelConstants.REFERENCES,
+ ModelConstants.IS_REFERENCED_BY,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_REFERENCED_BY -> OAFRelations(
+ ModelConstants.IS_REFERENCED_BY,
+ ModelConstants.REFERENCES,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
+ ModelConstants.IS_SUPPLEMENTED_BY,
+ ModelConstants.IS_SUPPLEMENT_TO,
+ ModelConstants.SUPPLEMENT
+ ),
+ ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
+ ModelConstants.IS_SUPPLEMENT_TO,
+ ModelConstants.IS_SUPPLEMENTED_BY,
+ ModelConstants.SUPPLEMENT
+ ),
+ ModelConstants.HAS_PART -> OAFRelations(
+ ModelConstants.HAS_PART,
+ ModelConstants.IS_PART_OF,
+ ModelConstants.PART
+ ),
+ ModelConstants.IS_PART_OF -> OAFRelations(
+ ModelConstants.IS_PART_OF,
+ ModelConstants.HAS_PART,
+ ModelConstants.PART
+ ),
+ ModelConstants.IS_VERSION_OF -> OAFRelations(
+ ModelConstants.IS_VERSION_OF,
+ ModelConstants.HAS_VERSION,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.HAS_VERSION -> OAFRelations(
+ ModelConstants.HAS_VERSION,
+ ModelConstants.IS_VERSION_OF,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
+ ModelConstants.IS_IDENTICAL_TO,
+ ModelConstants.IS_IDENTICAL_TO,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_CONTINUED_BY -> OAFRelations(
+ ModelConstants.IS_CONTINUED_BY,
+ ModelConstants.CONTINUES,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.CONTINUES -> OAFRelations(
+ ModelConstants.CONTINUES,
+ ModelConstants.IS_CONTINUED_BY,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
+ ModelConstants.IS_NEW_VERSION_OF,
+ ModelConstants.IS_PREVIOUS_VERSION_OF,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
+ ModelConstants.IS_PREVIOUS_VERSION_OF,
+ ModelConstants.IS_NEW_VERSION_OF,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
+ ModelConstants.IS_DOCUMENTED_BY,
+ ModelConstants.DOCUMENTS,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.DOCUMENTS -> OAFRelations(
+ ModelConstants.DOCUMENTS,
+ ModelConstants.IS_DOCUMENTED_BY,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_SOURCE_OF -> OAFRelations(
+ ModelConstants.IS_SOURCE_OF,
+ ModelConstants.IS_DERIVED_FROM,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.IS_DERIVED_FROM -> OAFRelations(
+ ModelConstants.IS_DERIVED_FROM,
+ ModelConstants.IS_SOURCE_OF,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.CITES -> OAFRelations(
+ ModelConstants.CITES,
+ ModelConstants.IS_CITED_BY,
+ ModelConstants.CITATION
+ ),
+ ModelConstants.IS_CITED_BY -> OAFRelations(
+ ModelConstants.IS_CITED_BY,
+ ModelConstants.CITES,
+ ModelConstants.CITATION
+ ),
+ ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
+ ModelConstants.IS_VARIANT_FORM_OF,
+ ModelConstants.IS_DERIVED_FROM,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
+ ModelConstants.IS_OBSOLETED_BY,
+ ModelConstants.IS_NEW_VERSION_OF,
+ ModelConstants.VERSION
+ ),
+ ModelConstants.REVIEWS -> OAFRelations(
+ ModelConstants.REVIEWS,
+ ModelConstants.IS_REVIEWED_BY,
+ ModelConstants.REVIEW
+ ),
+ ModelConstants.IS_REVIEWED_BY -> OAFRelations(
+ ModelConstants.IS_REVIEWED_BY,
+ ModelConstants.REVIEWS,
+ ModelConstants.REVIEW
+ ),
+ ModelConstants.DOCUMENTS -> OAFRelations(
+ ModelConstants.DOCUMENTS,
+ ModelConstants.IS_DOCUMENTED_BY,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
+ ModelConstants.IS_DOCUMENTED_BY,
+ ModelConstants.DOCUMENTS,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.COMPILES -> OAFRelations(
+ ModelConstants.COMPILES,
+ ModelConstants.IS_COMPILED_BY,
+ ModelConstants.RELATIONSHIP
+ ),
+ ModelConstants.IS_COMPILED_BY -> OAFRelations(
+ ModelConstants.IS_COMPILED_BY,
+ ModelConstants.COMPILES,
+ ModelConstants.RELATIONSHIP
+ )
)
-
val datacite_filter: List[String] = {
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
- require(stream!= null)
+ require(stream != null)
Source.fromInputStream(stream).getLines().toList
}
+ def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
+ false,
+ null,
+ false,
+ false,
+ ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
+ trust
+ )
- def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(false,null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, trust)
+ val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
+ "[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
+ Locale.ENGLISH
+ )
- val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
- val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
+ val df_it: DateTimeFormatter =
+ DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
- (Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
- (Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
-
+ (
+ Pattern.compile(
+ "(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
+ Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
+ ),
+ "40|corda__h2020::"
+ ),
+ (
+ Pattern.compile(
+ "(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
+ Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
+ ),
+ "40|corda_______::"
+ )
)
val Date_regex: List[Pattern] = List(
//Y-M-D
- Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
+ Pattern.compile(
+ "(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
+ Pattern.MULTILINE
+ ),
//M-D-Y
- Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
+ Pattern.compile(
+ "((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
+ Pattern.MULTILINE
+ ),
//D-M-Y
- Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
+ Pattern.compile(
+ "(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
+ Pattern.MULTILINE
+ ),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
-
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala
index a662cf99d..a0b7cd95e 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala
@@ -20,19 +20,16 @@ import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import scala.collection.JavaConverters._
-
object DataciteToOAFTransformation {
val mapper = new ObjectMapper()
-
- /**
- * This method should skip record if json contains invalid text
- * defined in gile datacite_filter
- *
- * @param json
- * @return True if the record should be skipped
- */
+ /** This method should skip record if json contains invalid text
+ * defined in gile datacite_filter
+ *
+ * @param json
+ * @return True if the record should be skipped
+ */
def skip_record(json: String): Boolean = {
datacite_filter.exists(f => json.contains(f))
}
@@ -74,35 +71,35 @@ object DataciteToOAFTransformation {
}
-
def embargo_end(embargo_end_date: String): Boolean = {
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
val td = LocalDate.now()
td.isAfter(dt)
}
-
def extract_date(input: String): Option[String] = {
- val d = Date_regex.map(pattern => {
- val matcher = pattern.matcher(input)
- if (matcher.find())
- matcher.group(0)
- else
- null
- }
- ).find(s => s != null)
+ val d = Date_regex
+ .map(pattern => {
+ val matcher = pattern.matcher(input)
+ if (matcher.find())
+ matcher.group(0)
+ else
+ null
+ })
+ .find(s => s != null)
if (d.isDefined) {
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
try {
return Some(LocalDate.parse(a_date, df_en).toString)
} catch {
- case _: Throwable => try {
- return Some(LocalDate.parse(a_date, df_it).toString)
- } catch {
- case _: Throwable =>
- return None
- }
+ case _: Throwable =>
+ try {
+ return Some(LocalDate.parse(a_date, df_it).toString)
+ } catch {
+ case _: Throwable =>
+ return None
+ }
}
}
d
@@ -118,31 +115,63 @@ object DataciteToOAFTransformation {
}
}
-
- def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
+ def getTypeQualifier(
+ resourceType: String,
+ resourceTypeGeneral: String,
+ schemaOrg: String,
+ vocabularies: VocabularyGroup
+ ): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) {
- val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
+ val typeQualifier =
+ vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
if (typeQualifier != null)
- return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
+ return (
+ typeQualifier,
+ vocabularies.getSynonymAsQualifier(
+ ModelConstants.DNET_RESULT_TYPOLOGIES,
+ typeQualifier.getClassid
+ )
+ )
}
if (schemaOrg != null && schemaOrg.nonEmpty) {
- val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
+ val typeQualifier =
+ vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
if (typeQualifier != null)
- return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
+ return (
+ typeQualifier,
+ vocabularies.getSynonymAsQualifier(
+ ModelConstants.DNET_RESULT_TYPOLOGIES,
+ typeQualifier.getClassid
+ )
+ )
}
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
- val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
+ val typeQualifier = vocabularies.getSynonymAsQualifier(
+ ModelConstants.DNET_PUBLICATION_RESOURCE,
+ resourceTypeGeneral
+ )
if (typeQualifier != null)
- return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
+ return (
+ typeQualifier,
+ vocabularies.getSynonymAsQualifier(
+ ModelConstants.DNET_RESULT_TYPOLOGIES,
+ typeQualifier.getClassid
+ )
+ )
}
null
}
-
- def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
- val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
+ def getResult(
+ resourceType: String,
+ resourceTypeGeneral: String,
+ schemaOrg: String,
+ vocabularies: VocabularyGroup
+ ): Result = {
+ val typeQualifiers: (Qualifier, Qualifier) =
+ getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (typeQualifiers == null)
return null
val i = new Instance
@@ -168,13 +197,12 @@ object DataciteToOAFTransformation {
null
}
-
def available_date(input: String): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val l: List[String] = for {
- JObject(dates) <- json \\ "dates"
+ JObject(dates) <- json \\ "dates"
JField("dateType", JString(dateTypes)) <- dates
} yield dateTypes
@@ -182,18 +210,19 @@ object DataciteToOAFTransformation {
}
-
- /**
- * As describe in ticket #6377
- * when the result come from figshare we need to remove subject
- * and set Access rights OPEN.
- *
- * @param r
- */
+ /** As describe in ticket #6377
+ * when the result come from figshare we need to remove subject
+ * and set Access rights OPEN.
+ *
+ * @param r
+ */
def fix_figshare(r: Result): Unit = {
if (r.getInstance() != null) {
- val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
+ val hosted_by_figshare = r
+ .getInstance()
+ .asScala
+ .exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
if (hosted_by_figshare) {
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
val l: List[StructuredProperty] = List()
@@ -201,10 +230,8 @@ object DataciteToOAFTransformation {
}
}
-
}
-
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
@@ -214,7 +241,13 @@ object DataciteToOAFTransformation {
OafMapperUtils.structuredProperty(dt, q, null)
}
- def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
+ def generateRelation(
+ sourceId: String,
+ targetId: String,
+ relClass: String,
+ cf: KeyValue,
+ di: DataInfo
+ ): Relation = {
val r = new Relation
r.setSource(sourceId)
@@ -226,7 +259,6 @@ object DataciteToOAFTransformation {
r.setDataInfo(di)
r
-
}
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
@@ -238,14 +270,18 @@ object DataciteToOAFTransformation {
val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}"
List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo))
- }
- else
+ } else
List()
}
-
- def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
+ def generateOAF(
+ input: String,
+ ts: Long,
+ dateOfCollection: Long,
+ vocabularies: VocabularyGroup,
+ exportLinks: Boolean
+ ): List[Oaf] = {
if (skip_record(input))
return List()
@@ -253,7 +289,8 @@ object DataciteToOAFTransformation {
lazy val json = parse(input)
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
- val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
+ val resourceTypeGeneral =
+ (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
val doi = (json \ "attributes" \ "doi").extract[String]
@@ -265,8 +302,12 @@ object DataciteToOAFTransformation {
if (result == null)
return List()
-
- val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
+ val doi_q = OafMapperUtils.qualifier(
+ "doi",
+ "doi",
+ ModelConstants.DNET_PID_TYPES,
+ ModelConstants.DNET_PID_TYPES
+ )
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
result.setPid(List(pid).asJava)
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
@@ -275,48 +316,70 @@ object DataciteToOAFTransformation {
val d = new Date(dateOfCollection * 1000)
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
-
result.setDateofcollection(ISO8601FORMAT.format(d))
result.setDateoftransformation(ISO8601FORMAT.format(d))
result.setDataInfo(dataInfo)
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
-
val authors = creators.zipWithIndex.map { case (c, idx) =>
val a = new Author
a.setFullname(c.name.orNull)
a.setName(c.givenName.orNull)
a.setSurname(c.familyName.orNull)
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
- a.setPid(c.nameIdentifiers.get.map(ni => {
- val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
- if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
- OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
- }
- else
- null
+ a.setPid(
+ c.nameIdentifiers.get
+ .map(ni => {
+ val q =
+ if (ni.nameIdentifierScheme.isDefined)
+ vocabularies.getTermAsQualifier(
+ ModelConstants.DNET_PID_TYPES,
+ ni.nameIdentifierScheme.get.toLowerCase()
+ )
+ else null
+ if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
+ OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
+ } else
+ null
- }
+ })
+ .asJava
)
- .asJava)
}
if (c.affiliation.isDefined)
- a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
+ a.setAffiliation(
+ c.affiliation.get
+ .filter(af => af.nonEmpty)
+ .map(af => OafMapperUtils.field(af, dataInfo))
+ .asJava
+ )
a.setRank(idx + 1)
a
}
-
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
- result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
- if (t.titleType.isEmpty) {
- OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
- } else {
- OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null)
- }
- }).asJava)
+ result.setTitle(
+ titles
+ .filter(t => t.title.nonEmpty)
+ .map(t => {
+ if (t.titleType.isEmpty) {
+ OafMapperUtils
+ .structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
+ } else {
+ OafMapperUtils.structuredProperty(
+ t.title.get,
+ t.titleType.get,
+ t.titleType.get,
+ ModelConstants.DNET_DATACITE_TITLE,
+ ModelConstants.DNET_DATACITE_TITLE,
+ null
+ )
+ }
+ })
+ .asJava
+ )
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
return List()
@@ -337,46 +400,81 @@ object DataciteToOAFTransformation {
if (a_date.isDefined) {
if (doi.startsWith("10.14457"))
- result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null))
+ result.setEmbargoenddate(
+ OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null)
+ )
else
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
}
if (i_date.isDefined && i_date.get.isDefined) {
if (doi.startsWith("10.14457")) {
- result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
- result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
- }
- else {
+ result.setDateofacceptance(
+ OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
+ )
+ result
+ .getInstance()
+ .get(0)
+ .setDateofacceptance(
+ OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
+ )
+ } else {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
}
- }
- else if (publication_year != null) {
+ } else if (publication_year != null) {
if (doi.startsWith("10.14457")) {
- result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
- result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
+ result.setDateofacceptance(
+ OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
+ )
+ result
+ .getInstance()
+ .get(0)
+ .setDateofacceptance(
+ OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
+ )
} else {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
- result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
+ result
+ .getInstance()
+ .get(0)
+ .setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
}
}
-
- result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
- .map(d => (extract_date(d.date.get), d.dateType.get))
- .filter(d => d._1.isDefined)
- .map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())))
- .filter(d => d._2 != null)
- .map(d => generateOAFDate(d._1, d._2)).asJava)
+ result.setRelevantdate(
+ dates
+ .filter(d => d.date.isDefined && d.dateType.isDefined)
+ .map(d => (extract_date(d.date.get), d.dateType.get))
+ .filter(d => d._1.isDefined)
+ .map(d =>
+ (
+ d._1.get,
+ vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
+ )
+ )
+ .filter(d => d._2 != null)
+ .map(d => generateOAFDate(d._1, d._2))
+ .asJava
+ )
val subjects = (json \\ "subjects").extract[List[SubjectType]]
- result.setSubject(subjects.filter(s => s.subject.nonEmpty)
- .map(s =>
- OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
- ).asJava)
-
+ result.setSubject(
+ subjects
+ .filter(s => s.subject.nonEmpty)
+ .map(s =>
+ OafMapperUtils.structuredProperty(
+ s.subject.get,
+ SUBJ_CLASS,
+ SUBJ_CLASS,
+ ModelConstants.DNET_SUBJECT_TYPOLOGIES,
+ ModelConstants.DNET_SUBJECT_TYPOLOGIES,
+ null
+ )
+ )
+ .asJava
+ )
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
@@ -384,66 +482,86 @@ object DataciteToOAFTransformation {
result.setDescription(
descriptions
- .filter(d => d.description.isDefined).
- map(d =>
- OafMapperUtils.field(d.description.get, null)
- ).filter(s => s != null).asJava)
-
+ .filter(d => d.description.isDefined)
+ .map(d => OafMapperUtils.field(d.description.get, null))
+ .filter(s => s != null)
+ .asJava
+ )
val publisher = (json \\ "publisher").extractOrElse[String](null)
if (publisher != null)
result.setPublisher(OafMapperUtils.field(publisher, null))
-
val language: String = (json \\ "language").extractOrElse[String](null)
if (language != null)
- result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
-
+ result.setLanguage(
+ vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
+ )
val instance = result.getInstance().get(0)
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
val accessRights: List[String] = for {
- JObject(rightsList) <- json \\ "rightsList"
+ JObject(rightsList) <- json \\ "rightsList"
JField("rightsUri", JString(rightsUri)) <- rightsList
} yield rightsUri
- val aRights: Option[AccessRight] = accessRights.map(r => {
- vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
- }).find(q => q != null).map(q => {
- val a = new AccessRight
- a.setClassid(q.getClassid)
- a.setClassname(q.getClassname)
- a.setSchemeid(q.getSchemeid)
- a.setSchemename(q.getSchemename)
- a
- })
+ val aRights: Option[AccessRight] = accessRights
+ .map(r => {
+ vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
+ })
+ .find(q => q != null)
+ .map(q => {
+ val a = new AccessRight
+ a.setClassid(q.getClassid)
+ a.setClassname(q.getClassname)
+ a.setSchemeid(q.getSchemeid)
+ a.setSchemename(q.getSchemename)
+ a
+ })
-
- val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
+ val access_rights_qualifier =
+ if (aRights.isDefined) aRights.get
+ else
+ OafMapperUtils.accessRight(
+ ModelConstants.UNKNOWN,
+ ModelConstants.NOT_AVAILABLE,
+ ModelConstants.DNET_ACCESS_MODES,
+ ModelConstants.DNET_ACCESS_MODES
+ )
if (client.isDefined) {
- instance.setHostedby(OafMapperUtils.keyValue(generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID), ModelConstants.UNKNOWN_REPOSITORY.getValue))
+ instance.setHostedby(
+ OafMapperUtils.keyValue(
+ generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID),
+ ModelConstants.UNKNOWN_REPOSITORY.getValue
+ )
+ )
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
instance.setAccessright(access_rights_qualifier)
instance.setPid(result.getPid)
val license = accessRights
- .find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
+ .find(r =>
+ r.startsWith("http") && r.matches(
+ ".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
+ )
+ )
if (license.isDefined)
instance.setLicense(OafMapperUtils.field(license.get, null))
}
val awardUris: List[String] = for {
- JObject(fundingReferences) <- json \\ "fundingReferences"
+ JObject(fundingReferences) <- json \\ "fundingReferences"
JField("awardUri", JString(awardUri)) <- fundingReferences
} yield awardUri
result.setId(IdentifierFactory.createIdentifier(result))
- var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
+ var relations: List[Relation] =
+ awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
fix_figshare(result)
@@ -452,28 +570,35 @@ object DataciteToOAFTransformation {
if (exportLinks) {
val rels: List[RelatedIdentifierType] = for {
- JObject(relIdentifier) <- json \\ "relatedIdentifiers"
- JField("relationType", JString(relationType)) <- relIdentifier
+ JObject(relIdentifier) <- json \\ "relatedIdentifiers"
+ JField("relationType", JString(relationType)) <- relIdentifier
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
- JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
+ JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
- relations = relations ::: generateRelations(rels, result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null)
+ relations = relations ::: generateRelations(
+ rels,
+ result.getId,
+ if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
+ )
}
if (relations != null && relations.nonEmpty) {
List(result) ::: relations
- }
- else
+ } else
List(result)
}
- private def generateRelations(rels: List[RelatedIdentifierType], id: String, date: String): List[Relation] = {
+ private def generateRelations(
+ rels: List[RelatedIdentifierType],
+ id: String,
+ date: String
+ ): List[Relation] = {
rels
.filter(r =>
- subRelTypeMapping.contains(r.relationType) && (
- r.relatedIdentifierType.equalsIgnoreCase("doi") ||
- r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
- r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
+ subRelTypeMapping
+ .contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
+ r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
+ r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
)
.map(r => {
val rel = new Relation
@@ -490,19 +615,19 @@ object DataciteToOAFTransformation {
rel.setProperties(List(dateProps).asJava)
rel.setSource(id)
- rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType))
+ rel.setTarget(
+ DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
+ )
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
rel
})
}
-
def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::")
val a = StringUtils.substringAfter(input, "::")
s"10|$b::${DHPUtils.md5(a)}"
}
-
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala
index a205edcf2..046290969 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala
@@ -12,12 +12,12 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
+class GenerateDataciteDatasetSpark(propertyPath: String, args: Array[String], log: Logger)
+ extends AbstractScalaApplication(propertyPath, args, log: Logger) {
-class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) {
- /**
- * Here all the spark applications runs this method
- * where the whole logic of the spark node is defined
- */
+ /** Here all the spark applications runs this method
+ * where the whole logic of the spark node is defined
+ */
override def run(): Unit = {
val sourcePath = parser.get("sourcePath")
@@ -46,49 +46,65 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
reportTotalSize(targetPath, outputBasePath)
}
-
- /**
- * For working with MDStore we need to store in a file on hdfs the size of
- * the current dataset
- * @param targetPath
- * @param outputBasePath
- */
- def reportTotalSize( targetPath: String, outputBasePath: String ):Unit = {
+ /** For working with MDStore we need to store in a file on hdfs the size of
+ * the current dataset
+ * @param targetPath
+ * @param outputBasePath
+ */
+ def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
val total_items = spark.read.text(targetPath).count()
- writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
+ writeHdfsFile(
+ spark.sparkContext.hadoopConfiguration,
+ s"$total_items",
+ outputBasePath + MDSTORE_SIZE_PATH
+ )
}
- /**
- * Generate the transformed and cleaned OAF Dataset from the native one
-
- * @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
- * @param exportLinks If true it generates unresolved links
- * @param vocabularies vocabularies for cleaning
- * @param targetPath the targetPath of the result Dataset
- */
- def generateDataciteDataset(sourcePath: String, exportLinks: Boolean, vocabularies: VocabularyGroup, targetPath: String, spark:SparkSession):Unit = {
- require(spark!= null)
+ /** Generate the transformed and cleaned OAF Dataset from the native one
+ *
+ * @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
+ * @param exportLinks If true it generates unresolved links
+ * @param vocabularies vocabularies for cleaning
+ * @param targetPath the targetPath of the result Dataset
+ */
+ def generateDataciteDataset(
+ sourcePath: String,
+ exportLinks: Boolean,
+ vocabularies: VocabularyGroup,
+ targetPath: String,
+ spark: SparkSession
+ ): Unit = {
+ require(spark != null)
import spark.implicits._
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
CollectionUtils.saveDataset(
- spark.read.load(sourcePath).as[DataciteType]
+ spark.read
+ .load(sourcePath)
+ .as[DataciteType]
.filter(d => d.isActive)
- .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
+ .flatMap(d =>
+ DataciteToOAFTransformation
+ .generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)
+ )
.filter(d => d != null),
- targetPath)
+ targetPath
+ )
}
}
-
object GenerateDataciteDatasetSpark {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = {
- new GenerateDataciteDatasetSpark("/eu/dnetlib/dhp/datacite/generate_dataset_params.json", args, log).initialize().run()
+ new GenerateDataciteDatasetSpark(
+ "/eu/dnetlib/dhp/datacite/generate_dataset_params.json",
+ args,
+ log
+ ).initialize().run()
}
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala
index 018b4958a..cb021925a 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala
@@ -22,7 +22,6 @@ object ImportDatacite {
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
-
def convertAPIStringToDataciteItem(input: String): DataciteType = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
@@ -32,14 +31,26 @@ object ImportDatacite {
val timestamp_string = (json \ "attributes" \ "updated").extract[String]
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
- DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input)
+ DataciteType(
+ doi = doi,
+ timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000,
+ isActive = isActive,
+ json = input
+ )
}
-
def main(args: Array[String]): Unit = {
- val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
+ val parser = new ArgumentApplicationParser(
+ Source
+ .fromInputStream(
+ getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json"
+ )
+ )
+ .mkString
+ )
parser.parseArgument(args)
val master = parser.get("master")
@@ -60,7 +71,8 @@ object ImportDatacite {
val spkipImport = parser.get("skipImport")
log.info(s"skipImport is $spkipImport")
- val spark: SparkSession = SparkSession.builder()
+ val spark: SparkSession = SparkSession
+ .builder()
.appName(ImportDatacite.getClass.getSimpleName)
.master(master)
.getOrCreate()
@@ -78,45 +90,48 @@ object ImportDatacite {
import spark.implicits._
+ val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] =
+ new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
- val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
+ override def zero: DataciteType = null
- override def zero: DataciteType = null
-
- override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
- if (b == null)
- return a
- if (a == null)
- return b
- if (a.timestamp > b.timestamp) {
- return a
+ override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
+ if (b == null)
+ return a
+ if (a == null)
+ return b
+ if (a.timestamp > b.timestamp) {
+ return a
+ }
+ b
}
- b
+
+ override def merge(a: DataciteType, b: DataciteType): DataciteType = {
+ reduce(a, b)
+ }
+
+ override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
+
+ override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
+
+ override def finish(reduction: DataciteType): DataciteType = reduction
}
- override def merge(a: DataciteType, b: DataciteType): DataciteType = {
- reduce(a, b)
- }
-
- override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
-
- override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
-
- override def finish(reduction: DataciteType): DataciteType = reduction
- }
-
val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType]
val ts = dump.select(max("timestamp")).first().getLong(0)
println(s"last Timestamp is $ts")
- val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
+ val cnt =
+ if ("true".equalsIgnoreCase(spkipImport)) 1
+ else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
println(s"Imported from Datacite API $cnt documents")
if (cnt > 0) {
- val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
+ val inputRdd: RDD[DataciteType] = sc
+ .sequenceFile(targetPath, classOf[Int], classOf[Text])
.map(s => s._2.toString)
.map(s => convertAPIStringToDataciteItem(s))
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
@@ -129,7 +144,9 @@ object ImportDatacite {
.agg(dataciteAggregator.toColumn)
.map(s => s._2)
.repartition(4000)
- .write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"${dataciteDump}_updated")
val fs = FileSystem.get(sc.hadoopConfiguration)
fs.delete(new Path(s"$dataciteDump"), true)
@@ -137,14 +154,24 @@ object ImportDatacite {
}
}
- private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs: Int): Long = {
+ private def writeSequenceFile(
+ hdfsTargetPath: Path,
+ timestamp: Long,
+ conf: Configuration,
+ bs: Int
+ ): Long = {
var from: Long = timestamp * 1000
val delta: Long = 100000000L
var client: DataciteAPIImporter = null
val now: Long = System.currentTimeMillis()
var i = 0
try {
- val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
+ val writer = SequenceFile.createWriter(
+ conf,
+ SequenceFile.Writer.file(hdfsTargetPath),
+ SequenceFile.Writer.keyClass(classOf[IntWritable]),
+ SequenceFile.Writer.valueClass(classOf[Text])
+ )
try {
var start: Long = System.currentTimeMillis
while (from < now) {
@@ -153,16 +180,16 @@ object ImportDatacite {
val key: IntWritable = new IntWritable(i)
val value: Text = new Text
while (client.hasNext) {
- key.set({
+ key.set {
i += 1;
i - 1
- })
+ }
value.set(client.next())
writer.append(key, value)
writer.hflush()
if (i % 1000 == 0) {
end = System.currentTimeMillis
- val time = (end - start) / 1000.0F
+ val time = (end - start) / 1000.0f
println(s"Imported $i in $time seconds")
start = System.currentTimeMillis
}
@@ -174,8 +201,7 @@ object ImportDatacite {
case e: Throwable =>
println("Error", e)
} finally if (writer != null) writer.close()
- }
- catch {
+ } catch {
case e: Throwable =>
log.error("Error", e)
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala
index d46e5423d..3e61edf02 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala
@@ -17,7 +17,13 @@ object SparkDownloadUpdateDatacite {
def main(args: Array[String]): Unit = {
val conf = new SparkConf
- val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString)
+ val parser = new ArgumentApplicationParser(
+ Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")
+ )
+ .mkString
+ )
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
@@ -26,8 +32,9 @@ object SparkDownloadUpdateDatacite {
val hdfsuri = parser.get("namenode")
log.info(s"namenode is $hdfsuri")
-
- val spark: SparkSession = SparkSession.builder().config(conf)
+ val spark: SparkSession = SparkSession
+ .builder()
+ .config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()
@@ -37,13 +44,18 @@ object SparkDownloadUpdateDatacite {
import spark.implicits._
-
- val maxDate: String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0)
+ val maxDate: String = spark.read
+ .load(workingPath)
+ .as[Oaf]
+ .filter(s => s.isInstanceOf[Result])
+ .map(r => r.asInstanceOf[Result].getDateofcollection)
+ .select(max("value"))
+ .first()
+ .getString(0)
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
val string_to_date = ISO8601FORMAT.parse(maxDate)
val ts = string_to_date.getTime
-
}
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala
index 853b24862..ffdab1799 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala
@@ -12,39 +12,81 @@ object BioDBToOAF {
case class EBILinkItem(id: Long, links: String) {}
- case class EBILinks(relType: String, date: String, title: String, pmid: String, targetPid: String, targetPidType: String, targetUrl: String) {}
+ case class EBILinks(
+ relType: String,
+ date: String,
+ title: String,
+ pmid: String,
+ targetPid: String,
+ targetPidType: String,
+ targetUrl: String
+ ) {}
case class UniprotDate(date: String, date_info: String) {}
- case class ScholixResolved(pid: String, pidType: String, typology: String, tilte: List[String], datasource: List[String], date: List[String], authors: List[String]) {}
+ case class ScholixResolved(
+ pid: String,
+ pidType: String,
+ typology: String,
+ tilte: List[String],
+ datasource: List[String],
+ date: List[String],
+ authors: List[String]
+ ) {}
- val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
+ val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
+ false,
+ null,
+ false,
+ false,
+ ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
+ "0.9"
+ )
val SUBJ_CLASS = "Keywords"
val DATE_RELATION_KEY = "RelationDate"
val resolvedURL: Map[String, String] = Map(
- "genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
- "ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
- "ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
- "ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
- "ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
+ "genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
+ "ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
+ "ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
+ "ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
+ "ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
"clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/",
- "onim" -> "https://omim.org/entry/",
- "refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
- "geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
+ "onim" -> "https://omim.org/entry/",
+ "refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
+ "geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
)
-
val collectedFromMap: Map[String, KeyValue] = {
- val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", "Protein Data Bank")
- val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive")
- val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", "NCBI Nucleotide")
- val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::296e1abaf1302897a6838d3588cd0310", "UniProtKB/Swiss-Prot")
- val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
- val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature")
- val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)")
- val pubmedCollectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
+ val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue(
+ "10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2",
+ "Protein Data Bank"
+ )
+ val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue(
+ "10|re3data_____::c2a591f440598b63d854556beaf01591",
+ "European Nucleotide Archive"
+ )
+ val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue(
+ "10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6",
+ "NCBI Nucleotide"
+ )
+ val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue(
+ "10|re3data_____::296e1abaf1302897a6838d3588cd0310",
+ "UniProtKB/Swiss-Prot"
+ )
+ val ElsevierCollectedFrom: KeyValue =
+ OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
+ val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue(
+ "10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e",
+ "Springer Nature"
+ )
+ val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue(
+ "10|opendoar____::83e60e09c222f206c725385f53d7e567c",
+ "EMBL-EBIs Protein Data Bank in Europe (PDBe)"
+ )
+ val pubmedCollectedFrom: KeyValue =
+ OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
PDBCollectedFrom.setDataInfo(DATA_INFO)
@@ -56,14 +98,14 @@ object BioDBToOAF {
springerNatureCollectedFrom.setDataInfo(DATA_INFO)
Map(
- "uniprot" -> UNIPROTCollectedFrom,
- "pdb" -> PDBCollectedFrom,
- "elsevier" -> ElsevierCollectedFrom,
- "ebi" -> EBICollectedFrom,
- "Springer Nature" -> springerNatureCollectedFrom,
- "NCBI Nucleotide" -> ncbiCollectedFrom,
+ "uniprot" -> UNIPROTCollectedFrom,
+ "pdb" -> PDBCollectedFrom,
+ "elsevier" -> ElsevierCollectedFrom,
+ "ebi" -> EBICollectedFrom,
+ "Springer Nature" -> springerNatureCollectedFrom,
+ "NCBI Nucleotide" -> ncbiCollectedFrom,
"European Nucleotide Archive" -> enaCollectedFrom,
- "Europe PMC" -> pubmedCollectedFrom
+ "Europe PMC" -> pubmedCollectedFrom
)
}
@@ -80,18 +122,32 @@ object BioDBToOAF {
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
- createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type), collectedFromMap("elsevier"), "relationship", relation_semantic, date)
+ createRelation(
+ target_pid,
+ target_pid_type,
+ generate_unresolved_id(source_pid, source_pid_type),
+ collectedFromMap("elsevier"),
+ "relationship",
+ relation_semantic,
+ date
+ )
}
-
def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
val d = new Dataset
d.setPid(
List(
- OafMapperUtils.structuredProperty(input.pid.toLowerCase, input.pidType.toLowerCase, input.pidType.toLowerCase, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
+ OafMapperUtils.structuredProperty(
+ input.pid.toLowerCase,
+ input.pidType.toLowerCase,
+ input.pidType.toLowerCase,
+ ModelConstants.DNET_PID_TYPES,
+ ModelConstants.DNET_PID_TYPES,
+ DATA_INFO
+ )
).asJava
)
@@ -101,7 +157,15 @@ object BioDBToOAF {
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
if (input.tilte != null && input.tilte.nonEmpty)
- d.setTitle(List(OafMapperUtils.structuredProperty(input.tilte.head, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
+ d.setTitle(
+ List(
+ OafMapperUtils.structuredProperty(
+ input.tilte.head,
+ ModelConstants.MAIN_TITLE_QUALIFIER,
+ DATA_INFO
+ )
+ ).asJava
+ )
d.setOriginalId(List(input.pid).asJava)
val i = new Instance
@@ -113,9 +177,23 @@ object BioDBToOAF {
}
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
- i.setInstancetype(OafMapperUtils.qualifier("0037", "Clinical Trial", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
+ i.setInstancetype(
+ OafMapperUtils.qualifier(
+ "0037",
+ "Clinical Trial",
+ ModelConstants.DNET_PUBLICATION_RESOURCE,
+ ModelConstants.DNET_PUBLICATION_RESOURCE
+ )
+ )
else
- i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
+ i.setInstancetype(
+ OafMapperUtils.qualifier(
+ "0046",
+ "Bioentity",
+ ModelConstants.DNET_PUBLICATION_RESOURCE,
+ ModelConstants.DNET_PUBLICATION_RESOURCE
+ )
+ )
if (input.datasource == null || input.datasource.isEmpty)
return null
@@ -141,7 +219,6 @@ object BioDBToOAF {
d
}
-
def uniprotToOAF(input: String): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
@@ -151,7 +228,14 @@ object BioDBToOAF {
d.setPid(
List(
- OafMapperUtils.structuredProperty(pid, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
+ OafMapperUtils.structuredProperty(
+ pid,
+ "uniprot",
+ "uniprot",
+ ModelConstants.DNET_PID_TYPES,
+ ModelConstants.DNET_PID_TYPES,
+ DATA_INFO
+ )
).asJava
)
@@ -162,32 +246,52 @@ object BioDBToOAF {
val title: String = (json \ "title").extractOrElse[String](null)
if (title != null)
- d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
+ d.setTitle(
+ List(
+ OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
+ ).asJava
+ )
d.setOriginalId(List(pid).asJava)
val i = new Instance
i.setPid(d.getPid)
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
- i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
+ i.setInstancetype(
+ OafMapperUtils.qualifier(
+ "0046",
+ "Bioentity",
+ ModelConstants.DNET_PUBLICATION_RESOURCE,
+ ModelConstants.DNET_PUBLICATION_RESOURCE
+ )
+ )
i.setCollectedfrom(collectedFromMap("uniprot"))
d.setInstance(List(i).asJava)
val dates: List[UniprotDate] = for {
- JObject(dateOBJ) <- json \ "dates"
- JField("date", JString(date)) <- dateOBJ
+ JObject(dateOBJ) <- json \ "dates"
+ JField("date", JString(date)) <- dateOBJ
JField("date_info", JString(date_info)) <- dateOBJ
} yield UniprotDate(GraphCleaningFunctions.cleanDate(date), date_info)
val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
-
if (subjects != null) {
d.setSubject(
- subjects.map(s =>
- OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
- ).asJava)
+ subjects
+ .map(s =>
+ OafMapperUtils.structuredProperty(
+ s,
+ SUBJ_CLASS,
+ SUBJ_CLASS,
+ ModelConstants.DNET_SUBJECT_TYPOLOGIES,
+ ModelConstants.DNET_SUBJECT_TYPOLOGIES,
+ null
+ )
+ )
+ .asJava
+ )
}
var i_date: Option[UniprotDate] = None
@@ -197,45 +301,73 @@ object BioDBToOAF {
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
}
- val relevant_dates: List[StructuredProperty] = dates.filter(d => !d.date_info.contains("entry version"))
- .map(date => OafMapperUtils.structuredProperty(date.date, ModelConstants.UNKNOWN, ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO))
+ val relevant_dates: List[StructuredProperty] = dates
+ .filter(d => !d.date_info.contains("entry version"))
+ .map(date =>
+ OafMapperUtils.structuredProperty(
+ date.date,
+ ModelConstants.UNKNOWN,
+ ModelConstants.UNKNOWN,
+ ModelConstants.DNET_DATACITE_DATE,
+ ModelConstants.DNET_DATACITE_DATE,
+ DATA_INFO
+ )
+ )
if (relevant_dates != null && relevant_dates.nonEmpty)
d.setRelevantdate(relevant_dates.asJava)
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
}
-
val references_pmid: List[String] = for {
- JObject(reference) <- json \ "references"
+ JObject(reference) <- json \ "references"
JField("PubMed", JString(pid)) <- reference
} yield pid
val references_doi: List[String] = for {
- JObject(reference) <- json \ "references"
+ JObject(reference) <- json \ "references"
JField(" DOI", JString(pid)) <- reference
} yield pid
-
if (references_pmid != null && references_pmid.nonEmpty) {
- val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
+ val rel = createRelation(
+ references_pmid.head,
+ "pmid",
+ d.getId,
+ collectedFromMap("uniprot"),
+ ModelConstants.RELATIONSHIP,
+ ModelConstants.IS_RELATED_TO,
+ if (i_date.isDefined) i_date.get.date else null
+ )
rel.getCollectedfrom
List(d, rel)
- }
- else if (references_doi != null && references_doi.nonEmpty) {
- val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
+ } else if (references_doi != null && references_doi.nonEmpty) {
+ val rel = createRelation(
+ references_doi.head,
+ "doi",
+ d.getId,
+ collectedFromMap("uniprot"),
+ ModelConstants.RELATIONSHIP,
+ ModelConstants.IS_RELATED_TO,
+ if (i_date.isDefined) i_date.get.date else null
+ )
List(d, rel)
- }
- else
+ } else
List(d)
}
-
def generate_unresolved_id(pid: String, pidType: String): String = {
s"unresolved::$pid::$pidType"
}
-
- def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType: String, relClass: String, date: String): Relation = {
+ def createRelation(
+ pid: String,
+ pidType: String,
+ sourceId: String,
+ collectedFrom: KeyValue,
+ subRelType: String,
+ relClass: String,
+ date: String
+ ): Relation = {
val rel = new Relation
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
@@ -248,7 +380,6 @@ object BioDBToOAF {
rel.setSource(sourceId)
rel.setTarget(s"unresolved::$pid::$pidType")
-
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava)
@@ -259,12 +390,24 @@ object BioDBToOAF {
}
-
- def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date: String): Relation = {
- createRelation(pid, pidType, sourceId, collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date)
+ def createSupplementaryRelation(
+ pid: String,
+ pidType: String,
+ sourceId: String,
+ collectedFrom: KeyValue,
+ date: String
+ ): Relation = {
+ createRelation(
+ pid,
+ pidType,
+ sourceId,
+ collectedFrom,
+ ModelConstants.SUPPLEMENT,
+ ModelConstants.IS_SUPPLEMENT_TO,
+ date
+ )
}
-
def pdbTOOaf(input: String): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
@@ -277,7 +420,14 @@ object BioDBToOAF {
d.setPid(
List(
- OafMapperUtils.structuredProperty(pdb, "pdb", "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
+ OafMapperUtils.structuredProperty(
+ pdb,
+ "pdb",
+ "Protein Data Bank Identifier",
+ ModelConstants.DNET_PID_TYPES,
+ ModelConstants.DNET_PID_TYPES,
+ DATA_INFO
+ )
).asJava
)
@@ -290,13 +440,16 @@ object BioDBToOAF {
if (title == null)
return List()
- d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
+ d.setTitle(
+ List(
+ OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
+ ).asJava
+ )
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
if (authors != null) {
val convertedAuthors = authors.zipWithIndex.map { a =>
-
val res = new Author
res.setFullname(a._1)
res.setRank(a._2 + 1)
@@ -310,7 +463,14 @@ object BioDBToOAF {
i.setPid(d.getPid)
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
- i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
+ i.setInstancetype(
+ OafMapperUtils.qualifier(
+ "0046",
+ "Bioentity",
+ ModelConstants.DNET_PUBLICATION_RESOURCE,
+ ModelConstants.DNET_PUBLICATION_RESOURCE
+ )
+ )
i.setCollectedfrom(collectedFromMap("pdb"))
d.setInstance(List(i).asJava)
@@ -323,7 +483,6 @@ object BioDBToOAF {
List(d)
}
-
def extractEBILinksFromDump(input: String): EBILinkItem = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
@@ -333,49 +492,70 @@ object BioDBToOAF {
EBILinkItem(pmid.toLong, compact(render(links)))
}
-
def EBITargetLinksFilter(input: EBILinks): Boolean = {
- input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot")
+ input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase(
+ "pdb"
+ ) || input.targetPidType.equalsIgnoreCase("uniprot")
}
-
def parse_ebi_links(input: String): List[EBILinks] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val pmid = (json \ "request" \ "id").extract[String]
for {
- JObject(link) <- json \\ "Link"
- JField("Target", JObject(target)) <- link
- JField("RelationshipType", JObject(relType)) <- link
- JField("Name", JString(relation)) <- relType
+ JObject(link) <- json \\ "Link"
+ JField("Target", JObject(target)) <- link
+ JField("RelationshipType", JObject(relType)) <- link
+ JField("Name", JString(relation)) <- relType
JField("PublicationDate", JString(publicationDate)) <- link
- JField("Title", JString(title)) <- target
- JField("Identifier", JObject(identifier)) <- target
- JField("IDScheme", JString(idScheme)) <- identifier
- JField("IDURL", JString(idUrl)) <- identifier
- JField("ID", JString(id)) <- identifier
+ JField("Title", JString(title)) <- target
+ JField("Identifier", JObject(identifier)) <- target
+ JField("IDScheme", JString(idScheme)) <- identifier
+ JField("IDURL", JString(idUrl)) <- identifier
+ JField("ID", JString(id)) <- identifier
- } yield EBILinks(relation, GraphCleaningFunctions.cleanDate(publicationDate), title, pmid, id, idScheme, idUrl)
+ } yield EBILinks(
+ relation,
+ GraphCleaningFunctions.cleanDate(publicationDate),
+ title,
+ pmid,
+ id,
+ idScheme,
+ idUrl
+ )
}
-
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
val d = new Dataset
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
d.setDataInfo(DATA_INFO)
- d.setTitle(List(OafMapperUtils.structuredProperty(input.title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
+ d.setTitle(
+ List(
+ OafMapperUtils.structuredProperty(
+ input.title,
+ ModelConstants.MAIN_TITLE_QUALIFIER,
+ DATA_INFO
+ )
+ ).asJava
+ )
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
-
d.setPid(
List(
- OafMapperUtils.structuredProperty(input.targetPid.toLowerCase, input.targetPidType.toLowerCase, "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
+ OafMapperUtils.structuredProperty(
+ input.targetPid.toLowerCase,
+ input.targetPidType.toLowerCase,
+ "Protein Data Bank Identifier",
+ ModelConstants.DNET_PID_TYPES,
+ ModelConstants.DNET_PID_TYPES,
+ DATA_INFO
+ )
).asJava
)
@@ -383,13 +563,35 @@ object BioDBToOAF {
i.setPid(d.getPid)
i.setUrl(List(input.targetUrl).asJava)
- i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
+ i.setInstancetype(
+ OafMapperUtils.qualifier(
+ "0046",
+ "Bioentity",
+ ModelConstants.DNET_PUBLICATION_RESOURCE,
+ ModelConstants.DNET_PUBLICATION_RESOURCE
+ )
+ )
i.setCollectedfrom(collectedFromMap("ebi"))
d.setInstance(List(i).asJava)
- i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
- d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
+ i.setDateofacceptance(
+ OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
+ )
+ d.setDateofacceptance(
+ OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
+ )
- List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, GraphCleaningFunctions.cleanDate(input.date)))
+ List(
+ d,
+ createRelation(
+ input.pmid,
+ "pmid",
+ d.getId,
+ collectedFromMap("ebi"),
+ ModelConstants.RELATIONSHIP,
+ ModelConstants.IS_RELATED_TO,
+ GraphCleaningFunctions.cleanDate(input.date)
+ )
+ )
}
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala
index 27caa8f36..96075b4f3 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala
@@ -14,7 +14,11 @@ object SparkTransformBioDatabaseToOAF {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val log: Logger = LoggerFactory.getLogger(getClass)
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")
+ )
+ )
parser.parseArgument(args)
val database: String = parser.get("database")
log.info("database: {}", database)
@@ -29,20 +33,33 @@ object SparkTransformBioDatabaseToOAF {
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
val sc = spark.sparkContext
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
import spark.implicits._
database.toUpperCase() match {
case "UNIPROT" =>
- CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))), targetPath)
+ CollectionUtils.saveDataset(
+ spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
+ targetPath
+ )
case "PDB" =>
- CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))), targetPath)
+ CollectionUtils.saveDataset(
+ spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
+ targetPath
+ )
case "SCHOLIX" =>
- CollectionUtils.saveDataset(spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)), targetPath)
+ CollectionUtils.saveDataset(
+ spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
+ targetPath
+ )
case "CROSSREF_LINKS" =>
- CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))), targetPath)
+ CollectionUtils.saveDataset(
+ spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
+ targetPath
+ )
}
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
index 0fea4ff7f..9c55ec7be 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
@@ -24,31 +24,37 @@ import scala.xml.pull.XMLEventReader
object SparkCreateBaselineDataFrame {
-
def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = {
val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
- val result = data.lines.filter(l => l.startsWith("")
- val start = l.indexOf(" l.startsWith("")
+ val start = l.indexOf("= 0 && end > start)
- l.substring(start + 9, end - start)
- else
- ""
- }.filter(s => s.endsWith(".gz")).filter(s => s > maxFile).map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList
+ if (start >= 0 && end > start)
+ l.substring(start + 9, end - start)
+ else
+ ""
+ }
+ .filter(s => s.endsWith(".gz"))
+ .filter(s => s > maxFile)
+ .map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s"))
+ .toList
result
}
-
def downloadBaselinePart(url: String): InputStream = {
val r = new HttpGet(url)
val timeout = 60; // seconds
- val config = RequestConfig.custom()
+ val config = RequestConfig
+ .custom()
.setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000)
- .setSocketTimeout(timeout * 1000).build()
+ .setSocketTimeout(timeout * 1000)
+ .build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
val response = client.execute(r)
println(s"get response with status${response.getStatusLine.getStatusCode}")
@@ -59,10 +65,12 @@ object SparkCreateBaselineDataFrame {
def requestPage(url: String): String = {
val r = new HttpGet(url)
val timeout = 60; // seconds
- val config = RequestConfig.custom()
+ val config = RequestConfig
+ .custom()
.setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000)
- .setSocketTimeout(timeout * 1000).build()
+ .setSocketTimeout(timeout * 1000)
+ .build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
try {
var tries = 4
@@ -73,8 +81,7 @@ object SparkCreateBaselineDataFrame {
println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
- }
- else
+ } else
return IOUtils.toString(response.getEntity.getContent)
} catch {
case e: Throwable =>
@@ -90,10 +97,8 @@ object SparkCreateBaselineDataFrame {
}
}
-
def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = {
-
val conf = new Configuration
conf.set("fs.defaultFS", hdfsServerUri)
val fs = FileSystem.get(conf)
@@ -122,31 +127,36 @@ object SparkCreateBaselineDataFrame {
}
+ val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] =
+ new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
+ override def zero: PMArticle = new PMArticle
- val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
- override def zero: PMArticle = new PMArticle
+ override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
+ if (b != null && b.getPmid != null) b else a._2
+ }
- override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
- if (b != null && b.getPmid != null) b else a._2
+ override def merge(b1: PMArticle, b2: PMArticle): PMArticle = {
+ if (b1 != null && b1.getPmid != null) b1 else b2
+
+ }
+
+ override def finish(reduction: PMArticle): PMArticle = reduction
+
+ override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
+
+ override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
}
- override def merge(b1: PMArticle, b2: PMArticle): PMArticle = {
- if (b1 != null && b1.getPmid != null) b1 else b2
-
- }
-
- override def finish(reduction: PMArticle): PMArticle = reduction
-
- override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
-
- override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
- }
-
-
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val log: Logger = LoggerFactory.getLogger(getClass)
- val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ SparkEBILinksToOaf.getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
+ )
+ )
+ )
parser.parseArgument(args)
val isLookupUrl: String = parser.get("isLookupUrl")
log.info("isLookupUrl: {}", isLookupUrl)
@@ -162,7 +172,6 @@ object SparkCreateBaselineDataFrame {
val skipUpdate = parser.get("skipUpdate")
log.info("skipUpdate: {}", skipUpdate)
-
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
val spark: SparkSession =
@@ -170,7 +179,8 @@ object SparkCreateBaselineDataFrame {
.builder()
.config(conf)
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
val sc = spark.sparkContext
import spark.implicits._
@@ -183,20 +193,30 @@ object SparkCreateBaselineDataFrame {
if (!"true".equalsIgnoreCase(skipUpdate)) {
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
- val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => {
- val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
- new PMParser(xml)
- }))
- ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
+ val ds: Dataset[PMArticle] = spark.createDataset(
+ k.filter(i => i._1.endsWith(".gz"))
+ .flatMap(i => {
+ val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
+ new PMParser(xml)
+ })
+ )
+ ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder))
+ .groupByKey(_._1)
.agg(pmArticleAggregator.toColumn)
- .map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
+ .map(p => p._2)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/baseline_dataset")
}
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
- CollectionUtils.saveDataset(exported_dataset
- .map(a => PubMedToOaf.convert(a, vocabularies)).as[Oaf]
- .filter(p => p != null),
- targetPath)
+ CollectionUtils.saveDataset(
+ exported_dataset
+ .map(a => PubMedToOaf.convert(a, vocabularies))
+ .as[Oaf]
+ .filter(p => p != null),
+ targetPath
+ )
}
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala
index 18e39387f..44e9e22ea 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala
@@ -25,10 +25,12 @@ object SparkDownloadEBILinks {
def requestPage(url: String): String = {
val r = new HttpGet(url)
val timeout = 60; // seconds
- val config = RequestConfig.custom()
+ val config = RequestConfig
+ .custom()
.setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000)
- .setSocketTimeout(timeout * 1000).build()
+ .setSocketTimeout(timeout * 1000)
+ .build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
try {
var tries = 4
@@ -39,8 +41,7 @@ object SparkDownloadEBILinks {
println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
- }
- else
+ } else
return IOUtils.toString(response.getEntity.getContent)
} catch {
case e: Throwable =>
@@ -66,14 +67,19 @@ object SparkDownloadEBILinks {
val log: Logger = LoggerFactory.getLogger(getClass)
val MAX_ITEM_PER_PARTITION = 20000
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
import spark.implicits._
@@ -87,22 +93,40 @@ object SparkDownloadEBILinks {
log.info(s"workingPath -> $workingPath")
log.info("Getting max pubmedId where the links have already requested")
- val links: Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
+ val links: Dataset[EBILinkItem] =
+ spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)
log.info("Retrieving PMID to request links")
val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
- pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request")
+ pubmed
+ .map(p => p.getPmid.toLong)
+ .where(s"value > $lastPMIDRequested")
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/id_to_request")
val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
val total = pmidToReq.count()
- spark.createDataset(pmidToReq.rdd.repartition((total / MAX_ITEM_PER_PARTITION).toInt).map(pmid => createEBILinks(pmid)).filter(l => l != null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update")
+ spark
+ .createDataset(
+ pmidToReq.rdd
+ .repartition((total / MAX_ITEM_PER_PARTITION).toInt)
+ .map(pmid => createEBILinks(pmid))
+ .filter(l => l != null)
+ )
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/links_update")
- val updates: Dataset[EBILinkItem] = spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
+ val updates: Dataset[EBILinkItem] =
+ spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
- links.union(updates).groupByKey(_.id)
+ links
+ .union(updates)
+ .groupByKey(_.id)
.reduceGroups { (x, y) =>
if (x == null || x.links == null)
y
@@ -112,6 +136,10 @@ object SparkDownloadEBILinks {
x
else
y
- }.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final")
+ }
+ .map(_._2)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/links_final")
}
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
index cd03f004d..7cb6153ff 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
@@ -15,15 +15,19 @@ object SparkEBILinksToOaf {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
-
+ .master(parser.get("master"))
+ .getOrCreate()
import spark.implicits._
val sourcePath = parser.get("sourcePath")
@@ -32,11 +36,17 @@ object SparkEBILinksToOaf {
log.info(s"targetPath -> $targetPath")
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
- val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{"))
+ val ebLinks: Dataset[EBILinkItem] = spark.read
+ .load(sourcePath)
+ .as[EBILinkItem]
+ .filter(l => l.links != null && l.links.startsWith("{"))
- CollectionUtils.saveDataset(ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
- .filter(p => BioDBToOAF.EBITargetLinksFilter(p))
- .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
- targetPath)
+ CollectionUtils.saveDataset(
+ ebLinks
+ .flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
+ .filter(p => BioDBToOAF.EBITargetLinksFilter(p))
+ .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
+ targetPath
+ )
}
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
index c6d5fdf74..49a271641 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
@@ -3,16 +3,13 @@ package eu.dnetlib.dhp.sx.bio.pubmed
import scala.xml.MetaData
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
+/** @param xml
+ */
+class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] {
-/**
- *
- * @param xml
- */
-class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
+ var currentArticle: PMArticle = generateNextArticle()
- var currentArticle:PMArticle = generateNextArticle()
-
- override def hasNext: Boolean = currentArticle!= null
+ override def hasNext: Boolean = currentArticle != null
override def next(): PMArticle = {
val tmp = currentArticle
@@ -20,33 +17,30 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
tmp
}
- def extractAttributes(attrs:MetaData, key:String):String = {
+ def extractAttributes(attrs: MetaData, key: String): String = {
val res = attrs.get(key)
if (res.isDefined) {
- val s =res.get
+ val s = res.get
if (s != null && s.nonEmpty)
s.head.text
else
null
- }
- else null
+ } else null
}
-
- def validate_Date(year:String, month:String, day:String):String = {
+ def validate_Date(year: String, month: String, day: String): String = {
try {
f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d"
} catch {
- case _: Throwable =>null
+ case _: Throwable => null
}
}
- def generateNextArticle():PMArticle = {
+ def generateNextArticle(): PMArticle = {
-
- var currentSubject:PMSubject = null
+ var currentSubject: PMSubject = null
var currentAuthor: PMAuthor = null
var currentJournal: PMJournal = null
var currentGrant: PMGrant = null
@@ -54,12 +48,7 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
var currentYear = "0"
var currentMonth = "01"
var currentDay = "01"
- var currentArticleType:String = null
-
-
-
-
-
+ var currentArticleType: String = null
while (xml.hasNext) {
xml.next match {
@@ -68,64 +57,67 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
label match {
case "PubmedArticle" => currentArticle = new PMArticle
- case "Author" => currentAuthor = new PMAuthor
- case "Journal" => currentJournal = new PMJournal
- case "Grant" => currentGrant = new PMGrant
+ case "Author" => currentAuthor = new PMAuthor
+ case "Journal" => currentJournal = new PMJournal
+ case "Grant" => currentGrant = new PMGrant
case "PublicationType" | "DescriptorName" =>
currentSubject = new PMSubject
currentSubject.setMeshId(extractAttributes(attrs, "UI"))
- case "ArticleId" => currentArticleType = extractAttributes(attrs,"IdType")
- case _ =>
+ case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType")
+ case _ =>
}
case EvElemEnd(_, label) =>
label match {
case "PubmedArticle" => return currentArticle
- case "Author" => currentArticle.getAuthors.add(currentAuthor)
- case "Journal" => currentArticle.setJournal(currentJournal)
- case "Grant" => currentArticle.getGrants.add(currentGrant)
- case "PubMedPubDate" => if (currentArticle.getDate== null)
- currentArticle.setDate(validate_Date(currentYear,currentMonth,currentDay))
- case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
- case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
- case "PublicationType" =>currentArticle.getPublicationTypes.add(currentSubject)
- case _ =>
+ case "Author" => currentArticle.getAuthors.add(currentAuthor)
+ case "Journal" => currentArticle.setJournal(currentJournal)
+ case "Grant" => currentArticle.getGrants.add(currentGrant)
+ case "PubMedPubDate" =>
+ if (currentArticle.getDate == null)
+ currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
+ case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
+ case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
+ case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject)
+ case _ =>
}
case EvText(text) =>
- if (currNode!= null && text.trim.nonEmpty)
+ if (currNode != null && text.trim.nonEmpty)
currNode match {
case "ArticleTitle" => {
- if (currentArticle.getTitle==null)
+ if (currentArticle.getTitle == null)
currentArticle.setTitle(text.trim)
else
currentArticle.setTitle(currentArticle.getTitle + text.trim)
}
case "AbstractText" => {
- if (currentArticle.getDescription==null)
+ if (currentArticle.getDescription == null)
currentArticle.setDescription(text.trim)
else
currentArticle.setDescription(currentArticle.getDescription + text.trim)
}
case "PMID" => currentArticle.setPmid(text.trim)
- case "ArticleId" => if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
- case "Language" => currentArticle.setLanguage(text.trim)
- case "ISSN" => currentJournal.setIssn(text.trim)
- case "GrantID" => currentGrant.setGrantID(text.trim)
- case "Agency" => currentGrant.setAgency(text.trim)
- case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim)
- case "Year" => currentYear = text.trim
- case "Month" => currentMonth = text.trim
- case "Day" => currentDay = text.trim
- case "Volume" => currentJournal.setVolume( text.trim)
- case "Issue" => currentJournal.setIssue (text.trim)
+ case "ArticleId" =>
+ if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
+ case "Language" => currentArticle.setLanguage(text.trim)
+ case "ISSN" => currentJournal.setIssn(text.trim)
+ case "GrantID" => currentGrant.setGrantID(text.trim)
+ case "Agency" => currentGrant.setAgency(text.trim)
+ case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim)
+ case "Year" => currentYear = text.trim
+ case "Month" => currentMonth = text.trim
+ case "Day" => currentDay = text.trim
+ case "Volume" => currentJournal.setVolume(text.trim)
+ case "Issue" => currentJournal.setIssue(text.trim)
case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim)
case "LastName" => {
if (currentAuthor != null)
currentAuthor.setLastName(text.trim)
}
- case "ForeName" => if (currentAuthor != null)
- currentAuthor.setForeName(text.trim)
+ case "ForeName" =>
+ if (currentAuthor != null)
+ currentAuthor.setForeName(text.trim)
case "Title" =>
- if (currentJournal.getTitle==null)
+ if (currentJournal.getTitle == null)
currentJournal.setTitle(text.trim)
else
currentJournal.setTitle(currentJournal.getTitle + text.trim)
@@ -139,8 +131,3 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
null
}
}
-
-
-
-
-
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala
index 65717adff..92ad22c57 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala
@@ -9,31 +9,38 @@ import collection.JavaConverters._
import java.util.regex.Pattern
/**
- *
- */
+ */
object PubMedToOaf {
val SUBJ_CLASS = "keywords"
+
val urlMap = Map(
"pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
- "doi" -> "https://dx.doi.org/"
+ "doi" -> "https://dx.doi.org/"
)
- val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
- val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
+ val dataInfo: DataInfo = OafMapperUtils.dataInfo(
+ false,
+ null,
+ false,
+ false,
+ ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
+ "0.9"
+ )
- /**
- * Cleaning the DOI Applying regex in order to
- * remove doi starting with URL
- *
- * @param doi input DOI
- * @return cleaned DOI
- */
+ val collectedFrom: KeyValue =
+ OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
+
+ /** Cleaning the DOI Applying regex in order to
+ * remove doi starting with URL
+ *
+ * @param doi input DOI
+ * @return cleaned DOI
+ */
def cleanDoi(doi: String): String = {
val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
-
val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
val matcher = pattern.matcher(doi)
@@ -43,33 +50,34 @@ object PubMedToOaf {
null
}
- /**
- *
- * Create an instance of class extends Result
- * starting from OAF instanceType value
- *
- * @param cobjQualifier OAF instance type
- * @param vocabularies All dnet vocabularies
- * @return the correct instance
- */
+ /** Create an instance of class extends Result
+ * starting from OAF instanceType value
+ *
+ * @param cobjQualifier OAF instance type
+ * @param vocabularies All dnet vocabularies
+ * @return the correct instance
+ */
def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
- val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid)
+ val result_typologies = getVocabularyTerm(
+ ModelConstants.DNET_RESULT_TYPOLOGIES,
+ vocabularies,
+ cobjQualifier.getClassid
+ )
result_typologies.getClassid match {
- case "dataset" => new Dataset
+ case "dataset" => new Dataset
case "publication" => new Publication
- case "other" => new OtherResearchProduct
- case "software" => new Software
- case _ => null
+ case "other" => new OtherResearchProduct
+ case "software" => new Software
+ case _ => null
}
}
- /**
- * Mapping the Pubmedjournal info into the OAF Journale
- *
- * @param j the pubmedJournal
- * @return the OAF Journal
- */
+ /** Mapping the Pubmedjournal info into the OAF Journale
+ *
+ * @param j the pubmedJournal
+ * @return the OAF Journal
+ */
def mapJournal(j: PMJournal): Journal = {
if (j == null)
return null
@@ -83,40 +91,47 @@ object PubMedToOaf {
journal.setIss(j.getIssue)
journal
-
}
- /**
- *
- * Find vocabulary term into synonyms and term in the vocabulary
- *
- * @param vocabularyName the input vocabulary name
- * @param vocabularies all the vocabularies
- * @param term the term to find
- * @return the cleaned term value
- */
- def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
+ /** Find vocabulary term into synonyms and term in the vocabulary
+ *
+ * @param vocabularyName the input vocabulary name
+ * @param vocabularies all the vocabularies
+ * @param term the term to find
+ * @return the cleaned term value
+ */
+ def getVocabularyTerm(
+ vocabularyName: String,
+ vocabularies: VocabularyGroup,
+ term: String
+ ): Qualifier = {
val a = vocabularies.getSynonymAsQualifier(vocabularyName, term)
val b = vocabularies.getTermAsQualifier(vocabularyName, term)
if (a == null) b else a
}
-
- /**
- * Map the Pubmed Article into the OAF instance
- *
- * @param article the pubmed articles
- * @param vocabularies the vocabularies
- * @return The OAF instance if the mapping did not fail
- */
+ /** Map the Pubmed Article into the OAF instance
+ *
+ * @param article the pubmed articles
+ * @param vocabularies the vocabularies
+ * @return The OAF instance if the mapping did not fail
+ */
def convert(article: PMArticle, vocabularies: VocabularyGroup): Oaf = {
if (article.getPublicationTypes == null)
return null
-
// MAP PMID into pid with classid = classname = pmid
- val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
+ val pidList: List[StructuredProperty] = List(
+ OafMapperUtils.structuredProperty(
+ article.getPmid,
+ PidType.pmid.toString,
+ PidType.pmid.toString,
+ ModelConstants.DNET_PID_TYPES,
+ ModelConstants.DNET_PID_TYPES,
+ dataInfo
+ )
+ )
if (pidList == null)
return null
@@ -125,7 +140,14 @@ object PubMedToOaf {
if (article.getDoi != null) {
val normalizedPid = cleanDoi(article.getDoi)
if (normalizedPid != null)
- alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
+ alternateIdentifier = OafMapperUtils.structuredProperty(
+ normalizedPid,
+ PidType.doi.toString,
+ PidType.doi.toString,
+ ModelConstants.DNET_PID_TYPES,
+ ModelConstants.DNET_PID_TYPES,
+ dataInfo
+ )
}
// INSTANCE MAPPING
@@ -133,10 +155,12 @@ object PubMedToOaf {
// If the article contains the typology Journal Article then we apply this type
//else We have to find a terms that match the vocabulary otherwise we discard it
- val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
+ val ja =
+ article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
val pubmedInstance = new Instance
if (ja.isDefined) {
- val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
+ val cojbCategory =
+ getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
pubmedInstance.setInstancetype(cojbCategory)
} else {
val i_type = article.getPublicationTypes.asScala
@@ -155,7 +179,9 @@ object PubMedToOaf {
if (alternateIdentifier != null)
pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava)
result.setInstance(List(pubmedInstance).asJava)
- pubmedInstance.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut)
+ pubmedInstance.getPid.asScala
+ .filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid))
+ .map(p => p.getValue)(collection.breakOut)
//CREATE URL From pmid
val urlLists: List[String] = pidList
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
@@ -165,7 +191,9 @@ object PubMedToOaf {
pubmedInstance.setUrl(urlLists.asJava)
//ASSIGN DateofAcceptance
- pubmedInstance.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
+ pubmedInstance.setDateofacceptance(
+ OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
+ )
//ASSIGN COLLECTEDFROM
pubmedInstance.setCollectedfrom(collectedFrom)
result.setPid(pidList.asJava)
@@ -173,7 +201,6 @@ object PubMedToOaf {
//END INSTANCE MAPPING
//--------------------------------------------------------------------------------------
-
// JOURNAL MAPPING
//--------------------------------------------------------------------------------------
if (article.getJournal != null && result.isInstanceOf[Publication])
@@ -182,31 +209,48 @@ object PubMedToOaf {
//END JOURNAL MAPPING
//--------------------------------------------------------------------------------------
-
// RESULT MAPPING
//--------------------------------------------------------------------------------------
- result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
+ result.setDateofacceptance(
+ OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
+ )
if (article.getTitle == null || article.getTitle.isEmpty)
return null
- result.setTitle(List(OafMapperUtils.structuredProperty(article.getTitle, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo)).asJava)
+ result.setTitle(
+ List(
+ OafMapperUtils.structuredProperty(
+ article.getTitle,
+ ModelConstants.MAIN_TITLE_QUALIFIER,
+ dataInfo
+ )
+ ).asJava
+ )
if (article.getDescription != null && article.getDescription.nonEmpty)
result.setDescription(List(OafMapperUtils.field(article.getDescription, dataInfo)).asJava)
if (article.getLanguage != null) {
- val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
+ val term =
+ vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
if (term != null)
result.setLanguage(term)
}
-
- val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection.breakOut)
+ val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s =>
+ OafMapperUtils.structuredProperty(
+ s.getValue,
+ SUBJ_CLASS,
+ SUBJ_CLASS,
+ ModelConstants.DNET_SUBJECT_TYPOLOGIES,
+ ModelConstants.DNET_SUBJECT_TYPOLOGIES,
+ dataInfo
+ )
+ )(collection.breakOut)
if (subjects != null)
result.setSubject(subjects.asJava)
-
val authors: List[Author] = article.getAuthors.asScala.zipWithIndex.map { case (a, index) =>
val author = new Author()
author.setName(a.getForeName)
@@ -216,15 +260,12 @@ object PubMedToOaf {
author
}(collection.breakOut)
-
if (authors != null && authors.nonEmpty)
result.setAuthor(authors.asJava)
result.setOriginalId(pidList.map(s => s.getValue).asJava)
-
result.setId(article.getPmid)
-
// END RESULT MAPPING
//--------------------------------------------------------------------------------------
val id = IdentifierFactory.createIdentifier(result)
@@ -234,5 +275,4 @@ object PubMedToOaf {
result
}
-
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkRetrieveDataciteDelta.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkRetrieveDataciteDelta.scala
index 45a6cfc89..2618d466a 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkRetrieveDataciteDelta.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkRetrieveDataciteDelta.scala
@@ -17,7 +17,8 @@ import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
import java.text.SimpleDateFormat
-class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) {
+class SparkRetrieveDataciteDelta(propertyPath: String, args: Array[String], log: Logger)
+ extends AbstractScalaApplication(propertyPath, args, log: Logger) {
val ISO_DATE_PATTERN = "yyyy-MM-dd'T'HH:mm:ssZ"
val simpleFormatter = new SimpleDateFormat(ISO_DATE_PATTERN)
@@ -25,162 +26,190 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
val SCHOLIX_RESOURCE_PATH_NAME = "scholixResource"
val DATACITE_OAF_PATH_NAME = "dataciteOAFUpdate"
val PID_MAP_PATH_NAME = "pidMap"
- val RESOLVED_REL_PATH_NAME ="resolvedRelation"
+ val RESOLVED_REL_PATH_NAME = "resolvedRelation"
val SCHOLIX_PATH_NAME = "scholix"
+ def scholixResourcePath(workingPath: String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME"
+ def dataciteOAFPath(workingPath: String) = s"$workingPath/$DATACITE_OAF_PATH_NAME"
+ def pidMapPath(workingPath: String) = s"$workingPath/$PID_MAP_PATH_NAME"
+ def resolvedRelationPath(workingPath: String) = s"$workingPath/$RESOLVED_REL_PATH_NAME"
+ def scholixPath(workingPath: String) = s"$workingPath/$SCHOLIX_PATH_NAME"
- def scholixResourcePath(workingPath:String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME"
- def dataciteOAFPath(workingPath:String) = s"$workingPath/$DATACITE_OAF_PATH_NAME"
- def pidMapPath(workingPath:String) = s"$workingPath/$PID_MAP_PATH_NAME"
- def resolvedRelationPath(workingPath:String) = s"$workingPath/$RESOLVED_REL_PATH_NAME"
- def scholixPath(workingPath:String) = s"$workingPath/$SCHOLIX_PATH_NAME"
-
-
- /**
- * Utility to parse Date in ISO8601 to epochMillis
- * @param inputDate The String represents an input date in ISO8601
- * @return The relative epochMillis of parsed date
- */
- def ISO8601toEpochMillis(inputDate:String):Long = {
+ /** Utility to parse Date in ISO8601 to epochMillis
+ * @param inputDate The String represents an input date in ISO8601
+ * @return The relative epochMillis of parsed date
+ */
+ def ISO8601toEpochMillis(inputDate: String): Long = {
simpleFormatter.parse(inputDate).getTime
}
-
- /**
- * This method tries to retrieve the last collection date from all datacite
- * records in HDFS.
- * This method should be called before indexing scholexplorer to retrieve
- * the delta of Datacite record to download, since from the generation of
- * raw graph to the generation of Scholexplorer sometimes it takes 20 days
- * @param spark
- * @param entitiesPath
- * @return the last collection date from the current scholexplorer Graph of the datacite records
- */
- def retrieveLastCollectedFrom(spark:SparkSession, entitiesPath:String):Long = {
+ /** This method tries to retrieve the last collection date from all datacite
+ * records in HDFS.
+ * This method should be called before indexing scholexplorer to retrieve
+ * the delta of Datacite record to download, since from the generation of
+ * raw graph to the generation of Scholexplorer sometimes it takes 20 days
+ * @param spark
+ * @param entitiesPath
+ * @return the last collection date from the current scholexplorer Graph of the datacite records
+ */
+ def retrieveLastCollectedFrom(spark: SparkSession, entitiesPath: String): Long = {
log.info("Retrieve last entities collected From")
- implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
- implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result]
+ implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
+ implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
import spark.implicits._
- val entitiesDS = spark.read.load(s"$entitiesPath/*").as[Oaf].filter(o =>o.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
+ val entitiesDS = spark.read
+ .load(s"$entitiesPath/*")
+ .as[Oaf]
+ .filter(o => o.isInstanceOf[Result])
+ .map(r => r.asInstanceOf[Result])
- val date = entitiesDS.filter(r => r.getDateofcollection!= null).map(_.getDateofcollection).select(max("value")).first.getString(0)
+ val date = entitiesDS
+ .filter(r => r.getDateofcollection != null)
+ .map(_.getDateofcollection)
+ .select(max("value"))
+ .first
+ .getString(0)
ISO8601toEpochMillis(date) / 1000
}
-
- /**
- * The method of update Datacite relationships on Scholexplorer
- * needs some utilities data structures
- * One is the scholixResource DS that stores all the nodes in the Scholix Graph
- * in format ScholixResource
- * @param summaryPath the path of the summary in Scholix
- * @param workingPath the working path
- * @param spark the spark session
- */
- def generateScholixResource(summaryPath:String, workingPath: String, spark:SparkSession) :Unit = {
- implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
- implicit val scholixResourceEncoder:Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
+ /** The method of update Datacite relationships on Scholexplorer
+ * needs some utilities data structures
+ * One is the scholixResource DS that stores all the nodes in the Scholix Graph
+ * in format ScholixResource
+ * @param summaryPath the path of the summary in Scholix
+ * @param workingPath the working path
+ * @param spark the spark session
+ */
+ def generateScholixResource(
+ summaryPath: String,
+ workingPath: String,
+ spark: SparkSession
+ ): Unit = {
+ implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
+ implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
log.info("Convert All summary to ScholixResource")
- spark.read.load(summaryPath).as[ScholixSummary]
+ spark.read
+ .load(summaryPath)
+ .as[ScholixSummary]
.map(ScholixUtils.generateScholixResourceFromSummary)(scholixResourceEncoder)
- .filter(r => r.getIdentifier!= null && r.getIdentifier.size>0)
- .write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_native")
+ .filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"${scholixResourcePath(workingPath)}_native")
}
- /**
- * This method convert the new Datacite Resource into Scholix Resource
- * Needed to fill the source and the type of Scholix Relationships
- * @param workingPath the Working Path
- * @param spark The spark Session
- */
- def addMissingScholixResource(workingPath:String, spark:SparkSession ) :Unit = {
- implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
- implicit val scholixResourceEncoder:Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
- implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result]
+ /** This method convert the new Datacite Resource into Scholix Resource
+ * Needed to fill the source and the type of Scholix Relationships
+ * @param workingPath the Working Path
+ * @param spark The spark Session
+ */
+ def addMissingScholixResource(workingPath: String, spark: SparkSession): Unit = {
+ implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
+ implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
+ implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
import spark.implicits._
- spark.read.load(dataciteOAFPath(workingPath)).as[Oaf]
+ spark.read
+ .load(dataciteOAFPath(workingPath))
+ .as[Oaf]
.filter(_.isInstanceOf[Result])
.map(_.asInstanceOf[Result])
.map(ScholixUtils.generateScholixResourceFromResult)
- .filter(r => r.getIdentifier!= null && r.getIdentifier.size>0)
- .write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_update")
+ .filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"${scholixResourcePath(workingPath)}_update")
val update = spark.read.load(s"${scholixResourcePath(workingPath)}_update").as[ScholixResource]
val native = spark.read.load(s"${scholixResourcePath(workingPath)}_native").as[ScholixResource]
- val graph = update.union(native)
+ val graph = update
+ .union(native)
.groupByKey(_.getDnetIdentifier)
- .reduceGroups((a,b) => if (a!= null && a.getDnetIdentifier!= null) a else b)
+ .reduceGroups((a, b) => if (a != null && a.getDnetIdentifier != null) a else b)
.map(_._2)
graph.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_graph")
}
+ /** This method get and Transform only datacite records with
+ * timestamp greater than timestamp
+ * @param datacitePath the datacite input Path
+ * @param timestamp the timestamp
+ * @param workingPath the working path where save the generated Dataset
+ * @param spark SparkSession
+ * @param vocabularies Vocabularies needed for transformation
+ */
- /**
- * This method get and Transform only datacite records with
- * timestamp greater than timestamp
- * @param datacitePath the datacite input Path
- * @param timestamp the timestamp
- * @param workingPath the working path where save the generated Dataset
- * @param spark SparkSession
- * @param vocabularies Vocabularies needed for transformation
- */
-
- def getDataciteUpdate(datacitePath:String, timestamp:Long, workingPath:String, spark:SparkSession,vocabularies: VocabularyGroup): Long = {
+ def getDataciteUpdate(
+ datacitePath: String,
+ timestamp: Long,
+ workingPath: String,
+ spark: SparkSession,
+ vocabularies: VocabularyGroup
+ ): Long = {
import spark.implicits._
val ds = spark.read.load(datacitePath).as[DataciteType]
- implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
- val total = ds.filter(_.timestamp>=timestamp).count()
- if (total >0) {
+ implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
+ val total = ds.filter(_.timestamp >= timestamp).count()
+ if (total > 0) {
ds.filter(_.timestamp >= timestamp)
- .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true))
- .flatMap(i => fixRelations(i)).filter(i => i != null)
- .write.mode(SaveMode.Overwrite).save(dataciteOAFPath(workingPath))
+ .flatMap(d =>
+ DataciteToOAFTransformation
+ .generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true)
+ )
+ .flatMap(i => fixRelations(i))
+ .filter(i => i != null)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(dataciteOAFPath(workingPath))
}
total
}
- /**
- * After added the new ScholixResource, we need to update the scholix Pid Map
- * to intersected with the new Datacite Relations
-
- * @param workingPath The working Path starting from save the new Map
- * @param spark the spark session
- */
- def generatePidMap(workingPath:String, spark:SparkSession ) :Unit = {
- implicit val scholixResourceEncoder:Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
+ /** After added the new ScholixResource, we need to update the scholix Pid Map
+ * to intersected with the new Datacite Relations
+ *
+ * @param workingPath The working Path starting from save the new Map
+ * @param spark the spark session
+ */
+ def generatePidMap(workingPath: String, spark: SparkSession): Unit = {
+ implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
import spark.implicits._
- spark.read.load(s"${scholixResourcePath(workingPath)}_graph").as[ScholixResource]
- .flatMap(r=>
- r.getIdentifier.asScala
- .map(i =>DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema))
- .map(t =>(t, r.getDnetIdentifier))
- )(Encoders.tuple(Encoders.STRING, Encoders.STRING))
+ spark.read
+ .load(s"${scholixResourcePath(workingPath)}_graph")
+ .as[ScholixResource]
+ .flatMap(r =>
+ r.getIdentifier.asScala
+ .map(i => DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema))
+ .map(t => (t, r.getDnetIdentifier))
+ )(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.groupByKey(_._1)
- .reduceGroups((a,b) => if (a!= null && a._2!= null) a else b)
+ .reduceGroups((a, b) => if (a != null && a._2 != null) a else b)
.map(_._2)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
- .write.mode(SaveMode.Overwrite).save(pidMapPath(workingPath))
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(pidMapPath(workingPath))
}
- /**
- * This method resolve the datacite relation and filter the resolved
- * relation
- * @param workingPath the working path
- * @param spark the spark session
- */
+ /** This method resolve the datacite relation and filter the resolved
+ * relation
+ * @param workingPath the working path
+ * @param spark the spark session
+ */
- def resolveUpdateRelation(workingPath:String, spark:SparkSession) :Unit = {
- implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
- implicit val relationEncoder:Encoder[Relation] = Encoders.kryo[Relation]
+ def resolveUpdateRelation(workingPath: String, spark: SparkSession): Unit = {
+ implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
+ implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
- val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String,String)]
+ val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String, String)]
- val unresolvedRelations:Dataset[(String,Relation)] = spark.read.load(dataciteOAFPath(workingPath)).as[Oaf]
+ val unresolvedRelations: Dataset[(String, Relation)] = spark.read
+ .load(dataciteOAFPath(workingPath))
+ .as[Oaf]
.filter(_.isInstanceOf[Relation])
.map(_.asInstanceOf[Relation])
.map { r =>
@@ -193,7 +222,7 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
unresolvedRelations
.joinWith(pidMap, unresolvedRelations("_1").equalTo(pidMap("_1")))
.map(t => {
- val r =t._1._2
+ val r = t._1._2
val resolvedIdentifier = t._2._2
if (r.getSource.startsWith("unresolved"))
r.setSource(resolvedIdentifier)
@@ -201,56 +230,62 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
r.setTarget(resolvedIdentifier)
r
})(relationEncoder)
- .filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved") ))
- .write.mode(SaveMode.Overwrite)
+ .filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved")))
+ .write
+ .mode(SaveMode.Overwrite)
.save(resolvedRelationPath(workingPath))
}
+ /** This method generate scholix starting from resolved relation
+ *
+ * @param workingPath
+ * @param spark
+ */
+ def generateScholixUpdate(workingPath: String, spark: SparkSession): Unit = {
+ implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
+ implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
+ implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
+ implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation]
+ implicit val intermediateEncoder: Encoder[(String, Scholix)] =
+ Encoders.tuple(Encoders.STRING, scholixEncoder)
+ val relations: Dataset[(String, Relation)] = spark.read
+ .load(resolvedRelationPath(workingPath))
+ .as[Relation]
+ .map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relationEncoder))
- /**
- * This method generate scholix starting from resolved relation
- *
- *
- * @param workingPath
- * @param spark
- */
- def generateScholixUpdate(workingPath:String, spark:SparkSession) :Unit = {
- implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
- implicit val scholixEncoder:Encoder[Scholix] = Encoders.kryo[Scholix]
- implicit val scholixResourceEncoder:Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
- implicit val relationEncoder:Encoder[Relation] = Encoders.kryo[Relation]
- implicit val intermediateEncoder :Encoder[(String,Scholix)] = Encoders.tuple(Encoders.STRING, scholixEncoder)
-
-
- val relations:Dataset[(String, Relation)] = spark.read.load(resolvedRelationPath(workingPath)).as[Relation].map(r =>(r.getSource,r))(Encoders.tuple(Encoders.STRING, relationEncoder))
-
- val id_summary:Dataset[(String,ScholixResource)] = spark.read.load(s"${scholixResourcePath(workingPath)}_graph").as[ScholixResource].map(r => (r.getDnetIdentifier,r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder))
+ val id_summary: Dataset[(String, ScholixResource)] = spark.read
+ .load(s"${scholixResourcePath(workingPath)}_graph")
+ .as[ScholixResource]
+ .map(r => (r.getDnetIdentifier, r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder))
id_summary.cache()
- relations.joinWith(id_summary, relations("_1").equalTo(id_summary("_1")),"inner")
- .map(t => (t._1._2.getTarget,ScholixUtils.scholixFromSource(t._1._2, t._2._2)))
- .write.mode(SaveMode.Overwrite).save(s"$workingPath/scholix_one_verse")
+ relations
+ .joinWith(id_summary, relations("_1").equalTo(id_summary("_1")), "inner")
+ .map(t => (t._1._2.getTarget, ScholixUtils.scholixFromSource(t._1._2, t._2._2)))
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/scholix_one_verse")
- val source_scholix:Dataset[(String, Scholix)] =spark.read.load(s"$workingPath/scholix_one_verse").as[(String,Scholix)]
+ val source_scholix: Dataset[(String, Scholix)] =
+ spark.read.load(s"$workingPath/scholix_one_verse").as[(String, Scholix)]
- source_scholix.joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")),"inner")
+ source_scholix
+ .joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")), "inner")
.map(t => {
- val target:ScholixResource =t._2._2
- val scholix:Scholix = t._1._2
- ScholixUtils.generateCompleteScholix(scholix,target)
- })(scholixEncoder).write.mode(SaveMode.Overwrite).save(s"$workingPath/scholix")
+ val target: ScholixResource = t._2._2
+ val scholix: Scholix = t._1._2
+ ScholixUtils.generateCompleteScholix(scholix, target)
+ })(scholixEncoder)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/scholix")
}
-
-
-
-
- /**
- * Here all the spark applications runs this method
- * where the whole logic of the spark node is defined
- */
+ /** Here all the spark applications runs this method
+ * where the whole logic of the spark node is defined
+ */
override def run(): Unit = {
val sourcePath = parser.get("sourcePath")
log.info(s"SourcePath is '$sourcePath'")
@@ -258,7 +293,7 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
val datacitePath = parser.get("datacitePath")
log.info(s"DatacitePath is '$datacitePath'")
- val workingPath = parser.get("workingSupportPath")
+ val workingPath = parser.get("workingSupportPath")
log.info(s"workingPath is '$workingPath'")
val isLookupUrl: String = parser.get("isLookupUrl")
@@ -268,38 +303,43 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
require(vocabularies != null)
-
- val updateDS:Boolean = "true".equalsIgnoreCase(parser.get("updateDS"))
+ val updateDS: Boolean = "true".equalsIgnoreCase(parser.get("updateDS"))
log.info(s"updateDS is '$updateDS'")
var lastCollectionDate = 0L
if (updateDS) {
generateScholixResource(s"$sourcePath/provision/summaries", workingPath, spark)
log.info("Retrieve last entities collected From starting from scholix Graph")
- lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities")
- }
- else {
+ lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities")
+ } else {
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
fs.delete(new Path(s"${scholixResourcePath(workingPath)}_native"), true)
- fs.rename(new Path(s"${scholixResourcePath(workingPath)}_graph"), new Path(s"${scholixResourcePath(workingPath)}_native"))
- lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath))
+ fs.rename(
+ new Path(s"${scholixResourcePath(workingPath)}_graph"),
+ new Path(s"${scholixResourcePath(workingPath)}_native")
+ )
+ lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath))
}
- val numRecords = getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies)
- if (numRecords>0) {
- addMissingScholixResource(workingPath,spark)
+ val numRecords =
+ getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies)
+ if (numRecords > 0) {
+ addMissingScholixResource(workingPath, spark)
generatePidMap(workingPath, spark)
- resolveUpdateRelation(workingPath,spark)
+ resolveUpdateRelation(workingPath, spark)
generateScholixUpdate(workingPath, spark)
}
}
}
-
object SparkRetrieveDataciteDelta {
val log: Logger = LoggerFactory.getLogger(SparkRetrieveDataciteDelta.getClass)
def main(args: Array[String]): Unit = {
- new SparkRetrieveDataciteDelta("/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json", args, log).initialize().run()
+ new SparkRetrieveDataciteDelta(
+ "/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json",
+ args,
+ log
+ ).initialize().run()
}
}
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala
index 5bb6ba67d..ca1dbc665 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala
@@ -1,6 +1,5 @@
package eu.dnetlib.dhp.datacite
-
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.Oaf
@@ -20,95 +19,90 @@ import java.util.Locale
import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension]))
-class DataciteToOAFTest extends AbstractVocabularyTest{
+class DataciteToOAFTest extends AbstractVocabularyTest {
- private var workingDir:Path = null
+ private var workingDir: Path = null
val log: Logger = LoggerFactory.getLogger(getClass)
@BeforeEach
- def setUp() :Unit = {
+ def setUp(): Unit = {
- workingDir= Files.createTempDirectory(getClass.getSimpleName)
+ workingDir = Files.createTempDirectory(getClass.getSimpleName)
super.setUpVocabulary()
}
@AfterEach
- def tearDown() :Unit = {
+ def tearDown(): Unit = {
FileUtils.deleteDirectory(workingDir.toFile)
}
-
@Test
- def testDateMapping:Unit = {
+ def testDateMapping: Unit = {
val inputDate = "2021-07-14T11:52:54+0000"
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
val dt = ISO8601FORMAT.parse(inputDate)
println(dt.getTime)
-
}
-
@Test
def testConvert(): Unit = {
-
val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
val conf = new SparkConf()
- val spark:SparkSession = SparkSession.builder().config(conf)
+ val spark: SparkSession = SparkSession
+ .builder()
+ .config(conf)
.appName(getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
-
-
- implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
+ implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
val instance = new GenerateDataciteDatasetSpark(null, null, log)
val targetPath = s"$workingDir/result"
- instance.generateDataciteDataset(path, exportLinks = true, vocabularies,targetPath, spark)
+ instance.generateDataciteDataset(path, exportLinks = true, vocabularies, targetPath, spark)
import spark.implicits._
- val nativeSize =spark.read.load(path).count()
-
+ val nativeSize = spark.read.load(path).count()
assertEquals(100, nativeSize)
- val result:Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
+ val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
-
- result.map(s => s.getClass.getSimpleName).groupBy(col("value").alias("class")).agg(count("value").alias("Total")).show(false)
+ result
+ .map(s => s.getClass.getSimpleName)
+ .groupBy(col("value").alias("class"))
+ .agg(count("value").alias("Total"))
+ .show(false)
val t = spark.read.load(targetPath).count()
- assertTrue(t >0)
-
+ assertTrue(t > 0)
spark.stop()
-
-
-
}
-
@Test
- def testMapping() :Unit = {
- val record =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")).mkString
+ def testMapping(): Unit = {
+ val record = Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")
+ )
+ .mkString
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
- val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true )
+ val res: List[Oaf] = DataciteToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true)
res.foreach(r => {
- println (mapper.writeValueAsString(r))
+ println(mapper.writeValueAsString(r))
println("----------------------------")
})
-
-
}
-}
\ No newline at end of file
+}
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
index 893a6e628..ea742a04a 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@@ -20,14 +20,13 @@ import scala.io.Source
import scala.xml.pull.XMLEventReader
@ExtendWith(Array(classOf[MockitoExtension]))
-class BioScholixTest extends AbstractVocabularyTest{
-
+class BioScholixTest extends AbstractVocabularyTest {
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
- mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,false)
+ mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
@BeforeEach
- def setUp() :Unit = {
+ def setUp(): Unit = {
super.setUpVocabulary()
}
@@ -38,52 +37,54 @@ class BioScholixTest extends AbstractVocabularyTest{
}
object GzFileIterator {
+
def apply(is: InputStream, encoding: String) = {
new BufferedReaderIterator(
- new BufferedReader(
- new InputStreamReader(
- new GZIPInputStream(
- is), encoding)))
+ new BufferedReader(new InputStreamReader(new GZIPInputStream(is), encoding))
+ )
}
}
-
-
-
@Test
def testEBIData() = {
- val inputXML = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")).mkString
+ val inputXML = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+ .mkString
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
- new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s)))
+ new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
}
-
@Test
def testPubmedToOaf(): Unit = {
assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
- val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")).mkString
- val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies))
+ val records: String = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump"))
+ .mkString
+ val r: List[Oaf] = records.lines.toList
+ .map(s => mapper.readValue(s, classOf[PMArticle]))
+ .map(a => PubMedToOaf.convert(a, vocabularies))
assertEquals(10, r.size)
- assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p)))
+ assertTrue(
+ r.map(p => p.asInstanceOf[Result])
+ .flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid))
+ .exists(p => "0037".equalsIgnoreCase(p))
+ )
println(mapper.writeValueAsString(r.head))
-
-
}
-
@Test
- def testPDBToOAF():Unit = {
+ def testPDBToOAF(): Unit = {
assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
- val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump")).mkString
+ val records: String = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump"))
+ .mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
- val result:List[Oaf]= records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
-
-
+ val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
assertTrue(result.nonEmpty)
result.foreach(r => assertNotNull(r))
@@ -93,19 +94,18 @@ class BioScholixTest extends AbstractVocabularyTest{
}
-
@Test
- def testUNIprotToOAF():Unit = {
+ def testUNIprotToOAF(): Unit = {
assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
- val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump")).mkString
+ val records: String = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump"))
+ .mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
- val result:List[Oaf]= records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
-
-
+ val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
assertTrue(result.nonEmpty)
result.foreach(r => assertNotNull(r))
@@ -115,35 +115,42 @@ class BioScholixTest extends AbstractVocabularyTest{
}
- case class EBILinks(relType:String, date:String, title:String, pmid:String, targetPid:String, targetPidType:String) {}
+ case class EBILinks(
+ relType: String,
+ date: String,
+ title: String,
+ pmid: String,
+ targetPid: String,
+ targetPidType: String
+ ) {}
- def parse_ebi_links(input:String):List[EBILinks] ={
+ def parse_ebi_links(input: String): List[EBILinks] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
- val pmid = (json \ "publication" \"pmid").extract[String]
+ val pmid = (json \ "publication" \ "pmid").extract[String]
for {
- JObject(link) <- json \\ "Link"
- JField("Target",JObject(target)) <- link
- JField("RelationshipType",JObject(relType)) <- link
- JField("Name", JString(relation)) <- relType
- JField("PublicationDate",JString(publicationDate)) <- link
- JField("Title", JString(title)) <- target
- JField("Identifier",JObject(identifier)) <- target
- JField("IDScheme", JString(idScheme)) <- identifier
- JField("ID", JString(id)) <- identifier
+ JObject(link) <- json \\ "Link"
+ JField("Target", JObject(target)) <- link
+ JField("RelationshipType", JObject(relType)) <- link
+ JField("Name", JString(relation)) <- relType
+ JField("PublicationDate", JString(publicationDate)) <- link
+ JField("Title", JString(title)) <- target
+ JField("Identifier", JObject(identifier)) <- target
+ JField("IDScheme", JString(idScheme)) <- identifier
+ JField("ID", JString(id)) <- identifier
} yield EBILinks(relation, publicationDate, title, pmid, id, idScheme)
}
-
@Test
- def testCrossrefLinksToOAF():Unit = {
+ def testCrossrefLinksToOAF(): Unit = {
- val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links")).mkString
+ val records: String = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links"))
+ .mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
-
- val result:List[Oaf] =records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
+ val result: List[Oaf] = records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
assertNotNull(result)
assertTrue(result.nonEmpty)
@@ -153,36 +160,41 @@ class BioScholixTest extends AbstractVocabularyTest{
}
@Test
- def testEBILinksToOAF():Unit = {
- val iterator = GzFileIterator(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"), "UTF-8")
+ def testEBILinksToOAF(): Unit = {
+ val iterator = GzFileIterator(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"),
+ "UTF-8"
+ )
val data = iterator.next()
- val res = BioDBToOAF.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links).filter(BioDBToOAF.EBITargetLinksFilter).flatMap(BioDBToOAF.convertEBILinksToOaf)
+ val res = BioDBToOAF
+ .parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links)
+ .filter(BioDBToOAF.EBITargetLinksFilter)
+ .flatMap(BioDBToOAF.convertEBILinksToOaf)
print(res.length)
-
println(mapper.writeValueAsString(res.head))
}
-
-
-
@Test
- def scholixResolvedToOAF():Unit ={
+ def scholixResolvedToOAF(): Unit = {
- val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")).mkString
+ val records: String = Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")
+ )
+ .mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
- val l:List[ScholixResolved] = records.lines.map{input =>
+ val l: List[ScholixResolved] = records.lines.map { input =>
lazy val json = parse(input)
json.extract[ScholixResolved]
}.toList
-
- val result:List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
+ val result: List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
assertTrue(result.nonEmpty)
}
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
index 3822f40b5..20471973a 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
@@ -16,10 +16,22 @@ import java.time.LocalDate
import java.time.format.DateTimeFormatter
import scala.collection.JavaConverters._
+case class HostedByItemType(
+ id: String,
+ officialname: String,
+ issn: String,
+ eissn: String,
+ lissn: String,
+ openAccess: Boolean
+) {}
-case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
-
-case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){}
+case class DoiBoostAffiliation(
+ PaperId: Long,
+ AffiliationId: Long,
+ GridId: Option[String],
+ OfficialPage: Option[String],
+ DisplayName: Option[String]
+) {}
object DoiBoostMappingUtil {
@@ -43,9 +55,19 @@ object DoiBoostMappingUtil {
val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"
val DOI_PREFIX = "10."
- val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
+ val invalidName = List(
+ ",",
+ "none none",
+ "none, none",
+ "none &na;",
+ "(:null)",
+ "test test test",
+ "test test",
+ "test",
+ "&na; &na;"
+ )
- def toActionSet(item:Oaf) :(String, String) = {
+ def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
item match {
@@ -75,59 +97,56 @@ object DoiBoostMappingUtil {
}
-
- def toHostedByItem(input:String): (String, HostedByItemType) = {
+ def toHostedByItem(input: String): (String, HostedByItemType) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
- val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]]
+ val c: Map[String, HostedByItemType] = json.extract[Map[String, HostedByItemType]]
(c.keys.head, c.values.head)
}
-
- def toISSNPair(publication: Publication) : (String, Publication) = {
+ def toISSNPair(publication: Publication): (String, Publication) = {
val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
- val eissn =if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
- val lissn =if (publication.getJournal == null) null else publication.getJournal.getIssnLinking
+ val eissn = if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
+ val lissn = if (publication.getJournal == null) null else publication.getJournal.getIssnLinking
- if (issn!= null && issn.nonEmpty)
+ if (issn != null && issn.nonEmpty)
(issn, publication)
- else if(eissn!= null && eissn.nonEmpty)
+ else if (eissn != null && eissn.nonEmpty)
(eissn, publication)
- else if(lissn!= null && lissn.nonEmpty)
+ else if (lissn != null && lissn.nonEmpty)
(lissn, publication)
else
(publication.getId, publication)
}
-
-
-
- def generateGridAffiliationId(gridId:String) :String = {
+ def generateGridAffiliationId(gridId: String): String = {
s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
}
-
- def fixResult(result: Dataset) :Dataset = {
+ def fixResult(result: Dataset): Dataset = {
val instanceType = extractInstance(result)
if (instanceType.isDefined) {
result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
}
- result.getInstance().asScala.foreach(i => {
- i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
- })
+ result
+ .getInstance()
+ .asScala
+ .foreach(i => {
+ i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
+ })
result
}
-
- def decideAccessRight(lic : Field[String], date:String) : AccessRight = {
- if(lic == null){
+ def decideAccessRight(lic: Field[String], date: String): AccessRight = {
+ if (lic == null) {
//Default value Unknown
return getUnknownQualifier()
}
- val license : String = lic.getValue
+ val license: String = lic.getValue
//CC licenses
- if(license.startsWith("cc") ||
+ if (
+ license.startsWith("cc") ||
license.startsWith("http://creativecommons.org/licenses") ||
license.startsWith("https://creativecommons.org/licenses") ||
@@ -137,40 +156,44 @@ object DoiBoostMappingUtil {
license.equals("http://pubs.acs.org/page/policy/authorchoice_ccbyncnd_termsofuse.html") ||
//APA (considered OPEN also by Unpaywall)
- license.equals("http://www.apa.org/pubs/journals/resources/open-access.aspx")){
+ license.equals("http://www.apa.org/pubs/journals/resources/open-access.aspx")
+ ) {
- val oaq : AccessRight = getOpenAccessQualifier()
+ val oaq: AccessRight = getOpenAccessQualifier()
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
return oaq
}
//OUP (BUT ONLY AFTER 12 MONTHS FROM THE PUBLICATION DATE, OTHERWISE THEY ARE EMBARGOED)
- if(license.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")){
+ if (
+ license.equals(
+ "https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
+ )
+ ) {
val now = java.time.LocalDate.now
- try{
+ try {
val pub_date = LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd"))
- if (((now.toEpochDay - pub_date.toEpochDay)/365.0) > 1){
- val oaq : AccessRight = getOpenAccessQualifier()
+ if (((now.toEpochDay - pub_date.toEpochDay) / 365.0) > 1) {
+ val oaq: AccessRight = getOpenAccessQualifier()
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
return oaq
- }
- else{
+ } else {
return getEmbargoedAccessQualifier()
}
- }catch {
+ } catch {
case e: Exception => {
- try{
- val pub_date = LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))
- if (((now.toEpochDay - pub_date.toEpochDay)/365.0) > 1){
- val oaq : AccessRight = getOpenAccessQualifier()
- oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
- return oaq
- }
- else{
- return getEmbargoedAccessQualifier()
- }
- }catch{
+ try {
+ val pub_date =
+ LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))
+ if (((now.toEpochDay - pub_date.toEpochDay) / 365.0) > 1) {
+ val oaq: AccessRight = getOpenAccessQualifier()
+ oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
+ return oaq
+ } else {
+ return getEmbargoedAccessQualifier()
+ }
+ } catch {
case ex: Exception => return getClosedAccessQualifier()
}
}
@@ -183,64 +206,91 @@ object DoiBoostMappingUtil {
}
+ def getOpenAccessQualifier(): AccessRight = {
-
- def getOpenAccessQualifier():AccessRight = {
-
- OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN,"Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
+ OafMapperUtils.accessRight(
+ ModelConstants.ACCESS_RIGHT_OPEN,
+ "Open Access",
+ ModelConstants.DNET_ACCESS_MODES,
+ ModelConstants.DNET_ACCESS_MODES
+ )
}
- def getRestrictedQualifier():AccessRight = {
- OafMapperUtils.accessRight( "RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
+ def getRestrictedQualifier(): AccessRight = {
+ OafMapperUtils.accessRight(
+ "RESTRICTED",
+ "Restricted",
+ ModelConstants.DNET_ACCESS_MODES,
+ ModelConstants.DNET_ACCESS_MODES
+ )
}
-
- def getUnknownQualifier():AccessRight = {
- OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
+ def getUnknownQualifier(): AccessRight = {
+ OafMapperUtils.accessRight(
+ ModelConstants.UNKNOWN,
+ ModelConstants.NOT_AVAILABLE,
+ ModelConstants.DNET_ACCESS_MODES,
+ ModelConstants.DNET_ACCESS_MODES
+ )
}
-
- def getEmbargoedAccessQualifier():AccessRight = {
- OafMapperUtils.accessRight("EMBARGO","Embargo",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
+ def getEmbargoedAccessQualifier(): AccessRight = {
+ OafMapperUtils.accessRight(
+ "EMBARGO",
+ "Embargo",
+ ModelConstants.DNET_ACCESS_MODES,
+ ModelConstants.DNET_ACCESS_MODES
+ )
}
- def getClosedAccessQualifier():AccessRight = {
- OafMapperUtils.accessRight("CLOSED","Closed Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
+ def getClosedAccessQualifier(): AccessRight = {
+ OafMapperUtils.accessRight(
+ "CLOSED",
+ "Closed Access",
+ ModelConstants.DNET_ACCESS_MODES,
+ ModelConstants.DNET_ACCESS_MODES
+ )
}
-
- def extractInstance(r:Result):Option[Instance] = {
- r.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
+ def extractInstance(r: Result): Option[Instance] = {
+ r.getInstance()
+ .asScala
+ .find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
}
- def fixPublication(input:((String,Publication), (String,HostedByItemType))): Publication = {
+ def fixPublication(input: ((String, Publication), (String, HostedByItemType))): Publication = {
val publication = input._1._2
val item = if (input._2 != null) input._2._2 else null
- val instanceType:Option[Instance] = extractInstance(publication)
+ val instanceType: Option[Instance] = extractInstance(publication)
if (instanceType.isDefined) {
- publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
+ publication
+ .getInstance()
+ .asScala
+ .foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
}
- publication.getInstance().asScala.foreach(i => {
- var hb = new KeyValue
- if (item != null) {
- hb.setValue(item.officialname)
- hb.setKey(item.id)
- if (item.openAccess) {
- i.setAccessright(getOpenAccessQualifier())
- i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
- }
+ publication
+ .getInstance()
+ .asScala
+ .foreach(i => {
+ var hb = new KeyValue
+ if (item != null) {
+ hb.setValue(item.officialname)
+ hb.setKey(item.id)
+ if (item.openAccess) {
+ i.setAccessright(getOpenAccessQualifier())
+ i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
+ }
- }
- else {
- hb = ModelConstants.UNKNOWN_REPOSITORY
- }
- i.setHostedby(hb)
- })
+ } else {
+ hb = ModelConstants.UNKNOWN_REPOSITORY
+ }
+ i.setHostedby(hb)
+ })
publication.setBestaccessright(OafMapperUtils.createBestAccessRights(publication.getInstance()))
@@ -270,17 +320,22 @@ object DoiBoostMappingUtil {
if (publication.getTitle == null || publication.getTitle.size == 0)
return false
-
- val s = publication.getTitle.asScala.count(p => p.getValue != null
- && p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]"))
+ val s = publication.getTitle.asScala.count(p =>
+ p.getValue != null
+ && p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]")
+ )
if (s == 0)
return false
// fixes #4360 (test publisher)
- val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
+ val publisher =
+ if (publication.getPublisher != null) publication.getPublisher.getValue else null
- if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
+ if (
+ publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher
+ .equalsIgnoreCase("CrossRef Test Account"))
+ ) {
return false;
}
@@ -288,18 +343,12 @@ object DoiBoostMappingUtil {
if (publication.getAuthor == null || publication.getAuthor.size() == 0)
return false
-
//filter invalid author
val authors = publication.getAuthor.asScala.map(s => {
if (s.getFullname.nonEmpty) {
s.getFullname
- }
- else
- s"${
- s.getName
- } ${
- s.getSurname
- }"
+ } else
+ s"${s.getName} ${s.getSurname}"
})
val c = authors.count(isValidAuthorName)
@@ -307,13 +356,16 @@ object DoiBoostMappingUtil {
return false
// fixes #4368
- if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue))
+ if (
+ authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(
+ publication.getPublisher.getValue
+ )
+ )
return false
true
}
-
def isValidAuthorName(fullName: String): Boolean = {
if (fullName == null || fullName.isEmpty)
return false
@@ -322,32 +374,47 @@ object DoiBoostMappingUtil {
true
}
-
def generateDataInfo(trust: String): DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
- di.setProvenanceaction(OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_ACTIONSET,ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS))
+ di.setProvenanceaction(
+ OafMapperUtils.qualifier(
+ ModelConstants.SYSIMPORT_ACTIONSET,
+ ModelConstants.SYSIMPORT_ACTIONSET,
+ ModelConstants.DNET_PROVENANCE_ACTIONS,
+ ModelConstants.DNET_PROVENANCE_ACTIONS
+ )
+ )
di
}
-
-
- def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
+ def createSP(
+ value: String,
+ classId: String,
+ className: String,
+ schemeId: String,
+ schemeName: String
+ ): StructuredProperty = {
val sp = new StructuredProperty
- sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName))
+ sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
sp.setValue(value)
sp
}
-
-
- def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
+ def createSP(
+ value: String,
+ classId: String,
+ className: String,
+ schemeId: String,
+ schemeName: String,
+ dataInfo: DataInfo
+ ): StructuredProperty = {
val sp = new StructuredProperty
- sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName))
+ sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
sp.setValue(value)
sp.setDataInfo(dataInfo)
sp
@@ -356,17 +423,20 @@ object DoiBoostMappingUtil {
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
val sp = new StructuredProperty
- sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId))
+ sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
sp.setValue(value)
sp
}
-
-
- def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
+ def createSP(
+ value: String,
+ classId: String,
+ schemeId: String,
+ dataInfo: DataInfo
+ ): StructuredProperty = {
val sp = new StructuredProperty
- sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId))
+ sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
sp.setValue(value)
sp.setDataInfo(dataInfo)
sp
@@ -382,7 +452,6 @@ object DoiBoostMappingUtil {
}
-
def createUnpayWallCollectedFrom(): KeyValue = {
val cf = new KeyValue
@@ -401,15 +470,11 @@ object DoiBoostMappingUtil {
}
-
- def generateIdentifier (oaf: Result, doi: String): String = {
- val id = DHPUtils.md5 (doi.toLowerCase)
+ def generateIdentifier(oaf: Result, doi: String): String = {
+ val id = DHPUtils.md5(doi.toLowerCase)
s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}"
}
-
-
-
def createMAGCollectedFrom(): KeyValue = {
val cf = new KeyValue
@@ -424,19 +489,21 @@ object DoiBoostMappingUtil {
tmp.setValue(value)
tmp
-
}
def isEmpty(x: String) = x == null || x.trim.isEmpty
- def normalizeDoi(input : String) :String ={
- if(input == null)
+ def normalizeDoi(input: String): String = {
+ if (input == null)
return null
- val replaced = input.replaceAll("(?:\\n|\\r|\\t|\\s)", "").toLowerCase.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
- if (isEmpty(replaced))
+ val replaced = input
+ .replaceAll("(?:\\n|\\r|\\t|\\s)", "")
+ .toLowerCase
+ .replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
+ if (isEmpty(replaced))
return null
- if(replaced.indexOf("10.") < 0)
+ if (replaced.indexOf("10.") < 0)
return null
val ret = replaced.substring(replaced.indexOf("10."))
@@ -446,9 +513,6 @@ object DoiBoostMappingUtil {
return ret
-
}
-
-
}
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
index f13900abe..b6152526d 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
@@ -17,22 +17,29 @@ object SparkGenerateDOIBoostActionSet {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
- implicit val mapEncoderAS: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
+ implicit val mapEncoderAS: Encoder[(String, String)] =
+ Encoders.tuple(Encoders.STRING, Encoders.STRING)
- implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]]
+ implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] =
+ Encoders.kryo[AtomicAction[OafDataset]]
val dbPublicationPath = parser.get("dbPublicationPath")
val dbDatasetPath = parser.get("dbDatasetPath")
@@ -41,35 +48,61 @@ object SparkGenerateDOIBoostActionSet {
val dbOrganizationPath = parser.get("dbOrganizationPath")
val sequenceFilePath = parser.get("sFilePath")
- val asDataset = spark.read.load(dbDatasetPath).as[OafDataset]
+ val asDataset = spark.read
+ .load(dbDatasetPath)
+ .as[OafDataset]
.filter(p => p != null || p.getId != null)
.map(d => DoiBoostMappingUtil.fixResult(d))
- .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
+ .map(d => DoiBoostMappingUtil.toActionSet(d))(
+ Encoders.tuple(Encoders.STRING, Encoders.STRING)
+ )
-
- val asPublication = spark.read.load(dbPublicationPath).as[Publication]
+ val asPublication = spark.read
+ .load(dbPublicationPath)
+ .as[Publication]
.filter(p => p != null || p.getId != null)
- .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
+ .map(d => DoiBoostMappingUtil.toActionSet(d))(
+ Encoders.tuple(Encoders.STRING, Encoders.STRING)
+ )
+ val asOrganization = spark.read
+ .load(dbOrganizationPath)
+ .as[Organization]
+ .map(d => DoiBoostMappingUtil.toActionSet(d))(
+ Encoders.tuple(Encoders.STRING, Encoders.STRING)
+ )
- val asOrganization = spark.read.load(dbOrganizationPath).as[Organization]
- .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-
-
- val asCRelation = spark.read.load(crossRefRelation).as[Relation]
+ val asCRelation = spark.read
+ .load(crossRefRelation)
+ .as[Relation]
.filter(r => r != null && r.getSource != null && r.getTarget != null)
- .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
+ .map(d => DoiBoostMappingUtil.toActionSet(d))(
+ Encoders.tuple(Encoders.STRING, Encoders.STRING)
+ )
+ val asRelAffiliation = spark.read
+ .load(dbaffiliationRelationPath)
+ .as[Relation]
+ .map(d => DoiBoostMappingUtil.toActionSet(d))(
+ Encoders.tuple(Encoders.STRING, Encoders.STRING)
+ )
- val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation]
- .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
-
-
- val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation)
-
-
- d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
+ val d: Dataset[(String, String)] = asDataset
+ .union(asPublication)
+ .union(asOrganization)
+ .union(asCRelation)
+ .union(asRelAffiliation)
+ d.rdd
+ .repartition(6000)
+ .map(s => (new Text(s._1), new Text(s._2)))
+ .saveAsHadoopFile(
+ s"$sequenceFilePath",
+ classOf[Text],
+ classOf[Text],
+ classOf[SequenceFileOutputFormat[Text, Text]],
+ classOf[GzipCodec]
+ )
}
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
index 91fe56cba..9323c994c 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
@@ -15,8 +15,8 @@ import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
-object SparkGenerateDoiBoost {
+object SparkGenerateDoiBoost {
def extractIdGRID(input: String): List[(String, String)] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@@ -26,28 +26,32 @@ object SparkGenerateDoiBoost {
val grids: List[String] = for {
- JObject(pid) <- json \ "pid"
+ JObject(pid) <- json \ "pid"
JField("qualifier", JObject(qualifier)) <- pid
- JField("classid", JString(classid)) <- qualifier
- JField("value", JString(vl)) <- pid
+ JField("classid", JString(classid)) <- qualifier
+ JField("value", JString(vl)) <- pid
if classid == "GRID"
} yield vl
grids.map(g => (id, s"unresolved::grid::${g.toLowerCase}"))(collection.breakOut)
}
-
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
import spark.implicits._
@@ -65,8 +69,7 @@ object SparkGenerateDoiBoost {
a._2.setId(a._1)
return a._2
}
- }
- else {
+ } else {
if (a != null && a._2 != null) {
b.mergeFrom(a._2)
b.setId(a._1)
@@ -82,8 +85,7 @@ object SparkGenerateDoiBoost {
if (b1 == null) {
if (b2 != null)
return b2
- }
- else {
+ } else {
if (b2 != null) {
b1.mergeFrom(b2)
val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
@@ -103,17 +105,19 @@ object SparkGenerateDoiBoost {
override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication]
}
-
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
- implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
+ implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =
+ Encoders.tuple(Encoders.STRING, mapEncoderPub)
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
logger.info("Phase 2) Join Crossref with UnpayWall")
- val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
- val uwPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
+ val crossrefPublication: Dataset[(String, Publication)] =
+ spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
+ val uwPublication: Dataset[(String, Publication)] =
+ spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
def applyMerge(item: ((String, Publication), (String, Publication))): Publication = {
val crossrefPub = item._1._2
@@ -127,86 +131,140 @@ object SparkGenerateDoiBoost {
crossrefPub
}
- crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/firstJoin")
+ crossrefPublication
+ .joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left")
+ .map(applyMerge)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingDirPath/firstJoin")
logger.info("Phase 3) Join Result with ORCID")
- val fj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
- val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
- fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
+ val fj: Dataset[(String, Publication)] =
+ spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
+ val orcidPublication: Dataset[(String, Publication)] =
+ spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
+ fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left")
+ .map(applyMerge)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingDirPath/secondJoin")
logger.info("Phase 4) Join Result with MAG")
- val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
+ val sj: Dataset[(String, Publication)] =
+ spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
- val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
- sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublication")
+ val magPublication: Dataset[(String, Publication)] =
+ spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
+ sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left")
+ .map(applyMerge)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingDirPath/doiBoostPublication")
+ val doiBoostPublication: Dataset[(String, Publication)] = spark.read
+ .load(s"$workingDirPath/doiBoostPublication")
+ .as[Publication]
+ .filter(p => DoiBoostMappingUtil.filterPublication(p))
+ .map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
- val doiBoostPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication].filter(p => DoiBoostMappingUtil.filterPublication(p)).map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
+ val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(
+ spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem)
+ )
- val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem))
-
-
- doiBoostPublication.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
+ doiBoostPublication
+ .joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
.map(DoiBoostMappingUtil.fixPublication)
.map(p => (p.getId, p))
.groupByKey(_._1)
.agg(crossrefAggregator.toColumn)
.map(p => p._2)
- .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingDirPath/doiBoostPublicationFiltered")
val affiliationPath = parser.get("affiliationPath")
val paperAffiliationPath = parser.get("paperAffiliationPath")
- val affiliation = spark.read.load(affiliationPath).select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))
-
- val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId"))
+ val affiliation = spark.read
+ .load(affiliationPath)
+ .select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))
+ val paperAffiliation = spark.read
+ .load(paperAffiliationPath)
+ .select(col("AffiliationId").alias("affId"), col("PaperId"))
val a: Dataset[DoiBoostAffiliation] = paperAffiliation
.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId")))
- .select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId"), col("_2.OfficialPage"), col("_2.DisplayName")).as[DoiBoostAffiliation]
+ .select(
+ col("_1.PaperId"),
+ col("_2.AffiliationId"),
+ col("_2.GridId"),
+ col("_2.OfficialPage"),
+ col("_2.DisplayName")
+ )
+ .as[DoiBoostAffiliation]
+ val magPubs: Dataset[(String, Publication)] = spark.read
+ .load(s"$workingDirPath/doiBoostPublicationFiltered")
+ .as[Publication]
+ .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(
+ tupleForJoinEncoder
+ )
+ .filter(s => s._1 != null)
- val magPubs: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
- .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s => s._1 != null)
+ magPubs
+ .joinWith(a, magPubs("_1").equalTo(a("PaperId")))
+ .flatMap(item => {
+ val pub: Publication = item._1._2
+ val affiliation = item._2
+ val affId: String =
+ if (affiliation.GridId.isDefined)
+ s"unresolved::grid::${affiliation.GridId.get.toLowerCase}"
+ else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
+ val r: Relation = new Relation
+ r.setSource(pub.getId)
+ r.setTarget(affId)
+ r.setRelType(ModelConstants.RESULT_ORGANIZATION)
+ r.setRelClass(ModelConstants.HAS_AUTHOR_INSTITUTION)
+ r.setSubRelType(ModelConstants.AFFILIATION)
+ r.setDataInfo(pub.getDataInfo)
+ r.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
+ val r1: Relation = new Relation
+ r1.setTarget(pub.getId)
+ r1.setSource(affId)
+ r1.setRelType(ModelConstants.RESULT_ORGANIZATION)
+ r1.setRelClass(ModelConstants.IS_AUTHOR_INSTITUTION_OF)
+ r1.setSubRelType(ModelConstants.AFFILIATION)
+ r1.setDataInfo(pub.getDataInfo)
+ r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
+ List(r, r1)
+ })(mapEncoderRel)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
+ val unresolvedRels: Dataset[(String, Relation)] = spark.read
+ .load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
+ .as[Relation]
+ .map(r => {
- magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
- val pub: Publication = item._1._2
- val affiliation = item._2
- val affId: String = if (affiliation.GridId.isDefined) s"unresolved::grid::${affiliation.GridId.get.toLowerCase}" else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
- val r: Relation = new Relation
- r.setSource(pub.getId)
- r.setTarget(affId)
- r.setRelType(ModelConstants.RESULT_ORGANIZATION)
- r.setRelClass(ModelConstants.HAS_AUTHOR_INSTITUTION)
- r.setSubRelType(ModelConstants.AFFILIATION)
- r.setDataInfo(pub.getDataInfo)
- r.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
- val r1: Relation = new Relation
- r1.setTarget(pub.getId)
- r1.setSource(affId)
- r1.setRelType(ModelConstants.RESULT_ORGANIZATION)
- r1.setRelClass(ModelConstants.IS_AUTHOR_INSTITUTION_OF)
- r1.setSubRelType(ModelConstants.AFFILIATION)
- r1.setDataInfo(pub.getDataInfo)
- r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
- List(r, r1)
- })(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
+ if (r.getSource.startsWith("unresolved"))
+ (r.getSource, r)
+ else if (r.getTarget.startsWith("unresolved"))
+ (r.getTarget, r)
+ else
+ ("resolved", r)
+ })(Encoders.tuple(Encoders.STRING, mapEncoderRel))
+ val openaireOrganization: Dataset[(String, String)] = spark.read
+ .text(openaireOrganizationPath)
+ .as[String]
+ .flatMap(s => extractIdGRID(s))
+ .groupByKey(_._2)
+ .reduceGroups((x, y) => if (x != null) x else y)
+ .map(_._2)
- val unresolvedRels: Dataset[(String, Relation)] = spark.read.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved").as[Relation].map(r => {
-
- if (r.getSource.startsWith("unresolved"))
- (r.getSource, r)
- else if (r.getTarget.startsWith("unresolved"))
- (r.getTarget, r)
- else
- ("resolved", r)
- })(Encoders.tuple(Encoders.STRING, mapEncoderRel))
-
- val openaireOrganization: Dataset[(String, String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x, y) => if (x != null) x else y).map(_._2)
-
- unresolvedRels.joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2")))
+ unresolvedRels
+ .joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2")))
.map { x =>
val currentRels = x._1._2
val currentOrgs = x._2
@@ -216,26 +274,35 @@ object SparkGenerateDoiBoost {
else
currentRels.setTarget(currentOrgs._1)
currentRels
- }.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
-
- magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).map(item => {
- val affiliation = item._2
- if (affiliation.GridId.isEmpty) {
- val o = new Organization
- o.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
- o.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
- o.setId(DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString))
- o.setOriginalId(List(affiliation.AffiliationId.toString).asJava)
- if (affiliation.DisplayName.nonEmpty)
- o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
- if (affiliation.OfficialPage.isDefined)
- o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
- o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
- o
}
- else
- null
- }).filter(o => o != null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization")
+ .filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingDirPath/doiBoostPublicationAffiliation")
+
+ magPubs
+ .joinWith(a, magPubs("_1").equalTo(a("PaperId")))
+ .map(item => {
+ val affiliation = item._2
+ if (affiliation.GridId.isEmpty) {
+ val o = new Organization
+ o.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
+ o.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
+ o.setId(DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString))
+ o.setOriginalId(List(affiliation.AffiliationId.toString).asJava)
+ if (affiliation.DisplayName.nonEmpty)
+ o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
+ if (affiliation.OfficialPage.isDefined)
+ o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
+ o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
+ o
+ } else
+ null
+ })
+ .filter(o => o != null)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingDirPath/doiBoostOrganization")
}
}
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
index edca4a180..0cb08ea94 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
@@ -18,70 +18,74 @@ import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.util.matching.Regex
-case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
+case class CrossrefDT(doi: String, json: String, timestamp: Long) {}
case class mappingAffiliation(name: String) {}
-case class mappingAuthor(given: Option[String], family: String, sequence:Option[String], ORCID: Option[String], affiliation: Option[mappingAffiliation]) {}
+case class mappingAuthor(
+ given: Option[String],
+ family: String,
+ sequence: Option[String],
+ ORCID: Option[String],
+ affiliation: Option[mappingAffiliation]
+) {}
case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {}
-
case object Crossref2Oaf {
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
val mappingCrossrefType = Map(
- "book-section" -> "publication",
- "book" -> "publication",
- "book-chapter" -> "publication",
- "book-part" -> "publication",
- "book-series" -> "publication",
- "book-set" -> "publication",
- "book-track" -> "publication",
- "edited-book" -> "publication",
- "reference-book" -> "publication",
- "monograph" -> "publication",
- "journal-article" -> "publication",
- "dissertation" -> "publication",
- "other" -> "publication",
- "peer-review" -> "publication",
- "proceedings" -> "publication",
+ "book-section" -> "publication",
+ "book" -> "publication",
+ "book-chapter" -> "publication",
+ "book-part" -> "publication",
+ "book-series" -> "publication",
+ "book-set" -> "publication",
+ "book-track" -> "publication",
+ "edited-book" -> "publication",
+ "reference-book" -> "publication",
+ "monograph" -> "publication",
+ "journal-article" -> "publication",
+ "dissertation" -> "publication",
+ "other" -> "publication",
+ "peer-review" -> "publication",
+ "proceedings" -> "publication",
"proceedings-article" -> "publication",
- "reference-entry" -> "publication",
- "report" -> "publication",
- "report-series" -> "publication",
- "standard" -> "publication",
- "standard-series" -> "publication",
- "posted-content" -> "publication",
- "dataset" -> "dataset"
+ "reference-entry" -> "publication",
+ "report" -> "publication",
+ "report-series" -> "publication",
+ "standard" -> "publication",
+ "standard-series" -> "publication",
+ "posted-content" -> "publication",
+ "dataset" -> "dataset"
)
-
val mappingCrossrefSubType = Map(
- "book-section" -> "0013 Part of book or chapter of book",
- "book" -> "0002 Book",
- "book-chapter" -> "0013 Part of book or chapter of book",
- "book-part" -> "0013 Part of book or chapter of book",
- "book-series" -> "0002 Book",
- "book-set" -> "0002 Book",
- "book-track" -> "0002 Book",
- "edited-book" -> "0002 Book",
- "reference-book" -> "0002 Book",
- "monograph" -> "0002 Book",
- "journal-article" -> "0001 Article",
- "dissertation" -> "0044 Thesis",
- "other" -> "0038 Other literature type",
- "peer-review" -> "0015 Review",
- "proceedings" -> "0004 Conference object",
+ "book-section" -> "0013 Part of book or chapter of book",
+ "book" -> "0002 Book",
+ "book-chapter" -> "0013 Part of book or chapter of book",
+ "book-part" -> "0013 Part of book or chapter of book",
+ "book-series" -> "0002 Book",
+ "book-set" -> "0002 Book",
+ "book-track" -> "0002 Book",
+ "edited-book" -> "0002 Book",
+ "reference-book" -> "0002 Book",
+ "monograph" -> "0002 Book",
+ "journal-article" -> "0001 Article",
+ "dissertation" -> "0044 Thesis",
+ "other" -> "0038 Other literature type",
+ "peer-review" -> "0015 Review",
+ "proceedings" -> "0004 Conference object",
"proceedings-article" -> "0004 Conference object",
- "reference-entry" -> "0013 Part of book or chapter of book",
- "report" -> "0017 Report",
- "report-series" -> "0017 Report",
- "standard" -> "0038 Other literature type",
- "standard-series" -> "0038 Other literature type",
- "dataset" -> "0021 Dataset",
- "preprint" -> "0016 Preprint",
- "report" -> "0017 Report"
+ "reference-entry" -> "0013 Part of book or chapter of book",
+ "report" -> "0017 Report",
+ "report-series" -> "0017 Report",
+ "standard" -> "0038 Other literature type",
+ "standard-series" -> "0038 Other literature type",
+ "dataset" -> "0021 Dataset",
+ "preprint" -> "0016 Preprint",
+ "report" -> "0017 Report"
)
def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = {
@@ -100,7 +104,6 @@ case object Crossref2Oaf {
val originalIds = new util.ArrayList(tmp.filter(id => id != null).asJava)
result.setOriginalId(originalIds)
-
// Add DataInfo
result.setDataInfo(generateDataInfo())
@@ -111,98 +114,169 @@ case object Crossref2Oaf {
// Publisher ( Name of work's publisher mapped into Result/Publisher)
val publisher = (json \ "publisher").extractOrElse[String](null)
- if (publisher!= null && publisher.nonEmpty)
+ if (publisher != null && publisher.nonEmpty)
result.setPublisher(asField(publisher))
-
// TITLE
- val mainTitles = for {JString(title) <- json \ "title" if title.nonEmpty} yield createSP(title, "main title", ModelConstants.DNET_DATACITE_TITLE)
- val originalTitles = for {JString(title) <- json \ "original-title" if title.nonEmpty} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
- val shortTitles = for {JString(title) <- json \ "short-title" if title.nonEmpty} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
- val subtitles = for {JString(title) <- json \ "subtitle" if title.nonEmpty} yield createSP(title, "subtitle", ModelConstants.DNET_DATACITE_TITLE)
+ val mainTitles =
+ for { JString(title) <- json \ "title" if title.nonEmpty } yield createSP(
+ title,
+ "main title",
+ ModelConstants.DNET_DATACITE_TITLE
+ )
+ val originalTitles = for {
+ JString(title) <- json \ "original-title" if title.nonEmpty
+ } yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
+ val shortTitles = for {
+ JString(title) <- json \ "short-title" if title.nonEmpty
+ } yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
+ val subtitles =
+ for { JString(title) <- json \ "subtitle" if title.nonEmpty } yield createSP(
+ title,
+ "subtitle",
+ ModelConstants.DNET_DATACITE_TITLE
+ )
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
// DESCRIPTION
- val descriptionList = for {JString(description) <- json \ "abstract"} yield asField(description)
+ val descriptionList =
+ for { JString(description) <- json \ "abstract" } yield asField(description)
result.setDescription(descriptionList.asJava)
// Source
- val sourceList = for {JString(source) <- json \ "source" if source!= null && source.nonEmpty} yield asField(source)
+ val sourceList = for {
+ JString(source) <- json \ "source" if source != null && source.nonEmpty
+ } yield asField(source)
result.setSource(sourceList.asJava)
//RELEVANT DATE Mapping
- val createdDate = generateDate((json \ "created" \ "date-time").extract[String], (json \ "created" \ "date-parts").extract[List[List[Int]]], "created", ModelConstants.DNET_DATACITE_DATE)
- val postedDate = generateDate((json \ "posted" \ "date-time").extractOrElse[String](null), (json \ "posted" \ "date-parts").extract[List[List[Int]]], "available", ModelConstants.DNET_DATACITE_DATE)
- val acceptedDate = generateDate((json \ "accepted" \ "date-time").extractOrElse[String](null), (json \ "accepted" \ "date-parts").extract[List[List[Int]]], "accepted", ModelConstants.DNET_DATACITE_DATE)
- val publishedPrintDate = generateDate((json \ "published-print" \ "date-time").extractOrElse[String](null), (json \ "published-print" \ "date-parts").extract[List[List[Int]]], "published-print", ModelConstants.DNET_DATACITE_DATE)
- val publishedOnlineDate = generateDate((json \ "published-online" \ "date-time").extractOrElse[String](null), (json \ "published-online" \ "date-parts").extract[List[List[Int]]], "published-online", ModelConstants.DNET_DATACITE_DATE)
+ val createdDate = generateDate(
+ (json \ "created" \ "date-time").extract[String],
+ (json \ "created" \ "date-parts").extract[List[List[Int]]],
+ "created",
+ ModelConstants.DNET_DATACITE_DATE
+ )
+ val postedDate = generateDate(
+ (json \ "posted" \ "date-time").extractOrElse[String](null),
+ (json \ "posted" \ "date-parts").extract[List[List[Int]]],
+ "available",
+ ModelConstants.DNET_DATACITE_DATE
+ )
+ val acceptedDate = generateDate(
+ (json \ "accepted" \ "date-time").extractOrElse[String](null),
+ (json \ "accepted" \ "date-parts").extract[List[List[Int]]],
+ "accepted",
+ ModelConstants.DNET_DATACITE_DATE
+ )
+ val publishedPrintDate = generateDate(
+ (json \ "published-print" \ "date-time").extractOrElse[String](null),
+ (json \ "published-print" \ "date-parts").extract[List[List[Int]]],
+ "published-print",
+ ModelConstants.DNET_DATACITE_DATE
+ )
+ val publishedOnlineDate = generateDate(
+ (json \ "published-online" \ "date-time").extractOrElse[String](null),
+ (json \ "published-online" \ "date-parts").extract[List[List[Int]]],
+ "published-online",
+ ModelConstants.DNET_DATACITE_DATE
+ )
- val issuedDate = extractDate((json \ "issued" \ "date-time").extractOrElse[String](null), (json \ "issued" \ "date-parts").extract[List[List[Int]]])
+ val issuedDate = extractDate(
+ (json \ "issued" \ "date-time").extractOrElse[String](null),
+ (json \ "issued" \ "date-parts").extract[List[List[Int]]]
+ )
if (StringUtils.isNotBlank(issuedDate)) {
result.setDateofacceptance(asField(issuedDate))
- }
- else {
+ } else {
result.setDateofacceptance(asField(createdDate.getValue))
}
- result.setRelevantdate(List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate).filter(p => p != null).asJava)
+ result.setRelevantdate(
+ List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate)
+ .filter(p => p != null)
+ .asJava
+ )
//Mapping Subject
- val subjectList:List[String] = (json \ "subject").extractOrElse[List[String]](List())
+ val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List())
if (subjectList.nonEmpty) {
- result.setSubject(subjectList.map(s=> createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava)
+ result.setSubject(
+ subjectList.map(s => createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
+ )
}
-
-
//Mapping Author
- val authorList: List[mappingAuthor] = (json \ "author").extractOrElse[List[mappingAuthor]](List())
+ val authorList: List[mappingAuthor] =
+ (json \ "author").extractOrElse[List[mappingAuthor]](List())
+ val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) =>
+ a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first")
+ )
-
- val sorted_list = authorList.sortWith((a:mappingAuthor, b:mappingAuthor) => a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first"))
-
- result.setAuthor(sorted_list.zipWithIndex.map{case (a, index) => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)}.asJava)
+ result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) =>
+ generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)
+ }.asJava)
// Mapping instance
val instance = new Instance()
val license = for {
- JObject(license) <- json \ "license"
- JField("URL", JString(lic)) <- license
+ JObject(license) <- json \ "license"
+ JField("URL", JString(lic)) <- license
JField("content-version", JString(content_version)) <- license
} yield (asField(lic), content_version)
val l = license.filter(d => StringUtils.isNotBlank(d._1.getValue))
- if (l.nonEmpty){
- if (l exists (d => d._2.equals("vor"))){
- for(d <- l){
- if (d._2.equals("vor")){
+ if (l.nonEmpty) {
+ if (l exists (d => d._2.equals("vor"))) {
+ for (d <- l) {
+ if (d._2.equals("vor")) {
instance.setLicense(d._1)
}
}
+ } else {
+ instance.setLicense(l.head._1)
}
- else{
- instance.setLicense(l.head._1)}
}
// Ticket #6281 added pid to Instance
instance.setPid(result.getPid)
- val has_review = json \ "relation" \"has-review" \ "id"
+ val has_review = json \ "relation" \ "has-review" \ "id"
- if(has_review != JNothing) {
+ if (has_review != JNothing) {
instance.setRefereed(
- OafMapperUtils.qualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS))
+ OafMapperUtils.qualifier(
+ "0001",
+ "peerReviewed",
+ ModelConstants.DNET_REVIEW_LEVELS,
+ ModelConstants.DNET_REVIEW_LEVELS
+ )
+ )
}
- instance.setAccessright(decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue))
- instance.setInstancetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
- result.setResourcetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
+ instance.setAccessright(
+ decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
+ )
+ instance.setInstancetype(
+ OafMapperUtils.qualifier(
+ cobjCategory.substring(0, 4),
+ cobjCategory.substring(5),
+ ModelConstants.DNET_PUBLICATION_RESOURCE,
+ ModelConstants.DNET_PUBLICATION_RESOURCE
+ )
+ )
+ result.setResourcetype(
+ OafMapperUtils.qualifier(
+ cobjCategory.substring(0, 4),
+ cobjCategory.substring(5),
+ ModelConstants.DNET_PUBLICATION_RESOURCE,
+ ModelConstants.DNET_PUBLICATION_RESOURCE
+ )
+ )
instance.setCollectedfrom(createCrossrefCollectedFrom())
if (StringUtils.isNotBlank(issuedDate)) {
instance.setDateofacceptance(asField(issuedDate))
- }
- else {
+ } else {
instance.setDateofacceptance(asField(createdDate.getValue))
}
val s: List[String] = List("https://doi.org/" + doi)
@@ -210,10 +284,9 @@ case object Crossref2Oaf {
// if (links.nonEmpty) {
// instance.setUrl(links.asJava)
// }
- if(s.nonEmpty)
- {
- instance.setUrl(s.asJava)
- }
+ if (s.nonEmpty) {
+ instance.setUrl(s.asJava)
+ }
result.setInstance(List(instance).asJava)
@@ -236,15 +309,23 @@ case object Crossref2Oaf {
result
}
-
- def generateAuhtor(given: String, family: String, orcid: String, index:Int): Author = {
+ def generateAuhtor(given: String, family: String, orcid: String, index: Int): Author = {
val a = new Author
a.setName(given)
a.setSurname(family)
a.setFullname(s"$given $family")
- a.setRank(index+1)
+ a.setRank(index + 1)
if (StringUtils.isNotBlank(orcid))
- a.setPid(List(createSP(orcid, ModelConstants.ORCID_PENDING, ModelConstants.DNET_PID_TYPES, generateDataInfo())).asJava)
+ a.setPid(
+ List(
+ createSP(
+ orcid,
+ ModelConstants.ORCID_PENDING,
+ ModelConstants.DNET_PID_TYPES,
+ generateDataInfo()
+ )
+ ).asJava
+ )
a
}
@@ -255,54 +336,62 @@ case object Crossref2Oaf {
var resultList: List[Oaf] = List()
-
val objectType = (json \ "type").extractOrElse[String](null)
val objectSubType = (json \ "subtype").extractOrElse[String](null)
if (objectType == null)
return resultList
-
val result = generateItemFromType(objectType, objectSubType)
if (result == null)
return List()
- val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type"))
+ val cOBJCategory = mappingCrossrefSubType.getOrElse(
+ objectType,
+ mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type")
+ )
mappingResult(result, json, cOBJCategory)
if (result == null || result.getId == null)
return List()
-
- val funderList: List[mappingFunder] = (json \ "funder").extractOrElse[List[mappingFunder]](List())
+ val funderList: List[mappingFunder] =
+ (json \ "funder").extractOrElse[List[mappingFunder]](List())
if (funderList.nonEmpty) {
- resultList = resultList ::: mappingFunderToRelations(funderList, result.getId, createCrossrefCollectedFrom(), result.getDataInfo, result.getLastupdatetimestamp)
+ resultList = resultList ::: mappingFunderToRelations(
+ funderList,
+ result.getId,
+ createCrossrefCollectedFrom(),
+ result.getDataInfo,
+ result.getLastupdatetimestamp
+ )
}
-
result match {
case publication: Publication => convertPublication(publication, json, cOBJCategory)
- case dataset: Dataset => convertDataset(dataset)
+ case dataset: Dataset => convertDataset(dataset)
}
resultList = resultList ::: List(result)
resultList
}
-
- def mappingFunderToRelations(funders: List[mappingFunder], sourceId: String, cf: KeyValue, di: DataInfo, ts: Long): List[Relation] = {
+ def mappingFunderToRelations(
+ funders: List[mappingFunder],
+ sourceId: String,
+ cf: KeyValue,
+ di: DataInfo,
+ ts: Long
+ ): List[Relation] = {
val queue = new mutable.Queue[Relation]
-
- def snsfRule(award:String): String = {
- val tmp1 = StringUtils.substringAfter(award,"_")
- val tmp2 = StringUtils.substringBefore(tmp1,"/")
+ def snsfRule(award: String): String = {
+ val tmp1 = StringUtils.substringAfter(award, "_")
+ val tmp2 = StringUtils.substringBefore(tmp1, "/")
logger.debug(s"From $award to $tmp2")
tmp2
-
}
-
def extractECAward(award: String): String = {
val awardECRegex: Regex = "[0-9]{4,9}".r
if (awardECRegex.findAllIn(award).hasNext)
@@ -310,8 +399,7 @@ case object Crossref2Oaf {
null
}
-
- def generateRelation(sourceId:String, targetId:String, relClass:String) :Relation = {
+ def generateRelation(sourceId: String, targetId: String, relClass: String): Relation = {
val r = new Relation
r.setSource(sourceId)
@@ -324,98 +412,119 @@ case object Crossref2Oaf {
r.setLastupdatetimestamp(ts)
r
-
}
-
- def generateSimpleRelationFromAward(funder: mappingFunder, nsPrefix: String, extractField: String => String): Unit = {
+ def generateSimpleRelationFromAward(
+ funder: mappingFunder,
+ nsPrefix: String,
+ extractField: String => String
+ ): Unit = {
if (funder.award.isDefined && funder.award.get.nonEmpty)
- funder.award.get.map(extractField).filter(a => a!= null && a.nonEmpty).foreach(
- award => {
+ funder.award.get
+ .map(extractField)
+ .filter(a => a != null && a.nonEmpty)
+ .foreach(award => {
val targetId = getProjectId(nsPrefix, DHPUtils.md5(award))
- queue += generateRelation(sourceId, targetId , ModelConstants.IS_PRODUCED_BY)
- queue += generateRelation(targetId , sourceId, ModelConstants.PRODUCES)
- }
- )
+ queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+ queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
+ })
}
- def getProjectId (nsPrefix:String, targetId:String):String = {
+ def getProjectId(nsPrefix: String, targetId: String): String = {
s"40|$nsPrefix::$targetId"
}
-
if (funders != null)
- funders.foreach(funder => {
- if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
- funder.DOI.get match {
- case "10.13039/100010663" |
- "10.13039/100010661" |
- "10.13039/501100007601" |
- "10.13039/501100000780" |
- "10.13039/100010665" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
- case "10.13039/100011199" |
- "10.13039/100004431" |
- "10.13039/501100004963" |
- "10.13039/501100000780" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
- case "10.13039/501100000781" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
- generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
- case "10.13039/100000001" => generateSimpleRelationFromAward(funder, "nsf_________", a => a)
- case "10.13039/501100001665" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
- case "10.13039/501100002341" => generateSimpleRelationFromAward(funder, "aka_________", a => a)
- case "10.13039/501100001602" => generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
- case "10.13039/501100000923" => generateSimpleRelationFromAward(funder, "arc_________", a => a)
- case "10.13039/501100000038"=> val targetId = getProjectId("nserc_______" , "1e5e62235d094afd01cd56e65112fc63")
- queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
- queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
- case "10.13039/501100000155"=> val targetId = getProjectId("sshrc_______" , "1e5e62235d094afd01cd56e65112fc63")
- queue += generateRelation(sourceId,targetId, ModelConstants.IS_PRODUCED_BY)
- queue += generateRelation(targetId,sourceId, ModelConstants.PRODUCES)
- case "10.13039/501100000024"=> val targetId = getProjectId("cihr________" , "1e5e62235d094afd01cd56e65112fc63")
- queue += generateRelation(sourceId,targetId, ModelConstants.IS_PRODUCED_BY)
- queue += generateRelation(targetId,sourceId, ModelConstants.PRODUCES)
- case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a)
- case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
- case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a=>a)
- case "10.13039/501100004564" => generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
- case "10.13039/501100003407" => generateSimpleRelationFromAward(funder, "miur________", a=>a)
- val targetId = getProjectId("miur________" , "1e5e62235d094afd01cd56e65112fc63")
- queue += generateRelation(sourceId,targetId, ModelConstants.IS_PRODUCED_BY)
- queue += generateRelation(targetId,sourceId, ModelConstants.PRODUCES)
- case "10.13039/501100006588" |
- "10.13039/501100004488" => generateSimpleRelationFromAward(funder, "irb_hr______", a=>a.replaceAll("Project No.", "").replaceAll("HRZZ-","") )
- case "10.13039/501100006769"=> generateSimpleRelationFromAward(funder, "rsf_________", a=>a)
- case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
- case "10.13039/501100004410"=> generateSimpleRelationFromAward(funder, "tubitakf____", a =>a)
- case "10.10.13039/100004440"=> generateSimpleRelationFromAward(funder, "wt__________", a =>a)
- case "10.13039/100004440"=> val targetId = getProjectId("wt__________" , "1e5e62235d094afd01cd56e65112fc63")
- queue += generateRelation(sourceId,targetId, ModelConstants.IS_PRODUCED_BY)
- queue += generateRelation(targetId,sourceId, ModelConstants.PRODUCES)
+ funders.foreach(funder => {
+ if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
+ funder.DOI.get match {
+ case "10.13039/100010663" | "10.13039/100010661" | "10.13039/501100007601" | "10.13039/501100000780" |
+ "10.13039/100010665" =>
+ generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
+ case "10.13039/100011199" | "10.13039/100004431" | "10.13039/501100004963" | "10.13039/501100000780" =>
+ generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
+ case "10.13039/501100000781" =>
+ generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
+ generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
+ case "10.13039/100000001" =>
+ generateSimpleRelationFromAward(funder, "nsf_________", a => a)
+ case "10.13039/501100001665" =>
+ generateSimpleRelationFromAward(funder, "anr_________", a => a)
+ case "10.13039/501100002341" =>
+ generateSimpleRelationFromAward(funder, "aka_________", a => a)
+ case "10.13039/501100001602" =>
+ generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
+ case "10.13039/501100000923" =>
+ generateSimpleRelationFromAward(funder, "arc_________", a => a)
+ case "10.13039/501100000038" =>
+ val targetId = getProjectId("nserc_______", "1e5e62235d094afd01cd56e65112fc63")
+ queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+ queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
+ case "10.13039/501100000155" =>
+ val targetId = getProjectId("sshrc_______", "1e5e62235d094afd01cd56e65112fc63")
+ queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+ queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
+ case "10.13039/501100000024" =>
+ val targetId = getProjectId("cihr________", "1e5e62235d094afd01cd56e65112fc63")
+ queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+ queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
+ case "10.13039/501100002848" =>
+ generateSimpleRelationFromAward(funder, "conicytf____", a => a)
+ case "10.13039/501100003448" =>
+ generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
+ case "10.13039/501100010198" =>
+ generateSimpleRelationFromAward(funder, "sgov________", a => a)
+ case "10.13039/501100004564" =>
+ generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
+ case "10.13039/501100003407" =>
+ generateSimpleRelationFromAward(funder, "miur________", a => a)
+ val targetId = getProjectId("miur________", "1e5e62235d094afd01cd56e65112fc63")
+ queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+ queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
+ case "10.13039/501100006588" | "10.13039/501100004488" =>
+ generateSimpleRelationFromAward(
+ funder,
+ "irb_hr______",
+ a => a.replaceAll("Project No.", "").replaceAll("HRZZ-", "")
+ )
+ case "10.13039/501100006769" =>
+ generateSimpleRelationFromAward(funder, "rsf_________", a => a)
+ case "10.13039/501100001711" =>
+ generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
+ case "10.13039/501100004410" =>
+ generateSimpleRelationFromAward(funder, "tubitakf____", a => a)
+ case "10.10.13039/100004440" =>
+ generateSimpleRelationFromAward(funder, "wt__________", a => a)
+ case "10.13039/100004440" =>
+ val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
+ queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+ queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
- case _ => logger.debug("no match for "+funder.DOI.get )
+ case _ => logger.debug("no match for " + funder.DOI.get)
+ }
+ } else {
+ funder.name match {
+ case "European Union’s Horizon 2020 research and innovation program" =>
+ generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
+ case "European Union's" =>
+ generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
+ generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
+ case "The French National Research Agency (ANR)" | "The French National Research Agency" =>
+ generateSimpleRelationFromAward(funder, "anr_________", a => a)
+ case "CONICYT, Programa de Formación de Capital Humano Avanzado" =>
+ generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
+ case "Wellcome Trust Masters Fellowship" =>
+ val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
+ queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
+ queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
+ case _ => logger.debug("no match for " + funder.name)
+
+ }
}
-
- } else {
- funder.name match {
- case "European Union’s Horizon 2020 research and innovation program" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
- case "European Union's" =>
- generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
- generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
- case "The French National Research Agency (ANR)" |
- "The French National Research Agency" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
- case "CONICYT, Programa de Formación de Capital Humano Avanzado" => generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
- case "Wellcome Trust Masters Fellowship" => val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
- queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY )
- queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES )
- case _ => logger.debug("no match for "+funder.name )
-
- }
- }
-
- }
- )
+ })
queue.toList
}
@@ -423,33 +532,31 @@ case object Crossref2Oaf {
// TODO check if there are other info to map into the Dataset
}
-
def convertPublication(publication: Publication, json: JValue, cobjCategory: String): Unit = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
- val containerTitles = for {JString(ct) <- json \ "container-title"} yield ct
-
+ val containerTitles = for { JString(ct) <- json \ "container-title" } yield ct
//Mapping book
if (cobjCategory.toLowerCase.contains("book")) {
- val ISBN = for {JString(isbn) <- json \ "ISBN"} yield isbn
+ val ISBN = for { JString(isbn) <- json \ "ISBN" } yield isbn
if (ISBN.nonEmpty && containerTitles.nonEmpty) {
val source = s"${containerTitles.head} ISBN: ${ISBN.head}"
if (publication.getSource != null) {
val l: List[Field[String]] = publication.getSource.asScala.toList
val ll: List[Field[String]] = l ::: List(asField(source))
publication.setSource(ll.asJava)
- }
- else
+ } else
publication.setSource(List(asField(source)).asJava)
}
} else {
// Mapping Journal
- val issnInfos = for {JArray(issn_types) <- json \ "issn-type"
- JObject(issn_type) <- issn_types
- JField("type", JString(tp)) <- issn_type
- JField("value", JString(vl)) <- issn_type
- } yield Tuple2(tp, vl)
+ val issnInfos = for {
+ JArray(issn_types) <- json \ "issn-type"
+ JObject(issn_type) <- issn_types
+ JField("type", JString(tp)) <- issn_type
+ JField("value", JString(vl)) <- issn_type
+ } yield Tuple2(tp, vl)
val volume = (json \ "volume").extractOrElse[String](null)
if (containerTitles.nonEmpty) {
@@ -460,7 +567,7 @@ case object Crossref2Oaf {
issnInfos.foreach(tp => {
tp._1 match {
case "electronic" => journal.setIssnOnline(tp._2)
- case "print" => journal.setIssnPrinted(tp._2)
+ case "print" => journal.setIssnPrinted(tp._2)
}
})
}
@@ -494,7 +601,12 @@ case object Crossref2Oaf {
}
- def generateDate(dt: String, datePart: List[List[Int]], classId: String, schemeId: String): StructuredProperty = {
+ def generateDate(
+ dt: String,
+ datePart: List[List[Int]],
+ classId: String,
+ schemeId: String
+ ): StructuredProperty = {
val dp = extractDate(dt, datePart)
if (StringUtils.isNotBlank(dp))
return createSP(dp, classId, schemeId)
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala
index 6a1c701af..c6e4706d7 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala
@@ -16,7 +16,6 @@ object CrossrefDataset {
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
-
def to_item(input: String): CrossrefDT = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@@ -29,19 +28,24 @@ object CrossrefDataset {
def main(args: Array[String]): Unit = {
-
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ CrossrefDataset.getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json"
+ )
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
import spark.implicits._
-
val crossrefAggregator = new Aggregator[CrossrefDT, CrossrefDT, CrossrefDT] with Serializable {
override def zero: CrossrefDT = null
@@ -52,7 +56,6 @@ object CrossrefDataset {
if (a == null)
return b
-
if (a.timestamp > b.timestamp) {
return a
}
@@ -80,19 +83,24 @@ object CrossrefDataset {
val workingPath: String = parser.get("workingPath")
-
val main_ds: Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT]
-
val update =
- spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text])
- .map(i => CrossrefImporter.decompressBlob(i._2.toString))
- .map(i => to_item(i)))
+ spark.createDataset(
+ spark.sparkContext
+ .sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text])
+ .map(i => CrossrefImporter.decompressBlob(i._2.toString))
+ .map(i => to_item(i))
+ )
- main_ds.union(update).groupByKey(_.doi)
+ main_ds
+ .union(update)
+ .groupByKey(_.doi)
.agg(crossrefAggregator.toColumn)
.map(s => s._2)
- .write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated")
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/crossref_ds_updated")
}
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala
index 6d03abc25..df185910e 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala
@@ -18,7 +18,6 @@ object GenerateCrossrefDataset {
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
-
def crossrefElement(meta: String): CrossrefDT = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(meta)
@@ -30,13 +29,23 @@ object GenerateCrossrefDataset {
def main(args: Array[String]): Unit = {
val conf = new SparkConf
- val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString)
+ val parser = new ArgumentApplicationParser(
+ Source
+ .fromInputStream(
+ getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
+ )
+ )
+ .mkString
+ )
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
- val spark: SparkSession = SparkSession.builder().config(conf)
+ val spark: SparkSession = SparkSession
+ .builder()
+ .config(conf)
.appName(UnpackCrtossrefEntries.getClass.getSimpleName)
.master(master)
.getOrCreate()
@@ -44,12 +53,14 @@ object GenerateCrossrefDataset {
import spark.implicits._
-
val tmp: RDD[String] = sc.textFile(sourcePath, 6000)
- spark.createDataset(tmp)
+ spark
+ .createDataset(tmp)
.map(entry => crossrefElement(entry))
- .write.mode(SaveMode.Overwrite).save(targetPath)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(targetPath)
// .map(meta => crossrefElement(meta))
// .toDS.as[CrossrefDT]
// .write.mode(SaveMode.Overwrite).save(targetPath)
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala
index fa55b9fb9..96923f000 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala
@@ -8,7 +8,6 @@ import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
-
case class Reference(author: String, firstPage: String) {}
object SparkMapDumpIntoOAF {
@@ -19,14 +18,21 @@ object SparkMapDumpIntoOAF {
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ SparkMapDumpIntoOAF.getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json"
+ )
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
@@ -35,19 +41,34 @@ object SparkMapDumpIntoOAF {
val targetPath = parser.get("targetPath")
- spark.read.load(parser.get("sourcePath")).as[CrossrefDT]
+ spark.read
+ .load(parser.get("sourcePath"))
+ .as[CrossrefDT]
.flatMap(k => Crossref2Oaf.convert(k.json))
.filter(o => o != null)
- .write.mode(SaveMode.Overwrite).save(s"$targetPath/mixObject")
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/mixObject")
- val ds:Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]
+ val ds: Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]
- ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefPublication")
+ ds.filter(o => o.isInstanceOf[Publication])
+ .map(o => o.asInstanceOf[Publication])
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/crossrefPublication")
- ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefRelation")
+ ds.filter(o => o.isInstanceOf[Relation])
+ .map(o => o.asInstanceOf[Relation])
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/crossrefRelation")
- ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefDataset")
+ ds.filter(o => o.isInstanceOf[OafDataset])
+ .map(o => o.asInstanceOf[OafDataset])
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/crossrefDataset")
}
-
}
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala
index 191c4587e..3fea9695c 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala
@@ -16,7 +16,6 @@ object UnpackCrtossrefEntries {
val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass)
-
def extractDump(input: String): List[String] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
@@ -24,28 +23,36 @@ object UnpackCrtossrefEntries {
val a = (json \ "items").extract[JArray]
a.arr.map(s => compact(render(s)))
-
}
-
def main(args: Array[String]): Unit = {
val conf = new SparkConf
- val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString)
+ val parser = new ArgumentApplicationParser(
+ Source
+ .fromInputStream(
+ getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
+ )
+ )
+ .mkString
+ )
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
- val spark: SparkSession = SparkSession.builder().config(conf)
+ val spark: SparkSession = SparkSession
+ .builder()
+ .config(conf)
.appName(UnpackCrtossrefEntries.getClass.getSimpleName)
.master(master)
.getOrCreate()
val sc: SparkContext = spark.sparkContext
- sc.wholeTextFiles(sourcePath, 6000).flatMap(d => extractDump(d._2))
+ sc.wholeTextFiles(sourcePath, 6000)
+ .flatMap(d => extractDump(d._2))
.saveAsTextFile(targetPath, classOf[GzipCodec])
-
}
}
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala
index 0a6fa00f0..18ba864ce 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala
@@ -1,6 +1,5 @@
package eu.dnetlib.doiboost.mag
-
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
@@ -14,59 +13,134 @@ import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.util.matching.Regex
-
-case class MagPapers(PaperId: Long, Rank: Integer, Doi: String,
- DocType: String, PaperTitle: String, OriginalTitle: String,
- BookTitle: String, Year: Option[Integer], Date: Option[java.sql.Timestamp], Publisher: String,
- JournalId: Option[Long], ConferenceSeriesId: Option[Long], ConferenceInstanceId: Option[Long],
- Volume: String, Issue: String, FirstPage: String, LastPage: String,
- ReferenceCount: Option[Long], CitationCount: Option[Long], EstimatedCitation: Option[Long],
- OriginalVenue: String, FamilyId: Option[Long], CreatedDate: java.sql.Timestamp) {}
-
+case class MagPapers(
+ PaperId: Long,
+ Rank: Integer,
+ Doi: String,
+ DocType: String,
+ PaperTitle: String,
+ OriginalTitle: String,
+ BookTitle: String,
+ Year: Option[Integer],
+ Date: Option[java.sql.Timestamp],
+ Publisher: String,
+ JournalId: Option[Long],
+ ConferenceSeriesId: Option[Long],
+ ConferenceInstanceId: Option[Long],
+ Volume: String,
+ Issue: String,
+ FirstPage: String,
+ LastPage: String,
+ ReferenceCount: Option[Long],
+ CitationCount: Option[Long],
+ EstimatedCitation: Option[Long],
+ OriginalVenue: String,
+ FamilyId: Option[Long],
+ CreatedDate: java.sql.Timestamp
+) {}
case class MagPaperAbstract(PaperId: Long, IndexedAbstract: String) {}
-case class MagAuthor(AuthorId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], LastKnownAffiliationId: Option[Long], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
+case class MagAuthor(
+ AuthorId: Long,
+ Rank: Option[Int],
+ NormalizedName: Option[String],
+ DisplayName: Option[String],
+ LastKnownAffiliationId: Option[Long],
+ PaperCount: Option[Long],
+ CitationCount: Option[Long],
+ CreatedDate: Option[java.sql.Timestamp]
+) {}
-case class MagAffiliation(AffiliationId: Long, Rank: Int, NormalizedName: String, DisplayName: String, GridId: String, OfficialPage: String, WikiPage: String, PaperCount: Long, CitationCount: Long, Latitude: Option[Float], Longitude: Option[Float], CreatedDate: java.sql.Timestamp) {}
+case class MagAffiliation(
+ AffiliationId: Long,
+ Rank: Int,
+ NormalizedName: String,
+ DisplayName: String,
+ GridId: String,
+ OfficialPage: String,
+ WikiPage: String,
+ PaperCount: Long,
+ CitationCount: Long,
+ Latitude: Option[Float],
+ Longitude: Option[Float],
+ CreatedDate: java.sql.Timestamp
+) {}
-case class MagPaperAuthorAffiliation(PaperId: Long, AuthorId: Long, AffiliationId: Option[Long], AuthorSequenceNumber: Int, OriginalAuthor: String, OriginalAffiliation: String) {}
+case class MagPaperAuthorAffiliation(
+ PaperId: Long,
+ AuthorId: Long,
+ AffiliationId: Option[Long],
+ AuthorSequenceNumber: Int,
+ OriginalAuthor: String,
+ OriginalAffiliation: String
+) {}
-
-case class MagAuthorAffiliation(author: MagAuthor, affiliation:String, sequenceNumber:Int)
+case class MagAuthorAffiliation(author: MagAuthor, affiliation: String, sequenceNumber: Int)
case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {}
-case class MagPaperAuthorDenormalized(PaperId: Long, author: MagAuthor, affiliation:String, sequenceNumber:Int) {}
+case class MagPaperAuthorDenormalized(
+ PaperId: Long,
+ author: MagAuthor,
+ affiliation: String,
+ sequenceNumber: Int
+) {}
-case class MagPaperUrl(PaperId: Long, SourceType: Option[Int], SourceUrl: Option[String], LanguageCode: Option[String]) {}
+case class MagPaperUrl(
+ PaperId: Long,
+ SourceType: Option[Int],
+ SourceUrl: Option[String],
+ LanguageCode: Option[String]
+) {}
-case class MagUrlInstance(SourceUrl:String){}
+case class MagUrlInstance(SourceUrl: String) {}
case class MagUrl(PaperId: Long, instances: List[MagUrlInstance])
-case class MagSubject(FieldOfStudyId:Long, DisplayName:String, MainType:Option[String], Score:Float){}
+case class MagSubject(
+ FieldOfStudyId: Long,
+ DisplayName: String,
+ MainType: Option[String],
+ Score: Float
+) {}
-case class MagFieldOfStudy(PaperId:Long, subjects:List[MagSubject]) {}
+case class MagFieldOfStudy(PaperId: Long, subjects: List[MagSubject]) {}
-case class MagJournal(JournalId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], Issn: Option[String], Publisher: Option[String], Webpage: Option[String], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
+case class MagJournal(
+ JournalId: Long,
+ Rank: Option[Int],
+ NormalizedName: Option[String],
+ DisplayName: Option[String],
+ Issn: Option[String],
+ Publisher: Option[String],
+ Webpage: Option[String],
+ PaperCount: Option[Long],
+ CitationCount: Option[Long],
+ CreatedDate: Option[java.sql.Timestamp]
+) {}
-
-case class MagConferenceInstance(ci:Long, DisplayName:Option[String], Location:Option[String], StartDate:Option[java.sql.Timestamp], EndDate:Option[java.sql.Timestamp], PaperId:Long){}
+case class MagConferenceInstance(
+ ci: Long,
+ DisplayName: Option[String],
+ Location: Option[String],
+ StartDate: Option[java.sql.Timestamp],
+ EndDate: Option[java.sql.Timestamp],
+ PaperId: Long
+) {}
case object ConversionUtil {
- def extractMagIdentifier(pids:mutable.Buffer[String]) :String ={
+ def extractMagIdentifier(pids: mutable.Buffer[String]): String = {
val magIDRegex: Regex = "^[0-9]+$".r
- val s =pids.filter(p=> magIDRegex.findAllIn(p).hasNext)
+ val s = pids.filter(p => magIDRegex.findAllIn(p).hasNext)
if (s.nonEmpty)
return s.head
null
}
-
- def mergePublication(a: Publication, b:Publication) : Publication = {
+ def mergePublication(a: Publication, b: Publication): Publication = {
if ((a != null) && (b != null)) {
a.mergeFrom(b)
a
@@ -74,10 +148,9 @@ case object ConversionUtil {
if (a == null) b else a
}
-
}
- def choiceLatestMagArtitcle(p1: MagPapers, p2:MagPapers) :MagPapers = {
+ def choiceLatestMagArtitcle(p1: MagPapers, p2: MagPapers): MagPapers = {
var r = if (p1 == null) p2 else p1
if (p1 != null && p2 != null) {
if (p1.CreatedDate != null && p2.CreatedDate != null) {
@@ -93,8 +166,9 @@ case object ConversionUtil {
}
-
- def updatePubsWithDescription(inputItem:((String, Publication), MagPaperAbstract)) : Publication = {
+ def updatePubsWithDescription(
+ inputItem: ((String, Publication), MagPaperAbstract)
+ ): Publication = {
val pub = inputItem._1._2
val abst = inputItem._2
if (abst != null) {
@@ -104,20 +178,22 @@ case object ConversionUtil {
}
+ def updatePubsWithConferenceInfo(
+ inputItem: ((String, Publication), MagConferenceInstance)
+ ): Publication = {
+ val publication: Publication = inputItem._1._2
+ val ci: MagConferenceInstance = inputItem._2
- def updatePubsWithConferenceInfo(inputItem:((String, Publication), MagConferenceInstance)) : Publication = {
- val publication:Publication= inputItem._1._2
- val ci:MagConferenceInstance = inputItem._2
+ if (ci != null) {
- if (ci!= null){
-
- val j:Journal = new Journal
+ val j: Journal = new Journal
if (ci.Location.isDefined)
j.setConferenceplace(ci.Location.get)
j.setName(ci.DisplayName.get)
- if (ci.StartDate.isDefined && ci.EndDate.isDefined)
- {
- j.setConferencedate(s"${ci.StartDate.get.toString.substring(0,10)} - ${ci.EndDate.get.toString.substring(0,10)}")
+ if (ci.StartDate.isDefined && ci.EndDate.isDefined) {
+ j.setConferencedate(
+ s"${ci.StartDate.get.toString.substring(0, 10)} - ${ci.EndDate.get.toString.substring(0, 10)}"
+ )
}
publication.setJournal(j)
@@ -125,7 +201,7 @@ case object ConversionUtil {
publication
}
- def updatePubsWithSubject(item:((String, Publication), MagFieldOfStudy)) : Publication = {
+ def updatePubsWithSubject(item: ((String, Publication), MagFieldOfStudy)): Publication = {
val publication = item._1._2
val fieldOfStudy = item._2
@@ -135,16 +211,34 @@ case object ConversionUtil {
val classid = "MAG"
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
- val s1 = createSP(s.DisplayName, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
+ val s1 = createSP(
+ s.DisplayName,
+ classid,
+ className,
+ ModelConstants.DNET_SUBJECT_TYPOLOGIES,
+ ModelConstants.DNET_SUBJECT_TYPOLOGIES
+ )
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
var resList: List[StructuredProperty] = List(s1)
if (s.MainType.isDefined) {
val maintp = s.MainType.get
- val s2 = createSP(s.MainType.get, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
+ val s2 = createSP(
+ s.MainType.get,
+ classid,
+ className,
+ ModelConstants.DNET_SUBJECT_TYPOLOGIES,
+ ModelConstants.DNET_SUBJECT_TYPOLOGIES
+ )
s2.setDataInfo(di)
resList = resList ::: List(s2)
if (maintp.contains(".")) {
- val s3 = createSP(maintp.split("\\.").head, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
+ val s3 = createSP(
+ maintp.split("\\.").head,
+ classid,
+ className,
+ ModelConstants.DNET_SUBJECT_TYPOLOGIES,
+ ModelConstants.DNET_SUBJECT_TYPOLOGIES
+ )
s3.setDataInfo(di)
resList = resList ::: List(s3)
}
@@ -156,25 +250,27 @@ case object ConversionUtil {
publication
}
-
-
def addInstances(a: (Publication, MagUrl)): Publication = {
val pub = a._1
val urls = a._2
-
-
val i = new Instance
+ if (urls != null) {
- if (urls!= null) {
-
- val l:List[String] = urls.instances.filter(k=>k.SourceUrl.nonEmpty).map(k=>k.SourceUrl):::List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}")
+ val l: List[String] = urls.instances
+ .filter(k => k.SourceUrl.nonEmpty)
+ .map(k => k.SourceUrl) ::: List(
+ s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}"
+ )
i.setUrl(l.asJava)
- }
- else
- i.setUrl(List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}").asJava)
+ } else
+ i.setUrl(
+ List(
+ s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}"
+ ).asJava
+ )
// Ticket #6281 added pid to Instance
i.setPid(pub.getPid)
@@ -184,13 +280,13 @@ case object ConversionUtil {
pub
}
-
def transformPaperAbstract(input: MagPaperAbstract): MagPaperAbstract = {
MagPaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract))
}
-
- def createOAFFromJournalAuthorPaper(inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)): Publication = {
+ def createOAFFromJournalAuthorPaper(
+ inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)
+ ): Publication = {
val paper = inputParams._1._1
val journal = inputParams._1._2
val authors = inputParams._2
@@ -206,31 +302,37 @@ case object ConversionUtil {
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
- val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
+ val originalTitles =
+ createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
pub.setTitle(List(mainTitles, originalTitles).asJava)
pub.setSource(List(asField(paper.BookTitle)).asJava)
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
-
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
a.setRank(f.sequenceNumber)
if (f.author.DisplayName.isDefined)
a.setFullname(f.author.DisplayName.get)
- if(f.affiliation!= null)
+ if (f.affiliation != null)
a.setAffiliation(List(asField(f.affiliation)).asJava)
- a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava)
+ a.setPid(
+ List(
+ createSP(
+ s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}",
+ "URL",
+ ModelConstants.DNET_PID_TYPES
+ )
+ ).asJava
+ )
a
}
pub.setAuthor(authorsOAF.asJava)
-
if (paper.Date != null && paper.Date.isDefined) {
- pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0,10)))
+ pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0, 10)))
}
pub.setPublisher(asField(paper.Publisher))
-
if (journal != null && journal.DisplayName.isDefined) {
val j = new Journal
@@ -250,8 +352,9 @@ case object ConversionUtil {
pub
}
-
- def createOAF(inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)): Publication = {
+ def createOAF(
+ inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)
+ ): Publication = {
val paper = inputParams._1._1
val authors = inputParams._1._2
@@ -268,46 +371,48 @@ case object ConversionUtil {
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
- val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
+ val originalTitles =
+ createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
pub.setTitle(List(mainTitles, originalTitles).asJava)
pub.setSource(List(asField(paper.BookTitle)).asJava)
-
if (description != null) {
pub.setDescription(List(asField(description.IndexedAbstract)).asJava)
}
-
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
-
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
a.setFullname(f.author.DisplayName.get)
- if(f.affiliation!= null)
+ if (f.affiliation != null)
a.setAffiliation(List(asField(f.affiliation)).asJava)
-
- a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava)
+ a.setPid(
+ List(
+ createSP(
+ s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}",
+ "URL",
+ ModelConstants.DNET_PID_TYPES
+ )
+ ).asJava
+ )
a
}
-
if (paper.Date != null) {
- pub.setDateofacceptance(asField(paper.Date.toString.substring(0,10)))
+ pub.setDateofacceptance(asField(paper.Date.toString.substring(0, 10)))
}
pub.setAuthor(authorsOAF.asJava)
-
pub
}
-
def convertInvertedIndexString(json_input: String): String = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(json_input)
@@ -317,13 +422,13 @@ case object ConversionUtil {
val iid = (json \ "InvertedIndex").extract[Map[String, List[Int]]]
- for {(k: String, v: List[Int]) <- iid} {
+ for { (k: String, v: List[Int]) <- iid } {
v.foreach(item => res(item) = k)
}
- (0 until idl).foreach(i => {
- if (res(i) == null)
- res(i) = ""
- })
+ (0 until idl).foreach(i => {
+ if (res(i) == null)
+ res(i) = ""
+ })
return res.mkString(" ")
}
""
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala
index 039c935f3..316bd91ac 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala
@@ -8,44 +8,245 @@ import org.apache.spark.sql.{SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
object SparkImportMagIntoDataset {
+
val datatypedict = Map(
- "bool" -> BooleanType,
- "int" -> IntegerType,
- "uint" -> IntegerType,
- "long" -> LongType,
- "ulong" -> LongType,
- "float" -> FloatType,
- "string" -> StringType,
+ "bool" -> BooleanType,
+ "int" -> IntegerType,
+ "uint" -> IntegerType,
+ "long" -> LongType,
+ "ulong" -> LongType,
+ "float" -> FloatType,
+ "string" -> StringType,
"DateTime" -> DateType
)
-
val stream = Map(
- "Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
- "AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")),
- "Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
- "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
- "ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
- "EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
- "FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
- "FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
- "FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
- "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
- "PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
- "PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
- "PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
- "PaperExtendedAttributes" -> Tuple2("mag/PaperExtendedAttributes.txt", Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")),
- "PaperFieldsOfStudy" -> Tuple2("advanced/PaperFieldsOfStudy.txt", Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")),
- "PaperMeSH" -> Tuple2("advanced/PaperMeSH.txt", Seq("PaperId:long", "DescriptorUI:string", "DescriptorName:string", "QualifierUI:string", "QualifierName:string", "IsMajorTopic:bool")),
- "PaperRecommendations" -> Tuple2("advanced/PaperRecommendations.txt", Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")),
- "PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
- "PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
- "PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
- "Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "FamilyRank:uint?", "DocSubTypes:string", "CreatedDate:DateTime")),
- "RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
+ "Affiliations" -> Tuple2(
+ "mag/Affiliations.txt",
+ Seq(
+ "AffiliationId:long",
+ "Rank:uint",
+ "NormalizedName:string",
+ "DisplayName:string",
+ "GridId:string",
+ "OfficialPage:string",
+ "WikiPage:string",
+ "PaperCount:long",
+ "PaperFamilyCount:long",
+ "CitationCount:long",
+ "Iso3166Code:string",
+ "Latitude:float?",
+ "Longitude:float?",
+ "CreatedDate:DateTime"
+ )
+ ),
+ "AuthorExtendedAttributes" -> Tuple2(
+ "mag/AuthorExtendedAttributes.txt",
+ Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")
+ ),
+ "Authors" -> Tuple2(
+ "mag/Authors.txt",
+ Seq(
+ "AuthorId:long",
+ "Rank:uint",
+ "NormalizedName:string",
+ "DisplayName:string",
+ "LastKnownAffiliationId:long?",
+ "PaperCount:long",
+ "PaperFamilyCount:long",
+ "CitationCount:long",
+ "CreatedDate:DateTime"
+ )
+ ),
+ "ConferenceInstances" -> Tuple2(
+ "mag/ConferenceInstances.txt",
+ Seq(
+ "ConferenceInstanceId:long",
+ "NormalizedName:string",
+ "DisplayName:string",
+ "ConferenceSeriesId:long",
+ "Location:string",
+ "OfficialUrl:string",
+ "StartDate:DateTime?",
+ "EndDate:DateTime?",
+ "AbstractRegistrationDate:DateTime?",
+ "SubmissionDeadlineDate:DateTime?",
+ "NotificationDueDate:DateTime?",
+ "FinalVersionDueDate:DateTime?",
+ "PaperCount:long",
+ "PaperFamilyCount:long",
+ "CitationCount:long",
+ "Latitude:float?",
+ "Longitude:float?",
+ "CreatedDate:DateTime"
+ )
+ ),
+ "ConferenceSeries" -> Tuple2(
+ "mag/ConferenceSeries.txt",
+ Seq(
+ "ConferenceSeriesId:long",
+ "Rank:uint",
+ "NormalizedName:string",
+ "DisplayName:string",
+ "PaperCount:long",
+ "PaperFamilyCount:long",
+ "CitationCount:long",
+ "CreatedDate:DateTime"
+ )
+ ),
+ "EntityRelatedEntities" -> Tuple2(
+ "advanced/EntityRelatedEntities.txt",
+ Seq(
+ "EntityId:long",
+ "EntityType:string",
+ "RelatedEntityId:long",
+ "RelatedEntityType:string",
+ "RelatedType:int",
+ "Score:float"
+ )
+ ),
+ "FieldOfStudyChildren" -> Tuple2(
+ "advanced/FieldOfStudyChildren.txt",
+ Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")
+ ),
+ "FieldOfStudyExtendedAttributes" -> Tuple2(
+ "advanced/FieldOfStudyExtendedAttributes.txt",
+ Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")
+ ),
+ "FieldsOfStudy" -> Tuple2(
+ "advanced/FieldsOfStudy.txt",
+ Seq(
+ "FieldOfStudyId:long",
+ "Rank:uint",
+ "NormalizedName:string",
+ "DisplayName:string",
+ "MainType:string",
+ "Level:int",
+ "PaperCount:long",
+ "PaperFamilyCount:long",
+ "CitationCount:long",
+ "CreatedDate:DateTime"
+ )
+ ),
+ "Journals" -> Tuple2(
+ "mag/Journals.txt",
+ Seq(
+ "JournalId:long",
+ "Rank:uint",
+ "NormalizedName:string",
+ "DisplayName:string",
+ "Issn:string",
+ "Publisher:string",
+ "Webpage:string",
+ "PaperCount:long",
+ "PaperFamilyCount:long",
+ "CitationCount:long",
+ "CreatedDate:DateTime"
+ )
+ ),
+ "PaperAbstractsInvertedIndex" -> Tuple2(
+ "nlp/PaperAbstractsInvertedIndex.txt.*",
+ Seq("PaperId:long", "IndexedAbstract:string")
+ ),
+ "PaperAuthorAffiliations" -> Tuple2(
+ "mag/PaperAuthorAffiliations.txt",
+ Seq(
+ "PaperId:long",
+ "AuthorId:long",
+ "AffiliationId:long?",
+ "AuthorSequenceNumber:uint",
+ "OriginalAuthor:string",
+ "OriginalAffiliation:string"
+ )
+ ),
+ "PaperCitationContexts" -> Tuple2(
+ "nlp/PaperCitationContexts.txt",
+ Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")
+ ),
+ "PaperExtendedAttributes" -> Tuple2(
+ "mag/PaperExtendedAttributes.txt",
+ Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")
+ ),
+ "PaperFieldsOfStudy" -> Tuple2(
+ "advanced/PaperFieldsOfStudy.txt",
+ Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")
+ ),
+ "PaperMeSH" -> Tuple2(
+ "advanced/PaperMeSH.txt",
+ Seq(
+ "PaperId:long",
+ "DescriptorUI:string",
+ "DescriptorName:string",
+ "QualifierUI:string",
+ "QualifierName:string",
+ "IsMajorTopic:bool"
+ )
+ ),
+ "PaperRecommendations" -> Tuple2(
+ "advanced/PaperRecommendations.txt",
+ Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")
+ ),
+ "PaperReferences" -> Tuple2(
+ "mag/PaperReferences.txt",
+ Seq("PaperId:long", "PaperReferenceId:long")
+ ),
+ "PaperResources" -> Tuple2(
+ "mag/PaperResources.txt",
+ Seq(
+ "PaperId:long",
+ "ResourceType:int",
+ "ResourceUrl:string",
+ "SourceUrl:string",
+ "RelationshipType:int"
+ )
+ ),
+ "PaperUrls" -> Tuple2(
+ "mag/PaperUrls.txt",
+ Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")
+ ),
+ "Papers" -> Tuple2(
+ "mag/Papers.txt",
+ Seq(
+ "PaperId:long",
+ "Rank:uint",
+ "Doi:string",
+ "DocType:string",
+ "PaperTitle:string",
+ "OriginalTitle:string",
+ "BookTitle:string",
+ "Year:int?",
+ "Date:DateTime?",
+ "OnlineDate:DateTime?",
+ "Publisher:string",
+ "JournalId:long?",
+ "ConferenceSeriesId:long?",
+ "ConferenceInstanceId:long?",
+ "Volume:string",
+ "Issue:string",
+ "FirstPage:string",
+ "LastPage:string",
+ "ReferenceCount:long",
+ "CitationCount:long",
+ "EstimatedCitation:long",
+ "OriginalVenue:string",
+ "FamilyId:long?",
+ "FamilyRank:uint?",
+ "DocSubTypes:string",
+ "CreatedDate:DateTime"
+ )
+ ),
+ "RelatedFieldOfStudy" -> Tuple2(
+ "advanced/RelatedFieldOfStudy.txt",
+ Seq(
+ "FieldOfStudyId1:long",
+ "Type1:string",
+ "FieldOfStudyId2:long",
+ "Type2:string",
+ "Rank:float"
+ )
+ )
)
-
def getSchema(streamName: String): StructType = {
var schema = new StructType()
val d: Seq[String] = stream(streamName)._2
@@ -61,19 +262,22 @@ object SparkImportMagIntoDataset {
schema
}
-
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
-
+ .master(parser.get("master"))
+ .getOrCreate()
stream.foreach { case (k, v) =>
val s: StructType = getSchema(k)
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala
index 41e95baa1..eae669853 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala
@@ -9,6 +9,7 @@ import org.apache.spark.sql.functions.{col, collect_list, struct}
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
+
object SparkProcessMAG {
def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = {
@@ -17,13 +18,31 @@ object SparkProcessMAG {
.reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2))
.map(_._2)(Encoders.product[MagPapers])
.map(mp => {
- MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi),
- mp.DocType, mp.PaperTitle, mp.OriginalTitle,
- mp.BookTitle, mp.Year, mp.Date, mp.Publisher: String,
- mp.JournalId, mp.ConferenceSeriesId, mp.ConferenceInstanceId,
- mp.Volume, mp.Issue, mp.FirstPage, mp.LastPage,
- mp.ReferenceCount, mp.CitationCount, mp.EstimatedCitation,
- mp.OriginalVenue, mp.FamilyId, mp.CreatedDate)
+ MagPapers(
+ mp.PaperId,
+ mp.Rank,
+ DoiBoostMappingUtil.normalizeDoi(mp.Doi),
+ mp.DocType,
+ mp.PaperTitle,
+ mp.OriginalTitle,
+ mp.BookTitle,
+ mp.Year,
+ mp.Date,
+ mp.Publisher: String,
+ mp.JournalId,
+ mp.ConferenceSeriesId,
+ mp.ConferenceInstanceId,
+ mp.Volume,
+ mp.Issue,
+ mp.FirstPage,
+ mp.LastPage,
+ mp.ReferenceCount,
+ mp.CitationCount,
+ mp.EstimatedCitation,
+ mp.OriginalVenue,
+ mp.FamilyId,
+ mp.CreatedDate
+ )
})(Encoders.product[MagPapers])
}
@@ -31,22 +50,29 @@ object SparkProcessMAG {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
val sourcePath = parser.get("sourcePath")
val workingPath = parser.get("workingPath")
val targetPath = parser.get("targetPath")
import spark.implicits._
- implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
- implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
+ implicit val mapEncoderPubs: Encoder[Publication] =
+ org.apache.spark.sql.Encoders.kryo[Publication]
+ implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =
+ Encoders.tuple(Encoders.STRING, mapEncoderPubs)
logger.info("Phase 1) make uninue DOI in Papers:")
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
@@ -58,16 +84,23 @@ object SparkProcessMAG {
logger.info("Phase 0) Enrich Publication with description")
val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
- pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"$workingPath/PaperAbstract")
+ pa.map(ConversionUtil.transformPaperAbstract)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/PaperAbstract")
logger.info("Phase 3) Group Author by PaperId")
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]
- val paperAuthorAffiliation = spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
+ val paperAuthorAffiliation =
+ spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
- paperAuthorAffiliation.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
- .map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) => (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber)) }
+ paperAuthorAffiliation
+ .joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
+ .map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) =>
+ (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber))
+ }
.joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")
.map(s => {
val mpa = s._1._2
@@ -76,79 +109,133 @@ object SparkProcessMAG {
MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName, mpa.sequenceNumber)
} else
mpa
- }).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors"))
- .write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_1_paper_authors")
+ })
+ .groupBy("PaperId")
+ .agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors"))
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/merge_step_1_paper_authors")
- logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
+ logger.info(
+ "Phase 4) create First Version of publication Entity with Paper Journal and Authors"
+ )
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
- val papers = spark.read.load((s"$workingPath/Papers_distinct")).as[MagPapers]
+ val papers = spark.read.load(s"$workingPath/Papers_distinct").as[MagPapers]
- val paperWithAuthors = spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
+ val paperWithAuthors =
+ spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
- val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
- firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
+ val firstJoin =
+ papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
+ firstJoin
+ .joinWith(
+ paperWithAuthors,
+ firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")),
+ "left"
+ )
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
- .write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_2")
-
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/merge_step_2")
var magPubs: Dataset[(String, Publication)] =
- spark.read.load(s"$workingPath/merge_step_2").as[Publication]
- .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
+ spark.read
+ .load(s"$workingPath/merge_step_2")
+ .as[Publication]
+ .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
+ .as[(String, Publication)]
+ val conference = spark.read
+ .load(s"$sourcePath/ConferenceInstances")
+ .select(
+ $"ConferenceInstanceId".as("ci"),
+ $"DisplayName",
+ $"Location",
+ $"StartDate",
+ $"EndDate"
+ )
+ val conferenceInstance = conference
+ .joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
+ .select(
+ $"_1.ci",
+ $"_1.DisplayName",
+ $"_1.Location",
+ $"_1.StartDate",
+ $"_1.EndDate",
+ $"_2.PaperId"
+ )
+ .as[MagConferenceInstance]
- val conference = spark.read.load(s"$sourcePath/ConferenceInstances")
- .select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate", $"EndDate")
- val conferenceInstance = conference.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
- .select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate", $"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance]
-
-
- magPubs.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
+ magPubs
+ .joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/merge_step_3")
+ val paperAbstract = spark.read.load(s"$workingPath/PaperAbstract").as[MagPaperAbstract]
- val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract]
-
-
- magPubs = spark.read.load(s"$workingPath/merge_step_3").as[Publication]
- .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
-
- magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
- .map(item => ConversionUtil.updatePubsWithDescription(item)
- ).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4")
+ magPubs = spark.read
+ .load(s"$workingPath/merge_step_3")
+ .as[Publication]
+ .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
+ .as[(String, Publication)]
+ magPubs
+ .joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
+ .map(item => ConversionUtil.updatePubsWithDescription(item))
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/merge_step_4")
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
- magPubs = spark.read.load(s"$workingPath/merge_step_4").as[Publication]
- .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
+ magPubs = spark.read
+ .load(s"$workingPath/merge_step_4")
+ .as[Publication]
+ .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
+ .as[(String, Publication)]
- val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
+ val fos = spark.read
+ .load(s"$sourcePath/FieldsOfStudy")
+ .select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy")
- val paperField = pfos.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
+ val paperField = pfos
+ .joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
.select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score")
- .groupBy($"PaperId").agg(collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score")).as("subjects"))
+ .groupBy($"PaperId")
+ .agg(
+ collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score"))
+ .as("subjects")
+ )
.as[MagFieldOfStudy]
- magPubs.joinWith(paperField, col("_1")
- .equalTo(paperField("PaperId")), "left")
+ magPubs
+ .joinWith(
+ paperField,
+ col("_1")
+ .equalTo(paperField("PaperId")),
+ "left"
+ )
.map(item => ConversionUtil.updatePubsWithSubject(item))
- .write.mode(SaveMode.Overwrite)
+ .write
+ .mode(SaveMode.Overwrite)
.save(s"$workingPath/mag_publication")
- spark.read.load(s"$workingPath/mag_publication").as[Publication]
+ spark.read
+ .load(s"$workingPath/mag_publication")
+ .as[Publication]
.filter(p => p.getId != null)
.groupByKey(p => p.getId)
.reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b))
.map(_._2)
- .write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
-
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/magPublication")
}
}
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala
index 11031f9ca..7c58afc09 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala
@@ -15,15 +15,20 @@ import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
+case class ORCIDItem(doi: String, authors: List[OrcidAuthor]) {}
-case class ORCIDItem(doi:String, authors:List[OrcidAuthor]){}
-case class OrcidAuthor(oid:String, name:Option[String], surname:Option[String], creditName:Option[String], otherNames:Option[List[String]], errorCode:Option[String]){}
-case class OrcidWork(oid:String, doi:String)
+case class OrcidAuthor(
+ oid: String,
+ name: Option[String],
+ surname: Option[String],
+ creditName: Option[String],
+ otherNames: Option[List[String]],
+ errorCode: Option[String]
+) {}
+case class OrcidWork(oid: String, doi: String)
+case class ORCIDElement(doi: String, authors: List[ORCIDItem]) {}
-
-
-case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
object ORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
val mapper = new ObjectMapper()
@@ -41,7 +46,7 @@ object ORCIDToOAF {
def extractValueFromInputString(input: String): (String, String) = {
val i = input.indexOf('[')
- if (i <5) {
+ if (i < 5) {
return null
}
val orcidList = input.substring(i, input.length - 1)
@@ -51,17 +56,16 @@ object ORCIDToOAF {
} else null
}
-
- def strValid(s:Option[String]) : Boolean = {
+ def strValid(s: Option[String]): Boolean = {
s.isDefined && s.get.nonEmpty
}
- def authorValid(author:OrcidAuthor): Boolean ={
+ def authorValid(author: OrcidAuthor): Boolean = {
if (strValid(author.name) && strValid(author.surname)) {
return true
}
if (strValid(author.surname)) {
- return true
+ return true
}
if (strValid(author.creditName)) {
return true
@@ -70,37 +74,35 @@ object ORCIDToOAF {
false
}
-
- def extractDOIWorks(input:String): List[OrcidWork] = {
+ def extractDOIWorks(input: String): List[OrcidWork] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
- val oid = (json \ "workDetail" \"oid").extractOrElse[String](null)
+ val oid = (json \ "workDetail" \ "oid").extractOrElse[String](null)
if (oid == null)
return List()
- val doi:List[(String, String)] = for {
- JObject(extIds) <- json \ "workDetail" \"extIds"
+ val doi: List[(String, String)] = for {
+ JObject(extIds) <- json \ "workDetail" \ "extIds"
JField("type", JString(typeValue)) <- extIds
- JField("value", JString(value)) <- extIds
+ JField("value", JString(value)) <- extIds
if "doi".equalsIgnoreCase(typeValue)
} yield (typeValue, DoiBoostMappingUtil.normalizeDoi(value))
if (doi.nonEmpty) {
- return doi.map(l =>OrcidWork(oid, l._2))
+ return doi.map(l => OrcidWork(oid, l._2))
}
List()
}
- def convertORCIDAuthor(input:String): OrcidAuthor = {
+ def convertORCIDAuthor(input: String): OrcidAuthor = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
- (json \"authorData" ).extractOrElse[OrcidAuthor](null)
- }
+ (json \ "authorData").extractOrElse[OrcidAuthor](null)
+ }
-
- def convertTOOAF(input:ORCIDItem) :Publication = {
+ def convertTOOAF(input: ORCIDItem): Publication = {
val doi = input.doi
- val pub:Publication = new Publication
+ val pub: Publication = new Publication
pub.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setDataInfo(generateDataInfo())
@@ -108,9 +110,9 @@ object ORCIDToOAF {
if (pub.getId == null)
return null
- try{
+ try {
- val l:List[Author]= input.authors.map(a=> {
+ val l: List[Author] = input.authors.map(a => {
generateAuthor(a)
})(collection.breakOut)
@@ -125,30 +127,38 @@ object ORCIDToOAF {
}
}
- def generateOricPIDDatainfo():DataInfo = {
- val di =DoiBoostMappingUtil.generateDataInfo("0.91")
+ def generateOricPIDDatainfo(): DataInfo = {
+ val di = DoiBoostMappingUtil.generateDataInfo("0.91")
di.getProvenanceaction.setClassid(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)
di.getProvenanceaction.setClassname(ModelConstants.HARVESTED)
di
}
- def generateAuthor(o : OrcidAuthor): Author = {
+ def generateAuthor(o: OrcidAuthor): Author = {
val a = new Author
if (strValid(o.name)) {
- a.setName(o.name.get.capitalize)
+ a.setName(o.name.get.capitalize)
}
if (strValid(o.surname)) {
a.setSurname(o.surname.get.capitalize)
}
- if(strValid(o.name) && strValid(o.surname))
+ if (strValid(o.name) && strValid(o.surname))
a.setFullname(s"${o.name.get.capitalize} ${o.surname.get.capitalize}")
else if (strValid(o.creditName))
a.setFullname(o.creditName.get)
if (StringUtils.isNotBlank(o.oid))
- a.setPid(List(createSP(o.oid, ModelConstants.ORCID, ModelConstants.DNET_PID_TYPES, generateOricPIDDatainfo())).asJava)
+ a.setPid(
+ List(
+ createSP(
+ o.oid,
+ ModelConstants.ORCID,
+ ModelConstants.DNET_PID_TYPES,
+ generateOricPIDDatainfo()
+ )
+ ).asJava
+ )
a
}
-
-}
\ No newline at end of file
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala
index 1b189e296..95a1f5a19 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala
@@ -10,11 +10,11 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkConvertORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
-
def run(spark: SparkSession, workingPath: String, targetPath: String): Unit = {
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
import spark.implicits._
- val dataset: Dataset[ORCIDItem] = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]
+ val dataset: Dataset[ORCIDItem] =
+ spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]
logger.info("Converting ORCID to OAF")
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath)
@@ -22,15 +22,21 @@ object SparkConvertORCIDToOAF {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ SparkConvertORCIDToOAF.getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json"
+ )
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
-
+ .master(parser.get("master"))
+ .getOrCreate()
val workingPath = parser.get("workingPath")
val targetPath = parser.get("targetPath")
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala
index 153be5dd1..7b6408417 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala
@@ -17,45 +17,72 @@ object SparkPreprocessORCID {
}
-
def run(spark: SparkSession, sourcePath: String, workingPath: String): Unit = {
import spark.implicits._
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
- val inputRDD: RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s != null).filter(s => ORCIDToOAF.authorValid(s))
+ val inputRDD: RDD[OrcidAuthor] = spark.sparkContext
+ .textFile(s"$sourcePath/authors")
+ .map(s => ORCIDToOAF.convertORCIDAuthor(s))
+ .filter(s => s != null)
+ .filter(s => ORCIDToOAF.authorValid(s))
- spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author")
+ spark
+ .createDataset(inputRDD)
+ .as[OrcidAuthor]
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/author")
- val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s != null)
+ val res = spark.sparkContext
+ .textFile(s"$sourcePath/works")
+ .flatMap(s => ORCIDToOAF.extractDOIWorks(s))
+ .filter(s => s != null)
- spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works")
+ spark
+ .createDataset(res)
+ .as[OrcidWork]
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/works")
val authors: Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor]
val works: Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork]
- works.joinWith(authors, authors("oid").equalTo(works("oid")))
+ works
+ .joinWith(authors, authors("oid").equalTo(works("oid")))
.map(i => {
val doi = i._1.doi
val author = i._2
(doi, author)
- }).groupBy(col("_1").alias("doi"))
- .agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem]
+ })
+ .groupBy(col("_1").alias("doi"))
+ .agg(collect_list(col("_2")).alias("authors"))
+ .as[ORCIDItem]
.map(s => fixORCIDItem(s))
- .write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor")
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/orcidworksWithAuthor")
}
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/preprocess_orcid_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ SparkConvertORCIDToOAF.getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/doiboost/preprocess_orcid_params.json"
+ )
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
-
+ .master(parser.get("master"))
+ .getOrCreate()
val sourcePath = parser.get("sourcePath")
val workingPath = parser.get("workingPath")
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala
index 70290018d..9f7f9d18f 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala
@@ -13,28 +13,35 @@ object SparkMapUnpayWallToOAF {
def main(args: Array[String]): Unit = {
-
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ SparkMapDumpIntoOAF.getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json"
+ )
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
-
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val inputRDD: RDD[String] = spark.sparkContext.textFile(s"$sourcePath")
logger.info("Converting UnpayWall to OAF")
- val d: Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null)).as[Publication]
+ val d: Dataset[Publication] = spark
+ .createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null))
+ .as[Publication]
d.write.mode(SaveMode.Overwrite).save(targetPath)
}
diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala
index bf5694965..bbdc80b1d 100644
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala
@@ -12,33 +12,41 @@ import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
-
-
-case class OALocation(evidence:Option[String], host_type:Option[String], is_best:Option[Boolean], license: Option[String], pmh_id:Option[String], updated:Option[String],
- url:Option[String], url_for_landing_page:Option[String], url_for_pdf:Option[String], version:Option[String]) {}
-
-
-
+case class OALocation(
+ evidence: Option[String],
+ host_type: Option[String],
+ is_best: Option[Boolean],
+ license: Option[String],
+ pmh_id: Option[String],
+ updated: Option[String],
+ url: Option[String],
+ url_for_landing_page: Option[String],
+ url_for_pdf: Option[String],
+ version: Option[String]
+) {}
object UnpayWallToOAF {
val logger: Logger = LoggerFactory.getLogger(getClass)
-
- def get_unpaywall_color(input:String):Option[OpenAccessRoute] = {
- if(input == null || input.equalsIgnoreCase("close"))
+ def get_unpaywall_color(input: String): Option[OpenAccessRoute] = {
+ if (input == null || input.equalsIgnoreCase("close"))
return None
- if(input.equalsIgnoreCase("green"))
+ if (input.equalsIgnoreCase("green"))
return Some(OpenAccessRoute.green)
- if(input.equalsIgnoreCase("bronze"))
+ if (input.equalsIgnoreCase("bronze"))
return Some(OpenAccessRoute.bronze)
- if(input.equalsIgnoreCase("hybrid"))
+ if (input.equalsIgnoreCase("hybrid"))
return Some(OpenAccessRoute.hybrid)
else
return Some(OpenAccessRoute.gold)
}
- def get_color(is_oa:Boolean, location: OALocation, journal_is_oa:Boolean):Option[OpenAccessRoute] = {
+ def get_color(
+ is_oa: Boolean,
+ location: OALocation,
+ journal_is_oa: Boolean
+ ): Option[OpenAccessRoute] = {
if (is_oa) {
if (location.host_type.isDefined) {
{
@@ -62,23 +70,22 @@ object UnpayWallToOAF {
None
}
-
- def convertToOAF(input:String):Publication = {
+ def convertToOAF(input: String): Publication = {
val pub = new Publication
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
- val doi = DoiBoostMappingUtil.normalizeDoi((json \"doi").extract[String])
+ val doi = DoiBoostMappingUtil.normalizeDoi((json \ "doi").extract[String])
- if(doi == null)
+ if (doi == null)
return null
- val is_oa = (json\ "is_oa").extract[Boolean]
+ val is_oa = (json \ "is_oa").extract[Boolean]
- val journal_is_oa= (json\ "journal_is_oa").extract[Boolean]
+ val journal_is_oa = (json \ "journal_is_oa").extract[Boolean]
- val oaLocation:OALocation = (json \ "best_oa_location").extractOrElse[OALocation](null)
+ val oaLocation: OALocation = (json \ "best_oa_location").extractOrElse[OALocation](null)
val colour = get_unpaywall_color((json \ "oa_status").extractOrElse[String](null))
@@ -88,9 +95,9 @@ object UnpayWallToOAF {
if (!is_oa)
return null
- if(oaLocation== null || oaLocation.url.isEmpty)
- return null
- val i :Instance= new Instance()
+ if (oaLocation == null || oaLocation.url.isEmpty)
+ return null
+ val i: Instance = new Instance()
i.setCollectedfrom(createUnpayWallCollectedFrom())
// i.setAccessright(getOpenAccessQualifier())
@@ -122,7 +129,4 @@ object UnpayWallToOAF {
}
-
-
-
}
diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala
index 41730ade0..61d2eef29 100644
--- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala
@@ -6,15 +6,11 @@ import org.junit.jupiter.api.Test
class DoiBoostHostedByMapTest {
@Test
- def idDSGeneration():Unit = {
- val s ="doajarticles::0066-782X"
-
-
+ def idDSGeneration(): Unit = {
+ val s = "doajarticles::0066-782X"
println(DoiBoostMappingUtil.generateDSId(s))
-
}
-
}
diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala
index a9a841ee9..391d45b10 100644
--- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala
@@ -6,41 +6,39 @@ import org.junit.jupiter.api.Test
class NormalizeDOITest {
@Test
- def doiDSLowerCase():Unit = {
- val doi ="10.1042/BCJ20160876"
+ def doiDSLowerCase(): Unit = {
+ val doi = "10.1042/BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi).equals(doi.toLowerCase()))
}
-
@Test
- def doiFiltered():Unit = {
+ def doiFiltered(): Unit = {
val doi = "0.1042/BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi) == null)
}
@Test
- def doiFiltered2():Unit = {
+ def doiFiltered2(): Unit = {
val doi = "https://doi.org/0.1042/BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi) == null)
}
-
@Test
- def doiCleaned():Unit = {
+ def doiCleaned(): Unit = {
val doi = "https://doi.org/10.1042/BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi).equals("10.1042/BCJ20160876".toLowerCase()))
}
@Test
- def doiCleaned1():Unit = {
+ def doiCleaned1(): Unit = {
val doi = "https://doi.org/10.1042/ BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi).equals("10.1042/BCJ20160876".toLowerCase()))
}
-}
\ No newline at end of file
+}
diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala
index 71dbf27be..8124a5aae 100644
--- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala
@@ -12,20 +12,24 @@ import scala.collection.JavaConverters._
import scala.io.Source
import scala.util.matching.Regex
-
class CrossrefMappingTest {
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
val mapper = new ObjectMapper()
-
-
@Test
def testFunderRelationshipsMapping(): Unit = {
- val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString
- val funder_doi = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString
- val funder_name = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString
-
+ val template = Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")
+ )
+ .mkString
+ val funder_doi = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi"))
+ .mkString
+ val funder_name = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi"))
+ .mkString
for (line <- funder_doi.lines) {
val json = template.replace("%s", line)
@@ -43,7 +47,8 @@ class CrossrefMappingTest {
def checkRelation(generatedOAF: List[Oaf]): Unit = {
- val rels: List[Relation] = generatedOAF.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
+ val rels: List[Relation] =
+ generatedOAF.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
assertFalse(rels.isEmpty)
rels.foreach(relation => {
val relJson = mapper.writeValueAsString(relation)
@@ -59,22 +64,22 @@ class CrossrefMappingTest {
}
-
@Test
- def testSum() :Unit = {
- val from:Long = 1613135645000L
- val delta:Long = 1000000L
-
-
- println(s"updating from value: $from -> ${from+delta}")
+ def testSum(): Unit = {
+ val from: Long = 1613135645000L
+ val delta: Long = 1000000L
+ println(s"updating from value: $from -> ${from + delta}")
}
@Test
- def testOrcidID() :Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")).mkString
-
+ def testOrcidID(): Unit = {
+ val json = Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")
+ )
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@@ -85,17 +90,18 @@ class CrossrefMappingTest {
val items = resultList.filter(p => p.isInstanceOf[Result])
-
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
items.foreach(p => println(mapper.writeValueAsString(p)))
-
}
@Test
- def testEmptyTitle() :Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")).mkString
-
+ def testEmptyTitle(): Unit = {
+ val json = Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")
+ )
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@@ -106,17 +112,16 @@ class CrossrefMappingTest {
val items = resultList.filter(p => p.isInstanceOf[Result])
-
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
items.foreach(p => println(mapper.writeValueAsString(p)))
-
}
-
@Test
def testPeerReviewed(): Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json")).mkString
+ val json = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json"))
+ .mkString
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
assertNotNull(json)
@@ -128,12 +133,8 @@ class CrossrefMappingTest {
val items = resultList.filter(p => p.isInstanceOf[Result])
-
items.foreach(p => logger.info(mapper.writeValueAsString(p)))
-
-
-
}
def extractECAward(award: String): String = {
@@ -143,21 +144,21 @@ class CrossrefMappingTest {
null
}
-
@Test
def extractECTest(): Unit = {
- val s = "FP7/2007-2013"
+ val s = "FP7/2007-2013"
val awardExtracted = extractECAward(s)
println(awardExtracted)
println(DHPUtils.md5(awardExtracted))
-
}
@Test
def testJournalRelation(): Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json")).mkString
+ val json = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json"))
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty)
@@ -165,20 +166,19 @@ class CrossrefMappingTest {
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty)
- val rels:List[Relation] = resultList.filter(p => p.isInstanceOf[Relation]).map(r=> r.asInstanceOf[Relation])
-
-
+ val rels: List[Relation] =
+ resultList.filter(p => p.isInstanceOf[Relation]).map(r => r.asInstanceOf[Relation])
rels.foreach(s => logger.info(s.getTarget))
- assertEquals(rels.size, 6 )
-
+ assertEquals(rels.size, 6)
}
-
@Test
def testConvertBookFromCrossRef2Oaf(): Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json")).mkString
+ val json = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json"))
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@@ -199,42 +199,62 @@ class CrossrefMappingTest {
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull(
result.getDataInfo.getProvenanceaction,
- "DataInfo/Provenance test not null Failed");
+ "DataInfo/Provenance test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
- "DataInfo/Provenance/classId test not null Failed");
+ "DataInfo/Provenance/classId test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
- "DataInfo/Provenance/className test not null Failed");
+ "DataInfo/Provenance/className test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
- "DataInfo/Provenance/SchemeId test not null Failed");
+ "DataInfo/Provenance/SchemeId test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
- "DataInfo/Provenance/SchemeName test not null Failed");
+ "DataInfo/Provenance/SchemeName test not null Failed"
+ );
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty);
val collectedFromList = result.getCollectedfrom.asScala
- assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
-
- assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
+ assert(
+ collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
+ "Wrong collected from assertion"
+ )
+ assert(
+ collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
+ "Wrong collected from assertion"
+ )
val relevantDates = result.getRelevantdate.asScala
- assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
- assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), "Missing relevant date of type published-online")
- assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), "Missing relevant date of type published-print")
+ assert(
+ relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
+ "Missing relevant date of type created"
+ )
+ assert(
+ relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")),
+ "Missing relevant date of type published-online"
+ )
+ assert(
+ relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")),
+ "Missing relevant date of type published-print"
+ )
val rels = resultList.filter(p => p.isInstanceOf[Relation])
assert(rels.isEmpty)
}
-
@Test
def testConvertPreprintFromCrossRef2Oaf(): Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json")).mkString
+ val json = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json"))
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@@ -255,44 +275,70 @@ class CrossrefMappingTest {
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull(
result.getDataInfo.getProvenanceaction,
- "DataInfo/Provenance test not null Failed");
+ "DataInfo/Provenance test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
- "DataInfo/Provenance/classId test not null Failed");
+ "DataInfo/Provenance/classId test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
- "DataInfo/Provenance/className test not null Failed");
+ "DataInfo/Provenance/className test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
- "DataInfo/Provenance/SchemeId test not null Failed");
+ "DataInfo/Provenance/SchemeId test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
- "DataInfo/Provenance/SchemeName test not null Failed");
+ "DataInfo/Provenance/SchemeName test not null Failed"
+ );
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty);
val collectedFromList = result.getCollectedfrom.asScala
- assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
-
- assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
+ assert(
+ collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
+ "Wrong collected from assertion"
+ )
+ assert(
+ collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
+ "Wrong collected from assertion"
+ )
val relevantDates = result.getRelevantdate.asScala
- assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
- assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("available")), "Missing relevant date of type available")
- assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("accepted")), "Missing relevant date of type accepted")
- assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), "Missing relevant date of type published-online")
- assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), "Missing relevant date of type published-print")
+ assert(
+ relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
+ "Missing relevant date of type created"
+ )
+ assert(
+ relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("available")),
+ "Missing relevant date of type available"
+ )
+ assert(
+ relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("accepted")),
+ "Missing relevant date of type accepted"
+ )
+ assert(
+ relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")),
+ "Missing relevant date of type published-online"
+ )
+ assert(
+ relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")),
+ "Missing relevant date of type published-print"
+ )
val rels = resultList.filter(p => p.isInstanceOf[Relation])
assert(rels.isEmpty)
}
-
@Test
def testConvertDatasetFromCrossRef2Oaf(): Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json")).mkString
+ val json = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json"))
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@@ -313,19 +359,24 @@ class CrossrefMappingTest {
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull(
result.getDataInfo.getProvenanceaction,
- "DataInfo/Provenance test not null Failed");
+ "DataInfo/Provenance test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
- "DataInfo/Provenance/classId test not null Failed");
+ "DataInfo/Provenance/classId test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
- "DataInfo/Provenance/className test not null Failed");
+ "DataInfo/Provenance/className test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
- "DataInfo/Provenance/SchemeId test not null Failed");
+ "DataInfo/Provenance/SchemeId test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
- "DataInfo/Provenance/SchemeName test not null Failed");
+ "DataInfo/Provenance/SchemeName test not null Failed"
+ );
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty);
@@ -333,7 +384,9 @@ class CrossrefMappingTest {
@Test
def testConvertArticleFromCrossRef2Oaf(): Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString
+ val json = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json"))
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@@ -354,32 +407,45 @@ class CrossrefMappingTest {
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull(
result.getDataInfo.getProvenanceaction,
- "DataInfo/Provenance test not null Failed");
+ "DataInfo/Provenance test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
- "DataInfo/Provenance/classId test not null Failed");
+ "DataInfo/Provenance/classId test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
- "DataInfo/Provenance/className test not null Failed");
+ "DataInfo/Provenance/className test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
- "DataInfo/Provenance/SchemeId test not null Failed");
+ "DataInfo/Provenance/SchemeId test not null Failed"
+ );
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
- "DataInfo/Provenance/SchemeName test not null Failed");
+ "DataInfo/Provenance/SchemeName test not null Failed"
+ );
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty);
val collectedFromList = result.getCollectedfrom.asScala
- assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
-
- assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
+ assert(
+ collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
+ "Wrong collected from assertion"
+ )
+ assert(
+ collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
+ "Wrong collected from assertion"
+ )
val relevantDates = result.getRelevantdate.asScala
- assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
+ assert(
+ relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
+ "Missing relevant date of type created"
+ )
val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
assertFalse(rels.isEmpty)
@@ -393,15 +459,14 @@ class CrossrefMappingTest {
})
-
}
-
-
@Test
def testSetDateOfAcceptanceCrossRef2Oaf(): Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json")).mkString
+ val json = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json"))
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@@ -421,8 +486,13 @@ class CrossrefMappingTest {
@Test
def testNormalizeDOI(): Unit = {
- val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString
- val line :String = "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
+ val template = Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")
+ )
+ .mkString
+ val line: String =
+ "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
val json = template.replace("%s", line)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty)
@@ -431,13 +501,17 @@ class CrossrefMappingTest {
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
assertTrue(result.getPid.size() == 1)
- result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())))
+ result.getPid.asScala.foreach(pid =>
+ assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))
+ )
}
@Test
def testNormalizeDOI2(): Unit = {
- val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString
+ val template = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json"))
+ .mkString
val resultList: List[Oaf] = Crossref2Oaf.convert(template)
assertTrue(resultList.nonEmpty)
@@ -446,14 +520,19 @@ class CrossrefMappingTest {
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
assertTrue(result.getPid.size() == 1)
- result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())))
+ result.getPid.asScala.foreach(pid =>
+ assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))
+ )
}
@Test
- def testLicenseVorClosed() :Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")).mkString
-
+ def testLicenseVorClosed(): Unit = {
+ val json = Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")
+ )
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@@ -462,25 +541,28 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty)
-
- val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
+ val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
- assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor")))
- assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED")))
+ assertTrue(
+ item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor"))
+ )
+ assertTrue(
+ item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED"))
+ )
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
-
-
-
}
@Test
- def testLicenseOpen() :Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")).mkString
-
+ def testLicenseOpen(): Unit = {
+ val json = Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")
+ )
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@@ -489,21 +571,33 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty)
+ val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
- val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
-
- assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html")))
+ assertTrue(
+ item.getInstance().asScala exists (i =>
+ i.getLicense.getValue.equals(
+ "http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html"
+ )
+ )
+ )
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
- assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid))
+ assertTrue(
+ item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)
+ )
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
}
@Test
- def testLicenseEmbargoOpen() :Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json")).mkString
-
+ def testLicenseEmbargoOpen(): Unit = {
+ val json = Source
+ .fromInputStream(
+ getClass.getResourceAsStream(
+ "/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json"
+ )
+ )
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@@ -512,21 +606,33 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty)
+ val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
- val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
-
- assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
+ assertTrue(
+ item.getInstance().asScala exists (i =>
+ i.getLicense.getValue.equals(
+ "https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
+ )
+ )
+ )
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
- assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid))
+ assertTrue(
+ item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)
+ )
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
}
@Test
- def testLicenseEmbargo() :Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo.json")).mkString
-
+ def testLicenseEmbargo(): Unit = {
+ val json = Source
+ .fromInputStream(
+ getClass.getResourceAsStream(
+ "/eu/dnetlib/doiboost/crossref/publication_license_embargo.json"
+ )
+ )
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@@ -535,35 +641,18 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty)
+ val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
- val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
-
- assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
- assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO")))
- assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
- mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
- println(mapper.writeValueAsString(item))
-
- }
-
-
- @Test
- def testLicenseEmbargoDateTime() :Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json")).mkString
-
-
- assertNotNull(json)
- assertFalse(json.isEmpty);
-
- val resultList: List[Oaf] = Crossref2Oaf.convert(json)
-
- assertTrue(resultList.nonEmpty)
-
-
- val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
-
- assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
- assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO")))
+ assertTrue(
+ item.getInstance().asScala exists (i =>
+ i.getLicense.getValue.equals(
+ "https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
+ )
+ )
+ )
+ assertTrue(
+ item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))
+ )
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
@@ -571,9 +660,14 @@ class CrossrefMappingTest {
}
@Test
- def testMultipleURLs() :Unit = {
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")).mkString
-
+ def testLicenseEmbargoDateTime(): Unit = {
+ val json = Source
+ .fromInputStream(
+ getClass.getResourceAsStream(
+ "/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json"
+ )
+ )
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@@ -582,12 +676,47 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty)
+ val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
- val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
+ assertTrue(
+ item.getInstance().asScala exists (i =>
+ i.getLicense.getValue.equals(
+ "https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
+ )
+ )
+ )
+ assertTrue(
+ item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))
+ )
+ assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
+ mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
+ println(mapper.writeValueAsString(item))
+
+ }
+
+ @Test
+ def testMultipleURLs(): Unit = {
+ val json = Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")
+ )
+ .mkString
+
+ assertNotNull(json)
+ assertFalse(json.isEmpty);
+
+ val resultList: List[Oaf] = Crossref2Oaf.convert(json)
+
+ assertTrue(resultList.nonEmpty)
+
+ val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
assertEquals(1, item.getInstance().size())
assertEquals(1, item.getInstance().get(0).getUrl().size())
- assertEquals("https://doi.org/10.1016/j.jas.2019.105013", item.getInstance().get(0).getUrl().get(0))
+ assertEquals(
+ "https://doi.org/10.1016/j.jas.2019.105013",
+ item.getInstance().get(0).getUrl().get(0)
+ )
//println(mapper.writeValueAsString(item))
}
diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/mag/MAGMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/mag/MAGMappingTest.scala
index 611f3b323..882c0d8a0 100644
--- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/mag/MAGMappingTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/mag/MAGMappingTest.scala
@@ -12,43 +12,35 @@ import org.slf4j.{Logger, LoggerFactory}
import java.sql.Timestamp
import scala.io.Source
-
-
class MAGMappingTest {
val logger: Logger = LoggerFactory.getLogger(getClass)
val mapper = new ObjectMapper()
-
-
-
@Test
- def testSplitter():Unit = {
+ def testSplitter(): Unit = {
val s = "sports.team"
-
if (s.contains(".")) {
- println(s.split("\\.")head)
+ println(s.split("\\.") head)
}
}
-
-
@Test
- def testDate() :Unit = {
+ def testDate(): Unit = {
- val p:Timestamp = Timestamp.valueOf("2011-10-02 00:00:00")
+ val p: Timestamp = Timestamp.valueOf("2011-10-02 00:00:00")
- println(p.toString.substring(0,10))
+ println(p.toString.substring(0, 10))
}
-
-
@Test
def buildInvertedIndexTest(): Unit = {
- val json_input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json")).mkString
+ val json_input = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json"))
+ .mkString
val description = ConversionUtil.convertInvertedIndexString(json_input)
assertNotNull(description)
assertTrue(description.nonEmpty)
@@ -56,10 +48,9 @@ class MAGMappingTest {
logger.debug(description)
}
+
@Test
- def normalizeDoiTest():Unit = {
-
-
+ def normalizeDoiTest(): Unit = {
implicit val formats = DefaultFormats
@@ -78,8 +69,9 @@ class MAGMappingTest {
val schema = Encoders.product[MagPapers].schema
import spark.implicits._
- val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
- val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
+ val magPapers: Dataset[MagPapers] =
+ spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers]
+ val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
assertTrue(ret.count == 10)
ret.take(10).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
@@ -87,7 +79,7 @@ class MAGMappingTest {
}
@Test
- def normalizeDoiTest2():Unit = {
+ def normalizeDoiTest2(): Unit = {
import org.json4s.DefaultFormats
@@ -108,15 +100,13 @@ class MAGMappingTest {
val schema = Encoders.product[MagPapers].schema
import spark.implicits._
- val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
- val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
+ val magPapers: Dataset[MagPapers] =
+ spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers]
+ val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
assertTrue(ret.count == 8)
ret.take(8).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
spark.close()
//ret.take(8).foreach(mp => println(write(mp)))
}
-
}
-
-
diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala
index 7c8f01f81..e5bf1bd5f 100644
--- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala
@@ -19,8 +19,10 @@ class MappingORCIDToOAFTest {
val mapper = new ObjectMapper()
@Test
- def testExtractData():Unit ={
- val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput")).mkString
+ def testExtractData(): Unit = {
+ val json = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput"))
+ .mkString
assertNotNull(json)
assertFalse(json.isEmpty)
json.lines.foreach(s => {
@@ -29,10 +31,10 @@ class MappingORCIDToOAFTest {
}
@Test
- def testOAFConvert(@TempDir testDir: Path):Unit ={
- val sourcePath:String = getClass.getResource("/eu/dnetlib/doiboost/orcid/datasets").getPath
- val targetPath: String =s"${testDir.toString}/output/orcidPublication"
- val workingPath =s"${testDir.toString}/wp/"
+ def testOAFConvert(@TempDir testDir: Path): Unit = {
+ val sourcePath: String = getClass.getResource("/eu/dnetlib/doiboost/orcid/datasets").getPath
+ val targetPath: String = s"${testDir.toString}/output/orcidPublication"
+ val workingPath = s"${testDir.toString}/wp/"
val conf = new SparkConf()
conf.setMaster("local[*]")
@@ -46,18 +48,14 @@ class MappingORCIDToOAFTest {
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
import spark.implicits._
- SparkPreprocessORCID.run( spark,sourcePath, workingPath)
+ SparkPreprocessORCID.run(spark, sourcePath, workingPath)
- SparkConvertORCIDToOAF.run(spark, workingPath,targetPath)
+ SparkConvertORCIDToOAF.run(spark, workingPath, targetPath)
val mapper = new ObjectMapper()
-
-
val oA = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem].count()
-
-
val p: Dataset[Publication] = spark.read.load(targetPath).as[Publication]
assertTrue(oA == p.count())
@@ -65,19 +63,18 @@ class MappingORCIDToOAFTest {
spark.close()
-
}
-
@Test
- def testExtractDat1():Unit ={
+ def testExtractDat1(): Unit = {
+ val aList: List[OrcidAuthor] = List(
+ OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null),
+ OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null),
+ OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null)
+ )
-
- val aList: List[OrcidAuthor] = List(OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null ),
- OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null ), OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null ))
-
- val orcid:ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList)
+ val orcid: ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList)
val oaf = ORCIDToOAF.convertTOOAF(orcid)
assert(oaf.getPid.size() == 1)
@@ -85,10 +82,6 @@ class MappingORCIDToOAFTest {
oaf.getPid.toList.foreach(pid => assert(pid.getValue.equals("10.1042/BCJ20160876")))
//println(mapper.writeValueAsString(ORCIDToOAF.convertTOOAF(orcid)))
-
}
-
-
-
}
diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/uw/UnpayWallMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/uw/UnpayWallMappingTest.scala
index 6671758b2..542faa8ad 100644
--- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/uw/UnpayWallMappingTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/uw/UnpayWallMappingTest.scala
@@ -14,41 +14,43 @@ class UnpayWallMappingTest {
val logger: Logger = LoggerFactory.getLogger(getClass)
val mapper = new ObjectMapper()
-
@Test
- def testMappingToOAF():Unit ={
+ def testMappingToOAF(): Unit = {
- val Ilist = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json")).mkString
+ val Ilist = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json"))
+ .mkString
- var i:Int = 0
- for (line <-Ilist.lines) {
+ var i: Int = 0
+ for (line <- Ilist.lines) {
val p = UnpayWallToOAF.convertToOAF(line)
- if(p!= null) {
- assertTrue(p.getInstance().size()==1)
- if (i== 0){
+ if (p != null) {
+ assertTrue(p.getInstance().size() == 1)
+ if (i == 0) {
assertTrue(p.getPid.get(0).getValue.equals("10.1038/2211089b0"))
}
- if (i== 1){
+ if (i == 1) {
assertTrue(p.getPid.get(0).getValue.equals("10.1021/acs.bioconjchem.8b00058.s001"))
}
- if (i== 2){
+ if (i == 2) {
assertTrue(p.getPid.get(0).getValue.equals("10.1021/acs.bioconjchem.8b00086.s001"))
}
logger.info(s"ID : ${p.getId}")
}
assertNotNull(line)
assertTrue(line.nonEmpty)
- i = i+1
+ i = i + 1
}
-
-
- val l = Ilist.lines.next()
+ val l = Ilist.lines.next()
val item = UnpayWallToOAF.convertToOAF(l)
- assertEquals(item.getInstance().get(0).getAccessright.getOpenAccessRoute, OpenAccessRoute.bronze)
+ assertEquals(
+ item.getInstance().get(0).getAccessright.getOpenAccessRoute,
+ OpenAccessRoute.bronze
+ )
logger.info(mapper.writeValueAsString(item))
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala
index ad4e1c96e..c5a2b4024 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala
@@ -4,137 +4,190 @@ import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
+case class HostedByItemType(
+ id: String,
+ officialname: String,
+ issn: String,
+ eissn: String,
+ lissn: String,
+ openAccess: Boolean
+) {}
-case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
-case class HostedByInfo(id: String, officialname: String, journal_id: String, provenance : String, id_type: String) {}
+case class HostedByInfo(
+ id: String,
+ officialname: String,
+ journal_id: String,
+ provenance: String,
+ id_type: String
+) {}
object Aggregators {
-
-
- def getId(s1:String, s2:String) : String = {
- if (s1.startsWith("10|")){
- return s1}
- s2
- }
-
- def getValue(s1:String, s2:String) : String = {
- if(!s1.equals("")){
+ def getId(s1: String, s2: String): String = {
+ if (s1.startsWith("10|")) {
return s1
}
s2
}
+ def getValue(s1: String, s2: String): String = {
+ if (!s1.equals("")) {
+ return s1
+ }
+ s2
+ }
- def explodeHostedByItemType(df: Dataset[(String, HostedByItemType)]): Dataset[(String, HostedByItemType)] = {
- val transformedData : Dataset[(String, HostedByItemType)] = df
+ def explodeHostedByItemType(
+ df: Dataset[(String, HostedByItemType)]
+ ): Dataset[(String, HostedByItemType)] = {
+ val transformedData: Dataset[(String, HostedByItemType)] = df
.groupByKey(_._1)(Encoders.STRING)
.agg(Aggregators.hostedByAggregator)
- .map{
- case (id:String , res:(String, HostedByItemType)) => res
+ .map { case (id: String, res: (String, HostedByItemType)) =>
+ res
}(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]))
transformedData
}
- val hostedByAggregator: TypedColumn[(String, HostedByItemType), (String, HostedByItemType)] = new Aggregator[(String, HostedByItemType), (String, HostedByItemType), (String, HostedByItemType)] {
- override def zero: (String, HostedByItemType) = ("", HostedByItemType("","","","","",false))
- override def reduce(b: (String, HostedByItemType), a:(String,HostedByItemType)): (String, HostedByItemType) = {
- return merge(b, a)
- }
- override def merge(b1: (String, HostedByItemType), b2: (String, HostedByItemType)): (String, HostedByItemType) = {
- if (b1 == null){
- return b2
+ val hostedByAggregator: TypedColumn[(String, HostedByItemType), (String, HostedByItemType)] =
+ new Aggregator[
+ (String, HostedByItemType),
+ (String, HostedByItemType),
+ (String, HostedByItemType)
+ ] {
+
+ override def zero: (String, HostedByItemType) =
+ ("", HostedByItemType("", "", "", "", "", false))
+
+ override def reduce(
+ b: (String, HostedByItemType),
+ a: (String, HostedByItemType)
+ ): (String, HostedByItemType) = {
+ return merge(b, a)
}
- if(b2 == null){
- return b1
- }
- if(b1._2.id.startsWith("10|")){
- return (b1._1, HostedByItemType(b1._2.id, b1._2.officialname, b1._2.issn, b1._2.eissn, b1._2.lissn, b1._2.openAccess || b2._2.openAccess))
+
+ override def merge(
+ b1: (String, HostedByItemType),
+ b2: (String, HostedByItemType)
+ ): (String, HostedByItemType) = {
+ if (b1 == null) {
+ return b2
+ }
+ if (b2 == null) {
+ return b1
+ }
+ if (b1._2.id.startsWith("10|")) {
+ return (
+ b1._1,
+ HostedByItemType(
+ b1._2.id,
+ b1._2.officialname,
+ b1._2.issn,
+ b1._2.eissn,
+ b1._2.lissn,
+ b1._2.openAccess || b2._2.openAccess
+ )
+ )
+
+ }
+ return (
+ b2._1,
+ HostedByItemType(
+ b2._2.id,
+ b2._2.officialname,
+ b2._2.issn,
+ b2._2.eissn,
+ b2._2.lissn,
+ b1._2.openAccess || b2._2.openAccess
+ )
+ )
}
- return (b2._1, HostedByItemType(b2._2.id, b2._2.officialname, b2._2.issn, b2._2.eissn, b2._2.lissn, b1._2.openAccess || b2._2.openAccess))
- }
- override def finish(reduction: (String,HostedByItemType)): (String, HostedByItemType) = reduction
- override def bufferEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType])
+ override def finish(reduction: (String, HostedByItemType)): (String, HostedByItemType) =
+ reduction
- override def outputEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType])
- }.toColumn
+ override def bufferEncoder: Encoder[(String, HostedByItemType)] =
+ Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
+ override def outputEncoder: Encoder[(String, HostedByItemType)] =
+ Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
+ }.toColumn
+ def resultToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] =
+ new Aggregator[EntityInfo, EntityInfo, EntityInfo] {
+ override def zero: EntityInfo = EntityInfo.newInstance("", "", "")
-
- def resultToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = new Aggregator[EntityInfo, EntityInfo, EntityInfo]{
- override def zero: EntityInfo = EntityInfo.newInstance("","","")
-
- override def reduce(b: EntityInfo, a:EntityInfo): EntityInfo = {
- return merge(b, a)
- }
- override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
- if (b1 == null){
- return b2
+ override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = {
+ return merge(b, a)
}
- if(b2 == null){
- return b1
+
+ override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
+ if (b1 == null) {
+ return b2
+ }
+ if (b2 == null) {
+ return b1
+ }
+ if (!b1.getHostedById.equals("")) {
+ b1.setOpenAccess(b1.getOpenAccess || b2.getOpenAccess)
+ return b1
+ }
+ b2.setOpenAccess(b1.getOpenAccess || b2.getOpenAccess)
+ b2
+
}
- if(!b1.getHostedById.equals("")){
- b1.setOpenAccess(b1.getOpenAccess || b2.getOpenAccess)
- return b1
- }
- b2.setOpenAccess(b1.getOpenAccess || b2.getOpenAccess)
- b2
+ override def finish(reduction: EntityInfo): EntityInfo = reduction
+ override def bufferEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
- }
- override def finish(reduction: EntityInfo): EntityInfo = reduction
- override def bufferEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
+ override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
+ }.toColumn
- override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
- }.toColumn
-
- def resultToSingleId(df:Dataset[EntityInfo]): Dataset[EntityInfo] = {
- val transformedData : Dataset[EntityInfo] = df
+ def resultToSingleId(df: Dataset[EntityInfo]): Dataset[EntityInfo] = {
+ val transformedData: Dataset[EntityInfo] = df
.groupByKey(_.getId)(Encoders.STRING)
.agg(Aggregators.resultToSingleIdAggregator)
- .map{
- case (id:String , res: EntityInfo) => res
+ .map { case (id: String, res: EntityInfo) =>
+ res
}(Encoders.bean(classOf[EntityInfo]))
transformedData
}
- def datasourceToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = new Aggregator[EntityInfo, EntityInfo, EntityInfo]{
- override def zero: EntityInfo = EntityInfo.newInstance("","","")
+ def datasourceToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] =
+ new Aggregator[EntityInfo, EntityInfo, EntityInfo] {
+ override def zero: EntityInfo = EntityInfo.newInstance("", "", "")
- override def reduce(b: EntityInfo, a:EntityInfo): EntityInfo = {
- return merge(b, a)
- }
- override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
- if (b1 == null){
- return b2
+ override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = {
+ return merge(b, a)
}
- if(b2 == null){
- return b1
+
+ override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
+ if (b1 == null) {
+ return b2
+ }
+ if (b2 == null) {
+ return b1
+ }
+ if (!b1.getHostedById.equals("")) {
+ return b1
+ }
+ b2
+
}
- if(!b1.getHostedById.equals("")){
- return b1
- }
- b2
+ override def finish(reduction: EntityInfo): EntityInfo = reduction
+ override def bufferEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
- }
- override def finish(reduction: EntityInfo): EntityInfo = reduction
- override def bufferEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
+ override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
+ }.toColumn
- override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
- }.toColumn
-
-
- def datasourceToSingleId(df:Dataset[EntityInfo]): Dataset[EntityInfo] = {
- val transformedData : Dataset[EntityInfo] = df
+ def datasourceToSingleId(df: Dataset[EntityInfo]): Dataset[EntityInfo] = {
+ val transformedData: Dataset[EntityInfo] = df
.groupByKey(_.getHostedById)(Encoders.STRING)
.agg(Aggregators.datasourceToSingleIdAggregator)
- .map{
- case (id:String , res: EntityInfo) => res
+ .map { case (id: String, res: EntityInfo) =>
+ res
}(Encoders.bean(classOf[EntityInfo]))
transformedData
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala
index 38af3eee4..80c672929 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala
@@ -14,7 +14,8 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkApplyHostedByMapToDatasource {
def applyHBtoDats(join: Dataset[EntityInfo], dats: Dataset[Datasource]): Dataset[Datasource] = {
- dats.joinWith(join, dats.col("id").equalTo(join.col("hostedById")), "left")
+ dats
+ .joinWith(join, dats.col("id").equalTo(join.col("hostedById")), "left")
.map(t2 => {
val d: Datasource = t2._1
if (t2._2 != null) {
@@ -31,14 +32,21 @@ object SparkApplyHostedByMapToDatasource {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"
+ )
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
val graphPath = parser.get("graphPath")
val outputPath = parser.get("outputPath")
@@ -51,20 +59,27 @@ object SparkApplyHostedByMapToDatasource {
val mapper = new ObjectMapper()
- val dats: Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource")
+ val dats: Dataset[Datasource] = spark.read
+ .textFile(graphPath + "/datasource")
.map(r => mapper.readValue(r, classOf[Datasource]))
- val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(preparedInfoPath)
- .map(ei => mapper.readValue(ei, classOf[EntityInfo])))
+ val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(
+ spark.read
+ .textFile(preparedInfoPath)
+ .map(ei => mapper.readValue(ei, classOf[EntityInfo]))
+ )
- applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
+ applyHBtoDats(pinfo, dats).write
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath)
- spark.read.textFile(outputPath)
+ spark.read
+ .textFile(outputPath)
.write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(graphPath + "/datasource")
}
-
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala
index 204325982..a900fc241 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala
@@ -16,7 +16,8 @@ import scala.collection.JavaConverters._
object SparkApplyHostedByMapToResult {
def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = {
- pubs.joinWith(join, pubs.col("id").equalTo(join.col("id")), "left")
+ pubs
+ .joinWith(join, pubs.col("id").equalTo(join.col("id")), "left")
.map(t2 => {
val p: Publication = t2._1
if (t2._2 != null) {
@@ -27,7 +28,14 @@ object SparkApplyHostedByMapToResult {
inst.getHostedby.setKey(ei.getHostedById)
inst.getHostedby.setValue(ei.getName)
if (ei.getOpenAccess) {
- inst.setAccessright(OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN, "Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES))
+ inst.setAccessright(
+ OafMapperUtils.accessRight(
+ ModelConstants.ACCESS_RIGHT_OPEN,
+ "Open Access",
+ ModelConstants.DNET_ACCESS_MODES,
+ ModelConstants.DNET_ACCESS_MODES
+ )
+ )
inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
}
@@ -40,46 +48,54 @@ object SparkApplyHostedByMapToResult {
def main(args: Array[String]): Unit = {
-
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"
+ )
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
-
+ .master(parser.get("master"))
+ .getOrCreate()
val graphPath = parser.get("graphPath")
val outputPath = parser.get("outputPath")
val preparedInfoPath = parser.get("preparedInfoPath")
-
implicit val formats = DefaultFormats
-
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication])
implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
val mapper = new ObjectMapper()
- val pubs: Dataset[Publication] = spark.read.textFile(graphPath + "/publication")
+ val pubs: Dataset[Publication] = spark.read
+ .textFile(graphPath + "/publication")
.map(r => mapper.readValue(r, classOf[Publication]))
- val pinfo: Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath)
+ val pinfo: Dataset[EntityInfo] = spark.read
+ .textFile(preparedInfoPath)
.map(ei => mapper.readValue(ei, classOf[EntityInfo]))
- applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
+ applyHBtoPubs(pinfo, pubs).write
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath)
- spark.read.textFile(outputPath)
+ spark.read
+ .textFile(outputPath)
.write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(graphPath + "/publication")
}
-
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala
index 87e203e4b..34798b147 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala
@@ -19,7 +19,6 @@ object SparkPrepareHostedByInfoToApply {
def getList(id: String, j: Journal, name: String): List[EntityInfo] = {
var lst: List[EntityInfo] = List()
-
if (j.getIssnLinking != null && !j.getIssnLinking.equals("")) {
lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst
}
@@ -37,14 +36,14 @@ object SparkPrepareHostedByInfoToApply {
val mapper = new ObjectMapper()
- val dd: Dataset[Publication] = spark.read.textFile(publicationPath)
+ val dd: Dataset[Publication] = spark.read
+ .textFile(publicationPath)
.map(r => mapper.readValue(r, classOf[Publication]))
dd.filter(p => p.getJournal != null).flatMap(p => getList(p.getId, p.getJournal, ""))
}
-
def toEntityInfo(input: String): EntityInfo = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@@ -53,7 +52,6 @@ object SparkPrepareHostedByInfoToApply {
toEntityItem(c.keys.head, c.values.head)
}
-
def toEntityItem(journal_id: String, hbi: HostedByItemType): EntityInfo = {
EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess)
@@ -61,62 +59,69 @@ object SparkPrepareHostedByInfoToApply {
}
def joinResHBM(res: Dataset[EntityInfo], hbm: Dataset[EntityInfo]): Dataset[EntityInfo] = {
- Aggregators.resultToSingleId(res.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left")
- .map(t2 => {
- val res: EntityInfo = t2._1
- if (t2._2 != null) {
- val ds = t2._2
- res.setHostedById(ds.getId)
- res.setOpenAccess(ds.getOpenAccess)
- res.setName(ds.getName)
- }
- res
- }))
+ Aggregators.resultToSingleId(
+ res
+ .joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left")
+ .map(t2 => {
+ val res: EntityInfo = t2._1
+ if (t2._2 != null) {
+ val ds = t2._2
+ res.setHostedById(ds.getId)
+ res.setOpenAccess(ds.getOpenAccess)
+ res.setName(ds.getName)
+ }
+ res
+ })
+ )
}
def main(args: Array[String]): Unit = {
-
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json"
+ )
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
-
+ .master(parser.get("master"))
+ .getOrCreate()
val graphPath = parser.get("graphPath")
val outputPath = parser.get("preparedInfoPath")
val hostedByMapPath = parser.get("hostedByMapPath")
-
implicit val formats = DefaultFormats
-
logger.info("Getting the Datasources")
import spark.implicits._
-
//STEP1: read the hostedbymap and transform it in EntityInfo
- val hostedByInfo: Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
+ val hostedByInfo: Dataset[EntityInfo] =
+ spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
//STEP2: create association (publication, issn), (publication, eissn), (publication, lissn)
- val resultInfoDataset: Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication")
+ val resultInfoDataset: Dataset[EntityInfo] =
+ prepareResultInfo(spark, graphPath + "/publication")
//STEP3: left join resultInfo with hostedByInfo on journal_id. Reduction of all the results with the same id in just
//one entry (one result could be associated to issn and eissn and so possivly matching more than once against the map)
//to this entry we add the id of the datasource for the next step
- joinResHBM(resultInfoDataset, hostedByInfo)
- .write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
-
+ joinResHBM(resultInfoDataset, hostedByInfo).write
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .json(outputPath)
}
-
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala
index 6dfe35623..8d8965866 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala
@@ -17,9 +17,8 @@ import java.io.PrintWriter
object SparkProduceHostedByMap {
-
- implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] = Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
-
+ implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] =
+ Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)): HostedByItemType = {
val openaire: HostedByInfo = input._1._1
@@ -28,9 +27,33 @@ object SparkProduceHostedByMap {
val isOpenAccess: Boolean = doaj == null && gold == null
openaire.journal_id match {
- case Constants.ISSN => HostedByItemType(openaire.id, openaire.officialname, openaire.journal_id, "", "", isOpenAccess)
- case Constants.EISSN => HostedByItemType(openaire.id, openaire.officialname, "", openaire.journal_id, "", isOpenAccess)
- case Constants.ISSNL => HostedByItemType(openaire.id, openaire.officialname, "", "", openaire.journal_id, isOpenAccess)
+ case Constants.ISSN =>
+ HostedByItemType(
+ openaire.id,
+ openaire.officialname,
+ openaire.journal_id,
+ "",
+ "",
+ isOpenAccess
+ )
+ case Constants.EISSN =>
+ HostedByItemType(
+ openaire.id,
+ openaire.officialname,
+ "",
+ openaire.journal_id,
+ "",
+ isOpenAccess
+ )
+ case Constants.ISSNL =>
+ HostedByItemType(
+ openaire.id,
+ openaire.officialname,
+ "",
+ "",
+ openaire.journal_id,
+ isOpenAccess
+ )
// catch the default with a variable so you can print it
case whoa => null
@@ -46,11 +69,16 @@ object SparkProduceHostedByMap {
Serialization.write(map)
-
}
-
- def getHostedByItemType(id: String, officialname: String, issn: String, eissn: String, issnl: String, oa: Boolean): HostedByItemType = {
+ def getHostedByItemType(
+ id: String,
+ officialname: String,
+ issn: String,
+ eissn: String,
+ issnl: String,
+ oa: Boolean
+ ): HostedByItemType = {
if (issn != null) {
if (eissn != null) {
if (issnl != null) {
@@ -85,7 +113,14 @@ object SparkProduceHostedByMap {
def oaToHostedbyItemType(dats: Datasource): HostedByItemType = {
if (dats.getJournal != null) {
- return getHostedByItemType(dats.getId, dats.getOfficialname.getValue, dats.getJournal.getIssnPrinted, dats.getJournal.getIssnOnline, dats.getJournal.getIssnLinking, false)
+ return getHostedByItemType(
+ dats.getId,
+ dats.getOfficialname.getValue,
+ dats.getJournal.getIssnPrinted,
+ dats.getJournal.getIssnOnline,
+ dats.getJournal.getIssnLinking,
+ false
+ )
}
HostedByItemType("", "", "", "", "", false)
}
@@ -94,32 +129,41 @@ object SparkProduceHostedByMap {
import spark.implicits._
-
val mapper = new ObjectMapper()
implicit var encoderD = Encoders.kryo[Datasource]
- val dd: Dataset[Datasource] = spark.read.textFile(datasourcePath)
+ val dd: Dataset[Datasource] = spark.read
+ .textFile(datasourcePath)
.map(r => mapper.readValue(r, classOf[Datasource]))
dd.map { ddt => oaToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
}
-
def goldToHostedbyItemType(gold: UnibiGoldModel): HostedByItemType = {
- return getHostedByItemType(Constants.UNIBI, gold.getTitle, gold.getIssn, "", gold.getIssnL, true)
+ return getHostedByItemType(
+ Constants.UNIBI,
+ gold.getTitle,
+ gold.getIssn,
+ "",
+ gold.getIssnL,
+ true
+ )
}
-
- def goldHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
+ def goldHostedByDataset(
+ spark: SparkSession,
+ datasourcePath: String
+ ): Dataset[HostedByItemType] = {
import spark.implicits._
implicit val mapEncoderUnibi: Encoder[UnibiGoldModel] = Encoders.kryo[UnibiGoldModel]
val mapper = new ObjectMapper()
- val dd: Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath)
+ val dd: Dataset[UnibiGoldModel] = spark.read
+ .textFile(datasourcePath)
.map(r => mapper.readValue(r, classOf[UnibiGoldModel]))
dd.map { ddt => goldToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
@@ -128,17 +172,28 @@ object SparkProduceHostedByMap {
def doajToHostedbyItemType(doaj: DOAJModel): HostedByItemType = {
- return getHostedByItemType(Constants.DOAJ, doaj.getJournalTitle, doaj.getIssn, doaj.getEissn, "", true)
+ return getHostedByItemType(
+ Constants.DOAJ,
+ doaj.getJournalTitle,
+ doaj.getIssn,
+ doaj.getEissn,
+ "",
+ true
+ )
}
- def doajHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
+ def doajHostedByDataset(
+ spark: SparkSession,
+ datasourcePath: String
+ ): Dataset[HostedByItemType] = {
import spark.implicits._
implicit val mapEncoderDOAJ: Encoder[DOAJModel] = Encoders.kryo[DOAJModel]
val mapper = new ObjectMapper()
- val dd: Dataset[DOAJModel] = spark.read.textFile(datasourcePath)
+ val dd: Dataset[DOAJModel] = spark.read
+ .textFile(datasourcePath)
.map(r => mapper.readValue(r, classOf[DOAJModel]))
dd.map { ddt => doajToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
@@ -159,7 +214,6 @@ object SparkProduceHostedByMap {
lst
}
-
def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = {
val conf = new Configuration()
@@ -169,49 +223,51 @@ object SparkProduceHostedByMap {
val writer = new PrintWriter(output)
try {
input.foreach(hbi => writer.println(hbi))
- }
- finally {
+ } finally {
writer.close()
}
}
-
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_params.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
-
+ .master(parser.get("master"))
+ .getOrCreate()
val datasourcePath = parser.get("datasourcePath")
val workingDirPath = parser.get("workingPath")
val outputPath = parser.get("outputPath")
-
implicit val formats = DefaultFormats
-
logger.info("Getting the Datasources")
-
- Aggregators.explodeHostedByItemType(oaHostedByDataset(spark, datasourcePath)
- .union(goldHostedByDataset(spark, workingDirPath + "/unibi_gold.json"))
- .union(doajHostedByDataset(spark, workingDirPath + "/doaj.json"))
- .flatMap(hbi => toList(hbi))).filter(hbi => hbi._2.id.startsWith("10|"))
+ Aggregators
+ .explodeHostedByItemType(
+ oaHostedByDataset(spark, datasourcePath)
+ .union(goldHostedByDataset(spark, workingDirPath + "/unibi_gold.json"))
+ .union(doajHostedByDataset(spark, workingDirPath + "/doaj.json"))
+ .flatMap(hbi => toList(hbi))
+ )
+ .filter(hbi => hbi._2.id.startsWith("10|"))
.map(hbi => toHostedByMap(hbi))(Encoders.STRING)
- .rdd.saveAsTextFile(outputPath, classOf[GzipCodec])
-
+ .rdd
+ .saveAsTextFile(outputPath, classOf[GzipCodec])
}
-
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala
index fa13f477c..533948289 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala
@@ -20,7 +20,13 @@ object CopyHdfsOafSparkApplication {
def main(args: Array[String]): Unit = {
val log = LoggerFactory.getLogger(getClass)
val conf = new SparkConf()
- val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json")).mkString)
+ val parser = new ArgumentApplicationParser(
+ Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json")
+ )
+ .mkString
+ )
parser.parseArgument(args)
val spark =
@@ -28,7 +34,8 @@ object CopyHdfsOafSparkApplication {
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
val sc: SparkContext = spark.sparkContext
@@ -49,19 +56,22 @@ object CopyHdfsOafSparkApplication {
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
- val paths = DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala
+ val paths =
+ DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala
- val validPaths: List[String] = paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList
+ val validPaths: List[String] =
+ paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList
- val types = ModelSupport.oafTypes.entrySet
- .asScala
+ val types = ModelSupport.oafTypes.entrySet.asScala
.map(e => Tuple2(e.getKey, e.getValue))
if (validPaths.nonEmpty) {
val oaf = spark.read.textFile(validPaths: _*)
- val mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
+ val mapper =
+ new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
- types.foreach(t => oaf
+ types.foreach(t =>
+ oaf
.filter(o => isOafType(o, t._1))
.map(j => mapper.readValue(j, t._2).asInstanceOf[Oaf])
.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala
index 8e15063c2..f5a13e72b 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala
@@ -3,7 +3,7 @@ package eu.dnetlib.dhp.oa.graph.resolution
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.common.EntityType
-import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset,_}
+import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
import org.apache.commons.io.IOUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkConf
@@ -13,20 +13,32 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkResolveEntities {
val mapper = new ObjectMapper()
- val entities = List(EntityType.dataset, EntityType.publication, EntityType.software, EntityType.otherresearchproduct)
+
+ val entities = List(
+ EntityType.dataset,
+ EntityType.publication,
+ EntityType.software,
+ EntityType.otherresearchproduct
+ )
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json"
+ )
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
-
+ .master(parser.get("master"))
+ .getOrCreate()
val graphBasePath = parser.get("graphBasePath")
log.info(s"graphBasePath -> $graphBasePath")
@@ -38,7 +50,6 @@ object SparkResolveEntities {
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
-
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
fs.mkdirs(new Path(workingPath))
@@ -46,60 +57,84 @@ object SparkResolveEntities {
generateResolvedEntities(spark, workingPath, graphBasePath, targetPath)
}
-
def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = {
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
import spark.implicits._
- val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
- val up: Dataset[(String, Result)] = spark.read.text(unresolvedPath).as[String].map(s => mapper.readValue(s, classOf[Result])).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
+ val rPid: Dataset[(String, String)] =
+ spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
+ val up: Dataset[(String, Result)] = spark.read
+ .text(unresolvedPath)
+ .as[String]
+ .map(s => mapper.readValue(s, classOf[Result]))
+ .map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
- rPid.joinWith(up, rPid("_2").equalTo(up("_1")), "inner").map {
- r =>
+ rPid
+ .joinWith(up, rPid("_2").equalTo(up("_1")), "inner")
+ .map { r =>
val result = r._2._2
val dnetId = r._1._1
result.setId(dnetId)
result
- }.write.mode(SaveMode.Overwrite).save(s"$workingPath/resolvedEntities")
+ }
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/resolvedEntities")
}
-
def deserializeObject(input: String, entity: EntityType): Result = {
entity match {
- case EntityType.publication => mapper.readValue(input, classOf[Publication])
- case EntityType.dataset => mapper.readValue(input, classOf[OafDataset])
- case EntityType.software => mapper.readValue(input, classOf[Software])
+ case EntityType.publication => mapper.readValue(input, classOf[Publication])
+ case EntityType.dataset => mapper.readValue(input, classOf[OafDataset])
+ case EntityType.software => mapper.readValue(input, classOf[Software])
case EntityType.otherresearchproduct => mapper.readValue(input, classOf[OtherResearchProduct])
}
}
- def generateResolvedEntities(spark: SparkSession, workingPath: String, graphBasePath: String, targetPath: String) = {
+ def generateResolvedEntities(
+ spark: SparkSession,
+ workingPath: String,
+ graphBasePath: String,
+ targetPath: String
+ ) = {
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
import spark.implicits._
- val re: Dataset[(String, Result)] = spark.read.load(s"$workingPath/resolvedEntities").as[Result].map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
- entities.foreach {
- e => {
+ val re: Dataset[(String, Result)] = spark.read
+ .load(s"$workingPath/resolvedEntities")
+ .as[Result]
+ .map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
+ entities.foreach { e =>
+ {
- val currentEntityDataset: Dataset[(String, Result)] = spark.read.text(s"$graphBasePath/$e").as[String].map(s => deserializeObject(s, e)).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
+ val currentEntityDataset: Dataset[(String, Result)] = spark.read
+ .text(s"$graphBasePath/$e")
+ .as[String]
+ .map(s => deserializeObject(s, e))
+ .map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
- currentEntityDataset.joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left").map(k => {
+ currentEntityDataset
+ .joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left")
+ .map(k => {
- val a = k._1
- val b = k._2
- if (b == null)
- a._2
- else {
- a._2.mergeFrom(b._2)
- a._2
- }
- }).map(r => mapper.writeValueAsString(r))(Encoders.STRING)
- .write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$targetPath/$e")
+ val a = k._1
+ val b = k._2
+ if (b == null)
+ a._2
+ else {
+ a._2.mergeFrom(b._2)
+ a._2
+ }
+ })
+ .map(r => mapper.writeValueAsString(r))(Encoders.STRING)
+ .write
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .text(s"$targetPath/$e")
}
-
}
}
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala
index 80c09940f..2567a30a6 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala
@@ -17,18 +17,25 @@ import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
object SparkResolveRelation {
+
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream(
+ "/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json"
+ )
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
-
+ .master(parser.get("master"))
+ .getOrCreate()
val graphBasePath = parser.get("graphBasePath")
log.info(s"graphBasePath -> $graphBasePath")
@@ -41,7 +48,6 @@ object SparkResolveRelation {
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
import spark.implicits._
-
//CLEANING TEMPORARY FOLDER
HdfsSupport.remove(workingPath, spark.sparkContext.hadoopConfiguration)
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
@@ -51,39 +57,49 @@ object SparkResolveRelation {
val mapper: ObjectMapper = new ObjectMapper()
- val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
+ val rPid: Dataset[(String, String)] =
+ spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
- val relationDs: Dataset[(String, Relation)] = spark.read.text(s"$graphBasePath/relation").as[String]
- .map(s => mapper.readValue(s, classOf[Relation])).as[Relation]
+ val relationDs: Dataset[(String, Relation)] = spark.read
+ .text(s"$graphBasePath/relation")
+ .as[String]
+ .map(s => mapper.readValue(s, classOf[Relation]))
+ .as[Relation]
.map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
- relationDs.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left").map {
- m =>
+ relationDs
+ .joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left")
+ .map { m =>
val sourceResolved = m._2
val currentRelation = m._1._2
if (sourceResolved != null && sourceResolved._1 != null && sourceResolved._1.nonEmpty)
currentRelation.setSource(sourceResolved._1)
currentRelation
- }.write
+ }
+ .write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/relationResolvedSource")
-
- val relationSourceResolved: Dataset[(String, Relation)] = spark.read.load(s"$workingPath/relationResolvedSource").as[Relation]
+ val relationSourceResolved: Dataset[(String, Relation)] = spark.read
+ .load(s"$workingPath/relationResolvedSource")
+ .as[Relation]
.map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
- relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left").map {
- m =>
+ relationSourceResolved
+ .joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left")
+ .map { m =>
val targetResolved = m._2
val currentRelation = m._1._2
if (targetResolved != null && targetResolved._1.nonEmpty)
currentRelation.setTarget(targetResolved._1)
currentRelation
- }
+ }
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/relation_resolved")
- spark.read.load(s"$workingPath/relation_resolved").as[Relation]
+ spark.read
+ .load(s"$workingPath/relation_resolved")
+ .as[Relation]
.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
.map(r => mapper.writeValueAsString(r))
.write
@@ -96,33 +112,31 @@ object SparkResolveRelation {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val result: List[(String, String)] = for {
- JObject(iObj) <- json \ "instance"
- JField("collectedfrom", JObject(cf)) <- iObj
+ JObject(iObj) <- json \ "instance"
+ JField("collectedfrom", JObject(cf)) <- iObj
JField("instancetype", JObject(instancetype)) <- iObj
- JField("value", JString(collectedFrom)) <- cf
- JField("classname", JString(classname)) <- instancetype
+ JField("value", JString(collectedFrom)) <- cf
+ JField("classname", JString(classname)) <- instancetype
} yield (classname, collectedFrom)
result
}
-
def extractPidsFromRecord(input: String): (String, List[(String, String)]) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val id: String = (json \ "id").extract[String]
val result: List[(String, String)] = for {
- JObject(pids) <- json \\ "instance" \ "pid"
- JField("value", JString(pidValue)) <- pids
+ JObject(pids) <- json \\ "instance" \ "pid"
+ JField("value", JString(pidValue)) <- pids
JField("qualifier", JObject(qualifier)) <- pids
- JField("classid", JString(pidType)) <- qualifier
+ JField("classid", JString(pidType)) <- qualifier
} yield (pidValue, pidType)
(id, result)
}
-
private def isRelation(input: String): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@@ -132,20 +146,25 @@ object SparkResolveRelation {
source != null
}
- def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = {
+ def extractPidResolvedTableFromJsonRDD(
+ spark: SparkSession,
+ graphPath: String,
+ workingPath: String
+ ) = {
import spark.implicits._
- val d: RDD[(String, String)] = spark.sparkContext.textFile(s"$graphPath/*")
+ val d: RDD[(String, String)] = spark.sparkContext
+ .textFile(s"$graphPath/*")
.filter(i => !isRelation(i))
.map(i => extractPidsFromRecord(i))
.filter(s => s != null && s._1 != null && s._2 != null && s._2.nonEmpty)
.flatMap { p =>
- p._2.map(pid =>
- (p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2))
- )
- }.filter(r => r._1 != null || r._2 != null)
+ p._2.map(pid => (p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2)))
+ }
+ .filter(r => r._1 != null || r._2 != null)
- spark.createDataset(d)
+ spark
+ .createDataset(d)
.groupByKey(_._2)
.reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y)
.map(s => s._2)
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala
index 9df3b41bd..79b1c22cd 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala
@@ -7,24 +7,26 @@ import org.apache.spark.sql.SparkSession
object SparkDataciteToOAF {
-
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
-
+ .master(parser.get("master"))
+ .getOrCreate()
val sc = spark.sparkContext
val inputPath = parser.get("inputPath")
-
}
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala
index 9d16cf907..fb90531c5 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala
@@ -11,18 +11,22 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkConvertDatasetToJsonRDD {
-
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
@@ -33,9 +37,13 @@ object SparkConvertDatasetToJsonRDD {
val mapper = new ObjectMapper()
implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
-
resultObject.foreach { item =>
- spark.read.load(s"$sourcePath/$item").as[Result].map(r => mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec])
+ spark.read
+ .load(s"$sourcePath/$item")
+ .as[Result]
+ .map(r => mapper.writeValueAsString(r))(Encoders.STRING)
+ .rdd
+ .saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec])
}
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala
index 0c54de7c8..bfa07eb69 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala
@@ -15,14 +15,19 @@ object SparkConvertObjectToJson {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
@@ -33,24 +38,28 @@ object SparkConvertObjectToJson {
val scholixUpdatePath = parser.get("scholixUpdatePath")
log.info(s"scholixUpdatePath -> $scholixUpdatePath")
-
-
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
-
val mapper = new ObjectMapper
objectType.toLowerCase match {
case "scholix" =>
log.info("Serialize Scholix")
val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix]
- val u :Dataset[Scholix]= spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
- d.union(u).repartition(8000).map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.saveAsTextFile(targetPath, classOf[GzipCodec])
+ val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
+ d.union(u)
+ .repartition(8000)
+ .map(s => mapper.writeValueAsString(s))(Encoders.STRING)
+ .rdd
+ .saveAsTextFile(targetPath, classOf[GzipCodec])
case "summary" =>
log.info("Serialize Summary")
val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary]
- d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(1000).saveAsTextFile(targetPath, classOf[GzipCodec])
+ d.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
+ .rdd
+ .repartition(1000)
+ .saveAsTextFile(targetPath, classOf[GzipCodec])
}
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala
index 2115df1fd..f13c14da5 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala
@@ -7,21 +7,26 @@ import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
+
object SparkConvertRDDtoDataset {
def main(args: Array[String]): Unit = {
-
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
@@ -31,43 +36,79 @@ object SparkConvertRDDtoDataset {
val entityPath = s"$t/entities"
val relPath = s"$t/relation"
val mapper = new ObjectMapper()
- implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
- implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
- implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
- implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct])
- implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
-
+ implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
+ implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
+ implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
+ implicit val orpEncoder: Encoder[OtherResearchProduct] =
+ Encoders.kryo(classOf[OtherResearchProduct])
+ implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
log.info("Converting dataset")
- val rddDataset =spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
- spark.createDataset(rddDataset).as[OafDataset].write.mode(SaveMode.Overwrite).save(s"$entityPath/dataset")
-
+ val rddDataset = spark.sparkContext
+ .textFile(s"$sourcePath/dataset")
+ .map(s => mapper.readValue(s, classOf[OafDataset]))
+ .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
+ spark
+ .createDataset(rddDataset)
+ .as[OafDataset]
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$entityPath/dataset")
log.info("Converting publication")
- val rddPublication =spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
- spark.createDataset(rddPublication).as[Publication].write.mode(SaveMode.Overwrite).save(s"$entityPath/publication")
+ val rddPublication = spark.sparkContext
+ .textFile(s"$sourcePath/publication")
+ .map(s => mapper.readValue(s, classOf[Publication]))
+ .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
+ spark
+ .createDataset(rddPublication)
+ .as[Publication]
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$entityPath/publication")
log.info("Converting software")
- val rddSoftware =spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
- spark.createDataset(rddSoftware).as[Software].write.mode(SaveMode.Overwrite).save(s"$entityPath/software")
+ val rddSoftware = spark.sparkContext
+ .textFile(s"$sourcePath/software")
+ .map(s => mapper.readValue(s, classOf[Software]))
+ .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
+ spark
+ .createDataset(rddSoftware)
+ .as[Software]
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$entityPath/software")
log.info("Converting otherresearchproduct")
- val rddOtherResearchProduct =spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
- spark.createDataset(rddOtherResearchProduct).as[OtherResearchProduct].write.mode(SaveMode.Overwrite).save(s"$entityPath/otherresearchproduct")
-
+ val rddOtherResearchProduct = spark.sparkContext
+ .textFile(s"$sourcePath/otherresearchproduct")
+ .map(s => mapper.readValue(s, classOf[OtherResearchProduct]))
+ .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
+ spark
+ .createDataset(rddOtherResearchProduct)
+ .as[OtherResearchProduct]
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$entityPath/otherresearchproduct")
log.info("Converting Relation")
+ val relationSemanticFilter = List(
+ "cites",
+ "iscitedby",
+ "merges",
+ "ismergedin",
+ "HasAmongTopNSimilarDocuments",
+ "IsAmongTopNSimilarDocuments"
+ )
- val relationSemanticFilter = List("cites", "iscitedby","merges", "ismergedin", "HasAmongTopNSimilarDocuments","IsAmongTopNSimilarDocuments" )
-
- val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation")
+ val rddRelation = spark.sparkContext
+ .textFile(s"$sourcePath/relation")
.map(s => mapper.readValue(s, classOf[Relation]))
- .filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
- .filter(r=> r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
+ .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
+ .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
-
}
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala
index ed88cfaa6..9d57e5869 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala
@@ -1,7 +1,7 @@
package eu.dnetlib.dhp.sx.graph
import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset,_}
+import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql._
@@ -13,82 +13,131 @@ object SparkCreateInputGraph {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/extract_entities_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/extract_entities_params.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
-
+ .master(parser.get("master"))
+ .getOrCreate()
val resultObject = List(
("publication", classOf[Publication]),
("dataset", classOf[OafDataset]),
("software", classOf[Software]),
("otherResearchProduct", classOf[OtherResearchProduct])
-
)
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
- implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct])
+ implicit val orpEncoder: Encoder[OtherResearchProduct] =
+ Encoders.kryo(classOf[OtherResearchProduct])
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
-
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
-
val oafDs: Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf]
-
log.info("Extract Publication")
- oafDs.filter(o => o.isInstanceOf[Publication]).map(p => p.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/publication")
+ oafDs
+ .filter(o => o.isInstanceOf[Publication])
+ .map(p => p.asInstanceOf[Publication])
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/extracted/publication")
log.info("Extract dataset")
- oafDs.filter(o => o.isInstanceOf[OafDataset]).map(p => p.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/dataset")
+ oafDs
+ .filter(o => o.isInstanceOf[OafDataset])
+ .map(p => p.asInstanceOf[OafDataset])
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/extracted/dataset")
log.info("Extract software")
- oafDs.filter(o => o.isInstanceOf[Software]).map(p => p.asInstanceOf[Software]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/software")
+ oafDs
+ .filter(o => o.isInstanceOf[Software])
+ .map(p => p.asInstanceOf[Software])
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/extracted/software")
log.info("Extract otherResearchProduct")
- oafDs.filter(o => o.isInstanceOf[OtherResearchProduct]).map(p => p.asInstanceOf[OtherResearchProduct]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/otherResearchProduct")
+ oafDs
+ .filter(o => o.isInstanceOf[OtherResearchProduct])
+ .map(p => p.asInstanceOf[OtherResearchProduct])
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/extracted/otherResearchProduct")
log.info("Extract Relation")
- oafDs.filter(o => o.isInstanceOf[Relation]).map(p => p.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/relation")
+ oafDs
+ .filter(o => o.isInstanceOf[Relation])
+ .map(p => p.asInstanceOf[Relation])
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/extracted/relation")
resultObject.foreach { r =>
log.info(s"Make ${r._1} unique")
- makeDatasetUnique(s"$targetPath/extracted/${r._1}", s"$targetPath/preprocess/${r._1}", spark, r._2)
+ makeDatasetUnique(
+ s"$targetPath/extracted/${r._1}",
+ s"$targetPath/preprocess/${r._1}",
+ spark,
+ r._2
+ )
}
}
-
- def extractEntities[T <: Oaf](oafDs: Dataset[Oaf], targetPath: String, clazz: Class[T], log: Logger): Unit = {
+ def extractEntities[T <: Oaf](
+ oafDs: Dataset[Oaf],
+ targetPath: String,
+ clazz: Class[T],
+ log: Logger
+ ): Unit = {
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
log.info(s"Extract ${clazz.getSimpleName}")
- oafDs.filter(o => o.isInstanceOf[T]).map(p => p.asInstanceOf[T]).write.mode(SaveMode.Overwrite).save(targetPath)
+ oafDs
+ .filter(o => o.isInstanceOf[T])
+ .map(p => p.asInstanceOf[T])
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(targetPath)
}
-
- def makeDatasetUnique[T <: Result](sourcePath: String, targetPath: String, spark: SparkSession, clazz: Class[T]): Unit = {
+ def makeDatasetUnique[T <: Result](
+ sourcePath: String,
+ targetPath: String,
+ spark: SparkSession,
+ clazz: Class[T]
+ ): Unit = {
import spark.implicits._
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
val ds: Dataset[T] = spark.read.load(sourcePath).as[T]
- ds.groupByKey(_.getId).reduceGroups { (x, y) =>
- x.mergeFrom(y)
- x
- }.map(_._2).write.mode(SaveMode.Overwrite).save(targetPath)
+ ds.groupByKey(_.getId)
+ .reduceGroups { (x, y) =>
+ x.mergeFrom(y)
+ x
+ }
+ .map(_._2)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(targetPath)
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala
index 9930c57af..af19b9698 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala
@@ -17,14 +17,19 @@ object SparkCreateScholix {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
val relationPath = parser.get("relationPath")
log.info(s"relationPath -> $relationPath")
@@ -33,37 +38,46 @@ object SparkCreateScholix {
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
-
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
import spark.implicits._
-
- val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation]
- .filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
+ val relationDS: Dataset[(String, Relation)] = spark.read
+ .load(relationPath)
+ .as[Relation]
+ .filter(r =>
+ (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase
+ .contains("merge")
+ )
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
- val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary]
+ val summaryDS: Dataset[(String, ScholixSummary)] = spark.read
+ .load(summaryPath)
+ .as[ScholixSummary]
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, summaryEncoder))
-
- relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
+ relationDS
+ .joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Relation), (String, ScholixSummary)) =>
if (input._1 != null && input._2 != null) {
val rel: Relation = input._1._2
val source: ScholixSummary = input._2._2
(rel.getTarget, ScholixUtils.scholixFromSource(rel, source))
- }
- else null
+ } else null
}(Encoders.tuple(Encoders.STRING, scholixEncoder))
.filter(r => r != null)
- .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source")
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/scholix_from_source")
- val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
+ val scholixSource: Dataset[(String, Scholix)] = spark.read
+ .load(s"$targetPath/scholix_from_source")
+ .as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
- scholixSource.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
+ scholixSource
+ .joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Scholix), (String, ScholixSummary)) =>
if (input._2 == null) {
null
@@ -72,40 +86,73 @@ object SparkCreateScholix {
val target: ScholixSummary = input._2._2
ScholixUtils.generateCompleteScholix(s, target)
}
- }.filter(s => s != null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse")
+ }
+ .filter(s => s != null)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/scholix_one_verse")
+ val scholix_o_v: Dataset[Scholix] =
+ spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
- val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
-
- scholix_o_v.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))).as[Scholix]
+ scholix_o_v
+ .flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s)))
+ .as[Scholix]
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder))
.groupByKey(_._1)
.agg(ScholixUtils.scholixAggregator.toColumn)
.map(s => s._2)
- .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix")
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/scholix")
val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix]
- val stats: Dataset[(String, String, Long)] = scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String, String, Long)]
-
+ val stats: Dataset[(String, String, Long)] = scholix_final
+ .map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType))
+ .groupBy("_1", "_2")
+ .agg(count("_1"))
+ .as[(String, String, Long)]
stats
- .map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0))
+ .map(s =>
+ RelatedEntities(
+ s._1,
+ if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0,
+ if ("publication".equalsIgnoreCase(s._2)) s._3 else 0
+ )
+ )
.groupByKey(_.id)
- .reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset + b.relatedDataset, a.relatedPublication + b.relatedPublication))
+ .reduceGroups((a, b) =>
+ RelatedEntities(
+ a.id,
+ a.relatedDataset + b.relatedDataset,
+ a.relatedPublication + b.relatedPublication
+ )
+ )
.map(_._2)
- .write.mode(SaveMode.Overwrite).save(s"$targetPath/related_entities")
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$targetPath/related_entities")
- val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication > 0 || r.relatedDataset > 0)
+ val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read
+ .load(s"$targetPath/related_entities")
+ .as[RelatedEntities]
+ .filter(r => r.relatedPublication > 0 || r.relatedDataset > 0)
- relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map { i =>
- val re = i._1
- val sum = i._2._2
+ relatedEntitiesDS
+ .joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner")
+ .map { i =>
+ val re = i._1
+ val sum = i._2._2
- sum.setRelatedDatasets(re.relatedDataset)
- sum.setRelatedPublications(re.relatedPublication)
- sum
- }.write.mode(SaveMode.Overwrite).save(s"${summaryPath}_filtered")
+ sum.setRelatedDatasets(re.relatedDataset)
+ sum.setRelatedPublications(re.relatedPublication)
+ sum
+ }
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"${summaryPath}_filtered")
}
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala
index 4274cae5a..6d489e8cb 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala
@@ -14,14 +14,19 @@ object SparkCreateSummaryObject {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json")))
+ val parser = new ArgumentApplicationParser(
+ IOUtils.toString(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json")
+ )
+ )
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
@@ -33,10 +38,17 @@ object SparkCreateSummaryObject {
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
+ val ds: Dataset[Result] = spark.read
+ .load(s"$sourcePath/*")
+ .as[Result]
+ .filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false)
- val ds: Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false)
-
- ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s != null).write.mode(SaveMode.Overwrite).save(targetPath)
+ ds.repartition(6000)
+ .map(r => ScholixUtils.resultToSummary(r))
+ .filter(s => s != null)
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(targetPath)
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala
index c70397d04..23f4da6c7 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala
@@ -10,61 +10,88 @@ import java.util.regex.Pattern
import scala.language.postfixOps
import scala.xml.{Elem, Node, XML}
-case class PangaeaDataModel(identifier:String, title:List[String], objectType:List[String], creator:List[String],
- publisher:List[String], dataCenter :List[String],subject :List[String], language:String,
- rights:String, parent:String,relation :List[String],linkage:List[(String,String)] ) {}
+case class PangaeaDataModel(
+ identifier: String,
+ title: List[String],
+ objectType: List[String],
+ creator: List[String],
+ publisher: List[String],
+ dataCenter: List[String],
+ subject: List[String],
+ language: String,
+ rights: String,
+ parent: String,
+ relation: List[String],
+ linkage: List[(String, String)]
+) {}
object PangaeaUtils {
-
- def toDataset(input:String):PangaeaDataModel = {
+ def toDataset(input: String): PangaeaDataModel = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
- val xml= (json \ "xml").extract[String]
+ val xml = (json \ "xml").extract[String]
parseXml(xml)
}
- def findDOIInRelation( input:List[String]):List[String] = {
+ def findDOIInRelation(input: List[String]): List[String] = {
val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b")
- input.map(i => {
- val matcher = pattern.matcher(i)
- if (matcher.find())
- matcher.group(0)
- else
- null
- }).filter(i => i!= null)
+ input
+ .map(i => {
+ val matcher = pattern.matcher(i)
+ if (matcher.find())
+ matcher.group(0)
+ else
+ null
+ })
+ .filter(i => i != null)
}
- def attributeOpt(attribute: String, node:Node): Option[String] =
+ def attributeOpt(attribute: String, node: Node): Option[String] =
node.attribute(attribute) flatMap (_.headOption) map (_.text)
- def extractLinkage(node:Elem):List[(String, String)] = {
- (node \ "linkage").map(n =>(attributeOpt("type",n), n.text)).filter(t => t._1.isDefined).map(t=> (t._1.get, t._2))(collection.breakOut)
+ def extractLinkage(node: Elem): List[(String, String)] = {
+ (node \ "linkage")
+ .map(n => (attributeOpt("type", n), n.text))
+ .filter(t => t._1.isDefined)
+ .map(t => (t._1.get, t._2))(collection.breakOut)
}
- def parseXml(input:String):PangaeaDataModel = {
+ def parseXml(input: String): PangaeaDataModel = {
val xml = XML.loadString(input)
val identifier = (xml \ "identifier").text
- val title :List[String] = (xml \ "title").map(n => n.text)(collection.breakOut)
- val pType :List[String] = (xml \ "type").map(n => n.text)(collection.breakOut)
- val creators:List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut)
- val publisher :List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut)
- val dataCenter :List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut)
- val subject :List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut)
- val language= (xml \ "language").text
- val rights= (xml \ "rights").text
- val parentIdentifier= (xml \ "parentIdentifier").text
- val relation :List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut)
+ val title: List[String] = (xml \ "title").map(n => n.text)(collection.breakOut)
+ val pType: List[String] = (xml \ "type").map(n => n.text)(collection.breakOut)
+ val creators: List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut)
+ val publisher: List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut)
+ val dataCenter: List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut)
+ val subject: List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut)
+ val language = (xml \ "language").text
+ val rights = (xml \ "rights").text
+ val parentIdentifier = (xml \ "parentIdentifier").text
+ val relation: List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut)
val relationFiltered = findDOIInRelation(relation)
- val linkage:List[(String,String)] = extractLinkage(xml)
+ val linkage: List[(String, String)] = extractLinkage(xml)
- PangaeaDataModel(identifier,title, pType, creators,publisher, dataCenter, subject, language, rights, parentIdentifier, relationFiltered, linkage)
+ PangaeaDataModel(
+ identifier,
+ title,
+ pType,
+ creators,
+ publisher,
+ dataCenter,
+ subject,
+ language,
+ rights,
+ parentIdentifier,
+ relationFiltered,
+ linkage
+ )
}
-
- def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] = new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel]{
-
+ def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] =
+ new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] {
override def zero: PangaeaDataModel = null
@@ -77,7 +104,7 @@ object PangaeaUtils {
else {
if (b.title != null && b.title.nonEmpty)
b
- else
+ else
a._2
}
@@ -106,7 +133,4 @@ object PangaeaUtils {
override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
}
-
-
-
-}
\ No newline at end of file
+}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala
index 2717b7b80..8ff8a8b1a 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala
@@ -11,20 +11,25 @@ import scala.io.Source
object SparkGeneratePanagaeaDataset {
-
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")).mkString)
+ val parser = new ArgumentApplicationParser(
+ Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")
+ )
+ .mkString
+ )
parser.parseArgument(args)
-
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkGeneratePanagaeaDataset.getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
+ .master(parser.get("master"))
+ .getOrCreate()
parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}"))
logger.info("Converting sequential file into Dataset")
@@ -34,16 +39,20 @@ object SparkGeneratePanagaeaDataset {
implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
- val inputRDD: RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
+ val inputRDD: RDD[PangaeaDataModel] =
+ sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
- spark.createDataset(inputRDD).as[PangaeaDataModel]
+ spark
+ .createDataset(inputRDD)
+ .as[PangaeaDataModel]
.map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders))
.groupByKey(_._1)(Encoders.STRING)
.agg(PangaeaUtils.getDatasetAggregator().toColumn)
.map(s => s._2)
- .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset")
+ .write
+ .mode(SaveMode.Overwrite)
+ .save(s"$workingPath/dataset")
}
-
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala
index 4613d5636..7e41e993f 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala
@@ -9,10 +9,10 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue}
import org.junit.jupiter.api.Test
-class TestApply extends java.io.Serializable{
+class TestApply extends java.io.Serializable {
@Test
- def testApplyOnResult (): Unit = {
+ def testApplyOnResult(): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
@@ -25,54 +25,104 @@ class TestApply extends java.io.Serializable{
val pub = getClass.getResource("publication.json").getPath
val hbm = getClass.getResource("preparedInfo.json").getPath
- val mapper:ObjectMapper = new ObjectMapper()
+ val mapper: ObjectMapper = new ObjectMapper()
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
implicit val mapEncoderPubInfo: Encoder[Publication] = Encoders.bean(classOf[Publication])
-
- val pub_ds :Dataset[Publication] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[Publication]))
- val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
-
+ val pub_ds: Dataset[Publication] =
+ spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[Publication]))
+ val hbm_ds: Dataset[EntityInfo] =
+ spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
assertEquals(13, pub_ds.count())
- val ds:Dataset[Publication] = SparkApplyHostedByMapToResult.applyHBtoPubs(hbm_ds, pub_ds)
+ val ds: Dataset[Publication] = SparkApplyHostedByMapToResult.applyHBtoPubs(hbm_ds, pub_ds)
- assertEquals(13, ds.count)
+ assertEquals(13, ds.count)
- val temp: Dataset[(Publication, Publication)] = pub_ds.joinWith(ds, pub_ds.col("id").equalTo(ds.col("id")), "left")
+ val temp: Dataset[(Publication, Publication)] =
+ pub_ds.joinWith(ds, pub_ds.col("id").equalTo(ds.col("id")), "left")
assertEquals(13, temp.count())
temp.foreach(t2 => {
- val pb : Publication = t2._1
- val pa : Publication = t2._2
+ val pb: Publication = t2._1
+ val pa: Publication = t2._2
assertEquals(1, pa.getInstance().size())
assertEquals(1, pb.getInstance().size())
assertTrue(t2._1.getId.equals(t2._2.getId))
- if(pb.getId.equals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9")){
- assertTrue(pa.getInstance().get(0).getHostedby.getKey.equals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735"))
+ if (pb.getId.equals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9")) {
+ assertTrue(
+ pa.getInstance()
+ .get(0)
+ .getHostedby
+ .getKey
+ .equals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735")
+ )
assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals("Academic Therapy"))
assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals("OPEN"))
assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals("Open Access"))
- assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.gold))
+ assertTrue(
+ pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.gold)
+ )
assertTrue(pa.getBestaccessright.getClassid.equals("OPEN"))
assertTrue(pa.getBestaccessright.getClassname.equals("Open Access"))
-
- assertTrue(pb.getInstance().get(0).getHostedby.getKey.equals("10|openaire____::0b74b6a356bbf23c245f9ae9a748745c"))
- assertTrue(pb.getInstance().get(0).getHostedby.getValue.equals("Revistas de investigación Universidad Nacional Mayor de San Marcos"))
+ assertTrue(
+ pb.getInstance()
+ .get(0)
+ .getHostedby
+ .getKey
+ .equals("10|openaire____::0b74b6a356bbf23c245f9ae9a748745c")
+ )
+ assertTrue(
+ pb.getInstance()
+ .get(0)
+ .getHostedby
+ .getValue
+ .equals("Revistas de investigación Universidad Nacional Mayor de San Marcos")
+ )
assertTrue(pb.getInstance().get(0).getAccessright.getClassname.equals("not available"))
assertTrue(pb.getInstance().get(0).getAccessright.getClassid.equals("UNKNOWN"))
assertTrue(pb.getInstance().get(0).getAccessright.getOpenAccessRoute == null)
assertTrue(pb.getBestaccessright.getClassid.equals("UNKNOWN"))
assertTrue(pb.getBestaccessright.getClassname.equals("not available"))
- }else{
- assertTrue(pa.getInstance().get(0).getHostedby.getKey.equals(pb.getInstance().get(0).getHostedby.getKey))
- assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals(pb.getInstance().get(0).getHostedby.getValue))
- assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals(pb.getInstance().get(0).getAccessright.getClassid))
- assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals(pb.getInstance().get(0).getAccessright.getClassname))
- assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute == pb.getInstance().get(0).getAccessright.getOpenAccessRoute)
+ } else {
+ assertTrue(
+ pa.getInstance()
+ .get(0)
+ .getHostedby
+ .getKey
+ .equals(pb.getInstance().get(0).getHostedby.getKey)
+ )
+ assertTrue(
+ pa.getInstance()
+ .get(0)
+ .getHostedby
+ .getValue
+ .equals(pb.getInstance().get(0).getHostedby.getValue)
+ )
+ assertTrue(
+ pa.getInstance()
+ .get(0)
+ .getAccessright
+ .getClassid
+ .equals(pb.getInstance().get(0).getAccessright.getClassid)
+ )
+ assertTrue(
+ pa.getInstance()
+ .get(0)
+ .getAccessright
+ .getClassname
+ .equals(pb.getInstance().get(0).getAccessright.getClassname)
+ )
+ assertTrue(
+ pa.getInstance().get(0).getAccessright.getOpenAccessRoute == pb
+ .getInstance()
+ .get(0)
+ .getAccessright
+ .getOpenAccessRoute
+ )
}
})
@@ -80,9 +130,8 @@ class TestApply extends java.io.Serializable{
spark.close()
}
-
@Test
- def testApplyOnDatasource():Unit = {
+ def testApplyOnDatasource(): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
@@ -95,38 +144,49 @@ class TestApply extends java.io.Serializable{
val dats = getClass.getResource("datasource.json").getPath
val hbm = getClass.getResource("preparedInfo2.json").getPath
- val mapper:ObjectMapper = new ObjectMapper()
+ val mapper: ObjectMapper = new ObjectMapper()
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
implicit val mapEncoderPubInfo: Encoder[Datasource] = Encoders.bean(classOf[Datasource])
-
- val dats_ds :Dataset[Datasource] = spark.read.textFile(dats).map(p => mapper.readValue(p, classOf[Datasource]))
- val hbm_ds :Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])))
-
+ val dats_ds: Dataset[Datasource] =
+ spark.read.textFile(dats).map(p => mapper.readValue(p, classOf[Datasource]))
+ val hbm_ds: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(
+ spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
+ )
assertEquals(10, dats_ds.count())
- val ds:Dataset[Datasource] = SparkApplyHostedByMapToDatasource.applyHBtoDats(hbm_ds, dats_ds)
+ val ds: Dataset[Datasource] = SparkApplyHostedByMapToDatasource.applyHBtoDats(hbm_ds, dats_ds)
- assertEquals(10, ds.count)
+ assertEquals(10, ds.count)
- val temp: Dataset[(Datasource, Datasource)] = dats_ds.joinWith(ds, dats_ds.col("id").equalTo(ds.col("id")), "left")
+ val temp: Dataset[(Datasource, Datasource)] =
+ dats_ds.joinWith(ds, dats_ds.col("id").equalTo(ds.col("id")), "left")
assertEquals(10, temp.count())
temp.foreach(t2 => {
- val pb : Datasource = t2._1
- val pa : Datasource = t2._2
+ val pb: Datasource = t2._1
+ val pa: Datasource = t2._2
assertTrue(t2._1.getId.equals(t2._2.getId))
- if(pb.getId.equals("10|doajarticles::0ab37b7620eb9a73ac95d3ca4320c97d")) {
+ if (pb.getId.equals("10|doajarticles::0ab37b7620eb9a73ac95d3ca4320c97d")) {
assertTrue(pa.getOpenairecompatibility().getClassid.equals("hostedBy"))
- assertTrue(pa.getOpenairecompatibility().getClassname.equals("collected from a compatible aggregator"))
+ assertTrue(
+ pa.getOpenairecompatibility()
+ .getClassname
+ .equals("collected from a compatible aggregator")
+ )
assertTrue(pb.getOpenairecompatibility().getClassid.equals(ModelConstants.UNKNOWN))
-
} else {
- assertTrue(pa.getOpenairecompatibility().getClassid.equals(pb.getOpenairecompatibility.getClassid))
- assertTrue(pa.getOpenairecompatibility().getClassname.equals(pb.getOpenairecompatibility.getClassname))
+ assertTrue(
+ pa.getOpenairecompatibility().getClassid.equals(pb.getOpenairecompatibility.getClassid)
+ )
+ assertTrue(
+ pa.getOpenairecompatibility()
+ .getClassname
+ .equals(pb.getOpenairecompatibility.getClassname)
+ )
}
})
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala
index 7abce547f..5fc29e3b0 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala
@@ -9,9 +9,9 @@ import org.json4s.DefaultFormats
import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue}
import org.junit.jupiter.api.Test
-class TestPrepare extends java.io.Serializable{
+class TestPrepare extends java.io.Serializable {
- def getString(input:HostedByItemType):String = {
+ def getString(input: HostedByItemType): String = {
import org.json4s.jackson.Serialization.write
implicit val formats = DefaultFormats
@@ -19,9 +19,8 @@ class TestPrepare extends java.io.Serializable{
write(input)
}
-
@Test
- def testHostedByMaptoEntityInfo() : Unit = {
+ def testHostedByMaptoEntityInfo(): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
@@ -33,23 +32,23 @@ class TestPrepare extends java.io.Serializable{
.getOrCreate()
val hbm = getClass.getResource("hostedbymap.json").getPath
-
import spark.implicits._
- val mapper:ObjectMapper = new ObjectMapper()
+ val mapper: ObjectMapper = new ObjectMapper()
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
- val ds :Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hbm)).map(toEntityInfo)
+ val ds: Dataset[EntityInfo] =
+ spark.createDataset(spark.sparkContext.textFile(hbm)).map(toEntityInfo)
ds.foreach(e => println(mapper.writeValueAsString(e)))
- assertEquals(20, ds.count)
+ assertEquals(20, ds.count)
spark.close()
}
@Test
- def testPublicationtoEntityInfo() : Unit = {
+ def testPublicationtoEntityInfo(): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
@@ -61,24 +60,30 @@ class TestPrepare extends java.io.Serializable{
.getOrCreate()
val path = getClass.getResource("publication.json").getPath
- val mapper:ObjectMapper = new ObjectMapper()
+ val mapper: ObjectMapper = new ObjectMapper()
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
- val ds :Dataset[EntityInfo] = prepareResultInfo(spark, path)
+ val ds: Dataset[EntityInfo] = prepareResultInfo(spark, path)
ds.foreach(e => println(mapper.writeValueAsString(e)))
- assertEquals(2, ds.count)
+ assertEquals(2, ds.count)
- assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ds.filter(ei => ei.getJournalId.equals("1728-5852")).first().getId)
- assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ds.filter(ei => ei.getJournalId.equals("0001-396X")).first().getId)
+ assertEquals(
+ "50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9",
+ ds.filter(ei => ei.getJournalId.equals("1728-5852")).first().getId
+ )
+ assertEquals(
+ "50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9",
+ ds.filter(ei => ei.getJournalId.equals("0001-396X")).first().getId
+ )
spark.close()
}
@Test
- def testJoinResHBM (): Unit = {
+ def testJoinResHBM(): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
@@ -91,18 +96,20 @@ class TestPrepare extends java.io.Serializable{
val pub = getClass.getResource("iteminfofrompublication").getPath
val hbm = getClass.getResource("iteminfofromhostedbymap.json").getPath
- val mapper:ObjectMapper = new ObjectMapper()
+ val mapper: ObjectMapper = new ObjectMapper()
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
- val pub_ds :Dataset[EntityInfo] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
- val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
+ val pub_ds: Dataset[EntityInfo] =
+ spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
+ val hbm_ds: Dataset[EntityInfo] =
+ spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds)
- assertEquals(1, ds.count)
+ assertEquals(1, ds.count)
- val ei:EntityInfo = ds.first()
+ val ei: EntityInfo = ds.first()
assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ei.getId)
assertEquals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735", ei.getHostedById)
@@ -114,7 +121,7 @@ class TestPrepare extends java.io.Serializable{
}
@Test
- def testJoinResHBM2 (): Unit = {
+ def testJoinResHBM2(): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
@@ -127,18 +134,20 @@ class TestPrepare extends java.io.Serializable{
val pub = getClass.getResource("iteminfofrompublication2").getPath
val hbm = getClass.getResource("iteminfofromhostedbymap2.json").getPath
- val mapper:ObjectMapper = new ObjectMapper()
+ val mapper: ObjectMapper = new ObjectMapper()
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
- val pub_ds :Dataset[EntityInfo] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
- val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
+ val pub_ds: Dataset[EntityInfo] =
+ spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
+ val hbm_ds: Dataset[EntityInfo] =
+ spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds)
- assertEquals(1, ds.count)
+ assertEquals(1, ds.count)
- val ei:EntityInfo = ds.first()
+ val ei: EntityInfo = ds.first()
assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ei.getId)
assertEquals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735", ei.getHostedById)
@@ -150,6 +159,4 @@ class TestPrepare extends java.io.Serializable{
spark.close()
}
-
-
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala
index 0922f2e19..12879c466 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala
@@ -8,20 +8,19 @@ import org.json4s.jackson.Serialization.write
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test
-class TestPreprocess extends java.io.Serializable{
+class TestPreprocess extends java.io.Serializable {
implicit val mapEncoderDats: Encoder[Datasource] = Encoders.kryo[Datasource]
implicit val schema = Encoders.product[HostedByInfo]
-
- def toHBIString (hbi:HostedByItemType): String = {
+ def toHBIString(hbi: HostedByItemType): String = {
implicit val formats = DefaultFormats
write(hbi)
}
@Test
- def readDatasource():Unit = {
+ def readDatasource(): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
@@ -33,29 +32,40 @@ class TestPreprocess extends java.io.Serializable{
.getOrCreate()
val path = getClass.getResource("datasource.json").getPath
- val ds :Dataset[HostedByItemType]= SparkProduceHostedByMap.oaHostedByDataset(spark, path)
+ val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.oaHostedByDataset(spark, path)
- assertEquals(9, ds.count)
+ assertEquals(9, ds.count)
assertEquals(8, ds.filter(hbi => !hbi.issn.equals("")).count)
assertEquals(5, ds.filter(hbi => !hbi.eissn.equals("")).count)
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
- assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count)
+ assertEquals(
+ 0,
+ ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
+ )
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365")).count == 1)
assertTrue(ds.filter(hbi => hbi.eissn.equals("2253-900X")).count == 1)
- assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1)
- assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata")).count == 1)
- assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.id.equals("10|doajarticles::abbc9265bea9ff62776a1c39785af00c")).count == 1)
+ assertTrue(
+ ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1
+ )
+ assertTrue(
+ ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata")).count == 1
+ )
+ assertTrue(
+ ds.filter(hbi =>
+ hbi.issn.equals("0212-8365") && hbi.id
+ .equals("10|doajarticles::abbc9265bea9ff62776a1c39785af00c")
+ ).count == 1
+ )
ds.foreach(hbi => assertTrue(hbi.id.startsWith("10|")))
ds.foreach(hbi => println(toHBIString(hbi)))
spark.close()
}
-
@Test
- def readGold():Unit = {
+ def readGold(): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
@@ -67,8 +77,7 @@ class TestPreprocess extends java.io.Serializable{
.getOrCreate()
val path = getClass.getResource("unibi_transformed.json").getPath
-
- val ds :Dataset[HostedByItemType]= SparkProduceHostedByMap.goldHostedByDataset(spark, path)
+ val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.goldHostedByDataset(spark, path)
assertEquals(29, ds.count)
@@ -76,9 +85,17 @@ class TestPreprocess extends java.io.Serializable{
assertEquals(0, ds.filter(hbi => !hbi.eissn.equals("")).count)
assertEquals(29, ds.filter(hbi => !hbi.lissn.equals("")).count)
- assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count)
+ assertEquals(
+ 0,
+ ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
+ )
- assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().officialname.equals("European journal of sustainable development."))
+ assertTrue(
+ ds.filter(hbi => hbi.issn.equals("2239-6101"))
+ .first()
+ .officialname
+ .equals("European journal of sustainable development.")
+ )
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().lissn.equals("2239-5938"))
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).count == 1)
ds.foreach(hbi => assertTrue(hbi.id.equals(Constants.UNIBI)))
@@ -88,7 +105,7 @@ class TestPreprocess extends java.io.Serializable{
}
@Test
- def readDoaj():Unit = {
+ def readDoaj(): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost")
@@ -100,7 +117,7 @@ class TestPreprocess extends java.io.Serializable{
.getOrCreate()
val path = getClass.getResource("doaj_transformed.json").getPath
- val ds :Dataset[HostedByItemType]= SparkProduceHostedByMap.doajHostedByDataset(spark, path)
+ val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.doajHostedByDataset(spark, path)
assertEquals(25, ds.count)
@@ -108,9 +125,17 @@ class TestPreprocess extends java.io.Serializable{
assertEquals(21, ds.filter(hbi => !hbi.eissn.equals("")).count)
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
- assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count)
+ assertEquals(
+ 0,
+ ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
+ )
- assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().officialname.equals("Journal of Space Technology"))
+ assertTrue(
+ ds.filter(hbi => hbi.issn.equals("2077-3099"))
+ .first()
+ .officialname
+ .equals("Journal of Space Technology")
+ )
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().eissn.equals("2411-5029"))
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).count == 1)
assertTrue(ds.filter(hbi => hbi.eissn.equals("2077-2955")).first().issn.equals(""))
@@ -121,7 +146,7 @@ class TestPreprocess extends java.io.Serializable{
}
@Test
- def testAggregator() : Unit = {
+ def testAggregator(): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
@@ -133,22 +158,40 @@ class TestPreprocess extends java.io.Serializable{
.config(conf)
.getOrCreate()
-
- val tmp = SparkProduceHostedByMap.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
- .union(SparkProduceHostedByMap.goldHostedByDataset(spark,getClass.getResource("unibi_transformed.json").getPath))
- .union(SparkProduceHostedByMap.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath))
- .flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]))
+ val tmp = SparkProduceHostedByMap
+ .oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
+ .union(
+ SparkProduceHostedByMap
+ .goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath)
+ )
+ .union(
+ SparkProduceHostedByMap
+ .doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)
+ )
+ .flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(
+ Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
+ )
assertEquals(106, tmp.count)
assertEquals(82, tmp.map(i => i._1)(Encoders.STRING).distinct().count)
+ val ds: Dataset[(String, HostedByItemType)] = Aggregators.explodeHostedByItemType(
+ SparkProduceHostedByMap
+ .oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
+ .union(
+ SparkProduceHostedByMap
+ .goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath)
+ )
+ .union(
+ SparkProduceHostedByMap
+ .doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)
+ )
+ .flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(
+ Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
+ )
+ )
- val ds :Dataset[(String, HostedByItemType)] = Aggregators.explodeHostedByItemType(SparkProduceHostedByMap.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
- .union(SparkProduceHostedByMap.goldHostedByDataset(spark,getClass.getResource("unibi_transformed.json").getPath))
- .union(SparkProduceHostedByMap.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath))
- .flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])))
-
- assertEquals(82, ds.count)
+ assertEquals(82, ds.count)
assertEquals(13, ds.filter(i => i._2.id.startsWith("10|")).count)
@@ -156,14 +199,13 @@ class TestPreprocess extends java.io.Serializable{
assertTrue(ds.filter(i => i._1.equals("2077-3757")).first()._2.openAccess)
assertEquals(1, ds.filter(i => i._1.equals("2077-3757")).count)
- val hbmap : Dataset[String] = ds.filter(hbi => hbi._2.id.startsWith("10|")).map(SparkProduceHostedByMap.toHostedByMap)(Encoders.STRING)
+ val hbmap: Dataset[String] = ds
+ .filter(hbi => hbi._2.id.startsWith("10|"))
+ .map(SparkProduceHostedByMap.toHostedByMap)(Encoders.STRING)
hbmap.foreach(entry => println(entry))
spark.close()
}
-
-
-
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala
index c22243f94..c8e41743f 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala
@@ -1,6 +1,5 @@
package eu.dnetlib.dhp.oa.graph.resolution
-
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.common.EntityType
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
@@ -19,174 +18,225 @@ import scala.io.Source
@TestInstance(Lifecycle.PER_CLASS)
class ResolveEntitiesTest extends Serializable {
- var workingDir:Path = null
+ var workingDir: Path = null
val FAKE_TITLE = "FAKETITLE"
val FAKE_SUBJECT = "FAKESUBJECT"
- var sparkSession:Option[SparkSession] = None
-
+ var sparkSession: Option[SparkSession] = None
@BeforeAll
- def setUp() :Unit = {
+ def setUp(): Unit = {
workingDir = Files.createTempDirectory(getClass.getSimpleName)
val conf = new SparkConf()
- sparkSession = Some(SparkSession
- .builder()
- .config(conf)
- .appName(getClass.getSimpleName)
- .master("local[*]").getOrCreate())
+ sparkSession = Some(
+ SparkSession
+ .builder()
+ .config(conf)
+ .appName(getClass.getSimpleName)
+ .master("local[*]")
+ .getOrCreate()
+ )
populateDatasets(sparkSession.get)
generateUpdates(sparkSession.get)
}
-
@AfterAll
- def tearDown():Unit = {
+ def tearDown(): Unit = {
FileUtils.deleteDirectory(workingDir.toFile)
sparkSession.get.stop()
-
}
-
- def generateUpdates(spark:SparkSession):Unit = {
+ def generateUpdates(spark: SparkSession): Unit = {
val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString
+ val pids: List[String] = template.lines
+ .map { id =>
+ val r = new Result
+ r.setId(id.toLowerCase.trim)
+ r.setSubject(
+ List(
+ OafMapperUtils.structuredProperty(
+ FAKE_SUBJECT,
+ OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
+ null
+ )
+ ).asJava
+ )
+ r.setTitle(
+ List(
+ OafMapperUtils.structuredProperty(
+ FAKE_TITLE,
+ OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
+ null
+ )
+ ).asJava
+ )
+ r
+ }
+ .map { r =>
+ val mapper = new ObjectMapper()
- val pids:List[String] = template.lines.map{id =>
- val r = new Result
- r.setId(id.toLowerCase.trim)
- r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
- r.setTitle(List(OafMapperUtils.structuredProperty(FAKE_TITLE, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
- r
- }.map{r =>
- val mapper = new ObjectMapper()
+ mapper.writeValueAsString(r)
+ }
+ .toList
- mapper.writeValueAsString(r)}.toList
-
-
- val sc =spark.sparkContext
+ val sc = spark.sparkContext
println(sc.parallelize(pids).count())
- spark.createDataset(sc.parallelize(pids))(Encoders.STRING).write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingDir/updates")
-
-
-
-
+ spark
+ .createDataset(sc.parallelize(pids))(Encoders.STRING)
+ .write
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .text(s"$workingDir/updates")
import spark.implicits._
implicit val resEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
- val ds = spark.read.text(s"$workingDir/updates").as[String].map{s => val mapper = new ObjectMapper()
- mapper.readValue(s, classOf[Result])}.collect()
-
-
-
+ val ds = spark.read
+ .text(s"$workingDir/updates")
+ .as[String]
+ .map { s =>
+ val mapper = new ObjectMapper()
+ mapper.readValue(s, classOf[Result])
+ }
+ .collect()
assertEquals(4, ds.length)
- ds.foreach{r => assertNotNull(r.getSubject)}
- ds.foreach{r => assertEquals(1,r.getSubject.size())}
- ds.foreach{r => assertNotNull(r.getTitle)}
- ds.foreach{r => assertEquals(1,r.getTitle.size())}
+ ds.foreach { r => assertNotNull(r.getSubject) }
+ ds.foreach { r => assertEquals(1, r.getSubject.size()) }
+ ds.foreach { r => assertNotNull(r.getTitle) }
+ ds.foreach { r => assertEquals(1, r.getTitle.size()) }
-
-
- ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_TITLE,t))
- ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_SUBJECT,t))
+ ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue))
+ .foreach(t => assertEquals(FAKE_TITLE, t))
+ ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue))
+ .foreach(t => assertEquals(FAKE_SUBJECT, t))
println("generated Updates")
}
-
- def populateDatasets(spark:SparkSession):Unit = {
+ def populateDatasets(spark: SparkSession): Unit = {
import spark.implicits._
- val entities =SparkResolveEntities.entities
+ val entities = SparkResolveEntities.entities
- entities.foreach{
- e =>
- val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString
- spark.createDataset(spark.sparkContext.parallelize(template.lines.toList)).as[String].write.option("compression", "gzip").text(s"$workingDir/graph/$e")
- println(s"Created Dataset $e")
+ entities.foreach { e =>
+ val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString
+ spark
+ .createDataset(spark.sparkContext.parallelize(template.lines.toList))
+ .as[String]
+ .write
+ .option("compression", "gzip")
+ .text(s"$workingDir/graph/$e")
+ println(s"Created Dataset $e")
}
- SparkResolveRelation.extractPidResolvedTableFromJsonRDD(spark, s"$workingDir/graph", s"$workingDir/work")
+ SparkResolveRelation.extractPidResolvedTableFromJsonRDD(
+ spark,
+ s"$workingDir/graph",
+ s"$workingDir/work"
+ )
}
-
@Test
- def testResolution():Unit = {
- val spark:SparkSession = sparkSession.get
+ def testResolution(): Unit = {
+ val spark: SparkSession = sparkSession.get
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
- SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" )
+ SparkResolveEntities.resolveEntities(spark, s"$workingDir/work", s"$workingDir/updates")
val ds = spark.read.load(s"$workingDir/work/resolvedEntities").as[Result]
assertEquals(3, ds.count())
- ds.collect().foreach{
- r =>
+ ds.collect().foreach { r =>
assertTrue(r.getId.startsWith("50"))
}
}
-
-
-
- private def structuredPContainsValue(l:java.util.List[StructuredProperty], exptectedValue:String):Boolean = {
- l.asScala.exists(p =>p.getValue!= null && p.getValue.equalsIgnoreCase(exptectedValue))
+ private def structuredPContainsValue(
+ l: java.util.List[StructuredProperty],
+ exptectedValue: String
+ ): Boolean = {
+ l.asScala.exists(p => p.getValue != null && p.getValue.equalsIgnoreCase(exptectedValue))
}
@Test
- def testUpdate():Unit = {
- val spark:SparkSession = sparkSession.get
+ def testUpdate(): Unit = {
+ val spark: SparkSession = sparkSession.get
import spark.implicits._
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
val m = new ObjectMapper()
- SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" )
- SparkResolveEntities.generateResolvedEntities(spark,s"$workingDir/work",s"$workingDir/graph", s"$workingDir/target" )
-
-
-
- val pubDS:Dataset[Result] = spark.read.text(s"$workingDir/target/publication").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
- val t = pubDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
+ SparkResolveEntities.resolveEntities(spark, s"$workingDir/work", s"$workingDir/updates")
+ SparkResolveEntities.generateResolvedEntities(
+ spark,
+ s"$workingDir/work",
+ s"$workingDir/graph",
+ s"$workingDir/target"
+ )
+ val pubDS: Dataset[Result] = spark.read
+ .text(s"$workingDir/target/publication")
+ .as[String]
+ .map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
+ val t = pubDS
+ .filter(p => p.getTitle != null && p.getSubject != null)
+ .filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
+ .count()
var ct = pubDS.count()
- var et = pubDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
+ var et = pubDS
+ .filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
+ .count()
assertEquals(ct, et)
-
-
- val datDS:Dataset[Result] = spark.read.text(s"$workingDir/target/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
- val td = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
+ val datDS: Dataset[Result] = spark.read
+ .text(s"$workingDir/target/dataset")
+ .as[String]
+ .map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
+ val td = datDS
+ .filter(p => p.getTitle != null && p.getSubject != null)
+ .filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
+ .count()
ct = datDS.count()
- et = datDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
+ et = datDS
+ .filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
+ .count()
assertEquals(ct, et)
-
- val softDS:Dataset[Result] = spark.read.text(s"$workingDir/target/software").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.software))
- val ts = softDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
+ val softDS: Dataset[Result] = spark.read
+ .text(s"$workingDir/target/software")
+ .as[String]
+ .map(s => SparkResolveEntities.deserializeObject(s, EntityType.software))
+ val ts = softDS
+ .filter(p => p.getTitle != null && p.getSubject != null)
+ .filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
+ .count()
ct = softDS.count()
- et = softDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
+ et = softDS
+ .filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
+ .count()
assertEquals(ct, et)
-
- val orpDS:Dataset[Result] = spark.read.text(s"$workingDir/target/otherresearchproduct").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct))
- val to = orpDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
-
+ val orpDS: Dataset[Result] = spark.read
+ .text(s"$workingDir/target/otherresearchproduct")
+ .as[String]
+ .map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct))
+ val to = orpDS
+ .filter(p => p.getTitle != null && p.getSubject != null)
+ .filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
+ .count()
ct = orpDS.count()
- et = orpDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
+ et = orpDS
+ .filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
+ .count()
assertEquals(ct, et)
-
-
-
-
assertEquals(0, t)
assertEquals(2, td)
assertEquals(1, ts)
@@ -194,40 +244,35 @@ class ResolveEntitiesTest extends Serializable {
}
-
-
-
-
@Test
- def testMerge():Unit = {
+ def testMerge(): Unit = {
val r = new Result
- r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
+ r.setSubject(
+ List(
+ OafMapperUtils.structuredProperty(
+ FAKE_SUBJECT,
+ OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
+ null
+ )
+ ).asJava
+ )
val mapper = new ObjectMapper()
- val p = mapper.readValue(Source.fromInputStream(this.getClass.getResourceAsStream(s"publication")).mkString.lines.next(), classOf[Publication])
-
+ val p = mapper.readValue(
+ Source
+ .fromInputStream(this.getClass.getResourceAsStream(s"publication"))
+ .mkString
+ .lines
+ .next(),
+ classOf[Publication]
+ )
r.mergeFrom(p)
-
println(mapper.writeValueAsString(r))
-
-
-
-
-
-
-
}
-
-
-
-
-
-
-
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/RetrieveDataciteDeltaTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/RetrieveDataciteDeltaTest.scala
index c277b0aa1..80ea9d59c 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/RetrieveDataciteDeltaTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/RetrieveDataciteDeltaTest.scala
@@ -1,26 +1,20 @@
package eu.dnetlib.dhp.sx.graph
+
import org.junit.jupiter.api.Test
import java.text.SimpleDateFormat
-
-
class RetrieveDataciteDeltaTest {
@Test
def testParsingDate(): Unit = {
-
val inputDate = "2021-12-02T11:17:36+0000"
val t = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ").parse(inputDate).getTime
-
println(t)
-
-
}
-
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala
index 04b1f9ecd..e92f36896 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala
@@ -18,37 +18,40 @@ import scala.collection.JavaConverters._
import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension]))
-class ScholixGraphTest extends AbstractVocabularyTest{
-
+class ScholixGraphTest extends AbstractVocabularyTest {
val mapper: ObjectMapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
- mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,false)
+ mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
@BeforeEach
- def setUp() :Unit = {
+ def setUp(): Unit = {
super.setUpVocabulary()
}
-
@Test
- def testExtractPids():Unit = {
+ def testExtractPids(): Unit = {
- val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json")).mkString
- val res =SparkResolveRelation.extractPidsFromRecord(input)
+ val input = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json"))
+ .mkString
+ val res = SparkResolveRelation.extractPidsFromRecord(input)
assertNotNull(res)
- assertEquals(1,res._2.size)
+ assertEquals(1, res._2.size)
}
@Test
- def testOAFToSummary():Unit= {
- val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary")).mkString
+ def testOAFToSummary(): Unit = {
+ val inputRelations = Source
+ .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary"))
+ .mkString
val items = inputRelations.lines.toList
assertNotNull(items)
- items.foreach(i =>assertTrue(i.nonEmpty))
- val result = items.map(r => mapper.readValue(r, classOf[Result])).map(i => ScholixUtils.resultToSummary(i))
+ items.foreach(i => assertTrue(i.nonEmpty))
+ val result =
+ items.map(r => mapper.readValue(r, classOf[Result])).map(i => ScholixUtils.resultToSummary(i))
assertNotNull(result)
@@ -59,37 +62,41 @@ class ScholixGraphTest extends AbstractVocabularyTest{
}
-
-
@Test
- def testScholixMergeOnSource():Unit = {
- val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix")).mkString
- val result:List[(Relation,ScholixSummary)] =inputRelations.lines.sliding(2).map(s => (s.head, s(1))).map(p => (mapper.readValue(p._1, classOf[Relation]),mapper.readValue(p._2, classOf[ScholixSummary]) )).toList
+ def testScholixMergeOnSource(): Unit = {
+ val inputRelations = Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix")
+ )
+ .mkString
+ val result: List[(Relation, ScholixSummary)] = inputRelations.lines
+ .sliding(2)
+ .map(s => (s.head, s(1)))
+ .map(p => (mapper.readValue(p._1, classOf[Relation]), mapper.readValue(p._2, classOf[ScholixSummary])))
+ .toList
assertNotNull(result)
assertTrue(result.nonEmpty)
result.foreach(r => assertEquals(r._1.getSource, r._2.getId))
- val scholix:List[Scholix] = result.map(r => ScholixUtils.scholixFromSource(r._1, r._2))
+ val scholix: List[Scholix] = result.map(r => ScholixUtils.scholixFromSource(r._1, r._2))
println(mapper.writeValueAsString(scholix.head))
}
-
-
-
@Test
def testScholixRelationshipsClean(): Unit = {
- val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relation_transform.json")).mkString
+ val inputRelations = Source
+ .fromInputStream(
+ getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relation_transform.json")
+ )
+ .mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(inputRelations)
- val l:List[String] =json.extract[List[String]]
+ val l: List[String] = json.extract[List[String]]
assertNotNull(l)
assertTrue(l.nonEmpty)
- val relVocbaulary =ScholixUtils.relations
- l.foreach(r => assertTrue(relVocbaulary.contains(r.toLowerCase)))
+ val relVocbaulary = ScholixUtils.relations
+ l.foreach(r => assertTrue(relVocbaulary.contains(r.toLowerCase)))
}
-
-
-
}
diff --git a/pom.xml b/pom.xml
index ed7b8a2ca..b68671aec 100644
--- a/pom.xml
+++ b/pom.xml
@@ -620,6 +620,18 @@
+
+ org.antipathy
+ mvn-scalafmt_2.11
+ 1.0.1640073709.733712b
+
+
+ eu.dnetlib.dhp
+ dhp-code-style
+ ${project.version}
+
+
+
@@ -665,6 +677,33 @@
+
+ org.antipathy
+ mvn-scalafmt_2.11
+
+ dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf
+ false
+ false
+
+ ${project.basedir}/src/main/scala
+
+
+ ${project.basedir}/src/test/scala
+
+ false
+ false
+ : git rev-parse --abbrev-ref HEAD
+ false
+
+
+
+ validate
+
+ format
+
+
+
+
org.apache.maven.plugins
maven-release-plugin