master #59

Closed
claudio.atzori wants to merge 3221 commits from master into stable_ids
67 changed files with 4841 additions and 2629 deletions
Showing only changes of commit 1b9e8378b3 - Show all commits

View File

@ -0,0 +1,21 @@
style = defaultWithAlign
align.openParenCallSite = false
align.openParenDefnSite = false
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
continuationIndent.callSite = 2
continuationIndent.defnSite = 2
danglingParentheses = true
indentOperator = spray
maxColumn = 120
newlines.alwaysBeforeTopLevelStatements = true
project.excludeFilters = [".*\\.sbt"]
rewrite.rules = [AvoidInfix]
rewrite.rules = [ExpandImportSelectors]
rewrite.rules = [RedundantBraces]
rewrite.rules = [RedundantParens]
rewrite.rules = [SortImports]
rewrite.rules = [SortModifiers]
rewrite.rules = [PreferCurlyFors]
spaces.inImportCurlyBraces = false
unindentTopLevelOperators = true

View File

@ -2,58 +2,57 @@ package eu.dnetlib.dhp.application
import scala.io.Source import scala.io.Source
/** /** This is the main Interface SparkApplication
* This is the main Interface SparkApplication
* where all the Spark Scala class should inherit * where all the Spark Scala class should inherit
*
*/ */
trait SparkScalaApplication { trait SparkScalaApplication {
/**
* This is the path in the classpath of the json /** This is the path in the classpath of the json
* describes all the argument needed to run * describes all the argument needed to run
*/ */
val propertyPath: String val propertyPath: String
/** /** Utility to parse the arguments using the
* Utility to parse the arguments using the
* property json in the classpath identified from * property json in the classpath identified from
* the variable propertyPath * the variable propertyPath
* *
* @param args the list of arguments * @param args the list of arguments
*/ */
def parseArguments(args: Array[String]): ArgumentApplicationParser = { def parseArguments(args: Array[String]): ArgumentApplicationParser = {
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString) val parser = new ArgumentApplicationParser(
Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString
)
parser.parseArgument(args) parser.parseArgument(args)
parser parser
} }
/** /** Here all the spark applications runs this method
* Here all the spark applications runs this method
* where the whole logic of the spark node is defined * where the whole logic of the spark node is defined
*/ */
def run(): Unit def run(): Unit
} }
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession import org.apache.spark.sql.SparkSession
import org.slf4j.Logger import org.slf4j.Logger
abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends SparkScalaApplication { abstract class AbstractScalaApplication(
val propertyPath: String,
val args: Array[String],
log: Logger
) extends SparkScalaApplication {
var parser: ArgumentApplicationParser = null var parser: ArgumentApplicationParser = null
var spark: SparkSession = null var spark: SparkSession = null
def initialize(): SparkScalaApplication = { def initialize(): SparkScalaApplication = {
parser = parseArguments(args) parser = parseArguments(args)
spark = createSparkSession() spark = createSparkSession()
this this
} }
/** /** Utility for creating a spark session starting from parser
* Utility for creating a spark session starting from parser
* *
* @return a spark Session * @return a spark Session
*/ */
@ -63,7 +62,9 @@ abstract class AbstractScalaApplication (val propertyPath:String, val args:Array
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val master = parser.get("master") val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master") log.info(s"Creating Spark session: Master: $master")
SparkSession.builder().config(conf) SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(master) .master(master)
.getOrCreate() .getOrCreate()

View File

@ -14,7 +14,6 @@ import scala.io.Source
object ScholixUtils extends Serializable { object ScholixUtils extends Serializable {
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier" val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
val DATE_RELATION_KEY: String = "RelationDate" val DATE_RELATION_KEY: String = "RelationDate"
@ -24,7 +23,11 @@ object ScholixUtils extends Serializable {
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {} case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
val relations: Map[String, RelationVocabulary] = { val relations: Map[String, RelationVocabulary] = {
val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")).mkString val input = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
)
.mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input) lazy val json: json4s.JValue = parse(input)
@ -32,13 +35,14 @@ object ScholixUtils extends Serializable {
json.extract[Map[String, RelationVocabulary]] json.extract[Map[String, RelationVocabulary]]
} }
def extractRelationDate(relation: Relation): String = { def extractRelationDate(relation: Relation): String = {
if (relation.getProperties == null || !relation.getProperties.isEmpty) if (relation.getProperties == null || !relation.getProperties.isEmpty)
null null
else { else {
val date = relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue) val date = relation.getProperties.asScala
.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey))
.map(p => p.getValue)
if (date.isDefined) if (date.isDefined)
date.get date.get
else else
@ -58,16 +62,14 @@ object ScholixUtils extends Serializable {
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = { def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName) new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
} }
def generateScholixResourceFromResult(r: Result): ScholixResource = { def generateScholixResourceFromResult(r: Result): ScholixResource = {
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r)) generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
} }
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable { new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
override def zero: RelatedEntities = null override def zero: RelatedEntities = null
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = { override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
@ -77,13 +79,20 @@ object ScholixUtils extends Serializable {
if (b == null) if (b == null)
RelatedEntities(a._1, relatedDataset, relatedPublication) RelatedEntities(a._1, relatedDataset, relatedPublication)
else else
RelatedEntities(a._1, b.relatedDataset + relatedDataset, b.relatedPublication + relatedPublication) RelatedEntities(
a._1,
b.relatedDataset + relatedDataset,
b.relatedPublication + relatedPublication
)
} }
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = { override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
if (b1 != null && b2 != null) if (b1 != null && b2 != null)
RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication) RelatedEntities(
b1.id,
b1.relatedDataset + b2.relatedDataset,
b1.relatedPublication + b2.relatedPublication
)
else if (b1 != null) else if (b1 != null)
b1 b1
else else
@ -97,18 +106,16 @@ object ScholixUtils extends Serializable {
override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities]) override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
} }
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] =
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable { new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
override def zero: Scholix = null override def zero: Scholix = null
def scholix_complete(s: Scholix): Boolean = { def scholix_complete(s: Scholix): Boolean = {
if (s == null || s.getIdentifier == null) { if (s == null || s.getIdentifier == null) {
false false
} else if (s.getSource == null || s.getTarget == null) { } else if (s.getSource == null || s.getTarget == null) {
false false
} } else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
false false
else else
true true
@ -129,7 +136,6 @@ object ScholixUtils extends Serializable {
override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
} }
def createInverseScholixRelation(scholix: Scholix): Scholix = { def createInverseScholixRelation(scholix: Scholix): Scholix = {
val s = new Scholix val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate) s.setPublicationDate(scholix.getPublicationDate)
@ -138,16 +144,19 @@ object ScholixUtils extends Serializable {
s.setRelationship(inverseRelationShip(scholix.getRelationship)) s.setRelationship(inverseRelationShip(scholix.getRelationship))
s.setSource(scholix.getTarget) s.setSource(scholix.getTarget)
s.setTarget(scholix.getSource) s.setTarget(scholix.getSource)
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}")) s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s s
} }
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = { def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) { if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
d => new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers) new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
}(collection.breakOut) }(collection.breakOut)
l l
} else List() } else List()
@ -155,8 +164,11 @@ object ScholixUtils extends Serializable {
def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = { def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
if (summary.getDatasources != null && !summary.getDatasources.isEmpty) { if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d =>
d => new ScholixEntityId(d.getDatasourceName, List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava) new ScholixEntityId(
d.getDatasourceName,
List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava
)
}(collection.breakOut) }(collection.breakOut)
l l
} else List() } else List()
@ -165,17 +177,16 @@ object ScholixUtils extends Serializable {
def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = { def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) { if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c =>
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { new ScholixEntityId(
c => c.getValue,
List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava
new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava) )
}.toList }.toList
l l
} else List() } else List()
} }
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = { def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
val s = new Scholix val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate) s.setPublicationDate(scholix.getPublicationDate)
@ -184,11 +195,14 @@ object ScholixUtils extends Serializable {
s.setRelationship(scholix.getRelationship) s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource) s.setSource(scholix.getSource)
s.setTarget(generateScholixResourceFromSummary(target)) s.setTarget(generateScholixResourceFromSummary(target))
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}")) s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s s
} }
def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = { def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = {
val s = new Scholix val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate) s.setPublicationDate(scholix.getPublicationDate)
@ -197,11 +211,14 @@ object ScholixUtils extends Serializable {
s.setRelationship(scholix.getRelationship) s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource) s.setSource(scholix.getSource)
s.setTarget(target) s.setTarget(target)
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}")) s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s s
} }
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = { def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
val r = new ScholixResource val r = new ScholixResource
r.setIdentifier(summaryObject.getLocalIdentifier) r.setIdentifier(summaryObject.getLocalIdentifier)
@ -214,7 +231,8 @@ object ScholixUtils extends Serializable {
r.setTitle(summaryObject.getTitle.get(0)) r.setTitle(summaryObject.getTitle.get(0))
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) { if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
val l: List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList val l: List[ScholixEntityId] =
summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
if (l.nonEmpty) if (l.nonEmpty)
r.setCreator(l.asJava) r.setCreator(l.asJava)
} }
@ -222,20 +240,27 @@ object ScholixUtils extends Serializable {
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty) if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
r.setPublicationDate(summaryObject.getDate.get(0)) r.setPublicationDate(summaryObject.getDate.get(0))
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) { if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList val plist: List[ScholixEntityId] =
summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
if (plist.nonEmpty) if (plist.nonEmpty)
r.setPublisher(plist.asJava) r.setPublisher(plist.asJava)
} }
if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) { if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom( val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala
new ScholixEntityId(c.getDatasourceName, List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava) .map(c =>
, "collected", "complete" new ScholixCollectedFrom(
new ScholixEntityId(
)).toList c.getDatasourceName,
List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava
),
"collected",
"complete"
)
)
.toList
if (l.nonEmpty) if (l.nonEmpty)
r.setCollectedFrom(l.asJava) r.setCollectedFrom(l.asJava)
@ -244,8 +269,6 @@ object ScholixUtils extends Serializable {
r r
} }
def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = { def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = {
if (relation == null || source == null) if (relation == null || source == null)
return null return null
@ -262,7 +285,6 @@ object ScholixUtils extends Serializable {
s.setPublicationDate(d) s.setPublicationDate(d)
if (source.getPublisher != null && !source.getPublisher.isEmpty) { if (source.getPublisher != null && !source.getPublisher.isEmpty) {
s.setPublisher(source.getPublisher) s.setPublisher(source.getPublisher)
} }
@ -270,13 +292,14 @@ object ScholixUtils extends Serializable {
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null) val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null) if (semanticRelation == null)
return null return null
s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)) s.setRelationship(
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
)
s.setSource(source) s.setSource(source)
s s
} }
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = { def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
if (relation == null || source == null) if (relation == null || source == null)
@ -298,11 +321,9 @@ object ScholixUtils extends Serializable {
s.setPublicationDate(d) s.setPublicationDate(d)
if (source.getPublisher != null && !source.getPublisher.isEmpty) { if (source.getPublisher != null && !source.getPublisher.isEmpty) {
val l: List[ScholixEntityId] = source.getPublisher.asScala val l: List[ScholixEntityId] = source.getPublisher.asScala
.map { .map { p =>
p =>
new ScholixEntityId(p, null) new ScholixEntityId(p, null)
}(collection.breakOut) }(collection.breakOut)
@ -313,16 +334,19 @@ object ScholixUtils extends Serializable {
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null) val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null) if (semanticRelation == null)
return null return null
s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)) s.setRelationship(
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
)
s.setSource(generateScholixResourceFromSummary(source)) s.setSource(generateScholixResourceFromSummary(source))
s s
} }
def findURLForPID(
def findURLForPID(pidValue: List[StructuredProperty], urls: List[String]): List[(StructuredProperty, String)] = { pidValue: List[StructuredProperty],
pidValue.map { urls: List[String]
p => ): List[(StructuredProperty, String)] = {
pidValue.map { p =>
val pv = p.getValue val pv = p.getValue
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase)) val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
@ -330,14 +354,17 @@ object ScholixUtils extends Serializable {
} }
} }
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = { def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
if (r.getInstance() == null || r.getInstance().isEmpty) if (r.getInstance() == null || r.getInstance().isEmpty)
return List() return List()
r.getInstance().asScala.filter(i => i.getUrl != null && !i.getUrl.isEmpty) r.getInstance()
.asScala
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
.filter(i => i.getPid != null && i.getUrl != null) .filter(i => i.getPid != null && i.getUrl != null)
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList)) .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
.distinct
.toList
} }
def resultToSummary(r: Result): ScholixSummary = { def resultToSummary(r: Result): ScholixSummary = {
@ -371,7 +398,12 @@ object ScholixUtils extends Serializable {
s.setAuthor(authors.asJava) s.setAuthor(authors.asJava)
} }
if (r.getInstance() != null) { if (r.getInstance() != null) {
val dt: List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue).toList val dt: List[String] = r
.getInstance()
.asScala
.filter(i => i.getDateofacceptance != null)
.map(i => i.getDateofacceptance.getValue)
.toList
if (dt.nonEmpty) if (dt.nonEmpty)
s.setDate(dt.distinct.asJava) s.setDate(dt.distinct.asJava)
} }
@ -382,7 +414,9 @@ object ScholixUtils extends Serializable {
} }
if (r.getSubject != null && !r.getSubject.isEmpty) { if (r.getSubject != null && !r.getSubject.isEmpty) {
val subjects: List[SchemeValue] = r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue)).toList val subjects: List[SchemeValue] = r.getSubject.asScala
.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))
.toList
if (subjects.nonEmpty) if (subjects.nonEmpty)
s.setSubject(subjects.asJava) s.setSubject(subjects.asJava)
} }
@ -391,7 +425,9 @@ object ScholixUtils extends Serializable {
s.setPublisher(List(r.getPublisher.getValue).asJava) s.setPublisher(List(r.getPublisher.getValue).asJava)
if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) { if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete")).toList val cf: List[CollectedFromType] = r.getCollectedfrom.asScala
.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))
.toList
if (cf.nonEmpty) if (cf.nonEmpty)
s.setDatasources(cf.distinct.asJava) s.setDatasources(cf.distinct.asJava)
} }

View File

@ -7,15 +7,13 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode}
object CollectionUtils { object CollectionUtils {
/** /** This method in pipeline to the transformation phase,
* This method in pipeline to the transformation phase,
* generates relations in both verse, typically it should be a phase of flatMap * generates relations in both verse, typically it should be a phase of flatMap
* *
* @param i input OAF * @param i input OAF
* @return * @return
* If the input OAF is an entity -> List(i) * If the input OAF is an entity -> List(i)
* If the input OAF is a relation -> List(relation, inverseRelation) * If the input OAF is a relation -> List(relation, inverseRelation)
*
*/ */
def fixRelations(i: Oaf): List[Oaf] = { def fixRelations(i: Oaf): List[Oaf] = {

View File

@ -6,7 +6,6 @@ import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest}
import org.apache.http.entity.StringEntity import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClientBuilder import org.apache.http.impl.client.HttpClientBuilder
abstract class AbstractRestClient extends Iterator[String] { abstract class AbstractRestClient extends Iterator[String] {
var buffer: List[String] = List() var buffer: List[String] = List()
@ -16,12 +15,10 @@ abstract class AbstractRestClient extends Iterator[String] {
var complete: Boolean = false var complete: Boolean = false
def extractInfo(input: String): Unit def extractInfo(input: String): Unit
protected def getBufferData(): Unit protected def getBufferData(): Unit
def doHTTPGETRequest(url: String): String = { def doHTTPGETRequest(url: String): String = {
val httpGet = new HttpGet(url) val httpGet = new HttpGet(url)
doHTTPRequest(httpGet) doHTTPRequest(httpGet)
@ -43,7 +40,6 @@ abstract class AbstractRestClient extends Iterator[String] {
buffer.nonEmpty && current_index < buffer.size buffer.nonEmpty && current_index < buffer.size
} }
override def next(): String = { override def next(): String = {
val next_item: String = buffer(current_index) val next_item: String = buffer(current_index)
current_index = current_index + 1 current_index = current_index + 1
@ -52,13 +48,14 @@ abstract class AbstractRestClient extends Iterator[String] {
next_item next_item
} }
private def doHTTPRequest[A <: HttpUriRequest](r: A): String = { private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
val timeout = 60; // seconds val timeout = 60; // seconds
val config = RequestConfig.custom() val config = RequestConfig
.custom()
.setConnectTimeout(timeout * 1000) .setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000) .setConnectionRequestTimeout(timeout * 1000)
.setSocketTimeout(timeout * 1000).build() .setSocketTimeout(timeout * 1000)
.build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build() val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
try { try {
var tries = 4 var tries = 4
@ -69,8 +66,7 @@ abstract class AbstractRestClient extends Iterator[String] {
println(s"get response with status${response.getStatusLine.getStatusCode}") println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) { if (response.getStatusLine.getStatusCode > 400) {
tries -= 1 tries -= 1
} } else
else
return IOUtils.toString(response.getEntity.getContent) return IOUtils.toString(response.getEntity.getContent)
} catch { } catch {
case e: Throwable => case e: Throwable =>

View File

@ -24,7 +24,9 @@ class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -
override def getBufferData(): Unit = { override def getBufferData(): Unit = {
if (!complete) { if (!complete) {
val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url()) val response =
if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get)
else doHTTPGETRequest(get_url())
extractInfo(response) extractInfo(response)
} }
} }

View File

@ -10,8 +10,7 @@ import java.util.Locale
import java.util.regex.Pattern import java.util.regex.Pattern
import scala.io.Source import scala.io.Source
/** /** This class represent the dataModel of the input Dataset of Datacite
* This class represent the dataModel of the input Dataset of Datacite
* @param doi THE DOI * @param doi THE DOI
* @param timestamp timestamp of last update date * @param timestamp timestamp of last update date
* @param isActive the record is active or deleted * @param isActive the record is active or deleted
@ -23,11 +22,26 @@ case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: S
The following class are utility class used for the mapping from The following class are utility class used for the mapping from
json datacite to OAF Shema json datacite to OAF Shema
*/ */
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {} case class RelatedIdentifierType(
relationType: String,
relatedIdentifier: String,
relatedIdentifierType: String
) {}
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {} case class NameIdentifiersType(
nameIdentifierScheme: Option[String],
schemeUri: Option[String],
nameIdentifier: Option[String]
) {}
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {} case class CreatorType(
nameType: Option[String],
nameIdentifiers: Option[List[NameIdentifiersType]],
name: Option[String],
familyName: Option[String],
givenName: Option[String],
affiliation: Option[List[String]]
) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {} case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
@ -35,16 +49,20 @@ case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {} case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {} case class FundingReferenceType(
funderIdentifierType: Option[String],
awardTitle: Option[String],
awardUri: Option[String],
funderName: Option[String],
funderIdentifier: Option[String],
awardNumber: Option[String]
) {}
case class DateType(date: Option[String], dateType: Option[String]) {} case class DateType(date: Option[String], dateType: Option[String]) {}
case class OAFRelations(relation: String, inverse: String, relType: String) case class OAFRelations(relation: String, inverse: String, relType: String)
class DataciteModelConstants extends Serializable {}
class DataciteModelConstants extends Serializable {
}
object DataciteModelConstants { object DataciteModelConstants {
@ -55,51 +73,147 @@ object DataciteModelConstants {
val SUBJ_CLASS = "keywords" val SUBJ_CLASS = "keywords"
val DATACITE_NAME = "Datacite" val DATACITE_NAME = "Datacite"
val dataInfo: DataInfo = dataciteDataInfo("0.9") val dataInfo: DataInfo = dataciteDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
val DATACITE_COLLECTED_FROM: KeyValue =
OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
val subRelTypeMapping: Map[String, OAFRelations] = Map( val subRelTypeMapping: Map[String, OAFRelations] = Map(
ModelConstants.REFERENCES -> OAFRelations(ModelConstants.REFERENCES, ModelConstants.IS_REFERENCED_BY, ModelConstants.RELATIONSHIP), ModelConstants.REFERENCES -> OAFRelations(
ModelConstants.IS_REFERENCED_BY -> OAFRelations(ModelConstants.IS_REFERENCED_BY,ModelConstants.REFERENCES, ModelConstants.RELATIONSHIP), ModelConstants.REFERENCES,
ModelConstants.IS_REFERENCED_BY,
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.SUPPLEMENT), ModelConstants.RELATIONSHIP
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.SUPPLEMENT), ),
ModelConstants.IS_REFERENCED_BY -> OAFRelations(
ModelConstants.HAS_PART -> OAFRelations(ModelConstants.HAS_PART,ModelConstants.IS_PART_OF, ModelConstants.PART), ModelConstants.IS_REFERENCED_BY,
ModelConstants.IS_PART_OF -> OAFRelations(ModelConstants.IS_PART_OF,ModelConstants.HAS_PART, ModelConstants.PART), ModelConstants.REFERENCES,
ModelConstants.RELATIONSHIP
ModelConstants.IS_VERSION_OF-> OAFRelations(ModelConstants.IS_VERSION_OF,ModelConstants.HAS_VERSION,ModelConstants.VERSION), ),
ModelConstants.HAS_VERSION-> OAFRelations(ModelConstants.HAS_VERSION,ModelConstants.IS_VERSION_OF,ModelConstants.VERSION), ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
ModelConstants.IS_SUPPLEMENTED_BY,
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(ModelConstants.IS_IDENTICAL_TO,ModelConstants.IS_IDENTICAL_TO, ModelConstants.RELATIONSHIP), ModelConstants.IS_SUPPLEMENT_TO,
ModelConstants.SUPPLEMENT
ModelConstants.IS_CONTINUED_BY -> OAFRelations(ModelConstants.IS_CONTINUED_BY,ModelConstants.CONTINUES, ModelConstants.RELATIONSHIP), ),
ModelConstants.CONTINUES -> OAFRelations(ModelConstants.CONTINUES,ModelConstants.IS_CONTINUED_BY, ModelConstants.RELATIONSHIP), ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
ModelConstants.IS_SUPPLEMENT_TO,
ModelConstants.IS_NEW_VERSION_OF-> OAFRelations(ModelConstants.IS_NEW_VERSION_OF,ModelConstants.IS_PREVIOUS_VERSION_OF, ModelConstants.VERSION), ModelConstants.IS_SUPPLEMENTED_BY,
ModelConstants.IS_PREVIOUS_VERSION_OF ->OAFRelations(ModelConstants.IS_PREVIOUS_VERSION_OF,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION), ModelConstants.SUPPLEMENT
),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP), ModelConstants.HAS_PART -> OAFRelations(
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP), ModelConstants.HAS_PART,
ModelConstants.IS_PART_OF,
ModelConstants.IS_SOURCE_OF -> OAFRelations(ModelConstants.IS_SOURCE_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION), ModelConstants.PART
ModelConstants.IS_DERIVED_FROM -> OAFRelations(ModelConstants.IS_DERIVED_FROM,ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION), ),
ModelConstants.IS_PART_OF -> OAFRelations(
ModelConstants.CITES -> OAFRelations(ModelConstants.CITES,ModelConstants.IS_CITED_BY, ModelConstants.CITATION), ModelConstants.IS_PART_OF,
ModelConstants.IS_CITED_BY -> OAFRelations(ModelConstants.IS_CITED_BY,ModelConstants.CITES, ModelConstants.CITATION), ModelConstants.HAS_PART,
ModelConstants.PART
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(ModelConstants.IS_VARIANT_FORM_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION), ),
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(ModelConstants.IS_OBSOLETED_BY,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION), ModelConstants.IS_VERSION_OF -> OAFRelations(
ModelConstants.IS_VERSION_OF,
ModelConstants.REVIEWS -> OAFRelations(ModelConstants.REVIEWS,ModelConstants.IS_REVIEWED_BY, ModelConstants.REVIEW), ModelConstants.HAS_VERSION,
ModelConstants.IS_REVIEWED_BY -> OAFRelations(ModelConstants.IS_REVIEWED_BY,ModelConstants.REVIEWS, ModelConstants.REVIEW), ModelConstants.VERSION
),
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP), ModelConstants.HAS_VERSION -> OAFRelations(
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP), ModelConstants.HAS_VERSION,
ModelConstants.IS_VERSION_OF,
ModelConstants.COMPILES -> OAFRelations(ModelConstants.COMPILES,ModelConstants.IS_COMPILED_BY, ModelConstants.RELATIONSHIP), ModelConstants.VERSION
ModelConstants.IS_COMPILED_BY -> OAFRelations(ModelConstants.IS_COMPILED_BY,ModelConstants.COMPILES, ModelConstants.RELATIONSHIP) ),
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
ModelConstants.IS_IDENTICAL_TO,
ModelConstants.IS_IDENTICAL_TO,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_CONTINUED_BY -> OAFRelations(
ModelConstants.IS_CONTINUED_BY,
ModelConstants.CONTINUES,
ModelConstants.RELATIONSHIP
),
ModelConstants.CONTINUES -> OAFRelations(
ModelConstants.CONTINUES,
ModelConstants.IS_CONTINUED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.IS_PREVIOUS_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
ModelConstants.IS_PREVIOUS_VERSION_OF,
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.DOCUMENTS,
ModelConstants.RELATIONSHIP
),
ModelConstants.DOCUMENTS -> OAFRelations(
ModelConstants.DOCUMENTS,
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_SOURCE_OF -> OAFRelations(
ModelConstants.IS_SOURCE_OF,
ModelConstants.IS_DERIVED_FROM,
ModelConstants.VERSION
),
ModelConstants.IS_DERIVED_FROM -> OAFRelations(
ModelConstants.IS_DERIVED_FROM,
ModelConstants.IS_SOURCE_OF,
ModelConstants.VERSION
),
ModelConstants.CITES -> OAFRelations(
ModelConstants.CITES,
ModelConstants.IS_CITED_BY,
ModelConstants.CITATION
),
ModelConstants.IS_CITED_BY -> OAFRelations(
ModelConstants.IS_CITED_BY,
ModelConstants.CITES,
ModelConstants.CITATION
),
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
ModelConstants.IS_VARIANT_FORM_OF,
ModelConstants.IS_DERIVED_FROM,
ModelConstants.VERSION
),
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
ModelConstants.IS_OBSOLETED_BY,
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.REVIEWS -> OAFRelations(
ModelConstants.REVIEWS,
ModelConstants.IS_REVIEWED_BY,
ModelConstants.REVIEW
),
ModelConstants.IS_REVIEWED_BY -> OAFRelations(
ModelConstants.IS_REVIEWED_BY,
ModelConstants.REVIEWS,
ModelConstants.REVIEW
),
ModelConstants.DOCUMENTS -> OAFRelations(
ModelConstants.DOCUMENTS,
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.DOCUMENTS,
ModelConstants.RELATIONSHIP
),
ModelConstants.COMPILES -> OAFRelations(
ModelConstants.COMPILES,
ModelConstants.IS_COMPILED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_COMPILED_BY -> OAFRelations(
ModelConstants.IS_COMPILED_BY,
ModelConstants.COMPILES,
ModelConstants.RELATIONSHIP
)
) )
val datacite_filter: List[String] = { val datacite_filter: List[String] = {
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH) val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
@ -107,28 +221,58 @@ object DataciteModelConstants {
Source.fromInputStream(stream).getLines().toList Source.fromInputStream(stream).getLines().toList
} }
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
false,
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
trust
)
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(false,null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, trust) val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
Locale.ENGLISH
)
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH) val df_it: DateTimeFormatter =
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List( val funder_regex: List[(Pattern, String)] = List(
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"), (
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::") Pattern.compile(
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
),
"40|corda__h2020::"
),
(
Pattern.compile(
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
),
"40|corda_______::"
)
) )
val Date_regex: List[Pattern] = List( val Date_regex: List[Pattern] = List(
//Y-M-D //Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE), Pattern.compile(
"(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
Pattern.MULTILINE
),
//M-D-Y //M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE), Pattern.compile(
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
Pattern.MULTILINE
),
//D-M-Y //D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE), Pattern.compile(
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
Pattern.MULTILINE
),
//Y //Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE) Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
) )
} }

View File

@ -20,14 +20,11 @@ import java.time.format.DateTimeFormatter
import java.util.{Date, Locale} import java.util.{Date, Locale}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
object DataciteToOAFTransformation { object DataciteToOAFTransformation {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
/** This method should skip record if json contains invalid text
/**
* This method should skip record if json contains invalid text
* defined in gile datacite_filter * defined in gile datacite_filter
* *
* @param json * @param json
@ -74,30 +71,30 @@ object DataciteToOAFTransformation {
} }
def embargo_end(embargo_end_date: String): Boolean = { def embargo_end(embargo_end_date: String): Boolean = {
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]")) val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
val td = LocalDate.now() val td = LocalDate.now()
td.isAfter(dt) td.isAfter(dt)
} }
def extract_date(input: String): Option[String] = { def extract_date(input: String): Option[String] = {
val d = Date_regex.map(pattern => { val d = Date_regex
.map(pattern => {
val matcher = pattern.matcher(input) val matcher = pattern.matcher(input)
if (matcher.find()) if (matcher.find())
matcher.group(0) matcher.group(0)
else else
null null
} })
).find(s => s != null) .find(s => s != null)
if (d.isDefined) { if (d.isDefined) {
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
try { try {
return Some(LocalDate.parse(a_date, df_en).toString) return Some(LocalDate.parse(a_date, df_en).toString)
} catch { } catch {
case _: Throwable => try { case _: Throwable =>
try {
return Some(LocalDate.parse(a_date, df_it).toString) return Some(LocalDate.parse(a_date, df_it).toString)
} catch { } catch {
case _: Throwable => case _: Throwable =>
@ -118,31 +115,63 @@ object DataciteToOAFTransformation {
} }
} }
def getTypeQualifier(
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = { resourceType: String,
resourceTypeGeneral: String,
schemaOrg: String,
vocabularies: VocabularyGroup
): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) { if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType) val typeQualifier =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
if (typeQualifier != null) if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid)) return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
} }
if (schemaOrg != null && schemaOrg.nonEmpty) { if (schemaOrg != null && schemaOrg.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg) val typeQualifier =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
if (typeQualifier != null) if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid)) return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
} }
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) { if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral) val typeQualifier = vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_PUBLICATION_RESOURCE,
resourceTypeGeneral
)
if (typeQualifier != null) if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid)) return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
} }
null null
} }
def getResult(
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = { resourceType: String,
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) resourceTypeGeneral: String,
schemaOrg: String,
vocabularies: VocabularyGroup
): Result = {
val typeQualifiers: (Qualifier, Qualifier) =
getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (typeQualifiers == null) if (typeQualifiers == null)
return null return null
val i = new Instance val i = new Instance
@ -168,7 +197,6 @@ object DataciteToOAFTransformation {
null null
} }
def available_date(input: String): Boolean = { def available_date(input: String): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -182,9 +210,7 @@ object DataciteToOAFTransformation {
} }
/** As describe in ticket #6377
/**
* As describe in ticket #6377
* when the result come from figshare we need to remove subject * when the result come from figshare we need to remove subject
* and set Access rights OPEN. * and set Access rights OPEN.
* *
@ -193,7 +219,10 @@ object DataciteToOAFTransformation {
def fix_figshare(r: Result): Unit = { def fix_figshare(r: Result): Unit = {
if (r.getInstance() != null) { if (r.getInstance() != null) {
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue)) val hosted_by_figshare = r
.getInstance()
.asScala
.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
if (hosted_by_figshare) { if (hosted_by_figshare) {
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT())) r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
val l: List[StructuredProperty] = List() val l: List[StructuredProperty] = List()
@ -201,10 +230,8 @@ object DataciteToOAFTransformation {
} }
} }
} }
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = { def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_') val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}" s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
@ -214,7 +241,13 @@ object DataciteToOAFTransformation {
OafMapperUtils.structuredProperty(dt, q, null) OafMapperUtils.structuredProperty(dt, q, null)
} }
def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = { def generateRelation(
sourceId: String,
targetId: String,
relClass: String,
cf: KeyValue,
di: DataInfo
): Relation = {
val r = new Relation val r = new Relation
r.setSource(sourceId) r.setSource(sourceId)
@ -226,7 +259,6 @@ object DataciteToOAFTransformation {
r.setDataInfo(di) r.setDataInfo(di)
r r
} }
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = { def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
@ -238,14 +270,18 @@ object DataciteToOAFTransformation {
val grantId = m.matcher(awardUri).replaceAll("$2") val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}" val targetId = s"$p${DHPUtils.md5(grantId)}"
List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo)) List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo))
} } else
else
List() List()
} }
def generateOAF(
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = { input: String,
ts: Long,
dateOfCollection: Long,
vocabularies: VocabularyGroup,
exportLinks: Boolean
): List[Oaf] = {
if (skip_record(input)) if (skip_record(input))
return List() return List()
@ -253,7 +289,8 @@ object DataciteToOAFTransformation {
lazy val json = parse(input) lazy val json = parse(input)
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null) val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null) val resourceTypeGeneral =
(json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null) val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
val doi = (json \ "attributes" \ "doi").extract[String] val doi = (json \ "attributes" \ "doi").extract[String]
@ -265,8 +302,12 @@ object DataciteToOAFTransformation {
if (result == null) if (result == null)
return List() return List()
val doi_q = OafMapperUtils.qualifier(
val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES) "doi",
"doi",
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES
)
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo) val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
result.setPid(List(pid).asJava) result.setPid(List(pid).asJava)
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true)) result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
@ -275,48 +316,70 @@ object DataciteToOAFTransformation {
val d = new Date(dateOfCollection * 1000) val d = new Date(dateOfCollection * 1000)
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US) val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
result.setDateofcollection(ISO8601FORMAT.format(d)) result.setDateofcollection(ISO8601FORMAT.format(d))
result.setDateoftransformation(ISO8601FORMAT.format(d)) result.setDateoftransformation(ISO8601FORMAT.format(d))
result.setDataInfo(dataInfo) result.setDataInfo(dataInfo)
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List()) val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
val authors = creators.zipWithIndex.map { case (c, idx) => val authors = creators.zipWithIndex.map { case (c, idx) =>
val a = new Author val a = new Author
a.setFullname(c.name.orNull) a.setFullname(c.name.orNull)
a.setName(c.givenName.orNull) a.setName(c.givenName.orNull)
a.setSurname(c.familyName.orNull) a.setSurname(c.familyName.orNull)
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) { if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
a.setPid(c.nameIdentifiers.get.map(ni => { a.setPid(
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null c.nameIdentifiers.get
.map(ni => {
val q =
if (ni.nameIdentifierScheme.isDefined)
vocabularies.getTermAsQualifier(
ModelConstants.DNET_PID_TYPES,
ni.nameIdentifierScheme.get.toLowerCase()
)
else null
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) { if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo) OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
} } else
else
null null
} })
.asJava
) )
.asJava)
} }
if (c.affiliation.isDefined) if (c.affiliation.isDefined)
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava) a.setAffiliation(
c.affiliation.get
.filter(af => af.nonEmpty)
.map(af => OafMapperUtils.field(af, dataInfo))
.asJava
)
a.setRank(idx + 1) a.setRank(idx + 1)
a a
} }
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List()) val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => { result.setTitle(
titles
.filter(t => t.title.nonEmpty)
.map(t => {
if (t.titleType.isEmpty) { if (t.titleType.isEmpty) {
OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null) OafMapperUtils
.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
} else { } else {
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null) OafMapperUtils.structuredProperty(
t.title.get,
t.titleType.get,
t.titleType.get,
ModelConstants.DNET_DATACITE_TITLE,
ModelConstants.DNET_DATACITE_TITLE,
null
)
} }
}).asJava) })
.asJava
)
if (authors == null || authors.isEmpty || !authors.exists(a => a != null)) if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
return List() return List()
@ -337,46 +400,81 @@ object DataciteToOAFTransformation {
if (a_date.isDefined) { if (a_date.isDefined) {
if (doi.startsWith("10.14457")) if (doi.startsWith("10.14457"))
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null)) result.setEmbargoenddate(
OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null)
)
else else
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null)) result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
} }
if (i_date.isDefined && i_date.get.isDefined) { if (i_date.isDefined && i_date.get.isDefined) {
if (doi.startsWith("10.14457")) { if (doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)) result.setDateofacceptance(
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)) OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
} )
else { result
.getInstance()
.get(0)
.setDateofacceptance(
OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
)
} else {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
} }
} } else if (publication_year != null) {
else if (publication_year != null) {
if (doi.startsWith("10.14457")) { if (doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)) result.setDateofacceptance(
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)) OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
)
result
.getInstance()
.get(0)
.setDateofacceptance(
OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
)
} else { } else {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) result
.getInstance()
.get(0)
.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
} }
} }
result.setRelevantdate(
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined) dates
.filter(d => d.date.isDefined && d.dateType.isDefined)
.map(d => (extract_date(d.date.get), d.dateType.get)) .map(d => (extract_date(d.date.get), d.dateType.get))
.filter(d => d._1.isDefined) .filter(d => d._1.isDefined)
.map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase()))) .map(d =>
(
d._1.get,
vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
)
)
.filter(d => d._2 != null) .filter(d => d._2 != null)
.map(d => generateOAFDate(d._1, d._2)).asJava) .map(d => generateOAFDate(d._1, d._2))
.asJava
)
val subjects = (json \\ "subjects").extract[List[SubjectType]] val subjects = (json \\ "subjects").extract[List[SubjectType]]
result.setSubject(subjects.filter(s => s.subject.nonEmpty) result.setSubject(
subjects
.filter(s => s.subject.nonEmpty)
.map(s => .map(s =>
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null) OafMapperUtils.structuredProperty(
).asJava) s.subject.get,
SUBJ_CLASS,
SUBJ_CLASS,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
null
)
)
.asJava
)
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
@ -384,22 +482,22 @@ object DataciteToOAFTransformation {
result.setDescription( result.setDescription(
descriptions descriptions
.filter(d => d.description.isDefined). .filter(d => d.description.isDefined)
map(d => .map(d => OafMapperUtils.field(d.description.get, null))
OafMapperUtils.field(d.description.get, null) .filter(s => s != null)
).filter(s => s != null).asJava) .asJava
)
val publisher = (json \\ "publisher").extractOrElse[String](null) val publisher = (json \\ "publisher").extractOrElse[String](null)
if (publisher != null) if (publisher != null)
result.setPublisher(OafMapperUtils.field(publisher, null)) result.setPublisher(OafMapperUtils.field(publisher, null))
val language: String = (json \\ "language").extractOrElse[String](null) val language: String = (json \\ "language").extractOrElse[String](null)
if (language != null) if (language != null)
result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)) result.setLanguage(
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
)
val instance = result.getInstance().get(0) val instance = result.getInstance().get(0)
@ -410,9 +508,12 @@ object DataciteToOAFTransformation {
JField("rightsUri", JString(rightsUri)) <- rightsList JField("rightsUri", JString(rightsUri)) <- rightsList
} yield rightsUri } yield rightsUri
val aRights: Option[AccessRight] = accessRights.map(r => { val aRights: Option[AccessRight] = accessRights
.map(r => {
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r) vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
}).find(q => q != null).map(q => { })
.find(q => q != null)
.map(q => {
val a = new AccessRight val a = new AccessRight
a.setClassid(q.getClassid) a.setClassid(q.getClassid)
a.setClassname(q.getClassname) a.setClassname(q.getClassname)
@ -421,18 +522,34 @@ object DataciteToOAFTransformation {
a a
}) })
val access_rights_qualifier =
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) if (aRights.isDefined) aRights.get
else
OafMapperUtils.accessRight(
ModelConstants.UNKNOWN,
ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
if (client.isDefined) { if (client.isDefined) {
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID), ModelConstants.UNKNOWN_REPOSITORY.getValue)) instance.setHostedby(
OafMapperUtils.keyValue(
generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID),
ModelConstants.UNKNOWN_REPOSITORY.getValue
)
)
instance.setCollectedfrom(DATACITE_COLLECTED_FROM) instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava) instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
instance.setAccessright(access_rights_qualifier) instance.setAccessright(access_rights_qualifier)
instance.setPid(result.getPid) instance.setPid(result.getPid)
val license = accessRights val license = accessRights
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*")) .find(r =>
r.startsWith("http") && r.matches(
".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
)
)
if (license.isDefined) if (license.isDefined)
instance.setLicense(OafMapperUtils.field(license.get, null)) instance.setLicense(OafMapperUtils.field(license.get, null))
} }
@ -443,7 +560,8 @@ object DataciteToOAFTransformation {
} yield awardUri } yield awardUri
result.setId(IdentifierFactory.createIdentifier(result)) result.setId(IdentifierFactory.createIdentifier(result))
var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null) var relations: List[Relation] =
awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
fix_figshare(result) fix_figshare(result)
@ -458,20 +576,27 @@ object DataciteToOAFTransformation {
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType) } yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
relations = relations ::: generateRelations(rels, result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null) relations = relations ::: generateRelations(
rels,
result.getId,
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
)
} }
if (relations != null && relations.nonEmpty) { if (relations != null && relations.nonEmpty) {
List(result) ::: relations List(result) ::: relations
} } else
else
List(result) List(result)
} }
private def generateRelations(rels: List[RelatedIdentifierType], id: String, date: String): List[Relation] = { private def generateRelations(
rels: List[RelatedIdentifierType],
id: String,
date: String
): List[Relation] = {
rels rels
.filter(r => .filter(r =>
subRelTypeMapping.contains(r.relationType) && ( subRelTypeMapping
r.relatedIdentifierType.equalsIgnoreCase("doi") || .contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
r.relatedIdentifierType.equalsIgnoreCase("pmid") || r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
r.relatedIdentifierType.equalsIgnoreCase("arxiv")) r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
) )
@ -490,19 +615,19 @@ object DataciteToOAFTransformation {
rel.setProperties(List(dateProps).asJava) rel.setProperties(List(dateProps).asJava)
rel.setSource(id) rel.setSource(id)
rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)) rel.setTarget(
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
)
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue).toList rel.getCollectedfrom.asScala.map(c => c.getValue).toList
rel rel
}) })
} }
def generateDSId(input: String): String = { def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::") val b = StringUtils.substringBefore(input, "::")
val a = StringUtils.substringAfter(input, "::") val a = StringUtils.substringAfter(input, "::")
s"10|$b::${DHPUtils.md5(a)}" s"10|$b::${DHPUtils.md5(a)}"
} }
} }

View File

@ -12,10 +12,10 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
class GenerateDataciteDatasetSpark(propertyPath: String, args: Array[String], log: Logger)
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) { /** Here all the spark applications runs this method
/**
* Here all the spark applications runs this method
* where the whole logic of the spark node is defined * where the whole logic of the spark node is defined
*/ */
override def run(): Unit = { override def run(): Unit = {
@ -46,27 +46,34 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
reportTotalSize(targetPath, outputBasePath) reportTotalSize(targetPath, outputBasePath)
} }
/** For working with MDStore we need to store in a file on hdfs the size of
/**
* For working with MDStore we need to store in a file on hdfs the size of
* the current dataset * the current dataset
* @param targetPath * @param targetPath
* @param outputBasePath * @param outputBasePath
*/ */
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = { def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
val total_items = spark.read.text(targetPath).count() val total_items = spark.read.text(targetPath).count()
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH) writeHdfsFile(
spark.sparkContext.hadoopConfiguration,
s"$total_items",
outputBasePath + MDSTORE_SIZE_PATH
)
} }
/** /** Generate the transformed and cleaned OAF Dataset from the native one
* Generate the transformed and cleaned OAF Dataset from the native one *
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite * @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
* @param exportLinks If true it generates unresolved links * @param exportLinks If true it generates unresolved links
* @param vocabularies vocabularies for cleaning * @param vocabularies vocabularies for cleaning
* @param targetPath the targetPath of the result Dataset * @param targetPath the targetPath of the result Dataset
*/ */
def generateDataciteDataset(sourcePath: String, exportLinks: Boolean, vocabularies: VocabularyGroup, targetPath: String, spark:SparkSession):Unit = { def generateDataciteDataset(
sourcePath: String,
exportLinks: Boolean,
vocabularies: VocabularyGroup,
targetPath: String,
spark: SparkSession
): Unit = {
require(spark != null) require(spark != null)
import spark.implicits._ import spark.implicits._
@ -74,21 +81,30 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
CollectionUtils.saveDataset( CollectionUtils.saveDataset(
spark.read.load(sourcePath).as[DataciteType] spark.read
.load(sourcePath)
.as[DataciteType]
.filter(d => d.isActive) .filter(d => d.isActive)
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)) .flatMap(d =>
DataciteToOAFTransformation
.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)
)
.filter(d => d != null), .filter(d => d != null),
targetPath) targetPath
)
} }
} }
object GenerateDataciteDatasetSpark { object GenerateDataciteDatasetSpark {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass) val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
new GenerateDataciteDatasetSpark("/eu/dnetlib/dhp/datacite/generate_dataset_params.json", args, log).initialize().run() new GenerateDataciteDatasetSpark(
"/eu/dnetlib/dhp/datacite/generate_dataset_params.json",
args,
log
).initialize().run()
} }
} }

View File

@ -22,7 +22,6 @@ object ImportDatacite {
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass) val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
def convertAPIStringToDataciteItem(input: String): DataciteType = { def convertAPIStringToDataciteItem(input: String): DataciteType = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input) lazy val json: org.json4s.JValue = parse(input)
@ -32,14 +31,26 @@ object ImportDatacite {
val timestamp_string = (json \ "attributes" \ "updated").extract[String] val timestamp_string = (json \ "attributes" \ "updated").extract[String]
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME) val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input) DataciteType(
doi = doi,
timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000,
isActive = isActive,
json = input
)
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString) val parser = new ArgumentApplicationParser(
Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json"
)
)
.mkString
)
parser.parseArgument(args) parser.parseArgument(args)
val master = parser.get("master") val master = parser.get("master")
@ -60,7 +71,8 @@ object ImportDatacite {
val spkipImport = parser.get("skipImport") val spkipImport = parser.get("skipImport")
log.info(s"skipImport is $spkipImport") log.info(s"skipImport is $spkipImport")
val spark: SparkSession = SparkSession.builder() val spark: SparkSession = SparkSession
.builder()
.appName(ImportDatacite.getClass.getSimpleName) .appName(ImportDatacite.getClass.getSimpleName)
.master(master) .master(master)
.getOrCreate() .getOrCreate()
@ -78,8 +90,8 @@ object ImportDatacite {
import spark.implicits._ import spark.implicits._
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] =
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable { new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
override def zero: DataciteType = null override def zero: DataciteType = null
@ -110,13 +122,16 @@ object ImportDatacite {
println(s"last Timestamp is $ts") println(s"last Timestamp is $ts")
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs) val cnt =
if ("true".equalsIgnoreCase(spkipImport)) 1
else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
println(s"Imported from Datacite API $cnt documents") println(s"Imported from Datacite API $cnt documents")
if (cnt > 0) { if (cnt > 0) {
val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text]) val inputRdd: RDD[DataciteType] = sc
.sequenceFile(targetPath, classOf[Int], classOf[Text])
.map(s => s._2.toString) .map(s => s._2.toString)
.map(s => convertAPIStringToDataciteItem(s)) .map(s => convertAPIStringToDataciteItem(s))
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset") spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
@ -129,7 +144,9 @@ object ImportDatacite {
.agg(dataciteAggregator.toColumn) .agg(dataciteAggregator.toColumn)
.map(s => s._2) .map(s => s._2)
.repartition(4000) .repartition(4000)
.write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated") .write
.mode(SaveMode.Overwrite)
.save(s"${dataciteDump}_updated")
val fs = FileSystem.get(sc.hadoopConfiguration) val fs = FileSystem.get(sc.hadoopConfiguration)
fs.delete(new Path(s"$dataciteDump"), true) fs.delete(new Path(s"$dataciteDump"), true)
@ -137,14 +154,24 @@ object ImportDatacite {
} }
} }
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs: Int): Long = { private def writeSequenceFile(
hdfsTargetPath: Path,
timestamp: Long,
conf: Configuration,
bs: Int
): Long = {
var from: Long = timestamp * 1000 var from: Long = timestamp * 1000
val delta: Long = 100000000L val delta: Long = 100000000L
var client: DataciteAPIImporter = null var client: DataciteAPIImporter = null
val now: Long = System.currentTimeMillis() val now: Long = System.currentTimeMillis()
var i = 0 var i = 0
try { try {
val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text])) val writer = SequenceFile.createWriter(
conf,
SequenceFile.Writer.file(hdfsTargetPath),
SequenceFile.Writer.keyClass(classOf[IntWritable]),
SequenceFile.Writer.valueClass(classOf[Text])
)
try { try {
var start: Long = System.currentTimeMillis var start: Long = System.currentTimeMillis
while (from < now) { while (from < now) {
@ -153,16 +180,16 @@ object ImportDatacite {
val key: IntWritable = new IntWritable(i) val key: IntWritable = new IntWritable(i)
val value: Text = new Text val value: Text = new Text
while (client.hasNext) { while (client.hasNext) {
key.set({ key.set {
i += 1; i += 1;
i - 1 i - 1
}) }
value.set(client.next()) value.set(client.next())
writer.append(key, value) writer.append(key, value)
writer.hflush() writer.hflush()
if (i % 1000 == 0) { if (i % 1000 == 0) {
end = System.currentTimeMillis end = System.currentTimeMillis
val time = (end - start) / 1000.0F val time = (end - start) / 1000.0f
println(s"Imported $i in $time seconds") println(s"Imported $i in $time seconds")
start = System.currentTimeMillis start = System.currentTimeMillis
} }
@ -174,8 +201,7 @@ object ImportDatacite {
case e: Throwable => case e: Throwable =>
println("Error", e) println("Error", e)
} finally if (writer != null) writer.close() } finally if (writer != null) writer.close()
} } catch {
catch {
case e: Throwable => case e: Throwable =>
log.error("Error", e) log.error("Error", e)
} }

View File

@ -17,7 +17,13 @@ object SparkDownloadUpdateDatacite {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf = new SparkConf val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString) val parser = new ArgumentApplicationParser(
Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")
)
.mkString
)
parser.parseArgument(args) parser.parseArgument(args)
val master = parser.get("master") val master = parser.get("master")
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
@ -26,8 +32,9 @@ object SparkDownloadUpdateDatacite {
val hdfsuri = parser.get("namenode") val hdfsuri = parser.get("namenode")
log.info(s"namenode is $hdfsuri") log.info(s"namenode is $hdfsuri")
val spark: SparkSession = SparkSession
val spark: SparkSession = SparkSession.builder().config(conf) .builder()
.config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(master) .master(master)
.getOrCreate() .getOrCreate()
@ -37,13 +44,18 @@ object SparkDownloadUpdateDatacite {
import spark.implicits._ import spark.implicits._
val maxDate: String = spark.read
val maxDate: String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0) .load(workingPath)
.as[Oaf]
.filter(s => s.isInstanceOf[Result])
.map(r => r.asInstanceOf[Result].getDateofcollection)
.select(max("value"))
.first()
.getString(0)
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US) val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
val string_to_date = ISO8601FORMAT.parse(maxDate) val string_to_date = ISO8601FORMAT.parse(maxDate)
val ts = string_to_date.getTime val ts = string_to_date.getTime
} }
} }

View File

@ -12,13 +12,36 @@ object BioDBToOAF {
case class EBILinkItem(id: Long, links: String) {} case class EBILinkItem(id: Long, links: String) {}
case class EBILinks(relType: String, date: String, title: String, pmid: String, targetPid: String, targetPidType: String, targetUrl: String) {} case class EBILinks(
relType: String,
date: String,
title: String,
pmid: String,
targetPid: String,
targetPidType: String,
targetUrl: String
) {}
case class UniprotDate(date: String, date_info: String) {} case class UniprotDate(date: String, date_info: String) {}
case class ScholixResolved(pid: String, pidType: String, typology: String, tilte: List[String], datasource: List[String], date: List[String], authors: List[String]) {} case class ScholixResolved(
pid: String,
pidType: String,
typology: String,
tilte: List[String],
datasource: List[String],
date: List[String],
authors: List[String]
) {}
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9") val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
false,
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
"0.9"
)
val SUBJ_CLASS = "Keywords" val SUBJ_CLASS = "Keywords"
val DATE_RELATION_KEY = "RelationDate" val DATE_RELATION_KEY = "RelationDate"
@ -35,16 +58,35 @@ object BioDBToOAF {
"geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" "geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
) )
val collectedFromMap: Map[String, KeyValue] = { val collectedFromMap: Map[String, KeyValue] = {
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", "Protein Data Bank") val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue(
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive") "10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2",
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", "NCBI Nucleotide") "Protein Data Bank"
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::296e1abaf1302897a6838d3588cd0310", "UniProtKB/Swiss-Prot") )
val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier") val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue(
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature") "10|re3data_____::c2a591f440598b63d854556beaf01591",
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)") "European Nucleotide Archive"
val pubmedCollectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") )
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue(
"10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6",
"NCBI Nucleotide"
)
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue(
"10|re3data_____::296e1abaf1302897a6838d3588cd0310",
"UniProtKB/Swiss-Prot"
)
val ElsevierCollectedFrom: KeyValue =
OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue(
"10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e",
"Springer Nature"
)
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue(
"10|opendoar____::83e60e09c222f206c725385f53d7e567c",
"EMBL-EBIs Protein Data Bank in Europe (PDBe)"
)
val pubmedCollectedFrom: KeyValue =
OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
UNIPROTCollectedFrom.setDataInfo(DATA_INFO) UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
PDBCollectedFrom.setDataInfo(DATA_INFO) PDBCollectedFrom.setDataInfo(DATA_INFO)
@ -80,18 +122,32 @@ object BioDBToOAF {
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String]) val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type), collectedFromMap("elsevier"), "relationship", relation_semantic, date) createRelation(
target_pid,
target_pid_type,
generate_unresolved_id(source_pid, source_pid_type),
collectedFromMap("elsevier"),
"relationship",
relation_semantic,
date
)
} }
def scholixResolvedToOAF(input: ScholixResolved): Oaf = { def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
val d = new Dataset val d = new Dataset
d.setPid( d.setPid(
List( List(
OafMapperUtils.structuredProperty(input.pid.toLowerCase, input.pidType.toLowerCase, input.pidType.toLowerCase, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO) OafMapperUtils.structuredProperty(
input.pid.toLowerCase,
input.pidType.toLowerCase,
input.pidType.toLowerCase,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
DATA_INFO
)
).asJava ).asJava
) )
@ -101,7 +157,15 @@ object BioDBToOAF {
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true)) d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
if (input.tilte != null && input.tilte.nonEmpty) if (input.tilte != null && input.tilte.nonEmpty)
d.setTitle(List(OafMapperUtils.structuredProperty(input.tilte.head, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava) d.setTitle(
List(
OafMapperUtils.structuredProperty(
input.tilte.head,
ModelConstants.MAIN_TITLE_QUALIFIER,
DATA_INFO
)
).asJava
)
d.setOriginalId(List(input.pid).asJava) d.setOriginalId(List(input.pid).asJava)
val i = new Instance val i = new Instance
@ -113,9 +177,23 @@ object BioDBToOAF {
} }
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov")) if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
i.setInstancetype(OafMapperUtils.qualifier("0037", "Clinical Trial", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) i.setInstancetype(
OafMapperUtils.qualifier(
"0037",
"Clinical Trial",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
else else
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) i.setInstancetype(
OafMapperUtils.qualifier(
"0046",
"Bioentity",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
if (input.datasource == null || input.datasource.isEmpty) if (input.datasource == null || input.datasource.isEmpty)
return null return null
@ -141,7 +219,6 @@ object BioDBToOAF {
d d
} }
def uniprotToOAF(input: String): List[Oaf] = { def uniprotToOAF(input: String): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input) lazy val json = parse(input)
@ -151,7 +228,14 @@ object BioDBToOAF {
d.setPid( d.setPid(
List( List(
OafMapperUtils.structuredProperty(pid, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO) OafMapperUtils.structuredProperty(
pid,
"uniprot",
"uniprot",
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
DATA_INFO
)
).asJava ).asJava
) )
@ -162,14 +246,25 @@ object BioDBToOAF {
val title: String = (json \ "title").extractOrElse[String](null) val title: String = (json \ "title").extractOrElse[String](null)
if (title != null) if (title != null)
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava) d.setTitle(
List(
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
).asJava
)
d.setOriginalId(List(pid).asJava) d.setOriginalId(List(pid).asJava)
val i = new Instance val i = new Instance
i.setPid(d.getPid) i.setPid(d.getPid)
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava) i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) i.setInstancetype(
OafMapperUtils.qualifier(
"0046",
"Bioentity",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
i.setCollectedfrom(collectedFromMap("uniprot")) i.setCollectedfrom(collectedFromMap("uniprot"))
d.setInstance(List(i).asJava) d.setInstance(List(i).asJava)
@ -182,12 +277,21 @@ object BioDBToOAF {
val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null) val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
if (subjects != null) { if (subjects != null) {
d.setSubject( d.setSubject(
subjects.map(s => subjects
OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null) .map(s =>
).asJava) OafMapperUtils.structuredProperty(
s,
SUBJ_CLASS,
SUBJ_CLASS,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
null
)
)
.asJava
)
} }
var i_date: Option[UniprotDate] = None var i_date: Option[UniprotDate] = None
@ -197,14 +301,23 @@ object BioDBToOAF {
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO)) i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO)) d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
} }
val relevant_dates: List[StructuredProperty] = dates.filter(d => !d.date_info.contains("entry version")) val relevant_dates: List[StructuredProperty] = dates
.map(date => OafMapperUtils.structuredProperty(date.date, ModelConstants.UNKNOWN, ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO)) .filter(d => !d.date_info.contains("entry version"))
.map(date =>
OafMapperUtils.structuredProperty(
date.date,
ModelConstants.UNKNOWN,
ModelConstants.UNKNOWN,
ModelConstants.DNET_DATACITE_DATE,
ModelConstants.DNET_DATACITE_DATE,
DATA_INFO
)
)
if (relevant_dates != null && relevant_dates.nonEmpty) if (relevant_dates != null && relevant_dates.nonEmpty)
d.setRelevantdate(relevant_dates.asJava) d.setRelevantdate(relevant_dates.asJava)
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO)) d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
} }
val references_pmid: List[String] = for { val references_pmid: List[String] = for {
JObject(reference) <- json \ "references" JObject(reference) <- json \ "references"
JField("PubMed", JString(pid)) <- reference JField("PubMed", JString(pid)) <- reference
@ -215,27 +328,46 @@ object BioDBToOAF {
JField(" DOI", JString(pid)) <- reference JField(" DOI", JString(pid)) <- reference
} yield pid } yield pid
if (references_pmid != null && references_pmid.nonEmpty) { if (references_pmid != null && references_pmid.nonEmpty) {
val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null) val rel = createRelation(
references_pmid.head,
"pmid",
d.getId,
collectedFromMap("uniprot"),
ModelConstants.RELATIONSHIP,
ModelConstants.IS_RELATED_TO,
if (i_date.isDefined) i_date.get.date else null
)
rel.getCollectedfrom rel.getCollectedfrom
List(d, rel) List(d, rel)
} } else if (references_doi != null && references_doi.nonEmpty) {
else if (references_doi != null && references_doi.nonEmpty) { val rel = createRelation(
val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null) references_doi.head,
"doi",
d.getId,
collectedFromMap("uniprot"),
ModelConstants.RELATIONSHIP,
ModelConstants.IS_RELATED_TO,
if (i_date.isDefined) i_date.get.date else null
)
List(d, rel) List(d, rel)
} } else
else
List(d) List(d)
} }
def generate_unresolved_id(pid: String, pidType: String): String = { def generate_unresolved_id(pid: String, pidType: String): String = {
s"unresolved::$pid::$pidType" s"unresolved::$pid::$pidType"
} }
def createRelation(
def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType: String, relClass: String, date: String): Relation = { pid: String,
pidType: String,
sourceId: String,
collectedFrom: KeyValue,
subRelType: String,
relClass: String,
date: String
): Relation = {
val rel = new Relation val rel = new Relation
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava) rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
@ -248,7 +380,6 @@ object BioDBToOAF {
rel.setSource(sourceId) rel.setSource(sourceId)
rel.setTarget(s"unresolved::$pid::$pidType") rel.setTarget(s"unresolved::$pid::$pidType")
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date) val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava) rel.setProperties(List(dateProps).asJava)
@ -259,12 +390,24 @@ object BioDBToOAF {
} }
def createSupplementaryRelation(
def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date: String): Relation = { pid: String,
createRelation(pid, pidType, sourceId, collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date) pidType: String,
sourceId: String,
collectedFrom: KeyValue,
date: String
): Relation = {
createRelation(
pid,
pidType,
sourceId,
collectedFrom,
ModelConstants.SUPPLEMENT,
ModelConstants.IS_SUPPLEMENT_TO,
date
)
} }
def pdbTOOaf(input: String): List[Oaf] = { def pdbTOOaf(input: String): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input) lazy val json = parse(input)
@ -277,7 +420,14 @@ object BioDBToOAF {
d.setPid( d.setPid(
List( List(
OafMapperUtils.structuredProperty(pdb, "pdb", "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO) OafMapperUtils.structuredProperty(
pdb,
"pdb",
"Protein Data Bank Identifier",
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
DATA_INFO
)
).asJava ).asJava
) )
@ -290,13 +440,16 @@ object BioDBToOAF {
if (title == null) if (title == null)
return List() return List()
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava) d.setTitle(
List(
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
).asJava
)
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null) val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
if (authors != null) { if (authors != null) {
val convertedAuthors = authors.zipWithIndex.map { a => val convertedAuthors = authors.zipWithIndex.map { a =>
val res = new Author val res = new Author
res.setFullname(a._1) res.setFullname(a._1)
res.setRank(a._2 + 1) res.setRank(a._2 + 1)
@ -310,7 +463,14 @@ object BioDBToOAF {
i.setPid(d.getPid) i.setPid(d.getPid)
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava) i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) i.setInstancetype(
OafMapperUtils.qualifier(
"0046",
"Bioentity",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
i.setCollectedfrom(collectedFromMap("pdb")) i.setCollectedfrom(collectedFromMap("pdb"))
d.setInstance(List(i).asJava) d.setInstance(List(i).asJava)
@ -323,7 +483,6 @@ object BioDBToOAF {
List(d) List(d)
} }
def extractEBILinksFromDump(input: String): EBILinkItem = { def extractEBILinksFromDump(input: String): EBILinkItem = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input) lazy val json = parse(input)
@ -333,14 +492,14 @@ object BioDBToOAF {
EBILinkItem(pmid.toLong, compact(render(links))) EBILinkItem(pmid.toLong, compact(render(links)))
} }
def EBITargetLinksFilter(input: EBILinks): Boolean = { def EBITargetLinksFilter(input: EBILinks): Boolean = {
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot") input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase(
"pdb"
) || input.targetPidType.equalsIgnoreCase("uniprot")
} }
def parse_ebi_links(input: String): List[EBILinks] = { def parse_ebi_links(input: String): List[EBILinks] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input) lazy val json = parse(input)
@ -357,25 +516,46 @@ object BioDBToOAF {
JField("IDURL", JString(idUrl)) <- identifier JField("IDURL", JString(idUrl)) <- identifier
JField("ID", JString(id)) <- identifier JField("ID", JString(id)) <- identifier
} yield EBILinks(relation, GraphCleaningFunctions.cleanDate(publicationDate), title, pmid, id, idScheme, idUrl) } yield EBILinks(
relation,
GraphCleaningFunctions.cleanDate(publicationDate),
title,
pmid,
id,
idScheme,
idUrl
)
} }
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = { def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
val d = new Dataset val d = new Dataset
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava) d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
d.setDataInfo(DATA_INFO) d.setDataInfo(DATA_INFO)
d.setTitle(List(OafMapperUtils.structuredProperty(input.title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava) d.setTitle(
List(
OafMapperUtils.structuredProperty(
input.title,
ModelConstants.MAIN_TITLE_QUALIFIER,
DATA_INFO
)
).asJava
)
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_') val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true)) d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
d.setOriginalId(List(input.targetPid.toLowerCase).asJava) d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
d.setPid( d.setPid(
List( List(
OafMapperUtils.structuredProperty(input.targetPid.toLowerCase, input.targetPidType.toLowerCase, "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO) OafMapperUtils.structuredProperty(
input.targetPid.toLowerCase,
input.targetPidType.toLowerCase,
"Protein Data Bank Identifier",
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
DATA_INFO
)
).asJava ).asJava
) )
@ -383,13 +563,35 @@ object BioDBToOAF {
i.setPid(d.getPid) i.setPid(d.getPid)
i.setUrl(List(input.targetUrl).asJava) i.setUrl(List(input.targetUrl).asJava)
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) i.setInstancetype(
OafMapperUtils.qualifier(
"0046",
"Bioentity",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
i.setCollectedfrom(collectedFromMap("ebi")) i.setCollectedfrom(collectedFromMap("ebi"))
d.setInstance(List(i).asJava) d.setInstance(List(i).asJava)
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)) i.setDateofacceptance(
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)) OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
)
d.setDateofacceptance(
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
)
List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, GraphCleaningFunctions.cleanDate(input.date))) List(
d,
createRelation(
input.pmid,
"pmid",
d.getId,
collectedFromMap("ebi"),
ModelConstants.RELATIONSHIP,
ModelConstants.IS_RELATED_TO,
GraphCleaningFunctions.cleanDate(input.date)
)
)
} }
} }

View File

@ -14,7 +14,11 @@ object SparkTransformBioDatabaseToOAF {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val database: String = parser.get("database") val database: String = parser.get("database")
log.info("database: {}", database) log.info("database: {}", database)
@ -29,20 +33,33 @@ object SparkTransformBioDatabaseToOAF {
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val sc = spark.sparkContext val sc = spark.sparkContext
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
import spark.implicits._ import spark.implicits._
database.toUpperCase() match { database.toUpperCase() match {
case "UNIPROT" => case "UNIPROT" =>
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))), targetPath) CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
targetPath
)
case "PDB" => case "PDB" =>
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))), targetPath) CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
targetPath
)
case "SCHOLIX" => case "SCHOLIX" =>
CollectionUtils.saveDataset(spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)), targetPath) CollectionUtils.saveDataset(
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
targetPath
)
case "CROSSREF_LINKS" => case "CROSSREF_LINKS" =>
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))), targetPath) CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
targetPath
)
} }
} }

View File

@ -24,11 +24,12 @@ import scala.xml.pull.XMLEventReader
object SparkCreateBaselineDataFrame { object SparkCreateBaselineDataFrame {
def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = { def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = {
val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/") val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
val result = data.lines.filter(l => l.startsWith("<a href=")).map { l => val result = data.lines
.filter(l => l.startsWith("<a href="))
.map { l =>
val end = l.lastIndexOf("\">") val end = l.lastIndexOf("\">")
val start = l.indexOf("<a href=\"") val start = l.indexOf("<a href=\"")
@ -36,19 +37,24 @@ object SparkCreateBaselineDataFrame {
l.substring(start + 9, end - start) l.substring(start + 9, end - start)
else else
"" ""
}.filter(s => s.endsWith(".gz")).filter(s => s > maxFile).map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList }
.filter(s => s.endsWith(".gz"))
.filter(s => s > maxFile)
.map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s"))
.toList
result result
} }
def downloadBaselinePart(url: String): InputStream = { def downloadBaselinePart(url: String): InputStream = {
val r = new HttpGet(url) val r = new HttpGet(url)
val timeout = 60; // seconds val timeout = 60; // seconds
val config = RequestConfig.custom() val config = RequestConfig
.custom()
.setConnectTimeout(timeout * 1000) .setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000) .setConnectionRequestTimeout(timeout * 1000)
.setSocketTimeout(timeout * 1000).build() .setSocketTimeout(timeout * 1000)
.build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build() val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
val response = client.execute(r) val response = client.execute(r)
println(s"get response with status${response.getStatusLine.getStatusCode}") println(s"get response with status${response.getStatusLine.getStatusCode}")
@ -59,10 +65,12 @@ object SparkCreateBaselineDataFrame {
def requestPage(url: String): String = { def requestPage(url: String): String = {
val r = new HttpGet(url) val r = new HttpGet(url)
val timeout = 60; // seconds val timeout = 60; // seconds
val config = RequestConfig.custom() val config = RequestConfig
.custom()
.setConnectTimeout(timeout * 1000) .setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000) .setConnectionRequestTimeout(timeout * 1000)
.setSocketTimeout(timeout * 1000).build() .setSocketTimeout(timeout * 1000)
.build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build() val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
try { try {
var tries = 4 var tries = 4
@ -73,8 +81,7 @@ object SparkCreateBaselineDataFrame {
println(s"get response with status${response.getStatusLine.getStatusCode}") println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) { if (response.getStatusLine.getStatusCode > 400) {
tries -= 1 tries -= 1
} } else
else
return IOUtils.toString(response.getEntity.getContent) return IOUtils.toString(response.getEntity.getContent)
} catch { } catch {
case e: Throwable => case e: Throwable =>
@ -90,10 +97,8 @@ object SparkCreateBaselineDataFrame {
} }
} }
def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = { def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = {
val conf = new Configuration val conf = new Configuration
conf.set("fs.defaultFS", hdfsServerUri) conf.set("fs.defaultFS", hdfsServerUri)
val fs = FileSystem.get(conf) val fs = FileSystem.get(conf)
@ -122,8 +127,8 @@ object SparkCreateBaselineDataFrame {
} }
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] =
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable { new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
override def zero: PMArticle = new PMArticle override def zero: PMArticle = new PMArticle
override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = { override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
@ -142,11 +147,16 @@ object SparkCreateBaselineDataFrame {
override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle] override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkEBILinksToOaf.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
)
)
)
parser.parseArgument(args) parser.parseArgument(args)
val isLookupUrl: String = parser.get("isLookupUrl") val isLookupUrl: String = parser.get("isLookupUrl")
log.info("isLookupUrl: {}", isLookupUrl) log.info("isLookupUrl: {}", isLookupUrl)
@ -162,7 +172,6 @@ object SparkCreateBaselineDataFrame {
val skipUpdate = parser.get("skipUpdate") val skipUpdate = parser.get("skipUpdate")
log.info("skipUpdate: {}", skipUpdate) log.info("skipUpdate: {}", skipUpdate)
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
val spark: SparkSession = val spark: SparkSession =
@ -170,7 +179,8 @@ object SparkCreateBaselineDataFrame {
.builder() .builder()
.config(conf) .config(conf)
.appName(SparkEBILinksToOaf.getClass.getSimpleName) .appName(SparkEBILinksToOaf.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val sc = spark.sparkContext val sc = spark.sparkContext
import spark.implicits._ import spark.implicits._
@ -183,20 +193,30 @@ object SparkCreateBaselineDataFrame {
if (!"true".equalsIgnoreCase(skipUpdate)) { if (!"true".equalsIgnoreCase(skipUpdate)) {
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri) downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000) val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => { val ds: Dataset[PMArticle] = spark.createDataset(
k.filter(i => i._1.endsWith(".gz"))
.flatMap(i => {
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes())) val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
new PMParser(xml) new PMParser(xml)
})) })
ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1) )
ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder))
.groupByKey(_._1)
.agg(pmArticleAggregator.toColumn) .agg(pmArticleAggregator.toColumn)
.map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset") .map(p => p._2)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/baseline_dataset")
} }
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle] val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
CollectionUtils.saveDataset(exported_dataset CollectionUtils.saveDataset(
.map(a => PubMedToOaf.convert(a, vocabularies)).as[Oaf] exported_dataset
.map(a => PubMedToOaf.convert(a, vocabularies))
.as[Oaf]
.filter(p => p != null), .filter(p => p != null),
targetPath) targetPath
)
} }
} }

View File

@ -25,10 +25,12 @@ object SparkDownloadEBILinks {
def requestPage(url: String): String = { def requestPage(url: String): String = {
val r = new HttpGet(url) val r = new HttpGet(url)
val timeout = 60; // seconds val timeout = 60; // seconds
val config = RequestConfig.custom() val config = RequestConfig
.custom()
.setConnectTimeout(timeout * 1000) .setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000) .setConnectionRequestTimeout(timeout * 1000)
.setSocketTimeout(timeout * 1000).build() .setSocketTimeout(timeout * 1000)
.build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build() val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
try { try {
var tries = 4 var tries = 4
@ -39,8 +41,7 @@ object SparkDownloadEBILinks {
println(s"get response with status${response.getStatusLine.getStatusCode}") println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) { if (response.getStatusLine.getStatusCode > 400) {
tries -= 1 tries -= 1
} } else
else
return IOUtils.toString(response.getEntity.getContent) return IOUtils.toString(response.getEntity.getContent)
} catch { } catch {
case e: Throwable => case e: Throwable =>
@ -66,14 +67,19 @@ object SparkDownloadEBILinks {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val MAX_ITEM_PER_PARTITION = 20000 val MAX_ITEM_PER_PARTITION = 20000
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(SparkEBILinksToOaf.getClass.getSimpleName) .appName(SparkEBILinksToOaf.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
import spark.implicits._ import spark.implicits._
@ -87,22 +93,40 @@ object SparkDownloadEBILinks {
log.info(s"workingPath -> $workingPath") log.info(s"workingPath -> $workingPath")
log.info("Getting max pubmedId where the links have already requested") log.info("Getting max pubmedId where the links have already requested")
val links: Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem] val links: Dataset[EBILinkItem] =
spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0) val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)
log.info("Retrieving PMID to request links") log.info("Retrieving PMID to request links")
val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle] val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request") pubmed
.map(p => p.getPmid.toLong)
.where(s"value > $lastPMIDRequested")
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/id_to_request")
val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long] val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
val total = pmidToReq.count() val total = pmidToReq.count()
spark.createDataset(pmidToReq.rdd.repartition((total / MAX_ITEM_PER_PARTITION).toInt).map(pmid => createEBILinks(pmid)).filter(l => l != null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update") spark
.createDataset(
pmidToReq.rdd
.repartition((total / MAX_ITEM_PER_PARTITION).toInt)
.map(pmid => createEBILinks(pmid))
.filter(l => l != null)
)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/links_update")
val updates: Dataset[EBILinkItem] = spark.read.load(s"$workingPath/links_update").as[EBILinkItem] val updates: Dataset[EBILinkItem] =
spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
links.union(updates).groupByKey(_.id) links
.union(updates)
.groupByKey(_.id)
.reduceGroups { (x, y) => .reduceGroups { (x, y) =>
if (x == null || x.links == null) if (x == null || x.links == null)
y y
@ -112,6 +136,10 @@ object SparkDownloadEBILinks {
x x
else else
y y
}.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final") }
.map(_._2)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/links_final")
} }
} }

View File

@ -15,15 +15,19 @@ object SparkEBILinksToOaf {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(SparkEBILinksToOaf.getClass.getSimpleName) .appName(SparkEBILinksToOaf.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
import spark.implicits._ import spark.implicits._
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
@ -32,11 +36,17 @@ object SparkEBILinksToOaf {
log.info(s"targetPath -> $targetPath") log.info(s"targetPath -> $targetPath")
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{")) val ebLinks: Dataset[EBILinkItem] = spark.read
.load(sourcePath)
.as[EBILinkItem]
.filter(l => l.links != null && l.links.startsWith("{"))
CollectionUtils.saveDataset(ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links)) CollectionUtils.saveDataset(
ebLinks
.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
.filter(p => BioDBToOAF.EBITargetLinksFilter(p)) .filter(p => BioDBToOAF.EBITargetLinksFilter(p))
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)), .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
targetPath) targetPath
)
} }
} }

View File

@ -3,10 +3,7 @@ package eu.dnetlib.dhp.sx.bio.pubmed
import scala.xml.MetaData import scala.xml.MetaData
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader} import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
/** @param xml
/**
*
* @param xml
*/ */
class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] { class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] {
@ -29,10 +26,8 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
s.head.text s.head.text
else else
null null
} else null
} }
else null
}
def validate_Date(year: String, month: String, day: String): String = { def validate_Date(year: String, month: String, day: String): String = {
try { try {
@ -45,7 +40,6 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
def generateNextArticle(): PMArticle = { def generateNextArticle(): PMArticle = {
var currentSubject: PMSubject = null var currentSubject: PMSubject = null
var currentAuthor: PMAuthor = null var currentAuthor: PMAuthor = null
var currentJournal: PMJournal = null var currentJournal: PMJournal = null
@ -56,11 +50,6 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
var currentDay = "01" var currentDay = "01"
var currentArticleType: String = null var currentArticleType: String = null
while (xml.hasNext) { while (xml.hasNext) {
xml.next match { xml.next match {
case EvElemStart(_, label, attrs, _) => case EvElemStart(_, label, attrs, _) =>
@ -83,7 +72,8 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
case "Author" => currentArticle.getAuthors.add(currentAuthor) case "Author" => currentArticle.getAuthors.add(currentAuthor)
case "Journal" => currentArticle.setJournal(currentJournal) case "Journal" => currentArticle.setJournal(currentJournal)
case "Grant" => currentArticle.getGrants.add(currentGrant) case "Grant" => currentArticle.getGrants.add(currentGrant)
case "PubMedPubDate" => if (currentArticle.getDate== null) case "PubMedPubDate" =>
if (currentArticle.getDate == null)
currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay)) currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay") case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject) case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
@ -106,7 +96,8 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
currentArticle.setDescription(currentArticle.getDescription + text.trim) currentArticle.setDescription(currentArticle.getDescription + text.trim)
} }
case "PMID" => currentArticle.setPmid(text.trim) case "PMID" => currentArticle.setPmid(text.trim)
case "ArticleId" => if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim) case "ArticleId" =>
if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
case "Language" => currentArticle.setLanguage(text.trim) case "Language" => currentArticle.setLanguage(text.trim)
case "ISSN" => currentJournal.setIssn(text.trim) case "ISSN" => currentJournal.setIssn(text.trim)
case "GrantID" => currentGrant.setGrantID(text.trim) case "GrantID" => currentGrant.setGrantID(text.trim)
@ -122,7 +113,8 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
if (currentAuthor != null) if (currentAuthor != null)
currentAuthor.setLastName(text.trim) currentAuthor.setLastName(text.trim)
} }
case "ForeName" => if (currentAuthor != null) case "ForeName" =>
if (currentAuthor != null)
currentAuthor.setForeName(text.trim) currentAuthor.setForeName(text.trim)
case "Title" => case "Title" =>
if (currentJournal.getTitle == null) if (currentJournal.getTitle == null)
@ -139,8 +131,3 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
null null
} }
} }

View File

@ -9,21 +9,29 @@ import collection.JavaConverters._
import java.util.regex.Pattern import java.util.regex.Pattern
/** /**
*
*/ */
object PubMedToOaf { object PubMedToOaf {
val SUBJ_CLASS = "keywords" val SUBJ_CLASS = "keywords"
val urlMap = Map( val urlMap = Map(
"pmid" -> "https://pubmed.ncbi.nlm.nih.gov/", "pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
"doi" -> "https://dx.doi.org/" "doi" -> "https://dx.doi.org/"
) )
val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
val dataInfo: DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
false,
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
"0.9"
)
/** val collectedFrom: KeyValue =
* Cleaning the DOI Applying regex in order to OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
/** Cleaning the DOI Applying regex in order to
* remove doi starting with URL * remove doi starting with URL
* *
* @param doi input DOI * @param doi input DOI
@ -33,7 +41,6 @@ object PubMedToOaf {
val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$" val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE) val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
val matcher = pattern.matcher(doi) val matcher = pattern.matcher(doi)
@ -43,9 +50,7 @@ object PubMedToOaf {
null null
} }
/** /** Create an instance of class extends Result
*
* Create an instance of class extends Result
* starting from OAF instanceType value * starting from OAF instanceType value
* *
* @param cobjQualifier OAF instance type * @param cobjQualifier OAF instance type
@ -53,7 +58,11 @@ object PubMedToOaf {
* @return the correct instance * @return the correct instance
*/ */
def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = { def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid) val result_typologies = getVocabularyTerm(
ModelConstants.DNET_RESULT_TYPOLOGIES,
vocabularies,
cobjQualifier.getClassid
)
result_typologies.getClassid match { result_typologies.getClassid match {
case "dataset" => new Dataset case "dataset" => new Dataset
case "publication" => new Publication case "publication" => new Publication
@ -64,8 +73,7 @@ object PubMedToOaf {
} }
} }
/** /** Mapping the Pubmedjournal info into the OAF Journale
* Mapping the Pubmedjournal info into the OAF Journale
* *
* @param j the pubmedJournal * @param j the pubmedJournal
* @return the OAF Journal * @return the OAF Journal
@ -83,27 +91,26 @@ object PubMedToOaf {
journal.setIss(j.getIssue) journal.setIss(j.getIssue)
journal journal
} }
/** /** Find vocabulary term into synonyms and term in the vocabulary
*
* Find vocabulary term into synonyms and term in the vocabulary
* *
* @param vocabularyName the input vocabulary name * @param vocabularyName the input vocabulary name
* @param vocabularies all the vocabularies * @param vocabularies all the vocabularies
* @param term the term to find * @param term the term to find
* @return the cleaned term value * @return the cleaned term value
*/ */
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = { def getVocabularyTerm(
vocabularyName: String,
vocabularies: VocabularyGroup,
term: String
): Qualifier = {
val a = vocabularies.getSynonymAsQualifier(vocabularyName, term) val a = vocabularies.getSynonymAsQualifier(vocabularyName, term)
val b = vocabularies.getTermAsQualifier(vocabularyName, term) val b = vocabularies.getTermAsQualifier(vocabularyName, term)
if (a == null) b else a if (a == null) b else a
} }
/** Map the Pubmed Article into the OAF instance
/**
* Map the Pubmed Article into the OAF instance
* *
* @param article the pubmed articles * @param article the pubmed articles
* @param vocabularies the vocabularies * @param vocabularies the vocabularies
@ -114,9 +121,17 @@ object PubMedToOaf {
if (article.getPublicationTypes == null) if (article.getPublicationTypes == null)
return null return null
// MAP PMID into pid with classid = classname = pmid // MAP PMID into pid with classid = classname = pmid
val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)) val pidList: List[StructuredProperty] = List(
OafMapperUtils.structuredProperty(
article.getPmid,
PidType.pmid.toString,
PidType.pmid.toString,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
dataInfo
)
)
if (pidList == null) if (pidList == null)
return null return null
@ -125,7 +140,14 @@ object PubMedToOaf {
if (article.getDoi != null) { if (article.getDoi != null) {
val normalizedPid = cleanDoi(article.getDoi) val normalizedPid = cleanDoi(article.getDoi)
if (normalizedPid != null) if (normalizedPid != null)
alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo) alternateIdentifier = OafMapperUtils.structuredProperty(
normalizedPid,
PidType.doi.toString,
PidType.doi.toString,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
dataInfo
)
} }
// INSTANCE MAPPING // INSTANCE MAPPING
@ -133,10 +155,12 @@ object PubMedToOaf {
// If the article contains the typology Journal Article then we apply this type // If the article contains the typology Journal Article then we apply this type
//else We have to find a terms that match the vocabulary otherwise we discard it //else We have to find a terms that match the vocabulary otherwise we discard it
val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue)) val ja =
article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
val pubmedInstance = new Instance val pubmedInstance = new Instance
if (ja.isDefined) { if (ja.isDefined) {
val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue) val cojbCategory =
getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
pubmedInstance.setInstancetype(cojbCategory) pubmedInstance.setInstancetype(cojbCategory)
} else { } else {
val i_type = article.getPublicationTypes.asScala val i_type = article.getPublicationTypes.asScala
@ -155,7 +179,9 @@ object PubMedToOaf {
if (alternateIdentifier != null) if (alternateIdentifier != null)
pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava) pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava)
result.setInstance(List(pubmedInstance).asJava) result.setInstance(List(pubmedInstance).asJava)
pubmedInstance.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut) pubmedInstance.getPid.asScala
.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid))
.map(p => p.getValue)(collection.breakOut)
//CREATE URL From pmid //CREATE URL From pmid
val urlLists: List[String] = pidList val urlLists: List[String] = pidList
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue)) .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
@ -165,7 +191,9 @@ object PubMedToOaf {
pubmedInstance.setUrl(urlLists.asJava) pubmedInstance.setUrl(urlLists.asJava)
//ASSIGN DateofAcceptance //ASSIGN DateofAcceptance
pubmedInstance.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) pubmedInstance.setDateofacceptance(
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
)
//ASSIGN COLLECTEDFROM //ASSIGN COLLECTEDFROM
pubmedInstance.setCollectedfrom(collectedFrom) pubmedInstance.setCollectedfrom(collectedFrom)
result.setPid(pidList.asJava) result.setPid(pidList.asJava)
@ -173,7 +201,6 @@ object PubMedToOaf {
//END INSTANCE MAPPING //END INSTANCE MAPPING
//-------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------
// JOURNAL MAPPING // JOURNAL MAPPING
//-------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------
if (article.getJournal != null && result.isInstanceOf[Publication]) if (article.getJournal != null && result.isInstanceOf[Publication])
@ -182,31 +209,48 @@ object PubMedToOaf {
//END JOURNAL MAPPING //END JOURNAL MAPPING
//-------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------
// RESULT MAPPING // RESULT MAPPING
//-------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) result.setDateofacceptance(
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
)
if (article.getTitle == null || article.getTitle.isEmpty) if (article.getTitle == null || article.getTitle.isEmpty)
return null return null
result.setTitle(List(OafMapperUtils.structuredProperty(article.getTitle, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo)).asJava) result.setTitle(
List(
OafMapperUtils.structuredProperty(
article.getTitle,
ModelConstants.MAIN_TITLE_QUALIFIER,
dataInfo
)
).asJava
)
if (article.getDescription != null && article.getDescription.nonEmpty) if (article.getDescription != null && article.getDescription.nonEmpty)
result.setDescription(List(OafMapperUtils.field(article.getDescription, dataInfo)).asJava) result.setDescription(List(OafMapperUtils.field(article.getDescription, dataInfo)).asJava)
if (article.getLanguage != null) { if (article.getLanguage != null) {
val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage) val term =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
if (term != null) if (term != null)
result.setLanguage(term) result.setLanguage(term)
} }
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s =>
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection.breakOut) OafMapperUtils.structuredProperty(
s.getValue,
SUBJ_CLASS,
SUBJ_CLASS,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
dataInfo
)
)(collection.breakOut)
if (subjects != null) if (subjects != null)
result.setSubject(subjects.asJava) result.setSubject(subjects.asJava)
val authors: List[Author] = article.getAuthors.asScala.zipWithIndex.map { case (a, index) => val authors: List[Author] = article.getAuthors.asScala.zipWithIndex.map { case (a, index) =>
val author = new Author() val author = new Author()
author.setName(a.getForeName) author.setName(a.getForeName)
@ -216,15 +260,12 @@ object PubMedToOaf {
author author
}(collection.breakOut) }(collection.breakOut)
if (authors != null && authors.nonEmpty) if (authors != null && authors.nonEmpty)
result.setAuthor(authors.asJava) result.setAuthor(authors.asJava)
result.setOriginalId(pidList.map(s => s.getValue).asJava) result.setOriginalId(pidList.map(s => s.getValue).asJava)
result.setId(article.getPmid) result.setId(article.getPmid)
// END RESULT MAPPING // END RESULT MAPPING
//-------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------
val id = IdentifierFactory.createIdentifier(result) val id = IdentifierFactory.createIdentifier(result)
@ -234,5 +275,4 @@ object PubMedToOaf {
result result
} }
} }

View File

@ -17,7 +17,8 @@ import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import java.text.SimpleDateFormat import java.text.SimpleDateFormat
class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) { class SparkRetrieveDataciteDelta(propertyPath: String, args: Array[String], log: Logger)
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
val ISO_DATE_PATTERN = "yyyy-MM-dd'T'HH:mm:ssZ" val ISO_DATE_PATTERN = "yyyy-MM-dd'T'HH:mm:ssZ"
val simpleFormatter = new SimpleDateFormat(ISO_DATE_PATTERN) val simpleFormatter = new SimpleDateFormat(ISO_DATE_PATTERN)
@ -28,16 +29,13 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
val RESOLVED_REL_PATH_NAME = "resolvedRelation" val RESOLVED_REL_PATH_NAME = "resolvedRelation"
val SCHOLIX_PATH_NAME = "scholix" val SCHOLIX_PATH_NAME = "scholix"
def scholixResourcePath(workingPath: String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME" def scholixResourcePath(workingPath: String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME"
def dataciteOAFPath(workingPath: String) = s"$workingPath/$DATACITE_OAF_PATH_NAME" def dataciteOAFPath(workingPath: String) = s"$workingPath/$DATACITE_OAF_PATH_NAME"
def pidMapPath(workingPath: String) = s"$workingPath/$PID_MAP_PATH_NAME" def pidMapPath(workingPath: String) = s"$workingPath/$PID_MAP_PATH_NAME"
def resolvedRelationPath(workingPath: String) = s"$workingPath/$RESOLVED_REL_PATH_NAME" def resolvedRelationPath(workingPath: String) = s"$workingPath/$RESOLVED_REL_PATH_NAME"
def scholixPath(workingPath: String) = s"$workingPath/$SCHOLIX_PATH_NAME" def scholixPath(workingPath: String) = s"$workingPath/$SCHOLIX_PATH_NAME"
/** Utility to parse Date in ISO8601 to epochMillis
/**
* Utility to parse Date in ISO8601 to epochMillis
* @param inputDate The String represents an input date in ISO8601 * @param inputDate The String represents an input date in ISO8601
* @return The relative epochMillis of parsed date * @return The relative epochMillis of parsed date
*/ */
@ -45,9 +43,7 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
simpleFormatter.parse(inputDate).getTime simpleFormatter.parse(inputDate).getTime
} }
/** This method tries to retrieve the last collection date from all datacite
/**
* This method tries to retrieve the last collection date from all datacite
* records in HDFS. * records in HDFS.
* This method should be called before indexing scholexplorer to retrieve * This method should be called before indexing scholexplorer to retrieve
* the delta of Datacite record to download, since from the generation of * the delta of Datacite record to download, since from the generation of
@ -63,16 +59,23 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result] implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
import spark.implicits._ import spark.implicits._
val entitiesDS = spark.read.load(s"$entitiesPath/*").as[Oaf].filter(o =>o.isInstanceOf[Result]).map(r => r.asInstanceOf[Result]) val entitiesDS = spark.read
.load(s"$entitiesPath/*")
.as[Oaf]
.filter(o => o.isInstanceOf[Result])
.map(r => r.asInstanceOf[Result])
val date = entitiesDS.filter(r => r.getDateofcollection!= null).map(_.getDateofcollection).select(max("value")).first.getString(0) val date = entitiesDS
.filter(r => r.getDateofcollection != null)
.map(_.getDateofcollection)
.select(max("value"))
.first
.getString(0)
ISO8601toEpochMillis(date) / 1000 ISO8601toEpochMillis(date) / 1000
} }
/** The method of update Datacite relationships on Scholexplorer
/**
* The method of update Datacite relationships on Scholexplorer
* needs some utilities data structures * needs some utilities data structures
* One is the scholixResource DS that stores all the nodes in the Scholix Graph * One is the scholixResource DS that stores all the nodes in the Scholix Graph
* in format ScholixResource * in format ScholixResource
@ -80,19 +83,26 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
* @param workingPath the working path * @param workingPath the working path
* @param spark the spark session * @param spark the spark session
*/ */
def generateScholixResource(summaryPath:String, workingPath: String, spark:SparkSession) :Unit = { def generateScholixResource(
summaryPath: String,
workingPath: String,
spark: SparkSession
): Unit = {
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource] implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
log.info("Convert All summary to ScholixResource") log.info("Convert All summary to ScholixResource")
spark.read.load(summaryPath).as[ScholixSummary] spark.read
.load(summaryPath)
.as[ScholixSummary]
.map(ScholixUtils.generateScholixResourceFromSummary)(scholixResourceEncoder) .map(ScholixUtils.generateScholixResourceFromSummary)(scholixResourceEncoder)
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0) .filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_native") .write
.mode(SaveMode.Overwrite)
.save(s"${scholixResourcePath(workingPath)}_native")
} }
/** /** This method convert the new Datacite Resource into Scholix Resource
* This method convert the new Datacite Resource into Scholix Resource
* Needed to fill the source and the type of Scholix Relationships * Needed to fill the source and the type of Scholix Relationships
* @param workingPath the Working Path * @param workingPath the Working Path
* @param spark The spark Session * @param spark The spark Session
@ -103,25 +113,28 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result] implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
import spark.implicits._ import spark.implicits._
spark.read.load(dataciteOAFPath(workingPath)).as[Oaf] spark.read
.load(dataciteOAFPath(workingPath))
.as[Oaf]
.filter(_.isInstanceOf[Result]) .filter(_.isInstanceOf[Result])
.map(_.asInstanceOf[Result]) .map(_.asInstanceOf[Result])
.map(ScholixUtils.generateScholixResourceFromResult) .map(ScholixUtils.generateScholixResourceFromResult)
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0) .filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_update") .write
.mode(SaveMode.Overwrite)
.save(s"${scholixResourcePath(workingPath)}_update")
val update = spark.read.load(s"${scholixResourcePath(workingPath)}_update").as[ScholixResource] val update = spark.read.load(s"${scholixResourcePath(workingPath)}_update").as[ScholixResource]
val native = spark.read.load(s"${scholixResourcePath(workingPath)}_native").as[ScholixResource] val native = spark.read.load(s"${scholixResourcePath(workingPath)}_native").as[ScholixResource]
val graph = update.union(native) val graph = update
.union(native)
.groupByKey(_.getDnetIdentifier) .groupByKey(_.getDnetIdentifier)
.reduceGroups((a, b) => if (a != null && a.getDnetIdentifier != null) a else b) .reduceGroups((a, b) => if (a != null && a.getDnetIdentifier != null) a else b)
.map(_._2) .map(_._2)
graph.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_graph") graph.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_graph")
} }
/** This method get and Transform only datacite records with
/**
* This method get and Transform only datacite records with
* timestamp greater than timestamp * timestamp greater than timestamp
* @param datacitePath the datacite input Path * @param datacitePath the datacite input Path
* @param timestamp the timestamp * @param timestamp the timestamp
@ -130,31 +143,44 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
* @param vocabularies Vocabularies needed for transformation * @param vocabularies Vocabularies needed for transformation
*/ */
def getDataciteUpdate(datacitePath:String, timestamp:Long, workingPath:String, spark:SparkSession,vocabularies: VocabularyGroup): Long = { def getDataciteUpdate(
datacitePath: String,
timestamp: Long,
workingPath: String,
spark: SparkSession,
vocabularies: VocabularyGroup
): Long = {
import spark.implicits._ import spark.implicits._
val ds = spark.read.load(datacitePath).as[DataciteType] val ds = spark.read.load(datacitePath).as[DataciteType]
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
val total = ds.filter(_.timestamp >= timestamp).count() val total = ds.filter(_.timestamp >= timestamp).count()
if (total > 0) { if (total > 0) {
ds.filter(_.timestamp >= timestamp) ds.filter(_.timestamp >= timestamp)
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true)) .flatMap(d =>
.flatMap(i => fixRelations(i)).filter(i => i != null) DataciteToOAFTransformation
.write.mode(SaveMode.Overwrite).save(dataciteOAFPath(workingPath)) .generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true)
)
.flatMap(i => fixRelations(i))
.filter(i => i != null)
.write
.mode(SaveMode.Overwrite)
.save(dataciteOAFPath(workingPath))
} }
total total
} }
/** /** After added the new ScholixResource, we need to update the scholix Pid Map
* After added the new ScholixResource, we need to update the scholix Pid Map
* to intersected with the new Datacite Relations * to intersected with the new Datacite Relations
*
* @param workingPath The working Path starting from save the new Map * @param workingPath The working Path starting from save the new Map
* @param spark the spark session * @param spark the spark session
*/ */
def generatePidMap(workingPath: String, spark: SparkSession): Unit = { def generatePidMap(workingPath: String, spark: SparkSession): Unit = {
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource] implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
import spark.implicits._ import spark.implicits._
spark.read.load(s"${scholixResourcePath(workingPath)}_graph").as[ScholixResource] spark.read
.load(s"${scholixResourcePath(workingPath)}_graph")
.as[ScholixResource]
.flatMap(r => .flatMap(r =>
r.getIdentifier.asScala r.getIdentifier.asScala
.map(i => DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema)) .map(i => DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema))
@ -163,11 +189,12 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
.groupByKey(_._1) .groupByKey(_._1)
.reduceGroups((a, b) => if (a != null && a._2 != null) a else b) .reduceGroups((a, b) => if (a != null && a._2 != null) a else b)
.map(_._2)(Encoders.tuple(Encoders.STRING, Encoders.STRING)) .map(_._2)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Overwrite).save(pidMapPath(workingPath)) .write
.mode(SaveMode.Overwrite)
.save(pidMapPath(workingPath))
} }
/** /** This method resolve the datacite relation and filter the resolved
* This method resolve the datacite relation and filter the resolved
* relation * relation
* @param workingPath the working path * @param workingPath the working path
* @param spark the spark session * @param spark the spark session
@ -180,7 +207,9 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String, String)] val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String, String)]
val unresolvedRelations:Dataset[(String,Relation)] = spark.read.load(dataciteOAFPath(workingPath)).as[Oaf] val unresolvedRelations: Dataset[(String, Relation)] = spark.read
.load(dataciteOAFPath(workingPath))
.as[Oaf]
.filter(_.isInstanceOf[Relation]) .filter(_.isInstanceOf[Relation])
.map(_.asInstanceOf[Relation]) .map(_.asInstanceOf[Relation])
.map { r => .map { r =>
@ -202,15 +231,12 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
r r
})(relationEncoder) })(relationEncoder)
.filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved"))) .filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved")))
.write.mode(SaveMode.Overwrite) .write
.mode(SaveMode.Overwrite)
.save(resolvedRelationPath(workingPath)) .save(resolvedRelationPath(workingPath))
} }
/** This method generate scholix starting from resolved relation
/**
* This method generate scholix starting from resolved relation
*
* *
* @param workingPath * @param workingPath
* @param spark * @param spark
@ -220,35 +246,44 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource] implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation] implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation]
implicit val intermediateEncoder :Encoder[(String,Scholix)] = Encoders.tuple(Encoders.STRING, scholixEncoder) implicit val intermediateEncoder: Encoder[(String, Scholix)] =
Encoders.tuple(Encoders.STRING, scholixEncoder)
val relations: Dataset[(String, Relation)] = spark.read
.load(resolvedRelationPath(workingPath))
.as[Relation]
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relationEncoder))
val relations:Dataset[(String, Relation)] = spark.read.load(resolvedRelationPath(workingPath)).as[Relation].map(r =>(r.getSource,r))(Encoders.tuple(Encoders.STRING, relationEncoder)) val id_summary: Dataset[(String, ScholixResource)] = spark.read
.load(s"${scholixResourcePath(workingPath)}_graph")
val id_summary:Dataset[(String,ScholixResource)] = spark.read.load(s"${scholixResourcePath(workingPath)}_graph").as[ScholixResource].map(r => (r.getDnetIdentifier,r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder)) .as[ScholixResource]
.map(r => (r.getDnetIdentifier, r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder))
id_summary.cache() id_summary.cache()
relations.joinWith(id_summary, relations("_1").equalTo(id_summary("_1")),"inner") relations
.joinWith(id_summary, relations("_1").equalTo(id_summary("_1")), "inner")
.map(t => (t._1._2.getTarget, ScholixUtils.scholixFromSource(t._1._2, t._2._2))) .map(t => (t._1._2.getTarget, ScholixUtils.scholixFromSource(t._1._2, t._2._2)))
.write.mode(SaveMode.Overwrite).save(s"$workingPath/scholix_one_verse") .write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/scholix_one_verse")
val source_scholix:Dataset[(String, Scholix)] =spark.read.load(s"$workingPath/scholix_one_verse").as[(String,Scholix)] val source_scholix: Dataset[(String, Scholix)] =
spark.read.load(s"$workingPath/scholix_one_verse").as[(String, Scholix)]
source_scholix.joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")),"inner") source_scholix
.joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")), "inner")
.map(t => { .map(t => {
val target: ScholixResource = t._2._2 val target: ScholixResource = t._2._2
val scholix: Scholix = t._1._2 val scholix: Scholix = t._1._2
ScholixUtils.generateCompleteScholix(scholix, target) ScholixUtils.generateCompleteScholix(scholix, target)
})(scholixEncoder).write.mode(SaveMode.Overwrite).save(s"$workingPath/scholix") })(scholixEncoder)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/scholix")
} }
/** Here all the spark applications runs this method
/**
* Here all the spark applications runs this method
* where the whole logic of the spark node is defined * where the whole logic of the spark node is defined
*/ */
override def run(): Unit = { override def run(): Unit = {
@ -268,7 +303,6 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
require(vocabularies != null) require(vocabularies != null)
val updateDS: Boolean = "true".equalsIgnoreCase(parser.get("updateDS")) val updateDS: Boolean = "true".equalsIgnoreCase(parser.get("updateDS"))
log.info(s"updateDS is '$updateDS'") log.info(s"updateDS is '$updateDS'")
@ -277,15 +311,18 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
generateScholixResource(s"$sourcePath/provision/summaries", workingPath, spark) generateScholixResource(s"$sourcePath/provision/summaries", workingPath, spark)
log.info("Retrieve last entities collected From starting from scholix Graph") log.info("Retrieve last entities collected From starting from scholix Graph")
lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities") lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities")
} } else {
else {
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
fs.delete(new Path(s"${scholixResourcePath(workingPath)}_native"), true) fs.delete(new Path(s"${scholixResourcePath(workingPath)}_native"), true)
fs.rename(new Path(s"${scholixResourcePath(workingPath)}_graph"), new Path(s"${scholixResourcePath(workingPath)}_native")) fs.rename(
new Path(s"${scholixResourcePath(workingPath)}_graph"),
new Path(s"${scholixResourcePath(workingPath)}_native")
)
lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath)) lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath))
} }
val numRecords = getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies) val numRecords =
getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies)
if (numRecords > 0) { if (numRecords > 0) {
addMissingScholixResource(workingPath, spark) addMissingScholixResource(workingPath, spark)
generatePidMap(workingPath, spark) generatePidMap(workingPath, spark)
@ -295,11 +332,14 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
} }
} }
object SparkRetrieveDataciteDelta { object SparkRetrieveDataciteDelta {
val log: Logger = LoggerFactory.getLogger(SparkRetrieveDataciteDelta.getClass) val log: Logger = LoggerFactory.getLogger(SparkRetrieveDataciteDelta.getClass)
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
new SparkRetrieveDataciteDelta("/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json", args, log).initialize().run() new SparkRetrieveDataciteDelta(
"/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json",
args,
log
).initialize().run()
} }
} }

View File

@ -1,6 +1,5 @@
package eu.dnetlib.dhp.datacite package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature} import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.schema.oaf.Oaf
@ -37,7 +36,6 @@ class DataciteToOAFTest extends AbstractVocabularyTest{
FileUtils.deleteDirectory(workingDir.toFile) FileUtils.deleteDirectory(workingDir.toFile)
} }
@Test @Test
def testDateMapping: Unit = { def testDateMapping: Unit = {
val inputDate = "2021-07-14T11:52:54+0000" val inputDate = "2021-07-14T11:52:54+0000"
@ -45,24 +43,21 @@ class DataciteToOAFTest extends AbstractVocabularyTest{
val dt = ISO8601FORMAT.parse(inputDate) val dt = ISO8601FORMAT.parse(inputDate)
println(dt.getTime) println(dt.getTime)
} }
@Test @Test
def testConvert(): Unit = { def testConvert(): Unit = {
val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
val conf = new SparkConf() val conf = new SparkConf()
val spark:SparkSession = SparkSession.builder().config(conf) val spark: SparkSession = SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master("local[*]") .master("local[*]")
.getOrCreate() .getOrCreate()
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
val instance = new GenerateDataciteDatasetSpark(null, null, log) val instance = new GenerateDataciteDatasetSpark(null, null, log)
val targetPath = s"$workingDir/result" val targetPath = s"$workingDir/result"
@ -73,30 +68,31 @@ class DataciteToOAFTest extends AbstractVocabularyTest{
val nativeSize = spark.read.load(path).count() val nativeSize = spark.read.load(path).count()
assertEquals(100, nativeSize) assertEquals(100, nativeSize)
val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf] val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
result
result.map(s => s.getClass.getSimpleName).groupBy(col("value").alias("class")).agg(count("value").alias("Total")).show(false) .map(s => s.getClass.getSimpleName)
.groupBy(col("value").alias("class"))
.agg(count("value").alias("Total"))
.show(false)
val t = spark.read.load(targetPath).count() val t = spark.read.load(targetPath).count()
assertTrue(t > 0) assertTrue(t > 0)
spark.stop() spark.stop()
} }
@Test @Test
def testMapping(): Unit = { def testMapping(): Unit = {
val record =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")).mkString val record = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")
)
.mkString
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT) val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
val res: List[Oaf] = DataciteToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true) val res: List[Oaf] = DataciteToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true)
@ -107,8 +103,6 @@ class DataciteToOAFTest extends AbstractVocabularyTest{
}) })
} }
} }

View File

@ -22,7 +22,6 @@ import scala.xml.pull.XMLEventReader
@ExtendWith(Array(classOf[MockitoExtension])) @ExtendWith(Array(classOf[MockitoExtension]))
class BioScholixTest extends AbstractVocabularyTest { class BioScholixTest extends AbstractVocabularyTest {
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT) val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
@ -38,53 +37,55 @@ class BioScholixTest extends AbstractVocabularyTest{
} }
object GzFileIterator { object GzFileIterator {
def apply(is: InputStream, encoding: String) = { def apply(is: InputStream, encoding: String) = {
new BufferedReaderIterator( new BufferedReaderIterator(
new BufferedReader( new BufferedReader(new InputStreamReader(new GZIPInputStream(is), encoding))
new InputStreamReader( )
new GZIPInputStream(
is), encoding)))
} }
} }
@Test @Test
def testEBIData() = { def testEBIData() = {
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")).mkString val inputXML = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
.mkString
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes())) val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s))) new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
} }
@Test @Test
def testPubmedToOaf(): Unit = { def testPubmedToOaf(): Unit = {
assertNotNull(vocabularies) assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource")) assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")).mkString val records: String = Source
val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies)) .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump"))
.mkString
val r: List[Oaf] = records.lines.toList
.map(s => mapper.readValue(s, classOf[PMArticle]))
.map(a => PubMedToOaf.convert(a, vocabularies))
assertEquals(10, r.size) assertEquals(10, r.size)
assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p))) assertTrue(
r.map(p => p.asInstanceOf[Result])
.flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid))
.exists(p => "0037".equalsIgnoreCase(p))
)
println(mapper.writeValueAsString(r.head)) println(mapper.writeValueAsString(r.head))
} }
@Test @Test
def testPDBToOAF(): Unit = { def testPDBToOAF(): Unit = {
assertNotNull(vocabularies) assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource")) assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump")).mkString val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump"))
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty)) records.lines.foreach(s => assertTrue(s.nonEmpty))
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o)) val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
assertTrue(result.nonEmpty) assertTrue(result.nonEmpty)
result.foreach(r => assertNotNull(r)) result.foreach(r => assertNotNull(r))
@ -93,20 +94,19 @@ class BioScholixTest extends AbstractVocabularyTest{
} }
@Test @Test
def testUNIprotToOAF(): Unit = { def testUNIprotToOAF(): Unit = {
assertNotNull(vocabularies) assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource")) assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump")).mkString val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump"))
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty)) records.lines.foreach(s => assertTrue(s.nonEmpty))
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o)) val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
assertTrue(result.nonEmpty) assertTrue(result.nonEmpty)
result.foreach(r => assertNotNull(r)) result.foreach(r => assertNotNull(r))
@ -115,7 +115,14 @@ class BioScholixTest extends AbstractVocabularyTest{
} }
case class EBILinks(relType:String, date:String, title:String, pmid:String, targetPid:String, targetPidType:String) {} case class EBILinks(
relType: String,
date: String,
title: String,
pmid: String,
targetPid: String,
targetPidType: String
) {}
def parse_ebi_links(input: String): List[EBILinks] = { def parse_ebi_links(input: String): List[EBILinks] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -135,14 +142,14 @@ class BioScholixTest extends AbstractVocabularyTest{
} yield EBILinks(relation, publicationDate, title, pmid, id, idScheme) } yield EBILinks(relation, publicationDate, title, pmid, id, idScheme)
} }
@Test @Test
def testCrossrefLinksToOAF(): Unit = { def testCrossrefLinksToOAF(): Unit = {
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links")).mkString val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links"))
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty)) records.lines.foreach(s => assertTrue(s.nonEmpty))
val result: List[Oaf] = records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList val result: List[Oaf] = records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
assertNotNull(result) assertNotNull(result)
@ -154,24 +161,30 @@ class BioScholixTest extends AbstractVocabularyTest{
@Test @Test
def testEBILinksToOAF(): Unit = { def testEBILinksToOAF(): Unit = {
val iterator = GzFileIterator(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"), "UTF-8") val iterator = GzFileIterator(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"),
"UTF-8"
)
val data = iterator.next() val data = iterator.next()
val res = BioDBToOAF.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links).filter(BioDBToOAF.EBITargetLinksFilter).flatMap(BioDBToOAF.convertEBILinksToOaf) val res = BioDBToOAF
.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links)
.filter(BioDBToOAF.EBITargetLinksFilter)
.flatMap(BioDBToOAF.convertEBILinksToOaf)
print(res.length) print(res.length)
println(mapper.writeValueAsString(res.head)) println(mapper.writeValueAsString(res.head))
} }
@Test @Test
def scholixResolvedToOAF(): Unit = { def scholixResolvedToOAF(): Unit = {
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")).mkString val records: String = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")
)
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty)) records.lines.foreach(s => assertTrue(s.nonEmpty))
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -181,7 +194,6 @@ class BioScholixTest extends AbstractVocabularyTest{
json.extract[ScholixResolved] json.extract[ScholixResolved]
}.toList }.toList
val result: List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s)) val result: List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
assertTrue(result.nonEmpty) assertTrue(result.nonEmpty)

View File

@ -16,10 +16,22 @@ import java.time.LocalDate
import java.time.format.DateTimeFormatter import java.time.format.DateTimeFormatter
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
case class HostedByItemType(
id: String,
officialname: String,
issn: String,
eissn: String,
lissn: String,
openAccess: Boolean
) {}
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {} case class DoiBoostAffiliation(
PaperId: Long,
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){} AffiliationId: Long,
GridId: Option[String],
OfficialPage: Option[String],
DisplayName: Option[String]
) {}
object DoiBoostMappingUtil { object DoiBoostMappingUtil {
@ -43,7 +55,17 @@ object DoiBoostMappingUtil {
val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)" val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"
val DOI_PREFIX = "10." val DOI_PREFIX = "10."
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;") val invalidName = List(
",",
"none none",
"none, none",
"none &na;",
"(:null)",
"test test test",
"test test",
"test",
"&na; &na;"
)
def toActionSet(item: Oaf): (String, String) = { def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
@ -75,7 +97,6 @@ object DoiBoostMappingUtil {
} }
def toHostedByItem(input: String): (String, HostedByItemType) = { def toHostedByItem(input: String): (String, HostedByItemType) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -84,7 +105,6 @@ object DoiBoostMappingUtil {
(c.keys.head, c.values.head) (c.keys.head, c.values.head)
} }
def toISSNPair(publication: Publication): (String, Publication) = { def toISSNPair(publication: Publication): (String, Publication) = {
val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
val eissn = if (publication.getJournal == null) null else publication.getJournal.getIssnOnline val eissn = if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
@ -100,26 +120,24 @@ object DoiBoostMappingUtil {
(publication.getId, publication) (publication.getId, publication)
} }
def generateGridAffiliationId(gridId: String): String = { def generateGridAffiliationId(gridId: String): String = {
s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}" s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
} }
def fixResult(result: Dataset): Dataset = { def fixResult(result: Dataset): Dataset = {
val instanceType = extractInstance(result) val instanceType = extractInstance(result)
if (instanceType.isDefined) { if (instanceType.isDefined) {
result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
} }
result.getInstance().asScala.foreach(i => { result
.getInstance()
.asScala
.foreach(i => {
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY) i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
}) })
result result
} }
def decideAccessRight(lic: Field[String], date: String): AccessRight = { def decideAccessRight(lic: Field[String], date: String): AccessRight = {
if (lic == null) { if (lic == null) {
//Default value Unknown //Default value Unknown
@ -127,7 +145,8 @@ object DoiBoostMappingUtil {
} }
val license: String = lic.getValue val license: String = lic.getValue
//CC licenses //CC licenses
if(license.startsWith("cc") || if (
license.startsWith("cc") ||
license.startsWith("http://creativecommons.org/licenses") || license.startsWith("http://creativecommons.org/licenses") ||
license.startsWith("https://creativecommons.org/licenses") || license.startsWith("https://creativecommons.org/licenses") ||
@ -137,7 +156,8 @@ object DoiBoostMappingUtil {
license.equals("http://pubs.acs.org/page/policy/authorchoice_ccbyncnd_termsofuse.html") || license.equals("http://pubs.acs.org/page/policy/authorchoice_ccbyncnd_termsofuse.html") ||
//APA (considered OPEN also by Unpaywall) //APA (considered OPEN also by Unpaywall)
license.equals("http://www.apa.org/pubs/journals/resources/open-access.aspx")){ license.equals("http://www.apa.org/pubs/journals/resources/open-access.aspx")
) {
val oaq: AccessRight = getOpenAccessQualifier() val oaq: AccessRight = getOpenAccessQualifier()
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid) oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
@ -145,7 +165,11 @@ object DoiBoostMappingUtil {
} }
//OUP (BUT ONLY AFTER 12 MONTHS FROM THE PUBLICATION DATE, OTHERWISE THEY ARE EMBARGOED) //OUP (BUT ONLY AFTER 12 MONTHS FROM THE PUBLICATION DATE, OTHERWISE THEY ARE EMBARGOED)
if(license.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")){ if (
license.equals(
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
)
) {
val now = java.time.LocalDate.now val now = java.time.LocalDate.now
try { try {
@ -154,20 +178,19 @@ object DoiBoostMappingUtil {
val oaq: AccessRight = getOpenAccessQualifier() val oaq: AccessRight = getOpenAccessQualifier()
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid) oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
return oaq return oaq
} } else {
else{
return getEmbargoedAccessQualifier() return getEmbargoedAccessQualifier()
} }
} catch { } catch {
case e: Exception => { case e: Exception => {
try { try {
val pub_date = LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")) val pub_date =
LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))
if (((now.toEpochDay - pub_date.toEpochDay) / 365.0) > 1) { if (((now.toEpochDay - pub_date.toEpochDay) / 365.0) > 1) {
val oaq: AccessRight = getOpenAccessQualifier() val oaq: AccessRight = getOpenAccessQualifier()
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid) oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
return oaq return oaq
} } else {
else{
return getEmbargoedAccessQualifier() return getEmbargoedAccessQualifier()
} }
} catch { } catch {
@ -183,34 +206,56 @@ object DoiBoostMappingUtil {
} }
def getOpenAccessQualifier(): AccessRight = { def getOpenAccessQualifier(): AccessRight = {
OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN,"Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) OafMapperUtils.accessRight(
ModelConstants.ACCESS_RIGHT_OPEN,
"Open Access",
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
} }
def getRestrictedQualifier(): AccessRight = { def getRestrictedQualifier(): AccessRight = {
OafMapperUtils.accessRight( "RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) OafMapperUtils.accessRight(
"RESTRICTED",
"Restricted",
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
} }
def getUnknownQualifier(): AccessRight = { def getUnknownQualifier(): AccessRight = {
OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) OafMapperUtils.accessRight(
ModelConstants.UNKNOWN,
ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
} }
def getEmbargoedAccessQualifier(): AccessRight = { def getEmbargoedAccessQualifier(): AccessRight = {
OafMapperUtils.accessRight("EMBARGO","Embargo",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) OafMapperUtils.accessRight(
"EMBARGO",
"Embargo",
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
} }
def getClosedAccessQualifier(): AccessRight = { def getClosedAccessQualifier(): AccessRight = {
OafMapperUtils.accessRight("CLOSED","Closed Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) OafMapperUtils.accessRight(
"CLOSED",
"Closed Access",
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
} }
def extractInstance(r: Result): Option[Instance] = { def extractInstance(r: Result): Option[Instance] = {
r.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty) r.getInstance()
.asScala
.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
} }
def fixPublication(input: ((String, Publication), (String, HostedByItemType))): Publication = { def fixPublication(input: ((String, Publication), (String, HostedByItemType))): Publication = {
@ -222,10 +267,16 @@ object DoiBoostMappingUtil {
val instanceType: Option[Instance] = extractInstance(publication) val instanceType: Option[Instance] = extractInstance(publication)
if (instanceType.isDefined) { if (instanceType.isDefined) {
publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) publication
.getInstance()
.asScala
.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
} }
publication.getInstance().asScala.foreach(i => { publication
.getInstance()
.asScala
.foreach(i => {
var hb = new KeyValue var hb = new KeyValue
if (item != null) { if (item != null) {
hb.setValue(item.officialname) hb.setValue(item.officialname)
@ -235,8 +286,7 @@ object DoiBoostMappingUtil {
i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold) i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
} }
} } else {
else {
hb = ModelConstants.UNKNOWN_REPOSITORY hb = ModelConstants.UNKNOWN_REPOSITORY
} }
i.setHostedby(hb) i.setHostedby(hb)
@ -270,17 +320,22 @@ object DoiBoostMappingUtil {
if (publication.getTitle == null || publication.getTitle.size == 0) if (publication.getTitle == null || publication.getTitle.size == 0)
return false return false
val s = publication.getTitle.asScala.count(p =>
val s = publication.getTitle.asScala.count(p => p.getValue != null p.getValue != null
&& p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]")) && p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]")
)
if (s == 0) if (s == 0)
return false return false
// fixes #4360 (test publisher) // fixes #4360 (test publisher)
val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null val publisher =
if (publication.getPublisher != null) publication.getPublisher.getValue else null
if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) { if (
publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher
.equalsIgnoreCase("CrossRef Test Account"))
) {
return false; return false;
} }
@ -288,18 +343,12 @@ object DoiBoostMappingUtil {
if (publication.getAuthor == null || publication.getAuthor.size() == 0) if (publication.getAuthor == null || publication.getAuthor.size() == 0)
return false return false
//filter invalid author //filter invalid author
val authors = publication.getAuthor.asScala.map(s => { val authors = publication.getAuthor.asScala.map(s => {
if (s.getFullname.nonEmpty) { if (s.getFullname.nonEmpty) {
s.getFullname s.getFullname
} } else
else s"${s.getName} ${s.getSurname}"
s"${
s.getName
} ${
s.getSurname
}"
}) })
val c = authors.count(isValidAuthorName) val c = authors.count(isValidAuthorName)
@ -307,13 +356,16 @@ object DoiBoostMappingUtil {
return false return false
// fixes #4368 // fixes #4368
if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue)) if (
authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(
publication.getPublisher.getValue
)
)
return false return false
true true
} }
def isValidAuthorName(fullName: String): Boolean = { def isValidAuthorName(fullName: String): Boolean = {
if (fullName == null || fullName.isEmpty) if (fullName == null || fullName.isEmpty)
return false return false
@ -322,20 +374,30 @@ object DoiBoostMappingUtil {
true true
} }
def generateDataInfo(trust: String): DataInfo = { def generateDataInfo(trust: String): DataInfo = {
val di = new DataInfo val di = new DataInfo
di.setDeletedbyinference(false) di.setDeletedbyinference(false)
di.setInferred(false) di.setInferred(false)
di.setInvisible(false) di.setInvisible(false)
di.setTrust(trust) di.setTrust(trust)
di.setProvenanceaction(OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_ACTIONSET,ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS)) di.setProvenanceaction(
OafMapperUtils.qualifier(
ModelConstants.SYSIMPORT_ACTIONSET,
ModelConstants.SYSIMPORT_ACTIONSET,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS
)
)
di di
} }
def createSP(
value: String,
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = { classId: String,
className: String,
schemeId: String,
schemeName: String
): StructuredProperty = {
val sp = new StructuredProperty val sp = new StructuredProperty
sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName)) sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
sp.setValue(value) sp.setValue(value)
@ -343,9 +405,14 @@ object DoiBoostMappingUtil {
} }
def createSP(
value: String,
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = { classId: String,
className: String,
schemeId: String,
schemeName: String,
dataInfo: DataInfo
): StructuredProperty = {
val sp = new StructuredProperty val sp = new StructuredProperty
sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName)) sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
sp.setValue(value) sp.setValue(value)
@ -362,9 +429,12 @@ object DoiBoostMappingUtil {
} }
def createSP(
value: String,
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = { classId: String,
schemeId: String,
dataInfo: DataInfo
): StructuredProperty = {
val sp = new StructuredProperty val sp = new StructuredProperty
sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId)) sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
sp.setValue(value) sp.setValue(value)
@ -382,7 +452,6 @@ object DoiBoostMappingUtil {
} }
def createUnpayWallCollectedFrom(): KeyValue = { def createUnpayWallCollectedFrom(): KeyValue = {
val cf = new KeyValue val cf = new KeyValue
@ -401,15 +470,11 @@ object DoiBoostMappingUtil {
} }
def generateIdentifier(oaf: Result, doi: String): String = { def generateIdentifier(oaf: Result, doi: String): String = {
val id = DHPUtils.md5(doi.toLowerCase) val id = DHPUtils.md5(doi.toLowerCase)
s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}" s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}"
} }
def createMAGCollectedFrom(): KeyValue = { def createMAGCollectedFrom(): KeyValue = {
val cf = new KeyValue val cf = new KeyValue
@ -424,7 +489,6 @@ object DoiBoostMappingUtil {
tmp.setValue(value) tmp.setValue(value)
tmp tmp
} }
def isEmpty(x: String) = x == null || x.trim.isEmpty def isEmpty(x: String) = x == null || x.trim.isEmpty
@ -432,7 +496,10 @@ object DoiBoostMappingUtil {
def normalizeDoi(input: String): String = { def normalizeDoi(input: String): String = {
if (input == null) if (input == null)
return null return null
val replaced = input.replaceAll("(?:\\n|\\r|\\t|\\s)", "").toLowerCase.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX) val replaced = input
.replaceAll("(?:\\n|\\r|\\t|\\s)", "")
.toLowerCase
.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
if (isEmpty(replaced)) if (isEmpty(replaced))
return null return null
@ -446,9 +513,6 @@ object DoiBoostMappingUtil {
return ret return ret
} }
} }

View File

@ -17,22 +17,29 @@ object SparkGenerateDOIBoostActionSet {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization] implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset] implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation] implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
implicit val mapEncoderAS: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING) implicit val mapEncoderAS: Encoder[(String, String)] =
Encoders.tuple(Encoders.STRING, Encoders.STRING)
implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]] implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] =
Encoders.kryo[AtomicAction[OafDataset]]
val dbPublicationPath = parser.get("dbPublicationPath") val dbPublicationPath = parser.get("dbPublicationPath")
val dbDatasetPath = parser.get("dbDatasetPath") val dbDatasetPath = parser.get("dbDatasetPath")
@ -41,35 +48,61 @@ object SparkGenerateDOIBoostActionSet {
val dbOrganizationPath = parser.get("dbOrganizationPath") val dbOrganizationPath = parser.get("dbOrganizationPath")
val sequenceFilePath = parser.get("sFilePath") val sequenceFilePath = parser.get("sFilePath")
val asDataset = spark.read.load(dbDatasetPath).as[OafDataset] val asDataset = spark.read
.load(dbDatasetPath)
.as[OafDataset]
.filter(p => p != null || p.getId != null) .filter(p => p != null || p.getId != null)
.map(d => DoiBoostMappingUtil.fixResult(d)) .map(d => DoiBoostMappingUtil.fixResult(d))
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) .map(d => DoiBoostMappingUtil.toActionSet(d))(
Encoders.tuple(Encoders.STRING, Encoders.STRING)
)
val asPublication = spark.read
val asPublication = spark.read.load(dbPublicationPath).as[Publication] .load(dbPublicationPath)
.as[Publication]
.filter(p => p != null || p.getId != null) .filter(p => p != null || p.getId != null)
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) .map(d => DoiBoostMappingUtil.toActionSet(d))(
Encoders.tuple(Encoders.STRING, Encoders.STRING)
)
val asOrganization = spark.read
.load(dbOrganizationPath)
.as[Organization]
.map(d => DoiBoostMappingUtil.toActionSet(d))(
Encoders.tuple(Encoders.STRING, Encoders.STRING)
)
val asOrganization = spark.read.load(dbOrganizationPath).as[Organization] val asCRelation = spark.read
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) .load(crossRefRelation)
.as[Relation]
val asCRelation = spark.read.load(crossRefRelation).as[Relation]
.filter(r => r != null && r.getSource != null && r.getTarget != null) .filter(r => r != null && r.getSource != null && r.getTarget != null)
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) .map(d => DoiBoostMappingUtil.toActionSet(d))(
Encoders.tuple(Encoders.STRING, Encoders.STRING)
)
val asRelAffiliation = spark.read
.load(dbaffiliationRelationPath)
.as[Relation]
.map(d => DoiBoostMappingUtil.toActionSet(d))(
Encoders.tuple(Encoders.STRING, Encoders.STRING)
)
val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation] val d: Dataset[(String, String)] = asDataset
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) .union(asPublication)
.union(asOrganization)
.union(asCRelation)
val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation) .union(asRelAffiliation)
d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
d.rdd
.repartition(6000)
.map(s => (new Text(s._1), new Text(s._2)))
.saveAsHadoopFile(
s"$sequenceFilePath",
classOf[Text],
classOf[Text],
classOf[SequenceFileOutputFormat[Text, Text]],
classOf[GzipCodec]
)
} }

View File

@ -15,8 +15,8 @@ import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
object SparkGenerateDoiBoost {
object SparkGenerateDoiBoost {
def extractIdGRID(input: String): List[(String, String)] = { def extractIdGRID(input: String): List[(String, String)] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -35,19 +35,23 @@ object SparkGenerateDoiBoost {
grids.map(g => (id, s"unresolved::grid::${g.toLowerCase}"))(collection.breakOut) grids.map(g => (id, s"unresolved::grid::${g.toLowerCase}"))(collection.breakOut)
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
import spark.implicits._ import spark.implicits._
@ -65,8 +69,7 @@ object SparkGenerateDoiBoost {
a._2.setId(a._1) a._2.setId(a._1)
return a._2 return a._2
} }
} } else {
else {
if (a != null && a._2 != null) { if (a != null && a._2 != null) {
b.mergeFrom(a._2) b.mergeFrom(a._2)
b.setId(a._1) b.setId(a._1)
@ -82,8 +85,7 @@ object SparkGenerateDoiBoost {
if (b1 == null) { if (b1 == null) {
if (b2 != null) if (b2 != null)
return b2 return b2
} } else {
else {
if (b2 != null) { if (b2 != null) {
b1.mergeFrom(b2) b1.mergeFrom(b2)
val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor) val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
@ -103,17 +105,19 @@ object SparkGenerateDoiBoost {
override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication] override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication]
} }
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization] implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset] implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub) implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =
Encoders.tuple(Encoders.STRING, mapEncoderPub)
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation] implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
logger.info("Phase 2) Join Crossref with UnpayWall") logger.info("Phase 2) Join Crossref with UnpayWall")
val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p)) val crossrefPublication: Dataset[(String, Publication)] =
val uwPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p)) spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
val uwPublication: Dataset[(String, Publication)] =
spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
def applyMerge(item: ((String, Publication), (String, Publication))): Publication = { def applyMerge(item: ((String, Publication), (String, Publication))): Publication = {
val crossrefPub = item._1._2 val crossrefPub = item._1._2
@ -127,53 +131,95 @@ object SparkGenerateDoiBoost {
crossrefPub crossrefPub
} }
crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/firstJoin") crossrefPublication
.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left")
.map(applyMerge)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/firstJoin")
logger.info("Phase 3) Join Result with ORCID") logger.info("Phase 3) Join Result with ORCID")
val fj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p)) val fj: Dataset[(String, Publication)] =
val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p)) spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin") val orcidPublication: Dataset[(String, Publication)] =
spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left")
.map(applyMerge)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/secondJoin")
logger.info("Phase 4) Join Result with MAG") logger.info("Phase 4) Join Result with MAG")
val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p)) val sj: Dataset[(String, Publication)] =
spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p)) val magPublication: Dataset[(String, Publication)] =
sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublication") spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left")
.map(applyMerge)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/doiBoostPublication")
val doiBoostPublication: Dataset[(String, Publication)] = spark.read
.load(s"$workingDirPath/doiBoostPublication")
.as[Publication]
.filter(p => DoiBoostMappingUtil.filterPublication(p))
.map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
val doiBoostPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication].filter(p => DoiBoostMappingUtil.filterPublication(p)).map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder) val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(
spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem)
)
val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem)) doiBoostPublication
.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
doiBoostPublication.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
.map(DoiBoostMappingUtil.fixPublication) .map(DoiBoostMappingUtil.fixPublication)
.map(p => (p.getId, p)) .map(p => (p.getId, p))
.groupByKey(_._1) .groupByKey(_._1)
.agg(crossrefAggregator.toColumn) .agg(crossrefAggregator.toColumn)
.map(p => p._2) .map(p => p._2)
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered") .write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/doiBoostPublicationFiltered")
val affiliationPath = parser.get("affiliationPath") val affiliationPath = parser.get("affiliationPath")
val paperAffiliationPath = parser.get("paperAffiliationPath") val paperAffiliationPath = parser.get("paperAffiliationPath")
val affiliation = spark.read.load(affiliationPath).select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName")) val affiliation = spark.read
.load(affiliationPath)
val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId")) .select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))
val paperAffiliation = spark.read
.load(paperAffiliationPath)
.select(col("AffiliationId").alias("affId"), col("PaperId"))
val a: Dataset[DoiBoostAffiliation] = paperAffiliation val a: Dataset[DoiBoostAffiliation] = paperAffiliation
.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId"))) .joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId")))
.select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId"), col("_2.OfficialPage"), col("_2.DisplayName")).as[DoiBoostAffiliation] .select(
col("_1.PaperId"),
col("_2.AffiliationId"),
col("_2.GridId"),
col("_2.OfficialPage"),
col("_2.DisplayName")
)
.as[DoiBoostAffiliation]
val magPubs: Dataset[(String, Publication)] = spark.read
.load(s"$workingDirPath/doiBoostPublicationFiltered")
.as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(
tupleForJoinEncoder
)
.filter(s => s._1 != null)
val magPubs: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication] magPubs
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s => s._1 != null) .joinWith(a, magPubs("_1").equalTo(a("PaperId")))
.flatMap(item => {
magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
val pub: Publication = item._1._2 val pub: Publication = item._1._2
val affiliation = item._2 val affiliation = item._2
val affId: String = if (affiliation.GridId.isDefined) s"unresolved::grid::${affiliation.GridId.get.toLowerCase}" else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString) val affId: String =
if (affiliation.GridId.isDefined)
s"unresolved::grid::${affiliation.GridId.get.toLowerCase}"
else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
val r: Relation = new Relation val r: Relation = new Relation
r.setSource(pub.getId) r.setSource(pub.getId)
r.setTarget(affId) r.setTarget(affId)
@ -191,10 +237,15 @@ object SparkGenerateDoiBoost {
r1.setDataInfo(pub.getDataInfo) r1.setDataInfo(pub.getDataInfo)
r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava) r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
List(r, r1) List(r, r1)
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved") })(mapEncoderRel)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
val unresolvedRels: Dataset[(String, Relation)] = spark.read
val unresolvedRels: Dataset[(String, Relation)] = spark.read.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved").as[Relation].map(r => { .load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
.as[Relation]
.map(r => {
if (r.getSource.startsWith("unresolved")) if (r.getSource.startsWith("unresolved"))
(r.getSource, r) (r.getSource, r)
@ -204,9 +255,16 @@ object SparkGenerateDoiBoost {
("resolved", r) ("resolved", r)
})(Encoders.tuple(Encoders.STRING, mapEncoderRel)) })(Encoders.tuple(Encoders.STRING, mapEncoderRel))
val openaireOrganization: Dataset[(String, String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x, y) => if (x != null) x else y).map(_._2) val openaireOrganization: Dataset[(String, String)] = spark.read
.text(openaireOrganizationPath)
.as[String]
.flatMap(s => extractIdGRID(s))
.groupByKey(_._2)
.reduceGroups((x, y) => if (x != null) x else y)
.map(_._2)
unresolvedRels.joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2"))) unresolvedRels
.joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2")))
.map { x => .map { x =>
val currentRels = x._1._2 val currentRels = x._1._2
val currentOrgs = x._2 val currentOrgs = x._2
@ -216,9 +274,15 @@ object SparkGenerateDoiBoost {
else else
currentRels.setTarget(currentOrgs._1) currentRels.setTarget(currentOrgs._1)
currentRels currentRels
}.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation") }
.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/doiBoostPublicationAffiliation")
magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).map(item => { magPubs
.joinWith(a, magPubs("_1").equalTo(a("PaperId")))
.map(item => {
val affiliation = item._2 val affiliation = item._2
if (affiliation.GridId.isEmpty) { if (affiliation.GridId.isEmpty) {
val o = new Organization val o = new Organization
@ -232,10 +296,13 @@ object SparkGenerateDoiBoost {
o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get)) o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
o.setCountry(ModelConstants.UNKNOWN_COUNTRY) o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
o o
} } else
else
null null
}).filter(o => o != null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization") })
.filter(o => o != null)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/doiBoostOrganization")
} }
} }

View File

@ -22,11 +22,16 @@ case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
case class mappingAffiliation(name: String) {} case class mappingAffiliation(name: String) {}
case class mappingAuthor(given: Option[String], family: String, sequence:Option[String], ORCID: Option[String], affiliation: Option[mappingAffiliation]) {} case class mappingAuthor(
given: Option[String],
family: String,
sequence: Option[String],
ORCID: Option[String],
affiliation: Option[mappingAffiliation]
) {}
case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {} case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {}
case object Crossref2Oaf { case object Crossref2Oaf {
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass) val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
@ -56,7 +61,6 @@ case object Crossref2Oaf {
"dataset" -> "dataset" "dataset" -> "dataset"
) )
val mappingCrossrefSubType = Map( val mappingCrossrefSubType = Map(
"book-section" -> "0013 Part of book or chapter of book", "book-section" -> "0013 Part of book or chapter of book",
"book" -> "0002 Book", "book" -> "0002 Book",
@ -100,7 +104,6 @@ case object Crossref2Oaf {
val originalIds = new util.ArrayList(tmp.filter(id => id != null).asJava) val originalIds = new util.ArrayList(tmp.filter(id => id != null).asJava)
result.setOriginalId(originalIds) result.setOriginalId(originalIds)
// Add DataInfo // Add DataInfo
result.setDataInfo(generateDataInfo()) result.setDataInfo(generateDataInfo())
@ -114,55 +117,105 @@ case object Crossref2Oaf {
if (publisher != null && publisher.nonEmpty) if (publisher != null && publisher.nonEmpty)
result.setPublisher(asField(publisher)) result.setPublisher(asField(publisher))
// TITLE // TITLE
val mainTitles = for {JString(title) <- json \ "title" if title.nonEmpty} yield createSP(title, "main title", ModelConstants.DNET_DATACITE_TITLE) val mainTitles =
val originalTitles = for {JString(title) <- json \ "original-title" if title.nonEmpty} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE) for { JString(title) <- json \ "title" if title.nonEmpty } yield createSP(
val shortTitles = for {JString(title) <- json \ "short-title" if title.nonEmpty} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE) title,
val subtitles = for {JString(title) <- json \ "subtitle" if title.nonEmpty} yield createSP(title, "subtitle", ModelConstants.DNET_DATACITE_TITLE) "main title",
ModelConstants.DNET_DATACITE_TITLE
)
val originalTitles = for {
JString(title) <- json \ "original-title" if title.nonEmpty
} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
val shortTitles = for {
JString(title) <- json \ "short-title" if title.nonEmpty
} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
val subtitles =
for { JString(title) <- json \ "subtitle" if title.nonEmpty } yield createSP(
title,
"subtitle",
ModelConstants.DNET_DATACITE_TITLE
)
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava) result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
// DESCRIPTION // DESCRIPTION
val descriptionList = for {JString(description) <- json \ "abstract"} yield asField(description) val descriptionList =
for { JString(description) <- json \ "abstract" } yield asField(description)
result.setDescription(descriptionList.asJava) result.setDescription(descriptionList.asJava)
// Source // Source
val sourceList = for {JString(source) <- json \ "source" if source!= null && source.nonEmpty} yield asField(source) val sourceList = for {
JString(source) <- json \ "source" if source != null && source.nonEmpty
} yield asField(source)
result.setSource(sourceList.asJava) result.setSource(sourceList.asJava)
//RELEVANT DATE Mapping //RELEVANT DATE Mapping
val createdDate = generateDate((json \ "created" \ "date-time").extract[String], (json \ "created" \ "date-parts").extract[List[List[Int]]], "created", ModelConstants.DNET_DATACITE_DATE) val createdDate = generateDate(
val postedDate = generateDate((json \ "posted" \ "date-time").extractOrElse[String](null), (json \ "posted" \ "date-parts").extract[List[List[Int]]], "available", ModelConstants.DNET_DATACITE_DATE) (json \ "created" \ "date-time").extract[String],
val acceptedDate = generateDate((json \ "accepted" \ "date-time").extractOrElse[String](null), (json \ "accepted" \ "date-parts").extract[List[List[Int]]], "accepted", ModelConstants.DNET_DATACITE_DATE) (json \ "created" \ "date-parts").extract[List[List[Int]]],
val publishedPrintDate = generateDate((json \ "published-print" \ "date-time").extractOrElse[String](null), (json \ "published-print" \ "date-parts").extract[List[List[Int]]], "published-print", ModelConstants.DNET_DATACITE_DATE) "created",
val publishedOnlineDate = generateDate((json \ "published-online" \ "date-time").extractOrElse[String](null), (json \ "published-online" \ "date-parts").extract[List[List[Int]]], "published-online", ModelConstants.DNET_DATACITE_DATE) ModelConstants.DNET_DATACITE_DATE
)
val postedDate = generateDate(
(json \ "posted" \ "date-time").extractOrElse[String](null),
(json \ "posted" \ "date-parts").extract[List[List[Int]]],
"available",
ModelConstants.DNET_DATACITE_DATE
)
val acceptedDate = generateDate(
(json \ "accepted" \ "date-time").extractOrElse[String](null),
(json \ "accepted" \ "date-parts").extract[List[List[Int]]],
"accepted",
ModelConstants.DNET_DATACITE_DATE
)
val publishedPrintDate = generateDate(
(json \ "published-print" \ "date-time").extractOrElse[String](null),
(json \ "published-print" \ "date-parts").extract[List[List[Int]]],
"published-print",
ModelConstants.DNET_DATACITE_DATE
)
val publishedOnlineDate = generateDate(
(json \ "published-online" \ "date-time").extractOrElse[String](null),
(json \ "published-online" \ "date-parts").extract[List[List[Int]]],
"published-online",
ModelConstants.DNET_DATACITE_DATE
)
val issuedDate = extractDate((json \ "issued" \ "date-time").extractOrElse[String](null), (json \ "issued" \ "date-parts").extract[List[List[Int]]]) val issuedDate = extractDate(
(json \ "issued" \ "date-time").extractOrElse[String](null),
(json \ "issued" \ "date-parts").extract[List[List[Int]]]
)
if (StringUtils.isNotBlank(issuedDate)) { if (StringUtils.isNotBlank(issuedDate)) {
result.setDateofacceptance(asField(issuedDate)) result.setDateofacceptance(asField(issuedDate))
} } else {
else {
result.setDateofacceptance(asField(createdDate.getValue)) result.setDateofacceptance(asField(createdDate.getValue))
} }
result.setRelevantdate(List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate).filter(p => p != null).asJava) result.setRelevantdate(
List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate)
.filter(p => p != null)
.asJava
)
//Mapping Subject //Mapping Subject
val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List()) val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List())
if (subjectList.nonEmpty) { if (subjectList.nonEmpty) {
result.setSubject(subjectList.map(s=> createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava) result.setSubject(
subjectList.map(s => createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
)
} }
//Mapping Author //Mapping Author
val authorList: List[mappingAuthor] = (json \ "author").extractOrElse[List[mappingAuthor]](List()) val authorList: List[mappingAuthor] =
(json \ "author").extractOrElse[List[mappingAuthor]](List())
val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) =>
a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first")
)
result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) =>
val sorted_list = authorList.sortWith((a:mappingAuthor, b:mappingAuthor) => a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first")) generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)
}.asJava)
result.setAuthor(sorted_list.zipWithIndex.map{case (a, index) => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)}.asJava)
// Mapping instance // Mapping instance
val instance = new Instance() val instance = new Instance()
@ -179,9 +232,9 @@ case object Crossref2Oaf {
instance.setLicense(d._1) instance.setLicense(d._1)
} }
} }
} else {
instance.setLicense(l.head._1)
} }
else{
instance.setLicense(l.head._1)}
} }
// Ticket #6281 added pid to Instance // Ticket #6281 added pid to Instance
@ -191,18 +244,39 @@ case object Crossref2Oaf {
if (has_review != JNothing) { if (has_review != JNothing) {
instance.setRefereed( instance.setRefereed(
OafMapperUtils.qualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS)) OafMapperUtils.qualifier(
"0001",
"peerReviewed",
ModelConstants.DNET_REVIEW_LEVELS,
ModelConstants.DNET_REVIEW_LEVELS
)
)
} }
instance.setAccessright(decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)) instance.setAccessright(
instance.setInstancetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
result.setResourcetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) )
instance.setInstancetype(
OafMapperUtils.qualifier(
cobjCategory.substring(0, 4),
cobjCategory.substring(5),
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
result.setResourcetype(
OafMapperUtils.qualifier(
cobjCategory.substring(0, 4),
cobjCategory.substring(5),
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
instance.setCollectedfrom(createCrossrefCollectedFrom()) instance.setCollectedfrom(createCrossrefCollectedFrom())
if (StringUtils.isNotBlank(issuedDate)) { if (StringUtils.isNotBlank(issuedDate)) {
instance.setDateofacceptance(asField(issuedDate)) instance.setDateofacceptance(asField(issuedDate))
} } else {
else {
instance.setDateofacceptance(asField(createdDate.getValue)) instance.setDateofacceptance(asField(createdDate.getValue))
} }
val s: List[String] = List("https://doi.org/" + doi) val s: List[String] = List("https://doi.org/" + doi)
@ -210,8 +284,7 @@ case object Crossref2Oaf {
// if (links.nonEmpty) { // if (links.nonEmpty) {
// instance.setUrl(links.asJava) // instance.setUrl(links.asJava)
// } // }
if(s.nonEmpty) if (s.nonEmpty) {
{
instance.setUrl(s.asJava) instance.setUrl(s.asJava)
} }
@ -236,7 +309,6 @@ case object Crossref2Oaf {
result result
} }
def generateAuhtor(given: String, family: String, orcid: String, index: Int): Author = { def generateAuhtor(given: String, family: String, orcid: String, index: Int): Author = {
val a = new Author val a = new Author
a.setName(given) a.setName(given)
@ -244,7 +316,16 @@ case object Crossref2Oaf {
a.setFullname(s"$given $family") a.setFullname(s"$given $family")
a.setRank(index + 1) a.setRank(index + 1)
if (StringUtils.isNotBlank(orcid)) if (StringUtils.isNotBlank(orcid))
a.setPid(List(createSP(orcid, ModelConstants.ORCID_PENDING, ModelConstants.DNET_PID_TYPES, generateDataInfo())).asJava) a.setPid(
List(
createSP(
orcid,
ModelConstants.ORCID_PENDING,
ModelConstants.DNET_PID_TYPES,
generateDataInfo()
)
).asJava
)
a a
} }
@ -255,29 +336,35 @@ case object Crossref2Oaf {
var resultList: List[Oaf] = List() var resultList: List[Oaf] = List()
val objectType = (json \ "type").extractOrElse[String](null) val objectType = (json \ "type").extractOrElse[String](null)
val objectSubType = (json \ "subtype").extractOrElse[String](null) val objectSubType = (json \ "subtype").extractOrElse[String](null)
if (objectType == null) if (objectType == null)
return resultList return resultList
val result = generateItemFromType(objectType, objectSubType) val result = generateItemFromType(objectType, objectSubType)
if (result == null) if (result == null)
return List() return List()
val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type")) val cOBJCategory = mappingCrossrefSubType.getOrElse(
objectType,
mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type")
)
mappingResult(result, json, cOBJCategory) mappingResult(result, json, cOBJCategory)
if (result == null || result.getId == null) if (result == null || result.getId == null)
return List() return List()
val funderList: List[mappingFunder] =
val funderList: List[mappingFunder] = (json \ "funder").extractOrElse[List[mappingFunder]](List()) (json \ "funder").extractOrElse[List[mappingFunder]](List())
if (funderList.nonEmpty) { if (funderList.nonEmpty) {
resultList = resultList ::: mappingFunderToRelations(funderList, result.getId, createCrossrefCollectedFrom(), result.getDataInfo, result.getLastupdatetimestamp) resultList = resultList ::: mappingFunderToRelations(
funderList,
result.getId,
createCrossrefCollectedFrom(),
result.getDataInfo,
result.getLastupdatetimestamp
)
} }
result match { result match {
case publication: Publication => convertPublication(publication, json, cOBJCategory) case publication: Publication => convertPublication(publication, json, cOBJCategory)
case dataset: Dataset => convertDataset(dataset) case dataset: Dataset => convertDataset(dataset)
@ -287,22 +374,24 @@ case object Crossref2Oaf {
resultList resultList
} }
def mappingFunderToRelations(
def mappingFunderToRelations(funders: List[mappingFunder], sourceId: String, cf: KeyValue, di: DataInfo, ts: Long): List[Relation] = { funders: List[mappingFunder],
sourceId: String,
cf: KeyValue,
di: DataInfo,
ts: Long
): List[Relation] = {
val queue = new mutable.Queue[Relation] val queue = new mutable.Queue[Relation]
def snsfRule(award: String): String = { def snsfRule(award: String): String = {
val tmp1 = StringUtils.substringAfter(award, "_") val tmp1 = StringUtils.substringAfter(award, "_")
val tmp2 = StringUtils.substringBefore(tmp1, "/") val tmp2 = StringUtils.substringBefore(tmp1, "/")
logger.debug(s"From $award to $tmp2") logger.debug(s"From $award to $tmp2")
tmp2 tmp2
} }
def extractECAward(award: String): String = { def extractECAward(award: String): String = {
val awardECRegex: Regex = "[0-9]{4,9}".r val awardECRegex: Regex = "[0-9]{4,9}".r
if (awardECRegex.findAllIn(award).hasNext) if (awardECRegex.findAllIn(award).hasNext)
@ -310,7 +399,6 @@ case object Crossref2Oaf {
null null
} }
def generateRelation(sourceId: String, targetId: String, relClass: String): Relation = { def generateRelation(sourceId: String, targetId: String, relClass: String): Relation = {
val r = new Relation val r = new Relation
@ -324,89 +412,111 @@ case object Crossref2Oaf {
r.setLastupdatetimestamp(ts) r.setLastupdatetimestamp(ts)
r r
} }
def generateSimpleRelationFromAward(
def generateSimpleRelationFromAward(funder: mappingFunder, nsPrefix: String, extractField: String => String): Unit = { funder: mappingFunder,
nsPrefix: String,
extractField: String => String
): Unit = {
if (funder.award.isDefined && funder.award.get.nonEmpty) if (funder.award.isDefined && funder.award.get.nonEmpty)
funder.award.get.map(extractField).filter(a => a!= null && a.nonEmpty).foreach( funder.award.get
award => { .map(extractField)
.filter(a => a != null && a.nonEmpty)
.foreach(award => {
val targetId = getProjectId(nsPrefix, DHPUtils.md5(award)) val targetId = getProjectId(nsPrefix, DHPUtils.md5(award))
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
} })
)
} }
def getProjectId(nsPrefix: String, targetId: String): String = { def getProjectId(nsPrefix: String, targetId: String): String = {
s"40|$nsPrefix::$targetId" s"40|$nsPrefix::$targetId"
} }
if (funders != null) if (funders != null)
funders.foreach(funder => { funders.foreach(funder => {
if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) { if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
funder.DOI.get match { funder.DOI.get match {
case "10.13039/100010663" | case "10.13039/100010663" | "10.13039/100010661" | "10.13039/501100007601" | "10.13039/501100000780" |
"10.13039/100010661" | "10.13039/100010665" =>
"10.13039/501100007601" |
"10.13039/501100000780" |
"10.13039/100010665" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "10.13039/100011199" |
"10.13039/100004431" |
"10.13039/501100004963" |
"10.13039/501100000780" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
case "10.13039/501100000781" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward) generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "10.13039/100000001" => generateSimpleRelationFromAward(funder, "nsf_________", a => a) case "10.13039/100011199" | "10.13039/100004431" | "10.13039/501100004963" | "10.13039/501100000780" =>
case "10.13039/501100001665" => generateSimpleRelationFromAward(funder, "anr_________", a => a) generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
case "10.13039/501100002341" => generateSimpleRelationFromAward(funder, "aka_________", a => a) case "10.13039/501100000781" =>
case "10.13039/501100001602" => generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", "")) generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
case "10.13039/501100000923" => generateSimpleRelationFromAward(funder, "arc_________", a => a) generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "10.13039/501100000038"=> val targetId = getProjectId("nserc_______" , "1e5e62235d094afd01cd56e65112fc63") case "10.13039/100000001" =>
generateSimpleRelationFromAward(funder, "nsf_________", a => a)
case "10.13039/501100001665" =>
generateSimpleRelationFromAward(funder, "anr_________", a => a)
case "10.13039/501100002341" =>
generateSimpleRelationFromAward(funder, "aka_________", a => a)
case "10.13039/501100001602" =>
generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
case "10.13039/501100000923" =>
generateSimpleRelationFromAward(funder, "arc_________", a => a)
case "10.13039/501100000038" =>
val targetId = getProjectId("nserc_______", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case "10.13039/501100000155"=> val targetId = getProjectId("sshrc_______" , "1e5e62235d094afd01cd56e65112fc63") case "10.13039/501100000155" =>
val targetId = getProjectId("sshrc_______", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case "10.13039/501100000024"=> val targetId = getProjectId("cihr________" , "1e5e62235d094afd01cd56e65112fc63") case "10.13039/501100000024" =>
val targetId = getProjectId("cihr________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a) case "10.13039/501100002848" =>
case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward) generateSimpleRelationFromAward(funder, "conicytf____", a => a)
case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a=>a) case "10.13039/501100003448" =>
case "10.13039/501100004564" => generateSimpleRelationFromAward(funder, "mestd_______", extractECAward) generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
case "10.13039/501100003407" => generateSimpleRelationFromAward(funder, "miur________", a=>a) case "10.13039/501100010198" =>
generateSimpleRelationFromAward(funder, "sgov________", a => a)
case "10.13039/501100004564" =>
generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
case "10.13039/501100003407" =>
generateSimpleRelationFromAward(funder, "miur________", a => a)
val targetId = getProjectId("miur________", "1e5e62235d094afd01cd56e65112fc63") val targetId = getProjectId("miur________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case "10.13039/501100006588" | case "10.13039/501100006588" | "10.13039/501100004488" =>
"10.13039/501100004488" => generateSimpleRelationFromAward(funder, "irb_hr______", a=>a.replaceAll("Project No.", "").replaceAll("HRZZ-","") ) generateSimpleRelationFromAward(
case "10.13039/501100006769"=> generateSimpleRelationFromAward(funder, "rsf_________", a=>a) funder,
case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snsfRule) "irb_hr______",
case "10.13039/501100004410"=> generateSimpleRelationFromAward(funder, "tubitakf____", a =>a) a => a.replaceAll("Project No.", "").replaceAll("HRZZ-", "")
case "10.10.13039/100004440"=> generateSimpleRelationFromAward(funder, "wt__________", a =>a) )
case "10.13039/100004440"=> val targetId = getProjectId("wt__________" , "1e5e62235d094afd01cd56e65112fc63") case "10.13039/501100006769" =>
generateSimpleRelationFromAward(funder, "rsf_________", a => a)
case "10.13039/501100001711" =>
generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
case "10.13039/501100004410" =>
generateSimpleRelationFromAward(funder, "tubitakf____", a => a)
case "10.10.13039/100004440" =>
generateSimpleRelationFromAward(funder, "wt__________", a => a)
case "10.13039/100004440" =>
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case _ => logger.debug("no match for " + funder.DOI.get) case _ => logger.debug("no match for " + funder.DOI.get)
} }
} else { } else {
funder.name match { funder.name match {
case "European Unions Horizon 2020 research and innovation program" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward) case "European Unions Horizon 2020 research and innovation program" =>
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "European Union's" => case "European Union's" =>
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward) generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
generateSimpleRelationFromAward(funder, "corda_______", extractECAward) generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
case "The French National Research Agency (ANR)" | case "The French National Research Agency (ANR)" | "The French National Research Agency" =>
"The French National Research Agency" => generateSimpleRelationFromAward(funder, "anr_________", a => a) generateSimpleRelationFromAward(funder, "anr_________", a => a)
case "CONICYT, Programa de Formación de Capital Humano Avanzado" => generateSimpleRelationFromAward(funder, "conicytf____", extractECAward) case "CONICYT, Programa de Formación de Capital Humano Avanzado" =>
case "Wellcome Trust Masters Fellowship" => val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63") generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
case "Wellcome Trust Masters Fellowship" =>
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case _ => logger.debug("no match for " + funder.name) case _ => logger.debug("no match for " + funder.name)
@ -414,8 +524,7 @@ case object Crossref2Oaf {
} }
} }
} })
)
queue.toList queue.toList
} }
@ -423,12 +532,10 @@ case object Crossref2Oaf {
// TODO check if there are other info to map into the Dataset // TODO check if there are other info to map into the Dataset
} }
def convertPublication(publication: Publication, json: JValue, cobjCategory: String): Unit = { def convertPublication(publication: Publication, json: JValue, cobjCategory: String): Unit = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
val containerTitles = for { JString(ct) <- json \ "container-title" } yield ct val containerTitles = for { JString(ct) <- json \ "container-title" } yield ct
//Mapping book //Mapping book
if (cobjCategory.toLowerCase.contains("book")) { if (cobjCategory.toLowerCase.contains("book")) {
val ISBN = for { JString(isbn) <- json \ "ISBN" } yield isbn val ISBN = for { JString(isbn) <- json \ "ISBN" } yield isbn
@ -438,14 +545,14 @@ case object Crossref2Oaf {
val l: List[Field[String]] = publication.getSource.asScala.toList val l: List[Field[String]] = publication.getSource.asScala.toList
val ll: List[Field[String]] = l ::: List(asField(source)) val ll: List[Field[String]] = l ::: List(asField(source))
publication.setSource(ll.asJava) publication.setSource(ll.asJava)
} } else
else
publication.setSource(List(asField(source)).asJava) publication.setSource(List(asField(source)).asJava)
} }
} else { } else {
// Mapping Journal // Mapping Journal
val issnInfos = for {JArray(issn_types) <- json \ "issn-type" val issnInfos = for {
JArray(issn_types) <- json \ "issn-type"
JObject(issn_type) <- issn_types JObject(issn_type) <- issn_types
JField("type", JString(tp)) <- issn_type JField("type", JString(tp)) <- issn_type
JField("value", JString(vl)) <- issn_type JField("value", JString(vl)) <- issn_type
@ -494,7 +601,12 @@ case object Crossref2Oaf {
} }
def generateDate(dt: String, datePart: List[List[Int]], classId: String, schemeId: String): StructuredProperty = { def generateDate(
dt: String,
datePart: List[List[Int]],
classId: String,
schemeId: String
): StructuredProperty = {
val dp = extractDate(dt, datePart) val dp = extractDate(dt, datePart)
if (StringUtils.isNotBlank(dp)) if (StringUtils.isNotBlank(dp))
return createSP(dp, classId, schemeId) return createSP(dp, classId, schemeId)

View File

@ -16,7 +16,6 @@ object CrossrefDataset {
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
def to_item(input: String): CrossrefDT = { def to_item(input: String): CrossrefDT = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -29,19 +28,24 @@ object CrossrefDataset {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
CrossrefDataset.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json"
)
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName) .appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
import spark.implicits._ import spark.implicits._
val crossrefAggregator = new Aggregator[CrossrefDT, CrossrefDT, CrossrefDT] with Serializable { val crossrefAggregator = new Aggregator[CrossrefDT, CrossrefDT, CrossrefDT] with Serializable {
override def zero: CrossrefDT = null override def zero: CrossrefDT = null
@ -52,7 +56,6 @@ object CrossrefDataset {
if (a == null) if (a == null)
return b return b
if (a.timestamp > b.timestamp) { if (a.timestamp > b.timestamp) {
return a return a
} }
@ -80,19 +83,24 @@ object CrossrefDataset {
val workingPath: String = parser.get("workingPath") val workingPath: String = parser.get("workingPath")
val main_ds: Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT] val main_ds: Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT]
val update = val update =
spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text]) spark.createDataset(
spark.sparkContext
.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text])
.map(i => CrossrefImporter.decompressBlob(i._2.toString)) .map(i => CrossrefImporter.decompressBlob(i._2.toString))
.map(i => to_item(i))) .map(i => to_item(i))
)
main_ds.union(update).groupByKey(_.doi) main_ds
.union(update)
.groupByKey(_.doi)
.agg(crossrefAggregator.toColumn) .agg(crossrefAggregator.toColumn)
.map(s => s._2) .map(s => s._2)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated") .write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/crossref_ds_updated")
} }

View File

@ -18,7 +18,6 @@ object GenerateCrossrefDataset {
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT] implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
def crossrefElement(meta: String): CrossrefDT = { def crossrefElement(meta: String): CrossrefDT = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(meta) lazy val json: json4s.JValue = parse(meta)
@ -30,13 +29,23 @@ object GenerateCrossrefDataset {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf = new SparkConf val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString) val parser = new ArgumentApplicationParser(
Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
)
)
.mkString
)
parser.parseArgument(args) parser.parseArgument(args)
val master = parser.get("master") val master = parser.get("master")
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
val spark: SparkSession = SparkSession.builder().config(conf) val spark: SparkSession = SparkSession
.builder()
.config(conf)
.appName(UnpackCrtossrefEntries.getClass.getSimpleName) .appName(UnpackCrtossrefEntries.getClass.getSimpleName)
.master(master) .master(master)
.getOrCreate() .getOrCreate()
@ -44,12 +53,14 @@ object GenerateCrossrefDataset {
import spark.implicits._ import spark.implicits._
val tmp: RDD[String] = sc.textFile(sourcePath, 6000) val tmp: RDD[String] = sc.textFile(sourcePath, 6000)
spark.createDataset(tmp) spark
.createDataset(tmp)
.map(entry => crossrefElement(entry)) .map(entry => crossrefElement(entry))
.write.mode(SaveMode.Overwrite).save(targetPath) .write
.mode(SaveMode.Overwrite)
.save(targetPath)
// .map(meta => crossrefElement(meta)) // .map(meta => crossrefElement(meta))
// .toDS.as[CrossrefDT] // .toDS.as[CrossrefDT]
// .write.mode(SaveMode.Overwrite).save(targetPath) // .write.mode(SaveMode.Overwrite).save(targetPath)

View File

@ -8,7 +8,6 @@ import org.apache.spark.SparkConf
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
case class Reference(author: String, firstPage: String) {} case class Reference(author: String, firstPage: String) {}
object SparkMapDumpIntoOAF { object SparkMapDumpIntoOAF {
@ -19,14 +18,21 @@ object SparkMapDumpIntoOAF {
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkMapDumpIntoOAF.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json"
)
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName) .appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
@ -35,19 +41,34 @@ object SparkMapDumpIntoOAF {
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
spark.read.load(parser.get("sourcePath")).as[CrossrefDT] spark.read
.load(parser.get("sourcePath"))
.as[CrossrefDT]
.flatMap(k => Crossref2Oaf.convert(k.json)) .flatMap(k => Crossref2Oaf.convert(k.json))
.filter(o => o != null) .filter(o => o != null)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/mixObject") .write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/mixObject")
val ds: Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf] val ds: Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]
ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefPublication") ds.filter(o => o.isInstanceOf[Publication])
.map(o => o.asInstanceOf[Publication])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/crossrefPublication")
ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefRelation") ds.filter(o => o.isInstanceOf[Relation])
.map(o => o.asInstanceOf[Relation])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/crossrefRelation")
ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefDataset") ds.filter(o => o.isInstanceOf[OafDataset])
.map(o => o.asInstanceOf[OafDataset])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/crossrefDataset")
} }
} }

View File

@ -16,7 +16,6 @@ object UnpackCrtossrefEntries {
val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass) val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass)
def extractDump(input: String): List[String] = { def extractDump(input: String): List[String] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input) lazy val json: json4s.JValue = parse(input)
@ -24,28 +23,36 @@ object UnpackCrtossrefEntries {
val a = (json \ "items").extract[JArray] val a = (json \ "items").extract[JArray]
a.arr.map(s => compact(render(s))) a.arr.map(s => compact(render(s)))
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf = new SparkConf val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString) val parser = new ArgumentApplicationParser(
Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
)
)
.mkString
)
parser.parseArgument(args) parser.parseArgument(args)
val master = parser.get("master") val master = parser.get("master")
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
val spark: SparkSession = SparkSession.builder().config(conf) val spark: SparkSession = SparkSession
.builder()
.config(conf)
.appName(UnpackCrtossrefEntries.getClass.getSimpleName) .appName(UnpackCrtossrefEntries.getClass.getSimpleName)
.master(master) .master(master)
.getOrCreate() .getOrCreate()
val sc: SparkContext = spark.sparkContext val sc: SparkContext = spark.sparkContext
sc.wholeTextFiles(sourcePath, 6000).flatMap(d => extractDump(d._2)) sc.wholeTextFiles(sourcePath, 6000)
.flatMap(d => extractDump(d._2))
.saveAsTextFile(targetPath, classOf[GzipCodec]) .saveAsTextFile(targetPath, classOf[GzipCodec])
} }
} }

View File

@ -1,6 +1,5 @@
package eu.dnetlib.doiboost.mag package eu.dnetlib.doiboost.mag
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty} import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
@ -14,45 +13,121 @@ import scala.collection.JavaConverters._
import scala.collection.mutable import scala.collection.mutable
import scala.util.matching.Regex import scala.util.matching.Regex
case class MagPapers(
case class MagPapers(PaperId: Long, Rank: Integer, Doi: String, PaperId: Long,
DocType: String, PaperTitle: String, OriginalTitle: String, Rank: Integer,
BookTitle: String, Year: Option[Integer], Date: Option[java.sql.Timestamp], Publisher: String, Doi: String,
JournalId: Option[Long], ConferenceSeriesId: Option[Long], ConferenceInstanceId: Option[Long], DocType: String,
Volume: String, Issue: String, FirstPage: String, LastPage: String, PaperTitle: String,
ReferenceCount: Option[Long], CitationCount: Option[Long], EstimatedCitation: Option[Long], OriginalTitle: String,
OriginalVenue: String, FamilyId: Option[Long], CreatedDate: java.sql.Timestamp) {} BookTitle: String,
Year: Option[Integer],
Date: Option[java.sql.Timestamp],
Publisher: String,
JournalId: Option[Long],
ConferenceSeriesId: Option[Long],
ConferenceInstanceId: Option[Long],
Volume: String,
Issue: String,
FirstPage: String,
LastPage: String,
ReferenceCount: Option[Long],
CitationCount: Option[Long],
EstimatedCitation: Option[Long],
OriginalVenue: String,
FamilyId: Option[Long],
CreatedDate: java.sql.Timestamp
) {}
case class MagPaperAbstract(PaperId: Long, IndexedAbstract: String) {} case class MagPaperAbstract(PaperId: Long, IndexedAbstract: String) {}
case class MagAuthor(AuthorId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], LastKnownAffiliationId: Option[Long], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {} case class MagAuthor(
AuthorId: Long,
Rank: Option[Int],
NormalizedName: Option[String],
DisplayName: Option[String],
LastKnownAffiliationId: Option[Long],
PaperCount: Option[Long],
CitationCount: Option[Long],
CreatedDate: Option[java.sql.Timestamp]
) {}
case class MagAffiliation(AffiliationId: Long, Rank: Int, NormalizedName: String, DisplayName: String, GridId: String, OfficialPage: String, WikiPage: String, PaperCount: Long, CitationCount: Long, Latitude: Option[Float], Longitude: Option[Float], CreatedDate: java.sql.Timestamp) {} case class MagAffiliation(
AffiliationId: Long,
case class MagPaperAuthorAffiliation(PaperId: Long, AuthorId: Long, AffiliationId: Option[Long], AuthorSequenceNumber: Int, OriginalAuthor: String, OriginalAffiliation: String) {} Rank: Int,
NormalizedName: String,
DisplayName: String,
GridId: String,
OfficialPage: String,
WikiPage: String,
PaperCount: Long,
CitationCount: Long,
Latitude: Option[Float],
Longitude: Option[Float],
CreatedDate: java.sql.Timestamp
) {}
case class MagPaperAuthorAffiliation(
PaperId: Long,
AuthorId: Long,
AffiliationId: Option[Long],
AuthorSequenceNumber: Int,
OriginalAuthor: String,
OriginalAffiliation: String
) {}
case class MagAuthorAffiliation(author: MagAuthor, affiliation: String, sequenceNumber: Int) case class MagAuthorAffiliation(author: MagAuthor, affiliation: String, sequenceNumber: Int)
case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {} case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {}
case class MagPaperAuthorDenormalized(PaperId: Long, author: MagAuthor, affiliation:String, sequenceNumber:Int) {} case class MagPaperAuthorDenormalized(
PaperId: Long,
author: MagAuthor,
affiliation: String,
sequenceNumber: Int
) {}
case class MagPaperUrl(PaperId: Long, SourceType: Option[Int], SourceUrl: Option[String], LanguageCode: Option[String]) {} case class MagPaperUrl(
PaperId: Long,
SourceType: Option[Int],
SourceUrl: Option[String],
LanguageCode: Option[String]
) {}
case class MagUrlInstance(SourceUrl: String) {} case class MagUrlInstance(SourceUrl: String) {}
case class MagUrl(PaperId: Long, instances: List[MagUrlInstance]) case class MagUrl(PaperId: Long, instances: List[MagUrlInstance])
case class MagSubject(FieldOfStudyId:Long, DisplayName:String, MainType:Option[String], Score:Float){} case class MagSubject(
FieldOfStudyId: Long,
DisplayName: String,
MainType: Option[String],
Score: Float
) {}
case class MagFieldOfStudy(PaperId: Long, subjects: List[MagSubject]) {} case class MagFieldOfStudy(PaperId: Long, subjects: List[MagSubject]) {}
case class MagJournal(JournalId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], Issn: Option[String], Publisher: Option[String], Webpage: Option[String], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {} case class MagJournal(
JournalId: Long,
Rank: Option[Int],
NormalizedName: Option[String],
DisplayName: Option[String],
Issn: Option[String],
Publisher: Option[String],
Webpage: Option[String],
PaperCount: Option[Long],
CitationCount: Option[Long],
CreatedDate: Option[java.sql.Timestamp]
) {}
case class MagConferenceInstance(
case class MagConferenceInstance(ci:Long, DisplayName:Option[String], Location:Option[String], StartDate:Option[java.sql.Timestamp], EndDate:Option[java.sql.Timestamp], PaperId:Long){} ci: Long,
DisplayName: Option[String],
Location: Option[String],
StartDate: Option[java.sql.Timestamp],
EndDate: Option[java.sql.Timestamp],
PaperId: Long
) {}
case object ConversionUtil { case object ConversionUtil {
@ -65,7 +140,6 @@ case object ConversionUtil {
null null
} }
def mergePublication(a: Publication, b: Publication): Publication = { def mergePublication(a: Publication, b: Publication): Publication = {
if ((a != null) && (b != null)) { if ((a != null) && (b != null)) {
a.mergeFrom(b) a.mergeFrom(b)
@ -74,7 +148,6 @@ case object ConversionUtil {
if (a == null) b else a if (a == null) b else a
} }
} }
def choiceLatestMagArtitcle(p1: MagPapers, p2: MagPapers): MagPapers = { def choiceLatestMagArtitcle(p1: MagPapers, p2: MagPapers): MagPapers = {
@ -93,8 +166,9 @@ case object ConversionUtil {
} }
def updatePubsWithDescription(
def updatePubsWithDescription(inputItem:((String, Publication), MagPaperAbstract)) : Publication = { inputItem: ((String, Publication), MagPaperAbstract)
): Publication = {
val pub = inputItem._1._2 val pub = inputItem._1._2
val abst = inputItem._2 val abst = inputItem._2
if (abst != null) { if (abst != null) {
@ -104,8 +178,9 @@ case object ConversionUtil {
} }
def updatePubsWithConferenceInfo(
def updatePubsWithConferenceInfo(inputItem:((String, Publication), MagConferenceInstance)) : Publication = { inputItem: ((String, Publication), MagConferenceInstance)
): Publication = {
val publication: Publication = inputItem._1._2 val publication: Publication = inputItem._1._2
val ci: MagConferenceInstance = inputItem._2 val ci: MagConferenceInstance = inputItem._2
@ -115,9 +190,10 @@ case object ConversionUtil {
if (ci.Location.isDefined) if (ci.Location.isDefined)
j.setConferenceplace(ci.Location.get) j.setConferenceplace(ci.Location.get)
j.setName(ci.DisplayName.get) j.setName(ci.DisplayName.get)
if (ci.StartDate.isDefined && ci.EndDate.isDefined) if (ci.StartDate.isDefined && ci.EndDate.isDefined) {
{ j.setConferencedate(
j.setConferencedate(s"${ci.StartDate.get.toString.substring(0,10)} - ${ci.EndDate.get.toString.substring(0,10)}") s"${ci.StartDate.get.toString.substring(0, 10)} - ${ci.EndDate.get.toString.substring(0, 10)}"
)
} }
publication.setJournal(j) publication.setJournal(j)
@ -135,16 +211,34 @@ case object ConversionUtil {
val classid = "MAG" val classid = "MAG"
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => { val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
val s1 = createSP(s.DisplayName, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES) val s1 = createSP(
s.DisplayName,
classid,
className,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES
)
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString) val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
var resList: List[StructuredProperty] = List(s1) var resList: List[StructuredProperty] = List(s1)
if (s.MainType.isDefined) { if (s.MainType.isDefined) {
val maintp = s.MainType.get val maintp = s.MainType.get
val s2 = createSP(s.MainType.get, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES) val s2 = createSP(
s.MainType.get,
classid,
className,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES
)
s2.setDataInfo(di) s2.setDataInfo(di)
resList = resList ::: List(s2) resList = resList ::: List(s2)
if (maintp.contains(".")) { if (maintp.contains(".")) {
val s3 = createSP(maintp.split("\\.").head, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES) val s3 = createSP(
maintp.split("\\.").head,
classid,
className,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES
)
s3.setDataInfo(di) s3.setDataInfo(di)
resList = resList ::: List(s3) resList = resList ::: List(s3)
} }
@ -156,25 +250,27 @@ case object ConversionUtil {
publication publication
} }
def addInstances(a: (Publication, MagUrl)): Publication = { def addInstances(a: (Publication, MagUrl)): Publication = {
val pub = a._1 val pub = a._1
val urls = a._2 val urls = a._2
val i = new Instance val i = new Instance
if (urls != null) { if (urls != null) {
val l:List[String] = urls.instances.filter(k=>k.SourceUrl.nonEmpty).map(k=>k.SourceUrl):::List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}") val l: List[String] = urls.instances
.filter(k => k.SourceUrl.nonEmpty)
.map(k => k.SourceUrl) ::: List(
s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}"
)
i.setUrl(l.asJava) i.setUrl(l.asJava)
} } else
else i.setUrl(
i.setUrl(List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}").asJava) List(
s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}"
).asJava
)
// Ticket #6281 added pid to Instance // Ticket #6281 added pid to Instance
i.setPid(pub.getPid) i.setPid(pub.getPid)
@ -184,13 +280,13 @@ case object ConversionUtil {
pub pub
} }
def transformPaperAbstract(input: MagPaperAbstract): MagPaperAbstract = { def transformPaperAbstract(input: MagPaperAbstract): MagPaperAbstract = {
MagPaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract)) MagPaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract))
} }
def createOAFFromJournalAuthorPaper(
def createOAFFromJournalAuthorPaper(inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)): Publication = { inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)
): Publication = {
val paper = inputParams._1._1 val paper = inputParams._1._1
val journal = inputParams._1._2 val journal = inputParams._1._2
val authors = inputParams._2 val authors = inputParams._2
@ -206,31 +302,37 @@ case object ConversionUtil {
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub)) pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE) val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE) val originalTitles =
createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
pub.setTitle(List(mainTitles, originalTitles).asJava) pub.setTitle(List(mainTitles, originalTitles).asJava)
pub.setSource(List(asField(paper.BookTitle)).asJava) pub.setSource(List(asField(paper.BookTitle)).asJava)
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation => val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
a.setRank(f.sequenceNumber) a.setRank(f.sequenceNumber)
if (f.author.DisplayName.isDefined) if (f.author.DisplayName.isDefined)
a.setFullname(f.author.DisplayName.get) a.setFullname(f.author.DisplayName.get)
if (f.affiliation != null) if (f.affiliation != null)
a.setAffiliation(List(asField(f.affiliation)).asJava) a.setAffiliation(List(asField(f.affiliation)).asJava)
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava) a.setPid(
List(
createSP(
s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}",
"URL",
ModelConstants.DNET_PID_TYPES
)
).asJava
)
a a
} }
pub.setAuthor(authorsOAF.asJava) pub.setAuthor(authorsOAF.asJava)
if (paper.Date != null && paper.Date.isDefined) { if (paper.Date != null && paper.Date.isDefined) {
pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0, 10))) pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0, 10)))
} }
pub.setPublisher(asField(paper.Publisher)) pub.setPublisher(asField(paper.Publisher))
if (journal != null && journal.DisplayName.isDefined) { if (journal != null && journal.DisplayName.isDefined) {
val j = new Journal val j = new Journal
@ -250,8 +352,9 @@ case object ConversionUtil {
pub pub
} }
def createOAF(
def createOAF(inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)): Publication = { inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)
): Publication = {
val paper = inputParams._1._1 val paper = inputParams._1._1
val authors = inputParams._1._2 val authors = inputParams._1._2
@ -268,19 +371,17 @@ case object ConversionUtil {
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub)) pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE) val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE) val originalTitles =
createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
pub.setTitle(List(mainTitles, originalTitles).asJava) pub.setTitle(List(mainTitles, originalTitles).asJava)
pub.setSource(List(asField(paper.BookTitle)).asJava) pub.setSource(List(asField(paper.BookTitle)).asJava)
if (description != null) { if (description != null) {
pub.setDescription(List(asField(description.IndexedAbstract)).asJava) pub.setDescription(List(asField(description.IndexedAbstract)).asJava)
} }
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation => val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
a.setFullname(f.author.DisplayName.get) a.setFullname(f.author.DisplayName.get)
@ -288,26 +389,30 @@ case object ConversionUtil {
if (f.affiliation != null) if (f.affiliation != null)
a.setAffiliation(List(asField(f.affiliation)).asJava) a.setAffiliation(List(asField(f.affiliation)).asJava)
a.setPid(
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava) List(
createSP(
s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}",
"URL",
ModelConstants.DNET_PID_TYPES
)
).asJava
)
a a
} }
if (paper.Date != null) { if (paper.Date != null) {
pub.setDateofacceptance(asField(paper.Date.toString.substring(0, 10))) pub.setDateofacceptance(asField(paper.Date.toString.substring(0, 10)))
} }
pub.setAuthor(authorsOAF.asJava) pub.setAuthor(authorsOAF.asJava)
pub pub
} }
def convertInvertedIndexString(json_input: String): String = { def convertInvertedIndexString(json_input: String): String = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(json_input) lazy val json: json4s.JValue = parse(json_input)

View File

@ -8,6 +8,7 @@ import org.apache.spark.sql.{SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkImportMagIntoDataset { object SparkImportMagIntoDataset {
val datatypedict = Map( val datatypedict = Map(
"bool" -> BooleanType, "bool" -> BooleanType,
"int" -> IntegerType, "int" -> IntegerType,
@ -19,32 +20,232 @@ object SparkImportMagIntoDataset {
"DateTime" -> DateType "DateTime" -> DateType
) )
val stream = Map( val stream = Map(
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), "Affiliations" -> Tuple2(
"AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")), "mag/Affiliations.txt",
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), Seq(
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), "AffiliationId:long",
"ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "Rank:uint",
"EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")), "NormalizedName:string",
"FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")), "DisplayName:string",
"FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")), "GridId:string",
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "OfficialPage:string",
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "WikiPage:string",
"PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")), "PaperCount:long",
"PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")), "PaperFamilyCount:long",
"PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")), "CitationCount:long",
"PaperExtendedAttributes" -> Tuple2("mag/PaperExtendedAttributes.txt", Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")), "Iso3166Code:string",
"PaperFieldsOfStudy" -> Tuple2("advanced/PaperFieldsOfStudy.txt", Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")), "Latitude:float?",
"PaperMeSH" -> Tuple2("advanced/PaperMeSH.txt", Seq("PaperId:long", "DescriptorUI:string", "DescriptorName:string", "QualifierUI:string", "QualifierName:string", "IsMajorTopic:bool")), "Longitude:float?",
"PaperRecommendations" -> Tuple2("advanced/PaperRecommendations.txt", Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")), "CreatedDate:DateTime"
"PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")), )
"PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")), ),
"PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")), "AuthorExtendedAttributes" -> Tuple2(
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "FamilyRank:uint?", "DocSubTypes:string", "CreatedDate:DateTime")), "mag/AuthorExtendedAttributes.txt",
"RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float")) Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")
),
"Authors" -> Tuple2(
"mag/Authors.txt",
Seq(
"AuthorId:long",
"Rank:uint",
"NormalizedName:string",
"DisplayName:string",
"LastKnownAffiliationId:long?",
"PaperCount:long",
"PaperFamilyCount:long",
"CitationCount:long",
"CreatedDate:DateTime"
)
),
"ConferenceInstances" -> Tuple2(
"mag/ConferenceInstances.txt",
Seq(
"ConferenceInstanceId:long",
"NormalizedName:string",
"DisplayName:string",
"ConferenceSeriesId:long",
"Location:string",
"OfficialUrl:string",
"StartDate:DateTime?",
"EndDate:DateTime?",
"AbstractRegistrationDate:DateTime?",
"SubmissionDeadlineDate:DateTime?",
"NotificationDueDate:DateTime?",
"FinalVersionDueDate:DateTime?",
"PaperCount:long",
"PaperFamilyCount:long",
"CitationCount:long",
"Latitude:float?",
"Longitude:float?",
"CreatedDate:DateTime"
)
),
"ConferenceSeries" -> Tuple2(
"mag/ConferenceSeries.txt",
Seq(
"ConferenceSeriesId:long",
"Rank:uint",
"NormalizedName:string",
"DisplayName:string",
"PaperCount:long",
"PaperFamilyCount:long",
"CitationCount:long",
"CreatedDate:DateTime"
)
),
"EntityRelatedEntities" -> Tuple2(
"advanced/EntityRelatedEntities.txt",
Seq(
"EntityId:long",
"EntityType:string",
"RelatedEntityId:long",
"RelatedEntityType:string",
"RelatedType:int",
"Score:float"
)
),
"FieldOfStudyChildren" -> Tuple2(
"advanced/FieldOfStudyChildren.txt",
Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")
),
"FieldOfStudyExtendedAttributes" -> Tuple2(
"advanced/FieldOfStudyExtendedAttributes.txt",
Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")
),
"FieldsOfStudy" -> Tuple2(
"advanced/FieldsOfStudy.txt",
Seq(
"FieldOfStudyId:long",
"Rank:uint",
"NormalizedName:string",
"DisplayName:string",
"MainType:string",
"Level:int",
"PaperCount:long",
"PaperFamilyCount:long",
"CitationCount:long",
"CreatedDate:DateTime"
)
),
"Journals" -> Tuple2(
"mag/Journals.txt",
Seq(
"JournalId:long",
"Rank:uint",
"NormalizedName:string",
"DisplayName:string",
"Issn:string",
"Publisher:string",
"Webpage:string",
"PaperCount:long",
"PaperFamilyCount:long",
"CitationCount:long",
"CreatedDate:DateTime"
)
),
"PaperAbstractsInvertedIndex" -> Tuple2(
"nlp/PaperAbstractsInvertedIndex.txt.*",
Seq("PaperId:long", "IndexedAbstract:string")
),
"PaperAuthorAffiliations" -> Tuple2(
"mag/PaperAuthorAffiliations.txt",
Seq(
"PaperId:long",
"AuthorId:long",
"AffiliationId:long?",
"AuthorSequenceNumber:uint",
"OriginalAuthor:string",
"OriginalAffiliation:string"
)
),
"PaperCitationContexts" -> Tuple2(
"nlp/PaperCitationContexts.txt",
Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")
),
"PaperExtendedAttributes" -> Tuple2(
"mag/PaperExtendedAttributes.txt",
Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")
),
"PaperFieldsOfStudy" -> Tuple2(
"advanced/PaperFieldsOfStudy.txt",
Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")
),
"PaperMeSH" -> Tuple2(
"advanced/PaperMeSH.txt",
Seq(
"PaperId:long",
"DescriptorUI:string",
"DescriptorName:string",
"QualifierUI:string",
"QualifierName:string",
"IsMajorTopic:bool"
)
),
"PaperRecommendations" -> Tuple2(
"advanced/PaperRecommendations.txt",
Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")
),
"PaperReferences" -> Tuple2(
"mag/PaperReferences.txt",
Seq("PaperId:long", "PaperReferenceId:long")
),
"PaperResources" -> Tuple2(
"mag/PaperResources.txt",
Seq(
"PaperId:long",
"ResourceType:int",
"ResourceUrl:string",
"SourceUrl:string",
"RelationshipType:int"
)
),
"PaperUrls" -> Tuple2(
"mag/PaperUrls.txt",
Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")
),
"Papers" -> Tuple2(
"mag/Papers.txt",
Seq(
"PaperId:long",
"Rank:uint",
"Doi:string",
"DocType:string",
"PaperTitle:string",
"OriginalTitle:string",
"BookTitle:string",
"Year:int?",
"Date:DateTime?",
"OnlineDate:DateTime?",
"Publisher:string",
"JournalId:long?",
"ConferenceSeriesId:long?",
"ConferenceInstanceId:long?",
"Volume:string",
"Issue:string",
"FirstPage:string",
"LastPage:string",
"ReferenceCount:long",
"CitationCount:long",
"EstimatedCitation:long",
"OriginalVenue:string",
"FamilyId:long?",
"FamilyRank:uint?",
"DocSubTypes:string",
"CreatedDate:DateTime"
)
),
"RelatedFieldOfStudy" -> Tuple2(
"advanced/RelatedFieldOfStudy.txt",
Seq(
"FieldOfStudyId1:long",
"Type1:string",
"FieldOfStudyId2:long",
"Type2:string",
"Rank:float"
)
)
) )
def getSchema(streamName: String): StructType = { def getSchema(streamName: String): StructType = {
var schema = new StructType() var schema = new StructType()
@ -61,19 +262,22 @@ object SparkImportMagIntoDataset {
schema schema
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
stream.foreach { case (k, v) => stream.foreach { case (k, v) =>
val s: StructType = getSchema(k) val s: StructType = getSchema(k)

View File

@ -9,6 +9,7 @@ import org.apache.spark.sql.functions.{col, collect_list, struct}
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
object SparkProcessMAG { object SparkProcessMAG {
def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = { def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = {
@ -17,13 +18,31 @@ object SparkProcessMAG {
.reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2)) .reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2))
.map(_._2)(Encoders.product[MagPapers]) .map(_._2)(Encoders.product[MagPapers])
.map(mp => { .map(mp => {
MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi), MagPapers(
mp.DocType, mp.PaperTitle, mp.OriginalTitle, mp.PaperId,
mp.BookTitle, mp.Year, mp.Date, mp.Publisher: String, mp.Rank,
mp.JournalId, mp.ConferenceSeriesId, mp.ConferenceInstanceId, DoiBoostMappingUtil.normalizeDoi(mp.Doi),
mp.Volume, mp.Issue, mp.FirstPage, mp.LastPage, mp.DocType,
mp.ReferenceCount, mp.CitationCount, mp.EstimatedCitation, mp.PaperTitle,
mp.OriginalVenue, mp.FamilyId, mp.CreatedDate) mp.OriginalTitle,
mp.BookTitle,
mp.Year,
mp.Date,
mp.Publisher: String,
mp.JournalId,
mp.ConferenceSeriesId,
mp.ConferenceInstanceId,
mp.Volume,
mp.Issue,
mp.FirstPage,
mp.LastPage,
mp.ReferenceCount,
mp.CitationCount,
mp.EstimatedCitation,
mp.OriginalVenue,
mp.FamilyId,
mp.CreatedDate
)
})(Encoders.product[MagPapers]) })(Encoders.product[MagPapers])
} }
@ -31,22 +50,29 @@ object SparkProcessMAG {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
val workingPath = parser.get("workingPath") val workingPath = parser.get("workingPath")
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
import spark.implicits._ import spark.implicits._
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication] implicit val mapEncoderPubs: Encoder[Publication] =
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs) org.apache.spark.sql.Encoders.kryo[Publication]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =
Encoders.tuple(Encoders.STRING, mapEncoderPubs)
logger.info("Phase 1) make uninue DOI in Papers:") logger.info("Phase 1) make uninue DOI in Papers:")
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers] val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
@ -58,16 +84,23 @@ object SparkProcessMAG {
logger.info("Phase 0) Enrich Publication with description") logger.info("Phase 0) Enrich Publication with description")
val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract] val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"$workingPath/PaperAbstract") pa.map(ConversionUtil.transformPaperAbstract)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/PaperAbstract")
logger.info("Phase 3) Group Author by PaperId") logger.info("Phase 3) Group Author by PaperId")
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor] val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation] val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]
val paperAuthorAffiliation = spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation] val paperAuthorAffiliation =
spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
paperAuthorAffiliation.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId"))) paperAuthorAffiliation
.map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) => (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber)) } .joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
.map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) =>
(a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber))
}
.joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left") .joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")
.map(s => { .map(s => {
val mpa = s._1._2 val mpa = s._1._2
@ -76,79 +109,133 @@ object SparkProcessMAG {
MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName, mpa.sequenceNumber) MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName, mpa.sequenceNumber)
} else } else
mpa mpa
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors")) })
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_1_paper_authors") .groupBy("PaperId")
.agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors"))
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/merge_step_1_paper_authors")
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors") logger.info(
"Phase 4) create First Version of publication Entity with Paper Journal and Authors"
)
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal] val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
val papers = spark.read.load((s"$workingPath/Papers_distinct")).as[MagPapers] val papers = spark.read.load(s"$workingPath/Papers_distinct").as[MagPapers]
val paperWithAuthors = spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList] val paperWithAuthors =
spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left") val firstJoin =
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left") papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
firstJoin
.joinWith(
paperWithAuthors,
firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")),
"left"
)
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) } .map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_2") .write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/merge_step_2")
var magPubs: Dataset[(String, Publication)] = var magPubs: Dataset[(String, Publication)] =
spark.read.load(s"$workingPath/merge_step_2").as[Publication] spark.read
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] .load(s"$workingPath/merge_step_2")
.as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
.as[(String, Publication)]
val conference = spark.read
.load(s"$sourcePath/ConferenceInstances")
.select(
$"ConferenceInstanceId".as("ci"),
$"DisplayName",
$"Location",
$"StartDate",
$"EndDate"
)
val conferenceInstance = conference
.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
.select(
$"_1.ci",
$"_1.DisplayName",
$"_1.Location",
$"_1.StartDate",
$"_1.EndDate",
$"_2.PaperId"
)
.as[MagConferenceInstance]
val conference = spark.read.load(s"$sourcePath/ConferenceInstances") magPubs
.select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate", $"EndDate") .joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
val conferenceInstance = conference.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
.select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate", $"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance]
magPubs.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item)) .map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
.write .write
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.save(s"$workingPath/merge_step_3") .save(s"$workingPath/merge_step_3")
val paperAbstract = spark.read.load(s"$workingPath/PaperAbstract").as[MagPaperAbstract]
val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract] magPubs = spark.read
.load(s"$workingPath/merge_step_3")
.as[Publication]
magPubs = spark.read.load(s"$workingPath/merge_step_3").as[Publication] .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] .as[(String, Publication)]
magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithDescription(item)
).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4")
magPubs
.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithDescription(item))
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/merge_step_4")
logger.info("Phase 7) Enrich Publication with FieldOfStudy") logger.info("Phase 7) Enrich Publication with FieldOfStudy")
magPubs = spark.read.load(s"$workingPath/merge_step_4").as[Publication] magPubs = spark.read
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] .load(s"$workingPath/merge_step_4")
.as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
.as[(String, Publication)]
val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType") val fos = spark.read
.load(s"$sourcePath/FieldsOfStudy")
.select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy") val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy")
val paperField = pfos.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId"))) val paperField = pfos
.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
.select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score") .select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score")
.groupBy($"PaperId").agg(collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score")).as("subjects")) .groupBy($"PaperId")
.agg(
collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score"))
.as("subjects")
)
.as[MagFieldOfStudy] .as[MagFieldOfStudy]
magPubs.joinWith(paperField, col("_1") magPubs
.equalTo(paperField("PaperId")), "left") .joinWith(
paperField,
col("_1")
.equalTo(paperField("PaperId")),
"left"
)
.map(item => ConversionUtil.updatePubsWithSubject(item)) .map(item => ConversionUtil.updatePubsWithSubject(item))
.write.mode(SaveMode.Overwrite) .write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/mag_publication") .save(s"$workingPath/mag_publication")
spark.read.load(s"$workingPath/mag_publication").as[Publication] spark.read
.load(s"$workingPath/mag_publication")
.as[Publication]
.filter(p => p.getId != null) .filter(p => p.getId != null)
.groupByKey(p => p.getId) .groupByKey(p => p.getId)
.reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b)) .reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b))
.map(_._2) .map(_._2)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") .write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/magPublication")
} }
} }

View File

@ -15,15 +15,20 @@ import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
case class ORCIDItem(doi: String, authors: List[OrcidAuthor]) {} case class ORCIDItem(doi: String, authors: List[OrcidAuthor]) {}
case class OrcidAuthor(oid:String, name:Option[String], surname:Option[String], creditName:Option[String], otherNames:Option[List[String]], errorCode:Option[String]){}
case class OrcidAuthor(
oid: String,
name: Option[String],
surname: Option[String],
creditName: Option[String],
otherNames: Option[List[String]],
errorCode: Option[String]
) {}
case class OrcidWork(oid: String, doi: String) case class OrcidWork(oid: String, doi: String)
case class ORCIDElement(doi: String, authors: List[ORCIDItem]) {} case class ORCIDElement(doi: String, authors: List[ORCIDItem]) {}
object ORCIDToOAF { object ORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass) val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
@ -51,7 +56,6 @@ object ORCIDToOAF {
} else null } else null
} }
def strValid(s: Option[String]): Boolean = { def strValid(s: Option[String]): Boolean = {
s.isDefined && s.get.nonEmpty s.isDefined && s.get.nonEmpty
} }
@ -70,7 +74,6 @@ object ORCIDToOAF {
false false
} }
def extractDOIWorks(input: String): List[OrcidWork] = { def extractDOIWorks(input: String): List[OrcidWork] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input) lazy val json: json4s.JValue = parse(input)
@ -97,7 +100,6 @@ object ORCIDToOAF {
(json \ "authorData").extractOrElse[OrcidAuthor](null) (json \ "authorData").extractOrElse[OrcidAuthor](null)
} }
def convertTOOAF(input: ORCIDItem): Publication = { def convertTOOAF(input: ORCIDItem): Publication = {
val doi = input.doi val doi = input.doi
val pub: Publication = new Publication val pub: Publication = new Publication
@ -145,10 +147,18 @@ object ORCIDToOAF {
else if (strValid(o.creditName)) else if (strValid(o.creditName))
a.setFullname(o.creditName.get) a.setFullname(o.creditName.get)
if (StringUtils.isNotBlank(o.oid)) if (StringUtils.isNotBlank(o.oid))
a.setPid(List(createSP(o.oid, ModelConstants.ORCID, ModelConstants.DNET_PID_TYPES, generateOricPIDDatainfo())).asJava) a.setPid(
List(
createSP(
o.oid,
ModelConstants.ORCID,
ModelConstants.DNET_PID_TYPES,
generateOricPIDDatainfo()
)
).asJava
)
a a
} }
} }

View File

@ -10,11 +10,11 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkConvertORCIDToOAF { object SparkConvertORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass) val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
def run(spark: SparkSession, workingPath: String, targetPath: String): Unit = { def run(spark: SparkSession, workingPath: String, targetPath: String): Unit = {
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
import spark.implicits._ import spark.implicits._
val dataset: Dataset[ORCIDItem] = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem] val dataset: Dataset[ORCIDItem] =
spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]
logger.info("Converting ORCID to OAF") logger.info("Converting ORCID to OAF")
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath) dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath)
@ -22,15 +22,21 @@ object SparkConvertORCIDToOAF {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkConvertORCIDToOAF.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json"
)
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val workingPath = parser.get("workingPath") val workingPath = parser.get("workingPath")
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")

View File

@ -17,45 +17,72 @@ object SparkPreprocessORCID {
} }
def run(spark: SparkSession, sourcePath: String, workingPath: String): Unit = { def run(spark: SparkSession, sourcePath: String, workingPath: String): Unit = {
import spark.implicits._ import spark.implicits._
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
val inputRDD: RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s != null).filter(s => ORCIDToOAF.authorValid(s)) val inputRDD: RDD[OrcidAuthor] = spark.sparkContext
.textFile(s"$sourcePath/authors")
.map(s => ORCIDToOAF.convertORCIDAuthor(s))
.filter(s => s != null)
.filter(s => ORCIDToOAF.authorValid(s))
spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author") spark
.createDataset(inputRDD)
.as[OrcidAuthor]
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/author")
val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s != null) val res = spark.sparkContext
.textFile(s"$sourcePath/works")
.flatMap(s => ORCIDToOAF.extractDOIWorks(s))
.filter(s => s != null)
spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works") spark
.createDataset(res)
.as[OrcidWork]
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/works")
val authors: Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor] val authors: Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor]
val works: Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork] val works: Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork]
works.joinWith(authors, authors("oid").equalTo(works("oid"))) works
.joinWith(authors, authors("oid").equalTo(works("oid")))
.map(i => { .map(i => {
val doi = i._1.doi val doi = i._1.doi
val author = i._2 val author = i._2
(doi, author) (doi, author)
}).groupBy(col("_1").alias("doi")) })
.agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem] .groupBy(col("_1").alias("doi"))
.agg(collect_list(col("_2")).alias("authors"))
.as[ORCIDItem]
.map(s => fixORCIDItem(s)) .map(s => fixORCIDItem(s))
.write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor") .write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/orcidworksWithAuthor")
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/preprocess_orcid_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkConvertORCIDToOAF.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/preprocess_orcid_params.json"
)
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
val workingPath = parser.get("workingPath") val workingPath = parser.get("workingPath")

View File

@ -13,28 +13,35 @@ object SparkMapUnpayWallToOAF {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkMapDumpIntoOAF.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json"
)
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
val inputRDD: RDD[String] = spark.sparkContext.textFile(s"$sourcePath") val inputRDD: RDD[String] = spark.sparkContext.textFile(s"$sourcePath")
logger.info("Converting UnpayWall to OAF") logger.info("Converting UnpayWall to OAF")
val d: Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null)).as[Publication] val d: Dataset[Publication] = spark
.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null))
.as[Publication]
d.write.mode(SaveMode.Overwrite).save(targetPath) d.write.mode(SaveMode.Overwrite).save(targetPath)
} }

View File

@ -12,18 +12,22 @@ import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
case class OALocation(
evidence: Option[String],
case class OALocation(evidence:Option[String], host_type:Option[String], is_best:Option[Boolean], license: Option[String], pmh_id:Option[String], updated:Option[String], host_type: Option[String],
url:Option[String], url_for_landing_page:Option[String], url_for_pdf:Option[String], version:Option[String]) {} is_best: Option[Boolean],
license: Option[String],
pmh_id: Option[String],
updated: Option[String],
url: Option[String],
url_for_landing_page: Option[String],
url_for_pdf: Option[String],
version: Option[String]
) {}
object UnpayWallToOAF { object UnpayWallToOAF {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
def get_unpaywall_color(input: String): Option[OpenAccessRoute] = { def get_unpaywall_color(input: String): Option[OpenAccessRoute] = {
if (input == null || input.equalsIgnoreCase("close")) if (input == null || input.equalsIgnoreCase("close"))
return None return None
@ -38,7 +42,11 @@ object UnpayWallToOAF {
} }
def get_color(is_oa:Boolean, location: OALocation, journal_is_oa:Boolean):Option[OpenAccessRoute] = { def get_color(
is_oa: Boolean,
location: OALocation,
journal_is_oa: Boolean
): Option[OpenAccessRoute] = {
if (is_oa) { if (is_oa) {
if (location.host_type.isDefined) { if (location.host_type.isDefined) {
{ {
@ -62,7 +70,6 @@ object UnpayWallToOAF {
None None
} }
def convertToOAF(input: String): Publication = { def convertToOAF(input: String): Publication = {
val pub = new Publication val pub = new Publication
@ -122,7 +129,4 @@ object UnpayWallToOAF {
} }
} }

View File

@ -9,12 +9,8 @@ class DoiBoostHostedByMapTest {
def idDSGeneration(): Unit = { def idDSGeneration(): Unit = {
val s = "doajarticles::0066-782X" val s = "doajarticles::0066-782X"
println(DoiBoostMappingUtil.generateDSId(s)) println(DoiBoostMappingUtil.generateDSId(s))
} }
} }

View File

@ -13,7 +13,6 @@ class NormalizeDOITest {
} }
@Test @Test
def doiFiltered(): Unit = { def doiFiltered(): Unit = {
val doi = "0.1042/BCJ20160876" val doi = "0.1042/BCJ20160876"
@ -28,7 +27,6 @@ class NormalizeDOITest {
assert(DoiBoostMappingUtil.normalizeDoi(doi) == null) assert(DoiBoostMappingUtil.normalizeDoi(doi) == null)
} }
@Test @Test
def doiCleaned(): Unit = { def doiCleaned(): Unit = {
val doi = "https://doi.org/10.1042/BCJ20160876" val doi = "https://doi.org/10.1042/BCJ20160876"

View File

@ -12,20 +12,24 @@ import scala.collection.JavaConverters._
import scala.io.Source import scala.io.Source
import scala.util.matching.Regex import scala.util.matching.Regex
class CrossrefMappingTest { class CrossrefMappingTest {
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass) val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
@Test @Test
def testFunderRelationshipsMapping(): Unit = { def testFunderRelationshipsMapping(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString val template = Source
val funder_doi = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString .fromInputStream(
val funder_name = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")
)
.mkString
val funder_doi = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi"))
.mkString
val funder_name = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi"))
.mkString
for (line <- funder_doi.lines) { for (line <- funder_doi.lines) {
val json = template.replace("%s", line) val json = template.replace("%s", line)
@ -43,7 +47,8 @@ class CrossrefMappingTest {
def checkRelation(generatedOAF: List[Oaf]): Unit = { def checkRelation(generatedOAF: List[Oaf]): Unit = {
val rels: List[Relation] = generatedOAF.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]] val rels: List[Relation] =
generatedOAF.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
assertFalse(rels.isEmpty) assertFalse(rels.isEmpty)
rels.foreach(relation => { rels.foreach(relation => {
val relJson = mapper.writeValueAsString(relation) val relJson = mapper.writeValueAsString(relation)
@ -59,22 +64,22 @@ class CrossrefMappingTest {
} }
@Test @Test
def testSum(): Unit = { def testSum(): Unit = {
val from: Long = 1613135645000L val from: Long = 1613135645000L
val delta: Long = 1000000L val delta: Long = 1000000L
println(s"updating from value: $from -> ${from + delta}") println(s"updating from value: $from -> ${from + delta}")
} }
@Test @Test
def testOrcidID(): Unit = { def testOrcidID(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")).mkString val json = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")
)
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -85,17 +90,18 @@ class CrossrefMappingTest {
val items = resultList.filter(p => p.isInstanceOf[Result]) val items = resultList.filter(p => p.isInstanceOf[Result])
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
items.foreach(p => println(mapper.writeValueAsString(p))) items.foreach(p => println(mapper.writeValueAsString(p)))
} }
@Test @Test
def testEmptyTitle(): Unit = { def testEmptyTitle(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")).mkString val json = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")
)
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -106,17 +112,16 @@ class CrossrefMappingTest {
val items = resultList.filter(p => p.isInstanceOf[Result]) val items = resultList.filter(p => p.isInstanceOf[Result])
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
items.foreach(p => println(mapper.writeValueAsString(p))) items.foreach(p => println(mapper.writeValueAsString(p)))
} }
@Test @Test
def testPeerReviewed(): Unit = { def testPeerReviewed(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json")).mkString val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json"))
.mkString
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
assertNotNull(json) assertNotNull(json)
@ -128,12 +133,8 @@ class CrossrefMappingTest {
val items = resultList.filter(p => p.isInstanceOf[Result]) val items = resultList.filter(p => p.isInstanceOf[Result])
items.foreach(p => logger.info(mapper.writeValueAsString(p))) items.foreach(p => logger.info(mapper.writeValueAsString(p)))
} }
def extractECAward(award: String): String = { def extractECAward(award: String): String = {
@ -143,7 +144,6 @@ class CrossrefMappingTest {
null null
} }
@Test @Test
def extractECTest(): Unit = { def extractECTest(): Unit = {
val s = "FP7/2007-2013" val s = "FP7/2007-2013"
@ -152,12 +152,13 @@ class CrossrefMappingTest {
println(DHPUtils.md5(awardExtracted)) println(DHPUtils.md5(awardExtracted))
} }
@Test @Test
def testJournalRelation(): Unit = { def testJournalRelation(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json")).mkString val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json"))
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty) assertFalse(json.isEmpty)
@ -165,20 +166,19 @@ class CrossrefMappingTest {
val resultList: List[Oaf] = Crossref2Oaf.convert(json) val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty) assertTrue(resultList.nonEmpty)
val rels:List[Relation] = resultList.filter(p => p.isInstanceOf[Relation]).map(r=> r.asInstanceOf[Relation]) val rels: List[Relation] =
resultList.filter(p => p.isInstanceOf[Relation]).map(r => r.asInstanceOf[Relation])
rels.foreach(s => logger.info(s.getTarget)) rels.foreach(s => logger.info(s.getTarget))
assertEquals(rels.size, 6) assertEquals(rels.size, 6)
} }
@Test @Test
def testConvertBookFromCrossRef2Oaf(): Unit = { def testConvertBookFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json")).mkString val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json"))
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -199,42 +199,62 @@ class CrossrefMappingTest {
assertNotNull(result.getDataInfo, "Datainfo test not null Failed"); assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull( assertNotNull(
result.getDataInfo.getProvenanceaction, result.getDataInfo.getProvenanceaction,
"DataInfo/Provenance test not null Failed"); "DataInfo/Provenance test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty, result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
"DataInfo/Provenance/classId test not null Failed"); "DataInfo/Provenance/classId test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty, result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
"DataInfo/Provenance/className test not null Failed"); "DataInfo/Provenance/className test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty, result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
"DataInfo/Provenance/SchemeId test not null Failed"); "DataInfo/Provenance/SchemeId test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty, result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
"DataInfo/Provenance/SchemeName test not null Failed"); "DataInfo/Provenance/SchemeName test not null Failed"
);
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed"); assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty); assertFalse(result.getCollectedfrom.isEmpty);
val collectedFromList = result.getCollectedfrom.asScala val collectedFromList = result.getCollectedfrom.asScala
assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion") assert(
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion") "Wrong collected from assertion"
)
assert(
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
"Wrong collected from assertion"
)
val relevantDates = result.getRelevantdate.asScala val relevantDates = result.getRelevantdate.asScala
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created") assert(
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), "Missing relevant date of type published-online") relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), "Missing relevant date of type published-print") "Missing relevant date of type created"
)
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")),
"Missing relevant date of type published-online"
)
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")),
"Missing relevant date of type published-print"
)
val rels = resultList.filter(p => p.isInstanceOf[Relation]) val rels = resultList.filter(p => p.isInstanceOf[Relation])
assert(rels.isEmpty) assert(rels.isEmpty)
} }
@Test @Test
def testConvertPreprintFromCrossRef2Oaf(): Unit = { def testConvertPreprintFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json")).mkString val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json"))
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -255,44 +275,70 @@ class CrossrefMappingTest {
assertNotNull(result.getDataInfo, "Datainfo test not null Failed"); assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull( assertNotNull(
result.getDataInfo.getProvenanceaction, result.getDataInfo.getProvenanceaction,
"DataInfo/Provenance test not null Failed"); "DataInfo/Provenance test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty, result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
"DataInfo/Provenance/classId test not null Failed"); "DataInfo/Provenance/classId test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty, result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
"DataInfo/Provenance/className test not null Failed"); "DataInfo/Provenance/className test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty, result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
"DataInfo/Provenance/SchemeId test not null Failed"); "DataInfo/Provenance/SchemeId test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty, result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
"DataInfo/Provenance/SchemeName test not null Failed"); "DataInfo/Provenance/SchemeName test not null Failed"
);
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed"); assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty); assertFalse(result.getCollectedfrom.isEmpty);
val collectedFromList = result.getCollectedfrom.asScala val collectedFromList = result.getCollectedfrom.asScala
assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion") assert(
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion") "Wrong collected from assertion"
)
assert(
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
"Wrong collected from assertion"
)
val relevantDates = result.getRelevantdate.asScala val relevantDates = result.getRelevantdate.asScala
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created") assert(
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("available")), "Missing relevant date of type available") relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("accepted")), "Missing relevant date of type accepted") "Missing relevant date of type created"
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), "Missing relevant date of type published-online") )
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), "Missing relevant date of type published-print") assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("available")),
"Missing relevant date of type available"
)
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("accepted")),
"Missing relevant date of type accepted"
)
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")),
"Missing relevant date of type published-online"
)
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")),
"Missing relevant date of type published-print"
)
val rels = resultList.filter(p => p.isInstanceOf[Relation]) val rels = resultList.filter(p => p.isInstanceOf[Relation])
assert(rels.isEmpty) assert(rels.isEmpty)
} }
@Test @Test
def testConvertDatasetFromCrossRef2Oaf(): Unit = { def testConvertDatasetFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json")).mkString val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json"))
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -313,19 +359,24 @@ class CrossrefMappingTest {
assertNotNull(result.getDataInfo, "Datainfo test not null Failed"); assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull( assertNotNull(
result.getDataInfo.getProvenanceaction, result.getDataInfo.getProvenanceaction,
"DataInfo/Provenance test not null Failed"); "DataInfo/Provenance test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty, result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
"DataInfo/Provenance/classId test not null Failed"); "DataInfo/Provenance/classId test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty, result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
"DataInfo/Provenance/className test not null Failed"); "DataInfo/Provenance/className test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty, result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
"DataInfo/Provenance/SchemeId test not null Failed"); "DataInfo/Provenance/SchemeId test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty, result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
"DataInfo/Provenance/SchemeName test not null Failed"); "DataInfo/Provenance/SchemeName test not null Failed"
);
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed"); assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty); assertFalse(result.getCollectedfrom.isEmpty);
@ -333,7 +384,9 @@ class CrossrefMappingTest {
@Test @Test
def testConvertArticleFromCrossRef2Oaf(): Unit = { def testConvertArticleFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json"))
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -354,32 +407,45 @@ class CrossrefMappingTest {
assertNotNull(result.getDataInfo, "Datainfo test not null Failed"); assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull( assertNotNull(
result.getDataInfo.getProvenanceaction, result.getDataInfo.getProvenanceaction,
"DataInfo/Provenance test not null Failed"); "DataInfo/Provenance test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty, result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
"DataInfo/Provenance/classId test not null Failed"); "DataInfo/Provenance/classId test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty, result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
"DataInfo/Provenance/className test not null Failed"); "DataInfo/Provenance/className test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty, result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
"DataInfo/Provenance/SchemeId test not null Failed"); "DataInfo/Provenance/SchemeId test not null Failed"
);
assertFalse( assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty, result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
"DataInfo/Provenance/SchemeName test not null Failed"); "DataInfo/Provenance/SchemeName test not null Failed"
);
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed"); assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty); assertFalse(result.getCollectedfrom.isEmpty);
val collectedFromList = result.getCollectedfrom.asScala val collectedFromList = result.getCollectedfrom.asScala
assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion") assert(
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion") "Wrong collected from assertion"
)
assert(
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
"Wrong collected from assertion"
)
val relevantDates = result.getRelevantdate.asScala val relevantDates = result.getRelevantdate.asScala
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created") assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
"Missing relevant date of type created"
)
val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]] val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
assertFalse(rels.isEmpty) assertFalse(rels.isEmpty)
@ -393,15 +459,14 @@ class CrossrefMappingTest {
}) })
} }
@Test @Test
def testSetDateOfAcceptanceCrossRef2Oaf(): Unit = { def testSetDateOfAcceptanceCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json")).mkString val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json"))
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -421,8 +486,13 @@ class CrossrefMappingTest {
@Test @Test
def testNormalizeDOI(): Unit = { def testNormalizeDOI(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString val template = Source
val line :String = "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}]," .fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")
)
.mkString
val line: String =
"\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
val json = template.replace("%s", line) val json = template.replace("%s", line)
val resultList: List[Oaf] = Crossref2Oaf.convert(json) val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty) assertTrue(resultList.nonEmpty)
@ -431,13 +501,17 @@ class CrossrefMappingTest {
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi"))) result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
assertTrue(result.getPid.size() == 1) assertTrue(result.getPid.size() == 1)
result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))) result.getPid.asScala.foreach(pid =>
assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))
)
} }
@Test @Test
def testNormalizeDOI2(): Unit = { def testNormalizeDOI2(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString val template = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json"))
.mkString
val resultList: List[Oaf] = Crossref2Oaf.convert(template) val resultList: List[Oaf] = Crossref2Oaf.convert(template)
assertTrue(resultList.nonEmpty) assertTrue(resultList.nonEmpty)
@ -446,14 +520,19 @@ class CrossrefMappingTest {
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi"))) result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
assertTrue(result.getPid.size() == 1) assertTrue(result.getPid.size() == 1)
result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))) result.getPid.asScala.foreach(pid =>
assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))
)
} }
@Test @Test
def testLicenseVorClosed(): Unit = { def testLicenseVorClosed(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")).mkString val json = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")
)
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -462,25 +541,28 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty) assertTrue(resultList.nonEmpty)
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item)) println(mapper.writeValueAsString(item))
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor"))) assertTrue(
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED"))) item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor"))
)
assertTrue(
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED"))
)
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null)) assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
} }
@Test @Test
def testLicenseOpen(): Unit = { def testLicenseOpen(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")).mkString val json = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")
)
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -489,12 +571,19 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty) assertTrue(resultList.nonEmpty)
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html"))) assertTrue(
item.getInstance().asScala exists (i =>
i.getLicense.getValue.equals(
"http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html"
)
)
)
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN"))) assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)) assertTrue(
item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)
)
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item)) println(mapper.writeValueAsString(item))
@ -502,8 +591,13 @@ class CrossrefMappingTest {
@Test @Test
def testLicenseEmbargoOpen(): Unit = { def testLicenseEmbargoOpen(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json")).mkString val json = Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json"
)
)
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -512,12 +606,19 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty) assertTrue(resultList.nonEmpty)
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"))) assertTrue(
item.getInstance().asScala exists (i =>
i.getLicense.getValue.equals(
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
)
)
)
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN"))) assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)) assertTrue(
item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)
)
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item)) println(mapper.writeValueAsString(item))
@ -525,8 +626,13 @@ class CrossrefMappingTest {
@Test @Test
def testLicenseEmbargo(): Unit = { def testLicenseEmbargo(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo.json")).mkString val json = Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/doiboost/crossref/publication_license_embargo.json"
)
)
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -535,22 +641,33 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty) assertTrue(resultList.nonEmpty)
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"))) assertTrue(
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))) item.getInstance().asScala exists (i =>
i.getLicense.getValue.equals(
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
)
)
)
assertTrue(
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))
)
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null)) assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item)) println(mapper.writeValueAsString(item))
} }
@Test @Test
def testLicenseEmbargoDateTime(): Unit = { def testLicenseEmbargoDateTime(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json")).mkString val json = Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json"
)
)
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -559,11 +676,18 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty) assertTrue(resultList.nonEmpty)
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"))) assertTrue(
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))) item.getInstance().asScala exists (i =>
i.getLicense.getValue.equals(
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
)
)
)
assertTrue(
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))
)
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null)) assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item)) println(mapper.writeValueAsString(item))
@ -572,8 +696,11 @@ class CrossrefMappingTest {
@Test @Test
def testMultipleURLs(): Unit = { def testMultipleURLs(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")).mkString val json = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")
)
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -582,12 +709,14 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty) assertTrue(resultList.nonEmpty)
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
assertEquals(1, item.getInstance().size()) assertEquals(1, item.getInstance().size())
assertEquals(1, item.getInstance().get(0).getUrl().size()) assertEquals(1, item.getInstance().get(0).getUrl().size())
assertEquals("https://doi.org/10.1016/j.jas.2019.105013", item.getInstance().get(0).getUrl().get(0)) assertEquals(
"https://doi.org/10.1016/j.jas.2019.105013",
item.getInstance().get(0).getUrl().get(0)
)
//println(mapper.writeValueAsString(item)) //println(mapper.writeValueAsString(item))
} }

View File

@ -12,29 +12,21 @@ import org.slf4j.{Logger, LoggerFactory}
import java.sql.Timestamp import java.sql.Timestamp
import scala.io.Source import scala.io.Source
class MAGMappingTest { class MAGMappingTest {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
@Test @Test
def testSplitter(): Unit = { def testSplitter(): Unit = {
val s = "sports.team" val s = "sports.team"
if (s.contains(".")) { if (s.contains(".")) {
println(s.split("\\.") head) println(s.split("\\.") head)
} }
} }
@Test @Test
def testDate(): Unit = { def testDate(): Unit = {
@ -44,11 +36,11 @@ class MAGMappingTest {
} }
@Test @Test
def buildInvertedIndexTest(): Unit = { def buildInvertedIndexTest(): Unit = {
val json_input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json")).mkString val json_input = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json"))
.mkString
val description = ConversionUtil.convertInvertedIndexString(json_input) val description = ConversionUtil.convertInvertedIndexString(json_input)
assertNotNull(description) assertNotNull(description)
assertTrue(description.nonEmpty) assertTrue(description.nonEmpty)
@ -56,11 +48,10 @@ class MAGMappingTest {
logger.debug(description) logger.debug(description)
} }
@Test @Test
def normalizeDoiTest(): Unit = { def normalizeDoiTest(): Unit = {
implicit val formats = DefaultFormats implicit val formats = DefaultFormats
val conf = new SparkConf() val conf = new SparkConf()
@ -78,7 +69,8 @@ class MAGMappingTest {
val schema = Encoders.product[MagPapers].schema val schema = Encoders.product[MagPapers].schema
import spark.implicits._ import spark.implicits._
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers] val magPapers: Dataset[MagPapers] =
spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers]
val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers) val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
assertTrue(ret.count == 10) assertTrue(ret.count == 10)
ret.take(10).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase()))) ret.take(10).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
@ -108,7 +100,8 @@ class MAGMappingTest {
val schema = Encoders.product[MagPapers].schema val schema = Encoders.product[MagPapers].schema
import spark.implicits._ import spark.implicits._
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers] val magPapers: Dataset[MagPapers] =
spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers]
val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers) val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
assertTrue(ret.count == 8) assertTrue(ret.count == 8)
ret.take(8).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase()))) ret.take(8).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
@ -116,7 +109,4 @@ class MAGMappingTest {
//ret.take(8).foreach(mp => println(write(mp))) //ret.take(8).foreach(mp => println(write(mp)))
} }
} }

View File

@ -20,7 +20,9 @@ class MappingORCIDToOAFTest {
@Test @Test
def testExtractData(): Unit = { def testExtractData(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput")).mkString val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput"))
.mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty) assertFalse(json.isEmpty)
json.lines.foreach(s => { json.lines.foreach(s => {
@ -52,12 +54,8 @@ class MappingORCIDToOAFTest {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
val oA = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem].count() val oA = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem].count()
val p: Dataset[Publication] = spark.read.load(targetPath).as[Publication] val p: Dataset[Publication] = spark.read.load(targetPath).as[Publication]
assertTrue(oA == p.count()) assertTrue(oA == p.count())
@ -65,17 +63,16 @@ class MappingORCIDToOAFTest {
spark.close() spark.close()
} }
@Test @Test
def testExtractDat1(): Unit = { def testExtractDat1(): Unit = {
val aList: List[OrcidAuthor] = List(
OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null),
val aList: List[OrcidAuthor] = List(OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null ), OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null),
OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null ), OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null )) OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null)
)
val orcid: ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList) val orcid: ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList)
@ -85,10 +82,6 @@ class MappingORCIDToOAFTest {
oaf.getPid.toList.foreach(pid => assert(pid.getValue.equals("10.1042/BCJ20160876"))) oaf.getPid.toList.foreach(pid => assert(pid.getValue.equals("10.1042/BCJ20160876")))
//println(mapper.writeValueAsString(ORCIDToOAF.convertTOOAF(orcid))) //println(mapper.writeValueAsString(ORCIDToOAF.convertTOOAF(orcid)))
} }
} }

View File

@ -14,11 +14,12 @@ class UnpayWallMappingTest {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
@Test @Test
def testMappingToOAF(): Unit = { def testMappingToOAF(): Unit = {
val Ilist = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json")).mkString val Ilist = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json"))
.mkString
var i: Int = 0 var i: Int = 0
for (line <- Ilist.lines) { for (line <- Ilist.lines) {
@ -42,13 +43,14 @@ class UnpayWallMappingTest {
i = i + 1 i = i + 1
} }
val l = Ilist.lines.next() val l = Ilist.lines.next()
val item = UnpayWallToOAF.convertToOAF(l) val item = UnpayWallToOAF.convertToOAF(l)
assertEquals(item.getInstance().get(0).getAccessright.getOpenAccessRoute, OpenAccessRoute.bronze) assertEquals(
item.getInstance().get(0).getAccessright.getOpenAccessRoute,
OpenAccessRoute.bronze
)
logger.info(mapper.writeValueAsString(item)) logger.info(mapper.writeValueAsString(item))

View File

@ -4,17 +4,29 @@ import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn} import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
case class HostedByItemType(
id: String,
officialname: String,
issn: String,
eissn: String,
lissn: String,
openAccess: Boolean
) {}
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {} case class HostedByInfo(
case class HostedByInfo(id: String, officialname: String, journal_id: String, provenance : String, id_type: String) {} id: String,
officialname: String,
journal_id: String,
provenance: String,
id_type: String
) {}
object Aggregators { object Aggregators {
def getId(s1: String, s2: String): String = { def getId(s1: String, s2: String): String = {
if (s1.startsWith("10|")) { if (s1.startsWith("10|")) {
return s1} return s1
}
s2 s2
} }
@ -25,24 +37,40 @@ object Aggregators {
s2 s2
} }
def explodeHostedByItemType(
def explodeHostedByItemType(df: Dataset[(String, HostedByItemType)]): Dataset[(String, HostedByItemType)] = { df: Dataset[(String, HostedByItemType)]
): Dataset[(String, HostedByItemType)] = {
val transformedData: Dataset[(String, HostedByItemType)] = df val transformedData: Dataset[(String, HostedByItemType)] = df
.groupByKey(_._1)(Encoders.STRING) .groupByKey(_._1)(Encoders.STRING)
.agg(Aggregators.hostedByAggregator) .agg(Aggregators.hostedByAggregator)
.map{ .map { case (id: String, res: (String, HostedByItemType)) =>
case (id:String , res:(String, HostedByItemType)) => res res
}(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])) }(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]))
transformedData transformedData
} }
val hostedByAggregator: TypedColumn[(String, HostedByItemType), (String, HostedByItemType)] = new Aggregator[(String, HostedByItemType), (String, HostedByItemType), (String, HostedByItemType)] { val hostedByAggregator: TypedColumn[(String, HostedByItemType), (String, HostedByItemType)] =
override def zero: (String, HostedByItemType) = ("", HostedByItemType("","","","","",false)) new Aggregator[
override def reduce(b: (String, HostedByItemType), a:(String,HostedByItemType)): (String, HostedByItemType) = { (String, HostedByItemType),
(String, HostedByItemType),
(String, HostedByItemType)
] {
override def zero: (String, HostedByItemType) =
("", HostedByItemType("", "", "", "", "", false))
override def reduce(
b: (String, HostedByItemType),
a: (String, HostedByItemType)
): (String, HostedByItemType) = {
return merge(b, a) return merge(b, a)
} }
override def merge(b1: (String, HostedByItemType), b2: (String, HostedByItemType)): (String, HostedByItemType) = {
override def merge(
b1: (String, HostedByItemType),
b2: (String, HostedByItemType)
): (String, HostedByItemType) = {
if (b1 == null) { if (b1 == null) {
return b2 return b2
} }
@ -50,27 +78,51 @@ object Aggregators {
return b1 return b1
} }
if (b1._2.id.startsWith("10|")) { if (b1._2.id.startsWith("10|")) {
return (b1._1, HostedByItemType(b1._2.id, b1._2.officialname, b1._2.issn, b1._2.eissn, b1._2.lissn, b1._2.openAccess || b2._2.openAccess)) return (
b1._1,
HostedByItemType(
b1._2.id,
b1._2.officialname,
b1._2.issn,
b1._2.eissn,
b1._2.lissn,
b1._2.openAccess || b2._2.openAccess
)
)
} }
return (b2._1, HostedByItemType(b2._2.id, b2._2.officialname, b2._2.issn, b2._2.eissn, b2._2.lissn, b1._2.openAccess || b2._2.openAccess)) return (
b2._1,
HostedByItemType(
b2._2.id,
b2._2.officialname,
b2._2.issn,
b2._2.eissn,
b2._2.lissn,
b1._2.openAccess || b2._2.openAccess
)
)
} }
override def finish(reduction: (String,HostedByItemType)): (String, HostedByItemType) = reduction
override def bufferEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType])
override def outputEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType]) override def finish(reduction: (String, HostedByItemType)): (String, HostedByItemType) =
reduction
override def bufferEncoder: Encoder[(String, HostedByItemType)] =
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
override def outputEncoder: Encoder[(String, HostedByItemType)] =
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
}.toColumn }.toColumn
def resultToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] =
new Aggregator[EntityInfo, EntityInfo, EntityInfo] {
def resultToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = new Aggregator[EntityInfo, EntityInfo, EntityInfo]{
override def zero: EntityInfo = EntityInfo.newInstance("", "", "") override def zero: EntityInfo = EntityInfo.newInstance("", "", "")
override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = { override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = {
return merge(b, a) return merge(b, a)
} }
override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = { override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
if (b1 == null) { if (b1 == null) {
return b2 return b2
@ -96,19 +148,21 @@ object Aggregators {
val transformedData: Dataset[EntityInfo] = df val transformedData: Dataset[EntityInfo] = df
.groupByKey(_.getId)(Encoders.STRING) .groupByKey(_.getId)(Encoders.STRING)
.agg(Aggregators.resultToSingleIdAggregator) .agg(Aggregators.resultToSingleIdAggregator)
.map{ .map { case (id: String, res: EntityInfo) =>
case (id:String , res: EntityInfo) => res res
}(Encoders.bean(classOf[EntityInfo])) }(Encoders.bean(classOf[EntityInfo]))
transformedData transformedData
} }
def datasourceToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = new Aggregator[EntityInfo, EntityInfo, EntityInfo]{ def datasourceToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] =
new Aggregator[EntityInfo, EntityInfo, EntityInfo] {
override def zero: EntityInfo = EntityInfo.newInstance("", "", "") override def zero: EntityInfo = EntityInfo.newInstance("", "", "")
override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = { override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = {
return merge(b, a) return merge(b, a)
} }
override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = { override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
if (b1 == null) { if (b1 == null) {
return b2 return b2
@ -128,13 +182,12 @@ object Aggregators {
override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
}.toColumn }.toColumn
def datasourceToSingleId(df: Dataset[EntityInfo]): Dataset[EntityInfo] = { def datasourceToSingleId(df: Dataset[EntityInfo]): Dataset[EntityInfo] = {
val transformedData: Dataset[EntityInfo] = df val transformedData: Dataset[EntityInfo] = df
.groupByKey(_.getHostedById)(Encoders.STRING) .groupByKey(_.getHostedById)(Encoders.STRING)
.agg(Aggregators.datasourceToSingleIdAggregator) .agg(Aggregators.datasourceToSingleIdAggregator)
.map{ .map { case (id: String, res: EntityInfo) =>
case (id:String , res: EntityInfo) => res res
}(Encoders.bean(classOf[EntityInfo])) }(Encoders.bean(classOf[EntityInfo]))
transformedData transformedData

View File

@ -14,7 +14,8 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkApplyHostedByMapToDatasource { object SparkApplyHostedByMapToDatasource {
def applyHBtoDats(join: Dataset[EntityInfo], dats: Dataset[Datasource]): Dataset[Datasource] = { def applyHBtoDats(join: Dataset[EntityInfo], dats: Dataset[Datasource]): Dataset[Datasource] = {
dats.joinWith(join, dats.col("id").equalTo(join.col("hostedById")), "left") dats
.joinWith(join, dats.col("id").equalTo(join.col("hostedById")), "left")
.map(t2 => { .map(t2 => {
val d: Datasource = t2._1 val d: Datasource = t2._1
if (t2._2 != null) { if (t2._2 != null) {
@ -31,14 +32,21 @@ object SparkApplyHostedByMapToDatasource {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"
)
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val graphPath = parser.get("graphPath") val graphPath = parser.get("graphPath")
val outputPath = parser.get("outputPath") val outputPath = parser.get("outputPath")
@ -51,20 +59,27 @@ object SparkApplyHostedByMapToDatasource {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
val dats: Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource") val dats: Dataset[Datasource] = spark.read
.textFile(graphPath + "/datasource")
.map(r => mapper.readValue(r, classOf[Datasource])) .map(r => mapper.readValue(r, classOf[Datasource]))
val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(preparedInfoPath) val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(
.map(ei => mapper.readValue(ei, classOf[EntityInfo]))) spark.read
.textFile(preparedInfoPath)
.map(ei => mapper.readValue(ei, classOf[EntityInfo]))
)
applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath) applyHBtoDats(pinfo, dats).write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath)
spark.read.textFile(outputPath) spark.read
.textFile(outputPath)
.write .write
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.text(graphPath + "/datasource") .text(graphPath + "/datasource")
} }
} }

View File

@ -16,7 +16,8 @@ import scala.collection.JavaConverters._
object SparkApplyHostedByMapToResult { object SparkApplyHostedByMapToResult {
def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = { def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = {
pubs.joinWith(join, pubs.col("id").equalTo(join.col("id")), "left") pubs
.joinWith(join, pubs.col("id").equalTo(join.col("id")), "left")
.map(t2 => { .map(t2 => {
val p: Publication = t2._1 val p: Publication = t2._1
if (t2._2 != null) { if (t2._2 != null) {
@ -27,7 +28,14 @@ object SparkApplyHostedByMapToResult {
inst.getHostedby.setKey(ei.getHostedById) inst.getHostedby.setKey(ei.getHostedById)
inst.getHostedby.setValue(ei.getName) inst.getHostedby.setValue(ei.getName)
if (ei.getOpenAccess) { if (ei.getOpenAccess) {
inst.setAccessright(OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN, "Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)) inst.setAccessright(
OafMapperUtils.accessRight(
ModelConstants.ACCESS_RIGHT_OPEN,
"Open Access",
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
)
inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold) inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance())); p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
} }
@ -40,46 +48,54 @@ object SparkApplyHostedByMapToResult {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"
)
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val graphPath = parser.get("graphPath") val graphPath = parser.get("graphPath")
val outputPath = parser.get("outputPath") val outputPath = parser.get("outputPath")
val preparedInfoPath = parser.get("preparedInfoPath") val preparedInfoPath = parser.get("preparedInfoPath")
implicit val formats = DefaultFormats implicit val formats = DefaultFormats
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication]) implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication])
implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
val pubs: Dataset[Publication] = spark.read.textFile(graphPath + "/publication") val pubs: Dataset[Publication] = spark.read
.textFile(graphPath + "/publication")
.map(r => mapper.readValue(r, classOf[Publication])) .map(r => mapper.readValue(r, classOf[Publication]))
val pinfo: Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath) val pinfo: Dataset[EntityInfo] = spark.read
.textFile(preparedInfoPath)
.map(ei => mapper.readValue(ei, classOf[EntityInfo])) .map(ei => mapper.readValue(ei, classOf[EntityInfo]))
applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath) applyHBtoPubs(pinfo, pubs).write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath)
spark.read.textFile(outputPath) spark.read
.textFile(outputPath)
.write .write
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.text(graphPath + "/publication") .text(graphPath + "/publication")
} }
} }

View File

@ -19,7 +19,6 @@ object SparkPrepareHostedByInfoToApply {
def getList(id: String, j: Journal, name: String): List[EntityInfo] = { def getList(id: String, j: Journal, name: String): List[EntityInfo] = {
var lst: List[EntityInfo] = List() var lst: List[EntityInfo] = List()
if (j.getIssnLinking != null && !j.getIssnLinking.equals("")) { if (j.getIssnLinking != null && !j.getIssnLinking.equals("")) {
lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst
} }
@ -37,14 +36,14 @@ object SparkPrepareHostedByInfoToApply {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
val dd: Dataset[Publication] = spark.read.textFile(publicationPath) val dd: Dataset[Publication] = spark.read
.textFile(publicationPath)
.map(r => mapper.readValue(r, classOf[Publication])) .map(r => mapper.readValue(r, classOf[Publication]))
dd.filter(p => p.getJournal != null).flatMap(p => getList(p.getId, p.getJournal, "")) dd.filter(p => p.getJournal != null).flatMap(p => getList(p.getId, p.getJournal, ""))
} }
def toEntityInfo(input: String): EntityInfo = { def toEntityInfo(input: String): EntityInfo = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -53,7 +52,6 @@ object SparkPrepareHostedByInfoToApply {
toEntityItem(c.keys.head, c.values.head) toEntityItem(c.keys.head, c.values.head)
} }
def toEntityItem(journal_id: String, hbi: HostedByItemType): EntityInfo = { def toEntityItem(journal_id: String, hbi: HostedByItemType): EntityInfo = {
EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess) EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess)
@ -61,7 +59,9 @@ object SparkPrepareHostedByInfoToApply {
} }
def joinResHBM(res: Dataset[EntityInfo], hbm: Dataset[EntityInfo]): Dataset[EntityInfo] = { def joinResHBM(res: Dataset[EntityInfo], hbm: Dataset[EntityInfo]): Dataset[EntityInfo] = {
Aggregators.resultToSingleId(res.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left") Aggregators.resultToSingleId(
res
.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left")
.map(t2 => { .map(t2 => {
val res: EntityInfo = t2._1 val res: EntityInfo = t2._1
if (t2._2 != null) { if (t2._2 != null) {
@ -71,52 +71,57 @@ object SparkPrepareHostedByInfoToApply {
res.setName(ds.getName) res.setName(ds.getName)
} }
res res
})) })
)
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json"
)
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val graphPath = parser.get("graphPath") val graphPath = parser.get("graphPath")
val outputPath = parser.get("preparedInfoPath") val outputPath = parser.get("preparedInfoPath")
val hostedByMapPath = parser.get("hostedByMapPath") val hostedByMapPath = parser.get("hostedByMapPath")
implicit val formats = DefaultFormats implicit val formats = DefaultFormats
logger.info("Getting the Datasources") logger.info("Getting the Datasources")
import spark.implicits._ import spark.implicits._
//STEP1: read the hostedbymap and transform it in EntityInfo //STEP1: read the hostedbymap and transform it in EntityInfo
val hostedByInfo: Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo) val hostedByInfo: Dataset[EntityInfo] =
spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
//STEP2: create association (publication, issn), (publication, eissn), (publication, lissn) //STEP2: create association (publication, issn), (publication, eissn), (publication, lissn)
val resultInfoDataset: Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication") val resultInfoDataset: Dataset[EntityInfo] =
prepareResultInfo(spark, graphPath + "/publication")
//STEP3: left join resultInfo with hostedByInfo on journal_id. Reduction of all the results with the same id in just //STEP3: left join resultInfo with hostedByInfo on journal_id. Reduction of all the results with the same id in just
//one entry (one result could be associated to issn and eissn and so possivly matching more than once against the map) //one entry (one result could be associated to issn and eissn and so possivly matching more than once against the map)
//to this entry we add the id of the datasource for the next step //to this entry we add the id of the datasource for the next step
joinResHBM(resultInfoDataset, hostedByInfo) joinResHBM(resultInfoDataset, hostedByInfo).write
.write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath) .mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath)
} }
} }

View File

@ -17,9 +17,8 @@ import java.io.PrintWriter
object SparkProduceHostedByMap { object SparkProduceHostedByMap {
implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] =
implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] = Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]) Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)): HostedByItemType = { def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)): HostedByItemType = {
val openaire: HostedByInfo = input._1._1 val openaire: HostedByInfo = input._1._1
@ -28,9 +27,33 @@ object SparkProduceHostedByMap {
val isOpenAccess: Boolean = doaj == null && gold == null val isOpenAccess: Boolean = doaj == null && gold == null
openaire.journal_id match { openaire.journal_id match {
case Constants.ISSN => HostedByItemType(openaire.id, openaire.officialname, openaire.journal_id, "", "", isOpenAccess) case Constants.ISSN =>
case Constants.EISSN => HostedByItemType(openaire.id, openaire.officialname, "", openaire.journal_id, "", isOpenAccess) HostedByItemType(
case Constants.ISSNL => HostedByItemType(openaire.id, openaire.officialname, "", "", openaire.journal_id, isOpenAccess) openaire.id,
openaire.officialname,
openaire.journal_id,
"",
"",
isOpenAccess
)
case Constants.EISSN =>
HostedByItemType(
openaire.id,
openaire.officialname,
"",
openaire.journal_id,
"",
isOpenAccess
)
case Constants.ISSNL =>
HostedByItemType(
openaire.id,
openaire.officialname,
"",
"",
openaire.journal_id,
isOpenAccess
)
// catch the default with a variable so you can print it // catch the default with a variable so you can print it
case whoa => null case whoa => null
@ -46,11 +69,16 @@ object SparkProduceHostedByMap {
Serialization.write(map) Serialization.write(map)
} }
def getHostedByItemType(
def getHostedByItemType(id: String, officialname: String, issn: String, eissn: String, issnl: String, oa: Boolean): HostedByItemType = { id: String,
officialname: String,
issn: String,
eissn: String,
issnl: String,
oa: Boolean
): HostedByItemType = {
if (issn != null) { if (issn != null) {
if (eissn != null) { if (eissn != null) {
if (issnl != null) { if (issnl != null) {
@ -85,7 +113,14 @@ object SparkProduceHostedByMap {
def oaToHostedbyItemType(dats: Datasource): HostedByItemType = { def oaToHostedbyItemType(dats: Datasource): HostedByItemType = {
if (dats.getJournal != null) { if (dats.getJournal != null) {
return getHostedByItemType(dats.getId, dats.getOfficialname.getValue, dats.getJournal.getIssnPrinted, dats.getJournal.getIssnOnline, dats.getJournal.getIssnLinking, false) return getHostedByItemType(
dats.getId,
dats.getOfficialname.getValue,
dats.getJournal.getIssnPrinted,
dats.getJournal.getIssnOnline,
dats.getJournal.getIssnLinking,
false
)
} }
HostedByItemType("", "", "", "", "", false) HostedByItemType("", "", "", "", "", false)
} }
@ -94,32 +129,41 @@ object SparkProduceHostedByMap {
import spark.implicits._ import spark.implicits._
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
implicit var encoderD = Encoders.kryo[Datasource] implicit var encoderD = Encoders.kryo[Datasource]
val dd: Dataset[Datasource] = spark.read.textFile(datasourcePath) val dd: Dataset[Datasource] = spark.read
.textFile(datasourcePath)
.map(r => mapper.readValue(r, classOf[Datasource])) .map(r => mapper.readValue(r, classOf[Datasource]))
dd.map { ddt => oaToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) dd.map { ddt => oaToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
} }
def goldToHostedbyItemType(gold: UnibiGoldModel): HostedByItemType = { def goldToHostedbyItemType(gold: UnibiGoldModel): HostedByItemType = {
return getHostedByItemType(Constants.UNIBI, gold.getTitle, gold.getIssn, "", gold.getIssnL, true) return getHostedByItemType(
Constants.UNIBI,
gold.getTitle,
gold.getIssn,
"",
gold.getIssnL,
true
)
} }
def goldHostedByDataset(
def goldHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { spark: SparkSession,
datasourcePath: String
): Dataset[HostedByItemType] = {
import spark.implicits._ import spark.implicits._
implicit val mapEncoderUnibi: Encoder[UnibiGoldModel] = Encoders.kryo[UnibiGoldModel] implicit val mapEncoderUnibi: Encoder[UnibiGoldModel] = Encoders.kryo[UnibiGoldModel]
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
val dd: Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath) val dd: Dataset[UnibiGoldModel] = spark.read
.textFile(datasourcePath)
.map(r => mapper.readValue(r, classOf[UnibiGoldModel])) .map(r => mapper.readValue(r, classOf[UnibiGoldModel]))
dd.map { ddt => goldToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) dd.map { ddt => goldToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
@ -128,17 +172,28 @@ object SparkProduceHostedByMap {
def doajToHostedbyItemType(doaj: DOAJModel): HostedByItemType = { def doajToHostedbyItemType(doaj: DOAJModel): HostedByItemType = {
return getHostedByItemType(Constants.DOAJ, doaj.getJournalTitle, doaj.getIssn, doaj.getEissn, "", true) return getHostedByItemType(
Constants.DOAJ,
doaj.getJournalTitle,
doaj.getIssn,
doaj.getEissn,
"",
true
)
} }
def doajHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { def doajHostedByDataset(
spark: SparkSession,
datasourcePath: String
): Dataset[HostedByItemType] = {
import spark.implicits._ import spark.implicits._
implicit val mapEncoderDOAJ: Encoder[DOAJModel] = Encoders.kryo[DOAJModel] implicit val mapEncoderDOAJ: Encoder[DOAJModel] = Encoders.kryo[DOAJModel]
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
val dd: Dataset[DOAJModel] = spark.read.textFile(datasourcePath) val dd: Dataset[DOAJModel] = spark.read
.textFile(datasourcePath)
.map(r => mapper.readValue(r, classOf[DOAJModel])) .map(r => mapper.readValue(r, classOf[DOAJModel]))
dd.map { ddt => doajToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) dd.map { ddt => doajToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
@ -159,7 +214,6 @@ object SparkProduceHostedByMap {
lst lst
} }
def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = { def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = {
val conf = new Configuration() val conf = new Configuration()
@ -169,49 +223,51 @@ object SparkProduceHostedByMap {
val writer = new PrintWriter(output) val writer = new PrintWriter(output)
try { try {
input.foreach(hbi => writer.println(hbi)) input.foreach(hbi => writer.println(hbi))
} } finally {
finally {
writer.close() writer.close()
} }
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val datasourcePath = parser.get("datasourcePath") val datasourcePath = parser.get("datasourcePath")
val workingDirPath = parser.get("workingPath") val workingDirPath = parser.get("workingPath")
val outputPath = parser.get("outputPath") val outputPath = parser.get("outputPath")
implicit val formats = DefaultFormats implicit val formats = DefaultFormats
logger.info("Getting the Datasources") logger.info("Getting the Datasources")
Aggregators
Aggregators.explodeHostedByItemType(oaHostedByDataset(spark, datasourcePath) .explodeHostedByItemType(
oaHostedByDataset(spark, datasourcePath)
.union(goldHostedByDataset(spark, workingDirPath + "/unibi_gold.json")) .union(goldHostedByDataset(spark, workingDirPath + "/unibi_gold.json"))
.union(doajHostedByDataset(spark, workingDirPath + "/doaj.json")) .union(doajHostedByDataset(spark, workingDirPath + "/doaj.json"))
.flatMap(hbi => toList(hbi))).filter(hbi => hbi._2.id.startsWith("10|")) .flatMap(hbi => toList(hbi))
)
.filter(hbi => hbi._2.id.startsWith("10|"))
.map(hbi => toHostedByMap(hbi))(Encoders.STRING) .map(hbi => toHostedByMap(hbi))(Encoders.STRING)
.rdd.saveAsTextFile(outputPath, classOf[GzipCodec]) .rdd
.saveAsTextFile(outputPath, classOf[GzipCodec])
} }
} }

View File

@ -20,7 +20,13 @@ object CopyHdfsOafSparkApplication {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val log = LoggerFactory.getLogger(getClass) val log = LoggerFactory.getLogger(getClass)
val conf = new SparkConf() val conf = new SparkConf()
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json")).mkString) val parser = new ArgumentApplicationParser(
Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json")
)
.mkString
)
parser.parseArgument(args) parser.parseArgument(args)
val spark = val spark =
@ -28,7 +34,8 @@ object CopyHdfsOafSparkApplication {
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val sc: SparkContext = spark.sparkContext val sc: SparkContext = spark.sparkContext
@ -49,19 +56,22 @@ object CopyHdfsOafSparkApplication {
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
val paths = DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala val paths =
DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala
val validPaths: List[String] = paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList val validPaths: List[String] =
paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList
val types = ModelSupport.oafTypes.entrySet val types = ModelSupport.oafTypes.entrySet.asScala
.asScala
.map(e => Tuple2(e.getKey, e.getValue)) .map(e => Tuple2(e.getKey, e.getValue))
if (validPaths.nonEmpty) { if (validPaths.nonEmpty) {
val oaf = spark.read.textFile(validPaths: _*) val oaf = spark.read.textFile(validPaths: _*)
val mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) val mapper =
new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
types.foreach(t => oaf types.foreach(t =>
oaf
.filter(o => isOafType(o, t._1)) .filter(o => isOafType(o, t._1))
.map(j => mapper.readValue(j, t._2).asInstanceOf[Oaf]) .map(j => mapper.readValue(j, t._2).asInstanceOf[Oaf])
.map(s => mapper.writeValueAsString(s))(Encoders.STRING) .map(s => mapper.writeValueAsString(s))(Encoders.STRING)

View File

@ -13,20 +13,32 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkResolveEntities { object SparkResolveEntities {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
val entities = List(EntityType.dataset, EntityType.publication, EntityType.software, EntityType.otherresearchproduct)
val entities = List(
EntityType.dataset,
EntityType.publication,
EntityType.software,
EntityType.otherresearchproduct
)
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json"
)
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val graphBasePath = parser.get("graphBasePath") val graphBasePath = parser.get("graphBasePath")
log.info(s"graphBasePath -> $graphBasePath") log.info(s"graphBasePath -> $graphBasePath")
@ -38,7 +50,6 @@ object SparkResolveEntities {
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath") log.info(s"targetPath -> $targetPath")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
fs.mkdirs(new Path(workingPath)) fs.mkdirs(new Path(workingPath))
@ -46,23 +57,30 @@ object SparkResolveEntities {
generateResolvedEntities(spark, workingPath, graphBasePath, targetPath) generateResolvedEntities(spark, workingPath, graphBasePath, targetPath)
} }
def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = { def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = {
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
import spark.implicits._ import spark.implicits._
val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)] val rPid: Dataset[(String, String)] =
val up: Dataset[(String, Result)] = spark.read.text(unresolvedPath).as[String].map(s => mapper.readValue(s, classOf[Result])).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder)) spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
val up: Dataset[(String, Result)] = spark.read
.text(unresolvedPath)
.as[String]
.map(s => mapper.readValue(s, classOf[Result]))
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
rPid.joinWith(up, rPid("_2").equalTo(up("_1")), "inner").map { rPid
r => .joinWith(up, rPid("_2").equalTo(up("_1")), "inner")
.map { r =>
val result = r._2._2 val result = r._2._2
val dnetId = r._1._1 val dnetId = r._1._1
result.setId(dnetId) result.setId(dnetId)
result result
}.write.mode(SaveMode.Overwrite).save(s"$workingPath/resolvedEntities")
} }
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/resolvedEntities")
}
def deserializeObject(input: String, entity: EntityType): Result = { def deserializeObject(input: String, entity: EntityType): Result = {
@ -74,18 +92,32 @@ object SparkResolveEntities {
} }
} }
def generateResolvedEntities(spark: SparkSession, workingPath: String, graphBasePath: String, targetPath: String) = { def generateResolvedEntities(
spark: SparkSession,
workingPath: String,
graphBasePath: String,
targetPath: String
) = {
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
import spark.implicits._ import spark.implicits._
val re: Dataset[(String, Result)] = spark.read.load(s"$workingPath/resolvedEntities").as[Result].map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder)) val re: Dataset[(String, Result)] = spark.read
entities.foreach { .load(s"$workingPath/resolvedEntities")
e => { .as[Result]
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
entities.foreach { e =>
{
val currentEntityDataset: Dataset[(String, Result)] = spark.read.text(s"$graphBasePath/$e").as[String].map(s => deserializeObject(s, e)).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder)) val currentEntityDataset: Dataset[(String, Result)] = spark.read
.text(s"$graphBasePath/$e")
.as[String]
.map(s => deserializeObject(s, e))
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
currentEntityDataset.joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left").map(k => { currentEntityDataset
.joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left")
.map(k => {
val a = k._1 val a = k._1
val b = k._2 val b = k._2
@ -95,11 +127,14 @@ object SparkResolveEntities {
a._2.mergeFrom(b._2) a._2.mergeFrom(b._2)
a._2 a._2
} }
}).map(r => mapper.writeValueAsString(r))(Encoders.STRING) })
.write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$targetPath/$e") .map(r => mapper.writeValueAsString(r))(Encoders.STRING)
.write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(s"$targetPath/$e")
} }
} }
} }
} }

View File

@ -17,18 +17,25 @@ import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkResolveRelation { object SparkResolveRelation {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json"
)
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val graphBasePath = parser.get("graphBasePath") val graphBasePath = parser.get("graphBasePath")
log.info(s"graphBasePath -> $graphBasePath") log.info(s"graphBasePath -> $graphBasePath")
@ -41,7 +48,6 @@ object SparkResolveRelation {
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
import spark.implicits._ import spark.implicits._
//CLEANING TEMPORARY FOLDER //CLEANING TEMPORARY FOLDER
HdfsSupport.remove(workingPath, spark.sparkContext.hadoopConfiguration) HdfsSupport.remove(workingPath, spark.sparkContext.hadoopConfiguration)
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
@ -51,28 +57,36 @@ object SparkResolveRelation {
val mapper: ObjectMapper = new ObjectMapper() val mapper: ObjectMapper = new ObjectMapper()
val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)] val rPid: Dataset[(String, String)] =
spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
val relationDs: Dataset[(String, Relation)] = spark.read.text(s"$graphBasePath/relation").as[String] val relationDs: Dataset[(String, Relation)] = spark.read
.map(s => mapper.readValue(s, classOf[Relation])).as[Relation] .text(s"$graphBasePath/relation")
.as[String]
.map(s => mapper.readValue(s, classOf[Relation]))
.as[Relation]
.map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder)) .map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
relationDs.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left").map { relationDs
m => .joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left")
.map { m =>
val sourceResolved = m._2 val sourceResolved = m._2
val currentRelation = m._1._2 val currentRelation = m._1._2
if (sourceResolved != null && sourceResolved._1 != null && sourceResolved._1.nonEmpty) if (sourceResolved != null && sourceResolved._1 != null && sourceResolved._1.nonEmpty)
currentRelation.setSource(sourceResolved._1) currentRelation.setSource(sourceResolved._1)
currentRelation currentRelation
}.write }
.write
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.save(s"$workingPath/relationResolvedSource") .save(s"$workingPath/relationResolvedSource")
val relationSourceResolved: Dataset[(String, Relation)] = spark.read
val relationSourceResolved: Dataset[(String, Relation)] = spark.read.load(s"$workingPath/relationResolvedSource").as[Relation] .load(s"$workingPath/relationResolvedSource")
.as[Relation]
.map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder)) .map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left").map { relationSourceResolved
m => .joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left")
.map { m =>
val targetResolved = m._2 val targetResolved = m._2
val currentRelation = m._1._2 val currentRelation = m._1._2
if (targetResolved != null && targetResolved._1.nonEmpty) if (targetResolved != null && targetResolved._1.nonEmpty)
@ -83,7 +97,9 @@ object SparkResolveRelation {
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.save(s"$workingPath/relation_resolved") .save(s"$workingPath/relation_resolved")
spark.read.load(s"$workingPath/relation_resolved").as[Relation] spark.read
.load(s"$workingPath/relation_resolved")
.as[Relation]
.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")) .filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
.map(r => mapper.writeValueAsString(r)) .map(r => mapper.writeValueAsString(r))
.write .write
@ -107,7 +123,6 @@ object SparkResolveRelation {
} }
def extractPidsFromRecord(input: String): (String, List[(String, String)]) = { def extractPidsFromRecord(input: String): (String, List[(String, String)]) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input) lazy val json: json4s.JValue = parse(input)
@ -122,7 +137,6 @@ object SparkResolveRelation {
(id, result) (id, result)
} }
private def isRelation(input: String): Boolean = { private def isRelation(input: String): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -132,20 +146,25 @@ object SparkResolveRelation {
source != null source != null
} }
def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = { def extractPidResolvedTableFromJsonRDD(
spark: SparkSession,
graphPath: String,
workingPath: String
) = {
import spark.implicits._ import spark.implicits._
val d: RDD[(String, String)] = spark.sparkContext.textFile(s"$graphPath/*") val d: RDD[(String, String)] = spark.sparkContext
.textFile(s"$graphPath/*")
.filter(i => !isRelation(i)) .filter(i => !isRelation(i))
.map(i => extractPidsFromRecord(i)) .map(i => extractPidsFromRecord(i))
.filter(s => s != null && s._1 != null && s._2 != null && s._2.nonEmpty) .filter(s => s != null && s._1 != null && s._2 != null && s._2.nonEmpty)
.flatMap { p => .flatMap { p =>
p._2.map(pid => p._2.map(pid => (p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2)))
(p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2)) }
) .filter(r => r._1 != null || r._2 != null)
}.filter(r => r._1 != null || r._2 != null)
spark.createDataset(d) spark
.createDataset(d)
.groupByKey(_._2) .groupByKey(_._2)
.reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y) .reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y)
.map(s => s._2) .map(s => s._2)

View File

@ -7,24 +7,26 @@ import org.apache.spark.sql.SparkSession
object SparkDataciteToOAF { object SparkDataciteToOAF {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val sc = spark.sparkContext val sc = spark.sparkContext
val inputPath = parser.get("inputPath") val inputPath = parser.get("inputPath")
} }
} }

View File

@ -11,18 +11,22 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkConvertDatasetToJsonRDD { object SparkConvertDatasetToJsonRDD {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath") log.info(s"sourcePath -> $sourcePath")
@ -33,9 +37,13 @@ object SparkConvertDatasetToJsonRDD {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
resultObject.foreach { item => resultObject.foreach { item =>
spark.read.load(s"$sourcePath/$item").as[Result].map(r => mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec]) spark.read
.load(s"$sourcePath/$item")
.as[Result]
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
.rdd
.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec])
} }
} }

View File

@ -15,14 +15,19 @@ object SparkConvertObjectToJson {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath") log.info(s"sourcePath -> $sourcePath")
@ -33,12 +38,9 @@ object SparkConvertObjectToJson {
val scholixUpdatePath = parser.get("scholixUpdatePath") val scholixUpdatePath = parser.get("scholixUpdatePath")
log.info(s"scholixUpdatePath -> $scholixUpdatePath") log.info(s"scholixUpdatePath -> $scholixUpdatePath")
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
val mapper = new ObjectMapper val mapper = new ObjectMapper
objectType.toLowerCase match { objectType.toLowerCase match {
@ -46,11 +48,18 @@ object SparkConvertObjectToJson {
log.info("Serialize Scholix") log.info("Serialize Scholix")
val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix] val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix]
val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix] val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
d.union(u).repartition(8000).map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.saveAsTextFile(targetPath, classOf[GzipCodec]) d.union(u)
.repartition(8000)
.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
.rdd
.saveAsTextFile(targetPath, classOf[GzipCodec])
case "summary" => case "summary" =>
log.info("Serialize Summary") log.info("Serialize Summary")
val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary] val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary]
d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(1000).saveAsTextFile(targetPath, classOf[GzipCodec]) d.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
.rdd
.repartition(1000)
.saveAsTextFile(targetPath, classOf[GzipCodec])
} }
} }

View File

@ -7,21 +7,26 @@ import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkConvertRDDtoDataset { object SparkConvertRDDtoDataset {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath") log.info(s"sourcePath -> $sourcePath")
@ -34,40 +39,76 @@ object SparkConvertRDDtoDataset {
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct]) implicit val orpEncoder: Encoder[OtherResearchProduct] =
Encoders.kryo(classOf[OtherResearchProduct])
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
log.info("Converting dataset") log.info("Converting dataset")
val rddDataset =spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false) val rddDataset = spark.sparkContext
spark.createDataset(rddDataset).as[OafDataset].write.mode(SaveMode.Overwrite).save(s"$entityPath/dataset") .textFile(s"$sourcePath/dataset")
.map(s => mapper.readValue(s, classOf[OafDataset]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
spark
.createDataset(rddDataset)
.as[OafDataset]
.write
.mode(SaveMode.Overwrite)
.save(s"$entityPath/dataset")
log.info("Converting publication") log.info("Converting publication")
val rddPublication =spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false) val rddPublication = spark.sparkContext
spark.createDataset(rddPublication).as[Publication].write.mode(SaveMode.Overwrite).save(s"$entityPath/publication") .textFile(s"$sourcePath/publication")
.map(s => mapper.readValue(s, classOf[Publication]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
spark
.createDataset(rddPublication)
.as[Publication]
.write
.mode(SaveMode.Overwrite)
.save(s"$entityPath/publication")
log.info("Converting software") log.info("Converting software")
val rddSoftware =spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false) val rddSoftware = spark.sparkContext
spark.createDataset(rddSoftware).as[Software].write.mode(SaveMode.Overwrite).save(s"$entityPath/software") .textFile(s"$sourcePath/software")
.map(s => mapper.readValue(s, classOf[Software]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
spark
.createDataset(rddSoftware)
.as[Software]
.write
.mode(SaveMode.Overwrite)
.save(s"$entityPath/software")
log.info("Converting otherresearchproduct") log.info("Converting otherresearchproduct")
val rddOtherResearchProduct =spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false) val rddOtherResearchProduct = spark.sparkContext
spark.createDataset(rddOtherResearchProduct).as[OtherResearchProduct].write.mode(SaveMode.Overwrite).save(s"$entityPath/otherresearchproduct") .textFile(s"$sourcePath/otherresearchproduct")
.map(s => mapper.readValue(s, classOf[OtherResearchProduct]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
spark
.createDataset(rddOtherResearchProduct)
.as[OtherResearchProduct]
.write
.mode(SaveMode.Overwrite)
.save(s"$entityPath/otherresearchproduct")
log.info("Converting Relation") log.info("Converting Relation")
val relationSemanticFilter = List(
"cites",
"iscitedby",
"merges",
"ismergedin",
"HasAmongTopNSimilarDocuments",
"IsAmongTopNSimilarDocuments"
)
val relationSemanticFilter = List("cites", "iscitedby","merges", "ismergedin", "HasAmongTopNSimilarDocuments","IsAmongTopNSimilarDocuments" ) val rddRelation = spark.sparkContext
.textFile(s"$sourcePath/relation")
val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation")
.map(s => mapper.readValue(s, classOf[Relation])) .map(s => mapper.readValue(s, classOf[Relation]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
} }
} }

View File

@ -13,82 +13,131 @@ object SparkCreateInputGraph {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/extract_entities_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/extract_entities_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val resultObject = List( val resultObject = List(
("publication", classOf[Publication]), ("publication", classOf[Publication]),
("dataset", classOf[OafDataset]), ("dataset", classOf[OafDataset]),
("software", classOf[Software]), ("software", classOf[Software]),
("otherResearchProduct", classOf[OtherResearchProduct]) ("otherResearchProduct", classOf[OtherResearchProduct])
) )
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct]) implicit val orpEncoder: Encoder[OtherResearchProduct] =
Encoders.kryo(classOf[OtherResearchProduct])
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath") log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath") log.info(s"targetPath -> $targetPath")
val oafDs: Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf] val oafDs: Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf]
log.info("Extract Publication") log.info("Extract Publication")
oafDs.filter(o => o.isInstanceOf[Publication]).map(p => p.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/publication") oafDs
.filter(o => o.isInstanceOf[Publication])
.map(p => p.asInstanceOf[Publication])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/extracted/publication")
log.info("Extract dataset") log.info("Extract dataset")
oafDs.filter(o => o.isInstanceOf[OafDataset]).map(p => p.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/dataset") oafDs
.filter(o => o.isInstanceOf[OafDataset])
.map(p => p.asInstanceOf[OafDataset])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/extracted/dataset")
log.info("Extract software") log.info("Extract software")
oafDs.filter(o => o.isInstanceOf[Software]).map(p => p.asInstanceOf[Software]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/software") oafDs
.filter(o => o.isInstanceOf[Software])
.map(p => p.asInstanceOf[Software])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/extracted/software")
log.info("Extract otherResearchProduct") log.info("Extract otherResearchProduct")
oafDs.filter(o => o.isInstanceOf[OtherResearchProduct]).map(p => p.asInstanceOf[OtherResearchProduct]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/otherResearchProduct") oafDs
.filter(o => o.isInstanceOf[OtherResearchProduct])
.map(p => p.asInstanceOf[OtherResearchProduct])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/extracted/otherResearchProduct")
log.info("Extract Relation") log.info("Extract Relation")
oafDs.filter(o => o.isInstanceOf[Relation]).map(p => p.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/relation") oafDs
.filter(o => o.isInstanceOf[Relation])
.map(p => p.asInstanceOf[Relation])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/extracted/relation")
resultObject.foreach { r => resultObject.foreach { r =>
log.info(s"Make ${r._1} unique") log.info(s"Make ${r._1} unique")
makeDatasetUnique(s"$targetPath/extracted/${r._1}", s"$targetPath/preprocess/${r._1}", spark, r._2) makeDatasetUnique(
s"$targetPath/extracted/${r._1}",
s"$targetPath/preprocess/${r._1}",
spark,
r._2
)
} }
} }
def extractEntities[T <: Oaf](
def extractEntities[T <: Oaf](oafDs: Dataset[Oaf], targetPath: String, clazz: Class[T], log: Logger): Unit = { oafDs: Dataset[Oaf],
targetPath: String,
clazz: Class[T],
log: Logger
): Unit = {
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
log.info(s"Extract ${clazz.getSimpleName}") log.info(s"Extract ${clazz.getSimpleName}")
oafDs.filter(o => o.isInstanceOf[T]).map(p => p.asInstanceOf[T]).write.mode(SaveMode.Overwrite).save(targetPath) oafDs
.filter(o => o.isInstanceOf[T])
.map(p => p.asInstanceOf[T])
.write
.mode(SaveMode.Overwrite)
.save(targetPath)
} }
def makeDatasetUnique[T <: Result](
def makeDatasetUnique[T <: Result](sourcePath: String, targetPath: String, spark: SparkSession, clazz: Class[T]): Unit = { sourcePath: String,
targetPath: String,
spark: SparkSession,
clazz: Class[T]
): Unit = {
import spark.implicits._ import spark.implicits._
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
val ds: Dataset[T] = spark.read.load(sourcePath).as[T] val ds: Dataset[T] = spark.read.load(sourcePath).as[T]
ds.groupByKey(_.getId).reduceGroups { (x, y) => ds.groupByKey(_.getId)
.reduceGroups { (x, y) =>
x.mergeFrom(y) x.mergeFrom(y)
x x
}.map(_._2).write.mode(SaveMode.Overwrite).save(targetPath) }
.map(_._2)
.write
.mode(SaveMode.Overwrite)
.save(targetPath)
} }

View File

@ -17,14 +17,19 @@ object SparkCreateScholix {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val relationPath = parser.get("relationPath") val relationPath = parser.get("relationPath")
log.info(s"relationPath -> $relationPath") log.info(s"relationPath -> $relationPath")
@ -33,37 +38,46 @@ object SparkCreateScholix {
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath") log.info(s"targetPath -> $targetPath")
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation] implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
import spark.implicits._ import spark.implicits._
val relationDS: Dataset[(String, Relation)] = spark.read
val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation] .load(relationPath)
.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) .as[Relation]
.filter(r =>
(r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase
.contains("merge")
)
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder)) .map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary] val summaryDS: Dataset[(String, ScholixSummary)] = spark.read
.load(summaryPath)
.as[ScholixSummary]
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, summaryEncoder)) .map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, summaryEncoder))
relationDS
relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left") .joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Relation), (String, ScholixSummary)) => .map { input: ((String, Relation), (String, ScholixSummary)) =>
if (input._1 != null && input._2 != null) { if (input._1 != null && input._2 != null) {
val rel: Relation = input._1._2 val rel: Relation = input._1._2
val source: ScholixSummary = input._2._2 val source: ScholixSummary = input._2._2
(rel.getTarget, ScholixUtils.scholixFromSource(rel, source)) (rel.getTarget, ScholixUtils.scholixFromSource(rel, source))
} } else null
else null
}(Encoders.tuple(Encoders.STRING, scholixEncoder)) }(Encoders.tuple(Encoders.STRING, scholixEncoder))
.filter(r => r != null) .filter(r => r != null)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source") .write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/scholix_from_source")
val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder)) val scholixSource: Dataset[(String, Scholix)] = spark.read
.load(s"$targetPath/scholix_from_source")
.as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
scholixSource.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left") scholixSource
.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Scholix), (String, ScholixSummary)) => .map { input: ((String, Scholix), (String, ScholixSummary)) =>
if (input._2 == null) { if (input._2 == null) {
null null
@ -72,40 +86,73 @@ object SparkCreateScholix {
val target: ScholixSummary = input._2._2 val target: ScholixSummary = input._2._2
ScholixUtils.generateCompleteScholix(s, target) ScholixUtils.generateCompleteScholix(s, target)
} }
}.filter(s => s != null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse") }
.filter(s => s != null)
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/scholix_one_verse")
val scholix_o_v: Dataset[Scholix] =
spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix] scholix_o_v
.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s)))
scholix_o_v.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))).as[Scholix] .as[Scholix]
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder)) .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder))
.groupByKey(_._1) .groupByKey(_._1)
.agg(ScholixUtils.scholixAggregator.toColumn) .agg(ScholixUtils.scholixAggregator.toColumn)
.map(s => s._2) .map(s => s._2)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix") .write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/scholix")
val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix] val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix]
val stats: Dataset[(String, String, Long)] = scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String, String, Long)] val stats: Dataset[(String, String, Long)] = scholix_final
.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType))
.groupBy("_1", "_2")
.agg(count("_1"))
.as[(String, String, Long)]
stats stats
.map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0)) .map(s =>
RelatedEntities(
s._1,
if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0,
if ("publication".equalsIgnoreCase(s._2)) s._3 else 0
)
)
.groupByKey(_.id) .groupByKey(_.id)
.reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset + b.relatedDataset, a.relatedPublication + b.relatedPublication)) .reduceGroups((a, b) =>
RelatedEntities(
a.id,
a.relatedDataset + b.relatedDataset,
a.relatedPublication + b.relatedPublication
)
)
.map(_._2) .map(_._2)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/related_entities") .write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/related_entities")
val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication > 0 || r.relatedDataset > 0) val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read
.load(s"$targetPath/related_entities")
.as[RelatedEntities]
.filter(r => r.relatedPublication > 0 || r.relatedDataset > 0)
relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map { i => relatedEntitiesDS
.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner")
.map { i =>
val re = i._1 val re = i._1
val sum = i._2._2 val sum = i._2._2
sum.setRelatedDatasets(re.relatedDataset) sum.setRelatedDatasets(re.relatedDataset)
sum.setRelatedPublications(re.relatedPublication) sum.setRelatedPublications(re.relatedPublication)
sum sum
}.write.mode(SaveMode.Overwrite).save(s"${summaryPath}_filtered") }
.write
.mode(SaveMode.Overwrite)
.save(s"${summaryPath}_filtered")
} }
} }

View File

@ -14,14 +14,19 @@ object SparkCreateSummaryObject {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json"))) val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json")
)
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath") log.info(s"sourcePath -> $sourcePath")
@ -33,10 +38,17 @@ object SparkCreateSummaryObject {
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
val ds: Dataset[Result] = spark.read
.load(s"$sourcePath/*")
.as[Result]
.filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false)
val ds: Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) ds.repartition(6000)
.map(r => ScholixUtils.resultToSummary(r))
ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s != null).write.mode(SaveMode.Overwrite).save(targetPath) .filter(s => s != null)
.write
.mode(SaveMode.Overwrite)
.save(targetPath)
} }

View File

@ -10,13 +10,23 @@ import java.util.regex.Pattern
import scala.language.postfixOps import scala.language.postfixOps
import scala.xml.{Elem, Node, XML} import scala.xml.{Elem, Node, XML}
case class PangaeaDataModel(identifier:String, title:List[String], objectType:List[String], creator:List[String], case class PangaeaDataModel(
publisher:List[String], dataCenter :List[String],subject :List[String], language:String, identifier: String,
rights:String, parent:String,relation :List[String],linkage:List[(String,String)] ) {} title: List[String],
objectType: List[String],
creator: List[String],
publisher: List[String],
dataCenter: List[String],
subject: List[String],
language: String,
rights: String,
parent: String,
relation: List[String],
linkage: List[(String, String)]
) {}
object PangaeaUtils { object PangaeaUtils {
def toDataset(input: String): PangaeaDataModel = { def toDataset(input: String): PangaeaDataModel = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input) lazy val json: json4s.JValue = parse(input)
@ -26,20 +36,25 @@ object PangaeaUtils {
def findDOIInRelation(input: List[String]): List[String] = { def findDOIInRelation(input: List[String]): List[String] = {
val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b") val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b")
input.map(i => { input
.map(i => {
val matcher = pattern.matcher(i) val matcher = pattern.matcher(i)
if (matcher.find()) if (matcher.find())
matcher.group(0) matcher.group(0)
else else
null null
}).filter(i => i!= null) })
.filter(i => i != null)
} }
def attributeOpt(attribute: String, node: Node): Option[String] = def attributeOpt(attribute: String, node: Node): Option[String] =
node.attribute(attribute) flatMap (_.headOption) map (_.text) node.attribute(attribute) flatMap (_.headOption) map (_.text)
def extractLinkage(node: Elem): List[(String, String)] = { def extractLinkage(node: Elem): List[(String, String)] = {
(node \ "linkage").map(n =>(attributeOpt("type",n), n.text)).filter(t => t._1.isDefined).map(t=> (t._1.get, t._2))(collection.breakOut) (node \ "linkage")
.map(n => (attributeOpt("type", n), n.text))
.filter(t => t._1.isDefined)
.map(t => (t._1.get, t._2))(collection.breakOut)
} }
def parseXml(input: String): PangaeaDataModel = { def parseXml(input: String): PangaeaDataModel = {
@ -59,12 +74,24 @@ object PangaeaUtils {
val relationFiltered = findDOIInRelation(relation) val relationFiltered = findDOIInRelation(relation)
val linkage: List[(String, String)] = extractLinkage(xml) val linkage: List[(String, String)] = extractLinkage(xml)
PangaeaDataModel(identifier,title, pType, creators,publisher, dataCenter, subject, language, rights, parentIdentifier, relationFiltered, linkage) PangaeaDataModel(
identifier,
title,
pType,
creators,
publisher,
dataCenter,
subject,
language,
rights,
parentIdentifier,
relationFiltered,
linkage
)
} }
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] =
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] = new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel]{ new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] {
override def zero: PangaeaDataModel = null override def zero: PangaeaDataModel = null
@ -106,7 +133,4 @@ object PangaeaUtils {
override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel] override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
} }
} }

View File

@ -11,20 +11,25 @@ import scala.io.Source
object SparkGeneratePanagaeaDataset { object SparkGeneratePanagaeaDataset {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")).mkString) val parser = new ArgumentApplicationParser(
Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")
)
.mkString
)
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(SparkGeneratePanagaeaDataset.getClass.getSimpleName) .appName(SparkGeneratePanagaeaDataset.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master"))
.getOrCreate()
parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}")) parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}"))
logger.info("Converting sequential file into Dataset") logger.info("Converting sequential file into Dataset")
@ -34,16 +39,20 @@ object SparkGeneratePanagaeaDataset {
implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel] implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
val inputRDD: RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s)) val inputRDD: RDD[PangaeaDataModel] =
sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
spark.createDataset(inputRDD).as[PangaeaDataModel] spark
.createDataset(inputRDD)
.as[PangaeaDataModel]
.map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders)) .map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders))
.groupByKey(_._1)(Encoders.STRING) .groupByKey(_._1)(Encoders.STRING)
.agg(PangaeaUtils.getDatasetAggregator().toColumn) .agg(PangaeaUtils.getDatasetAggregator().toColumn)
.map(s => s._2) .map(s => s._2)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset") .write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/dataset")
} }
} }

View File

@ -30,10 +30,10 @@ class TestApply extends java.io.Serializable{
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
implicit val mapEncoderPubInfo: Encoder[Publication] = Encoders.bean(classOf[Publication]) implicit val mapEncoderPubInfo: Encoder[Publication] = Encoders.bean(classOf[Publication])
val pub_ds: Dataset[Publication] =
val pub_ds :Dataset[Publication] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[Publication])) spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[Publication]))
val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])) val hbm_ds: Dataset[EntityInfo] =
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
assertEquals(13, pub_ds.count()) assertEquals(13, pub_ds.count())
@ -41,7 +41,8 @@ class TestApply extends java.io.Serializable{
assertEquals(13, ds.count) assertEquals(13, ds.count)
val temp: Dataset[(Publication, Publication)] = pub_ds.joinWith(ds, pub_ds.col("id").equalTo(ds.col("id")), "left") val temp: Dataset[(Publication, Publication)] =
pub_ds.joinWith(ds, pub_ds.col("id").equalTo(ds.col("id")), "left")
assertEquals(13, temp.count()) assertEquals(13, temp.count())
temp.foreach(t2 => { temp.foreach(t2 => {
val pb: Publication = t2._1 val pb: Publication = t2._1
@ -50,17 +51,36 @@ class TestApply extends java.io.Serializable{
assertEquals(1, pb.getInstance().size()) assertEquals(1, pb.getInstance().size())
assertTrue(t2._1.getId.equals(t2._2.getId)) assertTrue(t2._1.getId.equals(t2._2.getId))
if (pb.getId.equals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9")) { if (pb.getId.equals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9")) {
assertTrue(pa.getInstance().get(0).getHostedby.getKey.equals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735")) assertTrue(
pa.getInstance()
.get(0)
.getHostedby
.getKey
.equals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735")
)
assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals("Academic Therapy")) assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals("Academic Therapy"))
assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals("OPEN")) assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals("OPEN"))
assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals("Open Access")) assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals("Open Access"))
assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.gold)) assertTrue(
pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.gold)
)
assertTrue(pa.getBestaccessright.getClassid.equals("OPEN")) assertTrue(pa.getBestaccessright.getClassid.equals("OPEN"))
assertTrue(pa.getBestaccessright.getClassname.equals("Open Access")) assertTrue(pa.getBestaccessright.getClassname.equals("Open Access"))
assertTrue(
assertTrue(pb.getInstance().get(0).getHostedby.getKey.equals("10|openaire____::0b74b6a356bbf23c245f9ae9a748745c")) pb.getInstance()
assertTrue(pb.getInstance().get(0).getHostedby.getValue.equals("Revistas de investigación Universidad Nacional Mayor de San Marcos")) .get(0)
.getHostedby
.getKey
.equals("10|openaire____::0b74b6a356bbf23c245f9ae9a748745c")
)
assertTrue(
pb.getInstance()
.get(0)
.getHostedby
.getValue
.equals("Revistas de investigación Universidad Nacional Mayor de San Marcos")
)
assertTrue(pb.getInstance().get(0).getAccessright.getClassname.equals("not available")) assertTrue(pb.getInstance().get(0).getAccessright.getClassname.equals("not available"))
assertTrue(pb.getInstance().get(0).getAccessright.getClassid.equals("UNKNOWN")) assertTrue(pb.getInstance().get(0).getAccessright.getClassid.equals("UNKNOWN"))
assertTrue(pb.getInstance().get(0).getAccessright.getOpenAccessRoute == null) assertTrue(pb.getInstance().get(0).getAccessright.getOpenAccessRoute == null)
@ -68,11 +88,41 @@ class TestApply extends java.io.Serializable{
assertTrue(pb.getBestaccessright.getClassname.equals("not available")) assertTrue(pb.getBestaccessright.getClassname.equals("not available"))
} else { } else {
assertTrue(pa.getInstance().get(0).getHostedby.getKey.equals(pb.getInstance().get(0).getHostedby.getKey)) assertTrue(
assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals(pb.getInstance().get(0).getHostedby.getValue)) pa.getInstance()
assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals(pb.getInstance().get(0).getAccessright.getClassid)) .get(0)
assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals(pb.getInstance().get(0).getAccessright.getClassname)) .getHostedby
assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute == pb.getInstance().get(0).getAccessright.getOpenAccessRoute) .getKey
.equals(pb.getInstance().get(0).getHostedby.getKey)
)
assertTrue(
pa.getInstance()
.get(0)
.getHostedby
.getValue
.equals(pb.getInstance().get(0).getHostedby.getValue)
)
assertTrue(
pa.getInstance()
.get(0)
.getAccessright
.getClassid
.equals(pb.getInstance().get(0).getAccessright.getClassid)
)
assertTrue(
pa.getInstance()
.get(0)
.getAccessright
.getClassname
.equals(pb.getInstance().get(0).getAccessright.getClassname)
)
assertTrue(
pa.getInstance().get(0).getAccessright.getOpenAccessRoute == pb
.getInstance()
.get(0)
.getAccessright
.getOpenAccessRoute
)
} }
}) })
@ -80,7 +130,6 @@ class TestApply extends java.io.Serializable{
spark.close() spark.close()
} }
@Test @Test
def testApplyOnDatasource(): Unit = { def testApplyOnDatasource(): Unit = {
val conf = new SparkConf() val conf = new SparkConf()
@ -100,10 +149,11 @@ class TestApply extends java.io.Serializable{
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
implicit val mapEncoderPubInfo: Encoder[Datasource] = Encoders.bean(classOf[Datasource]) implicit val mapEncoderPubInfo: Encoder[Datasource] = Encoders.bean(classOf[Datasource])
val dats_ds: Dataset[Datasource] =
val dats_ds :Dataset[Datasource] = spark.read.textFile(dats).map(p => mapper.readValue(p, classOf[Datasource])) spark.read.textFile(dats).map(p => mapper.readValue(p, classOf[Datasource]))
val hbm_ds :Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))) val hbm_ds: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
)
assertEquals(10, dats_ds.count()) assertEquals(10, dats_ds.count())
@ -111,7 +161,8 @@ class TestApply extends java.io.Serializable{
assertEquals(10, ds.count) assertEquals(10, ds.count)
val temp: Dataset[(Datasource, Datasource)] = dats_ds.joinWith(ds, dats_ds.col("id").equalTo(ds.col("id")), "left") val temp: Dataset[(Datasource, Datasource)] =
dats_ds.joinWith(ds, dats_ds.col("id").equalTo(ds.col("id")), "left")
assertEquals(10, temp.count()) assertEquals(10, temp.count())
temp.foreach(t2 => { temp.foreach(t2 => {
val pb: Datasource = t2._1 val pb: Datasource = t2._1
@ -119,14 +170,23 @@ class TestApply extends java.io.Serializable{
assertTrue(t2._1.getId.equals(t2._2.getId)) assertTrue(t2._1.getId.equals(t2._2.getId))
if (pb.getId.equals("10|doajarticles::0ab37b7620eb9a73ac95d3ca4320c97d")) { if (pb.getId.equals("10|doajarticles::0ab37b7620eb9a73ac95d3ca4320c97d")) {
assertTrue(pa.getOpenairecompatibility().getClassid.equals("hostedBy")) assertTrue(pa.getOpenairecompatibility().getClassid.equals("hostedBy"))
assertTrue(pa.getOpenairecompatibility().getClassname.equals("collected from a compatible aggregator")) assertTrue(
pa.getOpenairecompatibility()
.getClassname
.equals("collected from a compatible aggregator")
)
assertTrue(pb.getOpenairecompatibility().getClassid.equals(ModelConstants.UNKNOWN)) assertTrue(pb.getOpenairecompatibility().getClassid.equals(ModelConstants.UNKNOWN))
} else { } else {
assertTrue(pa.getOpenairecompatibility().getClassid.equals(pb.getOpenairecompatibility.getClassid)) assertTrue(
assertTrue(pa.getOpenairecompatibility().getClassname.equals(pb.getOpenairecompatibility.getClassname)) pa.getOpenairecompatibility().getClassid.equals(pb.getOpenairecompatibility.getClassid)
)
assertTrue(
pa.getOpenairecompatibility()
.getClassname
.equals(pb.getOpenairecompatibility.getClassname)
)
} }
}) })

View File

@ -19,7 +19,6 @@ class TestPrepare extends java.io.Serializable{
write(input) write(input)
} }
@Test @Test
def testHostedByMaptoEntityInfo(): Unit = { def testHostedByMaptoEntityInfo(): Unit = {
val conf = new SparkConf() val conf = new SparkConf()
@ -33,14 +32,14 @@ class TestPrepare extends java.io.Serializable{
.getOrCreate() .getOrCreate()
val hbm = getClass.getResource("hostedbymap.json").getPath val hbm = getClass.getResource("hostedbymap.json").getPath
import spark.implicits._ import spark.implicits._
val mapper: ObjectMapper = new ObjectMapper() val mapper: ObjectMapper = new ObjectMapper()
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
val ds :Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hbm)).map(toEntityInfo) val ds: Dataset[EntityInfo] =
spark.createDataset(spark.sparkContext.textFile(hbm)).map(toEntityInfo)
ds.foreach(e => println(mapper.writeValueAsString(e))) ds.foreach(e => println(mapper.writeValueAsString(e)))
@ -71,8 +70,14 @@ class TestPrepare extends java.io.Serializable{
assertEquals(2, ds.count) assertEquals(2, ds.count)
assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ds.filter(ei => ei.getJournalId.equals("1728-5852")).first().getId) assertEquals(
assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ds.filter(ei => ei.getJournalId.equals("0001-396X")).first().getId) "50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9",
ds.filter(ei => ei.getJournalId.equals("1728-5852")).first().getId
)
assertEquals(
"50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9",
ds.filter(ei => ei.getJournalId.equals("0001-396X")).first().getId
)
spark.close() spark.close()
} }
@ -95,8 +100,10 @@ class TestPrepare extends java.io.Serializable{
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
val pub_ds :Dataset[EntityInfo] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo])) val pub_ds: Dataset[EntityInfo] =
val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])) spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
val hbm_ds: Dataset[EntityInfo] =
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds) val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds)
@ -131,8 +138,10 @@ class TestPrepare extends java.io.Serializable{
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
val pub_ds :Dataset[EntityInfo] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo])) val pub_ds: Dataset[EntityInfo] =
val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])) spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
val hbm_ds: Dataset[EntityInfo] =
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds) val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds)
@ -150,6 +159,4 @@ class TestPrepare extends java.io.Serializable{
spark.close() spark.close()
} }
} }

View File

@ -13,7 +13,6 @@ class TestPreprocess extends java.io.Serializable{
implicit val mapEncoderDats: Encoder[Datasource] = Encoders.kryo[Datasource] implicit val mapEncoderDats: Encoder[Datasource] = Encoders.kryo[Datasource]
implicit val schema = Encoders.product[HostedByInfo] implicit val schema = Encoders.product[HostedByInfo]
def toHBIString(hbi: HostedByItemType): String = { def toHBIString(hbi: HostedByItemType): String = {
implicit val formats = DefaultFormats implicit val formats = DefaultFormats
@ -41,19 +40,30 @@ class TestPreprocess extends java.io.Serializable{
assertEquals(5, ds.filter(hbi => !hbi.eissn.equals("")).count) assertEquals(5, ds.filter(hbi => !hbi.eissn.equals("")).count)
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count) assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count) assertEquals(
0,
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
)
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365")).count == 1) assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365")).count == 1)
assertTrue(ds.filter(hbi => hbi.eissn.equals("2253-900X")).count == 1) assertTrue(ds.filter(hbi => hbi.eissn.equals("2253-900X")).count == 1)
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1) assertTrue(
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata")).count == 1) ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.id.equals("10|doajarticles::abbc9265bea9ff62776a1c39785af00c")).count == 1) )
assertTrue(
ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata")).count == 1
)
assertTrue(
ds.filter(hbi =>
hbi.issn.equals("0212-8365") && hbi.id
.equals("10|doajarticles::abbc9265bea9ff62776a1c39785af00c")
).count == 1
)
ds.foreach(hbi => assertTrue(hbi.id.startsWith("10|"))) ds.foreach(hbi => assertTrue(hbi.id.startsWith("10|")))
ds.foreach(hbi => println(toHBIString(hbi))) ds.foreach(hbi => println(toHBIString(hbi)))
spark.close() spark.close()
} }
@Test @Test
def readGold(): Unit = { def readGold(): Unit = {
val conf = new SparkConf() val conf = new SparkConf()
@ -67,7 +77,6 @@ class TestPreprocess extends java.io.Serializable{
.getOrCreate() .getOrCreate()
val path = getClass.getResource("unibi_transformed.json").getPath val path = getClass.getResource("unibi_transformed.json").getPath
val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.goldHostedByDataset(spark, path) val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.goldHostedByDataset(spark, path)
assertEquals(29, ds.count) assertEquals(29, ds.count)
@ -76,9 +85,17 @@ class TestPreprocess extends java.io.Serializable{
assertEquals(0, ds.filter(hbi => !hbi.eissn.equals("")).count) assertEquals(0, ds.filter(hbi => !hbi.eissn.equals("")).count)
assertEquals(29, ds.filter(hbi => !hbi.lissn.equals("")).count) assertEquals(29, ds.filter(hbi => !hbi.lissn.equals("")).count)
assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count) assertEquals(
0,
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
)
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().officialname.equals("European journal of sustainable development.")) assertTrue(
ds.filter(hbi => hbi.issn.equals("2239-6101"))
.first()
.officialname
.equals("European journal of sustainable development.")
)
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().lissn.equals("2239-5938")) assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().lissn.equals("2239-5938"))
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).count == 1) assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).count == 1)
ds.foreach(hbi => assertTrue(hbi.id.equals(Constants.UNIBI))) ds.foreach(hbi => assertTrue(hbi.id.equals(Constants.UNIBI)))
@ -108,9 +125,17 @@ class TestPreprocess extends java.io.Serializable{
assertEquals(21, ds.filter(hbi => !hbi.eissn.equals("")).count) assertEquals(21, ds.filter(hbi => !hbi.eissn.equals("")).count)
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count) assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count) assertEquals(
0,
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
)
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().officialname.equals("Journal of Space Technology")) assertTrue(
ds.filter(hbi => hbi.issn.equals("2077-3099"))
.first()
.officialname
.equals("Journal of Space Technology")
)
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().eissn.equals("2411-5029")) assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().eissn.equals("2411-5029"))
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).count == 1) assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).count == 1)
assertTrue(ds.filter(hbi => hbi.eissn.equals("2077-2955")).first().issn.equals("")) assertTrue(ds.filter(hbi => hbi.eissn.equals("2077-2955")).first().issn.equals(""))
@ -133,20 +158,38 @@ class TestPreprocess extends java.io.Serializable{
.config(conf) .config(conf)
.getOrCreate() .getOrCreate()
val tmp = SparkProduceHostedByMap
val tmp = SparkProduceHostedByMap.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath) .oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
.union(SparkProduceHostedByMap.goldHostedByDataset(spark,getClass.getResource("unibi_transformed.json").getPath)) .union(
.union(SparkProduceHostedByMap.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)) SparkProduceHostedByMap
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])) .goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath)
)
.union(
SparkProduceHostedByMap
.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)
)
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
)
assertEquals(106, tmp.count) assertEquals(106, tmp.count)
assertEquals(82, tmp.map(i => i._1)(Encoders.STRING).distinct().count) assertEquals(82, tmp.map(i => i._1)(Encoders.STRING).distinct().count)
val ds: Dataset[(String, HostedByItemType)] = Aggregators.explodeHostedByItemType(
val ds :Dataset[(String, HostedByItemType)] = Aggregators.explodeHostedByItemType(SparkProduceHostedByMap.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath) SparkProduceHostedByMap
.union(SparkProduceHostedByMap.goldHostedByDataset(spark,getClass.getResource("unibi_transformed.json").getPath)) .oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
.union(SparkProduceHostedByMap.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)) .union(
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]))) SparkProduceHostedByMap
.goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath)
)
.union(
SparkProduceHostedByMap
.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)
)
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
)
)
assertEquals(82, ds.count) assertEquals(82, ds.count)
@ -156,14 +199,13 @@ class TestPreprocess extends java.io.Serializable{
assertTrue(ds.filter(i => i._1.equals("2077-3757")).first()._2.openAccess) assertTrue(ds.filter(i => i._1.equals("2077-3757")).first()._2.openAccess)
assertEquals(1, ds.filter(i => i._1.equals("2077-3757")).count) assertEquals(1, ds.filter(i => i._1.equals("2077-3757")).count)
val hbmap : Dataset[String] = ds.filter(hbi => hbi._2.id.startsWith("10|")).map(SparkProduceHostedByMap.toHostedByMap)(Encoders.STRING) val hbmap: Dataset[String] = ds
.filter(hbi => hbi._2.id.startsWith("10|"))
.map(SparkProduceHostedByMap.toHostedByMap)(Encoders.STRING)
hbmap.foreach(entry => println(entry)) hbmap.foreach(entry => println(entry))
spark.close() spark.close()
} }
} }

View File

@ -1,6 +1,5 @@
package eu.dnetlib.dhp.oa.graph.resolution package eu.dnetlib.dhp.oa.graph.resolution
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.common.EntityType import eu.dnetlib.dhp.schema.common.EntityType
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
@ -26,65 +25,86 @@ class ResolveEntitiesTest extends Serializable {
var sparkSession: Option[SparkSession] = None var sparkSession: Option[SparkSession] = None
@BeforeAll @BeforeAll
def setUp(): Unit = { def setUp(): Unit = {
workingDir = Files.createTempDirectory(getClass.getSimpleName) workingDir = Files.createTempDirectory(getClass.getSimpleName)
val conf = new SparkConf() val conf = new SparkConf()
sparkSession = Some(SparkSession sparkSession = Some(
SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master("local[*]").getOrCreate()) .master("local[*]")
.getOrCreate()
)
populateDatasets(sparkSession.get) populateDatasets(sparkSession.get)
generateUpdates(sparkSession.get) generateUpdates(sparkSession.get)
} }
@AfterAll @AfterAll
def tearDown(): Unit = { def tearDown(): Unit = {
FileUtils.deleteDirectory(workingDir.toFile) FileUtils.deleteDirectory(workingDir.toFile)
sparkSession.get.stop() sparkSession.get.stop()
} }
def generateUpdates(spark: SparkSession): Unit = { def generateUpdates(spark: SparkSession): Unit = {
val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString
val pids: List[String] = template.lines
val pids:List[String] = template.lines.map{id => .map { id =>
val r = new Result val r = new Result
r.setId(id.toLowerCase.trim) r.setId(id.toLowerCase.trim)
r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava) r.setSubject(
r.setTitle(List(OafMapperUtils.structuredProperty(FAKE_TITLE, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava) List(
OafMapperUtils.structuredProperty(
FAKE_SUBJECT,
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
null
)
).asJava
)
r.setTitle(
List(
OafMapperUtils.structuredProperty(
FAKE_TITLE,
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
null
)
).asJava
)
r r
}.map{r => }
.map { r =>
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
mapper.writeValueAsString(r)}.toList mapper.writeValueAsString(r)
}
.toList
val sc = spark.sparkContext val sc = spark.sparkContext
println(sc.parallelize(pids).count()) println(sc.parallelize(pids).count())
spark.createDataset(sc.parallelize(pids))(Encoders.STRING).write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingDir/updates") spark
.createDataset(sc.parallelize(pids))(Encoders.STRING)
.write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(s"$workingDir/updates")
import spark.implicits._ import spark.implicits._
implicit val resEncoder: Encoder[Result] = Encoders.bean(classOf[Result]) implicit val resEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
val ds = spark.read.text(s"$workingDir/updates").as[String].map{s => val mapper = new ObjectMapper() val ds = spark.read
mapper.readValue(s, classOf[Result])}.collect() .text(s"$workingDir/updates")
.as[String]
.map { s =>
val mapper = new ObjectMapper()
mapper.readValue(s, classOf[Result])
}
.collect()
assertEquals(4, ds.length) assertEquals(4, ds.length)
ds.foreach { r => assertNotNull(r.getSubject) } ds.foreach { r => assertNotNull(r.getSubject) }
@ -92,30 +112,36 @@ class ResolveEntitiesTest extends Serializable {
ds.foreach { r => assertNotNull(r.getTitle) } ds.foreach { r => assertNotNull(r.getTitle) }
ds.foreach { r => assertEquals(1, r.getTitle.size()) } ds.foreach { r => assertEquals(1, r.getTitle.size()) }
ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue))
.foreach(t => assertEquals(FAKE_TITLE, t))
ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_TITLE,t)) ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue))
ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_SUBJECT,t)) .foreach(t => assertEquals(FAKE_SUBJECT, t))
println("generated Updates") println("generated Updates")
} }
def populateDatasets(spark: SparkSession): Unit = { def populateDatasets(spark: SparkSession): Unit = {
import spark.implicits._ import spark.implicits._
val entities = SparkResolveEntities.entities val entities = SparkResolveEntities.entities
entities.foreach{ entities.foreach { e =>
e =>
val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString
spark.createDataset(spark.sparkContext.parallelize(template.lines.toList)).as[String].write.option("compression", "gzip").text(s"$workingDir/graph/$e") spark
.createDataset(spark.sparkContext.parallelize(template.lines.toList))
.as[String]
.write
.option("compression", "gzip")
.text(s"$workingDir/graph/$e")
println(s"Created Dataset $e") println(s"Created Dataset $e")
} }
SparkResolveRelation.extractPidResolvedTableFromJsonRDD(spark, s"$workingDir/graph", s"$workingDir/work") SparkResolveRelation.extractPidResolvedTableFromJsonRDD(
spark,
s"$workingDir/graph",
s"$workingDir/work"
)
} }
@Test @Test
def testResolution(): Unit = { def testResolution(): Unit = {
val spark: SparkSession = sparkSession.get val spark: SparkSession = sparkSession.get
@ -126,16 +152,15 @@ class ResolveEntitiesTest extends Serializable {
assertEquals(3, ds.count()) assertEquals(3, ds.count())
ds.collect().foreach{ ds.collect().foreach { r =>
r =>
assertTrue(r.getId.startsWith("50")) assertTrue(r.getId.startsWith("50"))
} }
} }
private def structuredPContainsValue(
l: java.util.List[StructuredProperty],
exptectedValue: String
private def structuredPContainsValue(l:java.util.List[StructuredProperty], exptectedValue:String):Boolean = { ): Boolean = {
l.asScala.exists(p => p.getValue != null && p.getValue.equalsIgnoreCase(exptectedValue)) l.asScala.exists(p => p.getValue != null && p.getValue.equalsIgnoreCase(exptectedValue))
} }
@ -146,47 +171,72 @@ class ResolveEntitiesTest extends Serializable {
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
val m = new ObjectMapper() val m = new ObjectMapper()
SparkResolveEntities.resolveEntities(spark, s"$workingDir/work", s"$workingDir/updates") SparkResolveEntities.resolveEntities(spark, s"$workingDir/work", s"$workingDir/updates")
SparkResolveEntities.generateResolvedEntities(spark,s"$workingDir/work",s"$workingDir/graph", s"$workingDir/target" ) SparkResolveEntities.generateResolvedEntities(
spark,
s"$workingDir/work",
s"$workingDir/graph",
val pubDS:Dataset[Result] = spark.read.text(s"$workingDir/target/publication").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication)) s"$workingDir/target"
val t = pubDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() )
val pubDS: Dataset[Result] = spark.read
.text(s"$workingDir/target/publication")
.as[String]
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
val t = pubDS
.filter(p => p.getTitle != null && p.getSubject != null)
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
.count()
var ct = pubDS.count() var ct = pubDS.count()
var et = pubDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count() var et = pubDS
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
.count()
assertEquals(ct, et) assertEquals(ct, et)
val datDS: Dataset[Result] = spark.read
.text(s"$workingDir/target/dataset")
val datDS:Dataset[Result] = spark.read.text(s"$workingDir/target/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset)) .as[String]
val td = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() .map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
val td = datDS
.filter(p => p.getTitle != null && p.getSubject != null)
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
.count()
ct = datDS.count() ct = datDS.count()
et = datDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count() et = datDS
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
.count()
assertEquals(ct, et) assertEquals(ct, et)
val softDS: Dataset[Result] = spark.read
val softDS:Dataset[Result] = spark.read.text(s"$workingDir/target/software").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.software)) .text(s"$workingDir/target/software")
val ts = softDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() .as[String]
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.software))
val ts = softDS
.filter(p => p.getTitle != null && p.getSubject != null)
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
.count()
ct = softDS.count() ct = softDS.count()
et = softDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count() et = softDS
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
.count()
assertEquals(ct, et) assertEquals(ct, et)
val orpDS: Dataset[Result] = spark.read
val orpDS:Dataset[Result] = spark.read.text(s"$workingDir/target/otherresearchproduct").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct)) .text(s"$workingDir/target/otherresearchproduct")
val to = orpDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() .as[String]
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct))
val to = orpDS
.filter(p => p.getTitle != null && p.getSubject != null)
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
.count()
ct = orpDS.count() ct = orpDS.count()
et = orpDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count() et = orpDS
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
.count()
assertEquals(ct, et) assertEquals(ct, et)
assertEquals(0, t) assertEquals(0, t)
assertEquals(2, td) assertEquals(2, td)
assertEquals(1, ts) assertEquals(1, ts)
@ -194,40 +244,35 @@ class ResolveEntitiesTest extends Serializable {
} }
@Test @Test
def testMerge(): Unit = { def testMerge(): Unit = {
val r = new Result val r = new Result
r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava) r.setSubject(
List(
OafMapperUtils.structuredProperty(
FAKE_SUBJECT,
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
null
)
).asJava
)
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
val p = mapper.readValue(Source.fromInputStream(this.getClass.getResourceAsStream(s"publication")).mkString.lines.next(), classOf[Publication]) val p = mapper.readValue(
Source
.fromInputStream(this.getClass.getResourceAsStream(s"publication"))
.mkString
.lines
.next(),
classOf[Publication]
)
r.mergeFrom(p) r.mergeFrom(p)
println(mapper.writeValueAsString(r)) println(mapper.writeValueAsString(r))
} }
} }

View File

@ -1,26 +1,20 @@
package eu.dnetlib.dhp.sx.graph package eu.dnetlib.dhp.sx.graph
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
import java.text.SimpleDateFormat import java.text.SimpleDateFormat
class RetrieveDataciteDeltaTest { class RetrieveDataciteDeltaTest {
@Test @Test
def testParsingDate(): Unit = { def testParsingDate(): Unit = {
val inputDate = "2021-12-02T11:17:36+0000" val inputDate = "2021-12-02T11:17:36+0000"
val t = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ").parse(inputDate).getTime val t = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ").parse(inputDate).getTime
println(t) println(t)
} }
} }

View File

@ -20,7 +20,6 @@ import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension])) @ExtendWith(Array(classOf[MockitoExtension]))
class ScholixGraphTest extends AbstractVocabularyTest { class ScholixGraphTest extends AbstractVocabularyTest {
val mapper: ObjectMapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT) val mapper: ObjectMapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
@ -30,11 +29,12 @@ class ScholixGraphTest extends AbstractVocabularyTest{
super.setUpVocabulary() super.setUpVocabulary()
} }
@Test @Test
def testExtractPids(): Unit = { def testExtractPids(): Unit = {
val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json")).mkString val input = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json"))
.mkString
val res = SparkResolveRelation.extractPidsFromRecord(input) val res = SparkResolveRelation.extractPidsFromRecord(input)
assertNotNull(res) assertNotNull(res)
@ -44,11 +44,14 @@ class ScholixGraphTest extends AbstractVocabularyTest{
@Test @Test
def testOAFToSummary(): Unit = { def testOAFToSummary(): Unit = {
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary")).mkString val inputRelations = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary"))
.mkString
val items = inputRelations.lines.toList val items = inputRelations.lines.toList
assertNotNull(items) assertNotNull(items)
items.foreach(i => assertTrue(i.nonEmpty)) items.foreach(i => assertTrue(i.nonEmpty))
val result = items.map(r => mapper.readValue(r, classOf[Result])).map(i => ScholixUtils.resultToSummary(i)) val result =
items.map(r => mapper.readValue(r, classOf[Result])).map(i => ScholixUtils.resultToSummary(i))
assertNotNull(result) assertNotNull(result)
@ -59,12 +62,18 @@ class ScholixGraphTest extends AbstractVocabularyTest{
} }
@Test @Test
def testScholixMergeOnSource(): Unit = { def testScholixMergeOnSource(): Unit = {
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix")).mkString val inputRelations = Source
val result:List[(Relation,ScholixSummary)] =inputRelations.lines.sliding(2).map(s => (s.head, s(1))).map(p => (mapper.readValue(p._1, classOf[Relation]),mapper.readValue(p._2, classOf[ScholixSummary]) )).toList .fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix")
)
.mkString
val result: List[(Relation, ScholixSummary)] = inputRelations.lines
.sliding(2)
.map(s => (s.head, s(1)))
.map(p => (mapper.readValue(p._1, classOf[Relation]), mapper.readValue(p._2, classOf[ScholixSummary])))
.toList
assertNotNull(result) assertNotNull(result)
assertTrue(result.nonEmpty) assertTrue(result.nonEmpty)
result.foreach(r => assertEquals(r._1.getSource, r._2.getId)) result.foreach(r => assertEquals(r._1.getSource, r._2.getId))
@ -72,12 +81,13 @@ class ScholixGraphTest extends AbstractVocabularyTest{
println(mapper.writeValueAsString(scholix.head)) println(mapper.writeValueAsString(scholix.head))
} }
@Test @Test
def testScholixRelationshipsClean(): Unit = { def testScholixRelationshipsClean(): Unit = {
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relation_transform.json")).mkString val inputRelations = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relation_transform.json")
)
.mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(inputRelations) lazy val json: json4s.JValue = parse(inputRelations)
@ -89,7 +99,4 @@ class ScholixGraphTest extends AbstractVocabularyTest{
} }
} }

39
pom.xml
View File

@ -620,6 +620,18 @@
</dependency> </dependency>
</dependencies> </dependencies>
</plugin> </plugin>
<plugin>
<groupId>org.antipathy</groupId>
<artifactId>mvn-scalafmt_2.11</artifactId>
<version>1.0.1640073709.733712b</version>
<dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-code-style</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</plugin>
</plugins> </plugins>
</pluginManagement> </pluginManagement>
<plugins> <plugins>
@ -665,6 +677,33 @@
</execution> </execution>
</executions> </executions>
</plugin> </plugin>
<plugin>
<groupId>org.antipathy</groupId>
<artifactId>mvn-scalafmt_2.11</artifactId>
<configuration>
<configLocation>dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf</configLocation>
<skipTestSources>false</skipTestSources>
<skipSources>false</skipSources>
<sourceDirectories>
<param>${project.basedir}/src/main/scala</param>
</sourceDirectories>
<testSourceDirectories>
<param>${project.basedir}/src/test/scala</param>
</testSourceDirectories>
<validateOnly>false</validateOnly>
<onlyChangedFiles>false</onlyChangedFiles>
<branch>: git rev-parse --abbrev-ref HEAD</branch>
<useSpecifiedRepositories>false</useSpecifiedRepositories>
</configuration>
<executions>
<execution>
<phase>validate</phase>
<goals>
<goal>format</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId> <artifactId>maven-release-plugin</artifactId>