1
0
Fork 0

Merge pull request 'scalafmt: code style for scala' (#184) from scalafmt into beta

Reviewed-on: D-Net/dnet-hadoop#184
This commit is contained in:
Sandro La Bruzzo 2022-01-12 09:58:39 +01:00
commit 1b9e8378b3
67 changed files with 4841 additions and 2629 deletions

View File

@ -0,0 +1,21 @@
style = defaultWithAlign
align.openParenCallSite = false
align.openParenDefnSite = false
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
continuationIndent.callSite = 2
continuationIndent.defnSite = 2
danglingParentheses = true
indentOperator = spray
maxColumn = 120
newlines.alwaysBeforeTopLevelStatements = true
project.excludeFilters = [".*\\.sbt"]
rewrite.rules = [AvoidInfix]
rewrite.rules = [ExpandImportSelectors]
rewrite.rules = [RedundantBraces]
rewrite.rules = [RedundantParens]
rewrite.rules = [SortImports]
rewrite.rules = [SortModifiers]
rewrite.rules = [PreferCurlyFors]
spaces.inImportCurlyBraces = false
unindentTopLevelOperators = true

View File

@ -2,58 +2,57 @@ package eu.dnetlib.dhp.application
import scala.io.Source
/**
* This is the main Interface SparkApplication
/** This is the main Interface SparkApplication
* where all the Spark Scala class should inherit
*
*/
trait SparkScalaApplication {
/**
* This is the path in the classpath of the json
/** This is the path in the classpath of the json
* describes all the argument needed to run
*/
val propertyPath: String
/**
* Utility to parse the arguments using the
/** Utility to parse the arguments using the
* property json in the classpath identified from
* the variable propertyPath
*
* @param args the list of arguments
*/
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString)
val parser = new ArgumentApplicationParser(
Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString
)
parser.parseArgument(args)
parser
}
/**
* Here all the spark applications runs this method
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
def run(): Unit
}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.slf4j.Logger
abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends SparkScalaApplication {
abstract class AbstractScalaApplication(
val propertyPath: String,
val args: Array[String],
log: Logger
) extends SparkScalaApplication {
var parser: ArgumentApplicationParser = null
var spark: SparkSession = null
def initialize(): SparkScalaApplication = {
parser = parseArguments(args)
spark = createSparkSession()
this
}
/**
* Utility for creating a spark session starting from parser
/** Utility for creating a spark session starting from parser
*
* @return a spark Session
*/
@ -63,7 +62,9 @@ abstract class AbstractScalaApplication (val propertyPath:String, val args:Array
val conf: SparkConf = new SparkConf()
val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master")
SparkSession.builder().config(conf)
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()

View File

@ -14,7 +14,6 @@ import scala.io.Source
object ScholixUtils extends Serializable {
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
val DATE_RELATION_KEY: String = "RelationDate"
@ -24,7 +23,11 @@ object ScholixUtils extends Serializable {
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
val relations: Map[String, RelationVocabulary] = {
val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")).mkString
val input = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
)
.mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
@ -32,13 +35,14 @@ object ScholixUtils extends Serializable {
json.extract[Map[String, RelationVocabulary]]
}
def extractRelationDate(relation: Relation): String = {
if (relation.getProperties == null || !relation.getProperties.isEmpty)
null
else {
val date = relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue)
val date = relation.getProperties.asScala
.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey))
.map(p => p.getValue)
if (date.isDefined)
date.get
else
@ -58,16 +62,14 @@ object ScholixUtils extends Serializable {
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
}
def generateScholixResourceFromResult(r: Result): ScholixResource = {
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
}
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
override def zero: RelatedEntities = null
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
@ -77,13 +79,20 @@ object ScholixUtils extends Serializable {
if (b == null)
RelatedEntities(a._1, relatedDataset, relatedPublication)
else
RelatedEntities(a._1, b.relatedDataset + relatedDataset, b.relatedPublication + relatedPublication)
RelatedEntities(
a._1,
b.relatedDataset + relatedDataset,
b.relatedPublication + relatedPublication
)
}
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
if (b1 != null && b2 != null)
RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication)
RelatedEntities(
b1.id,
b1.relatedDataset + b2.relatedDataset,
b1.relatedPublication + b2.relatedPublication
)
else if (b1 != null)
b1
else
@ -97,18 +106,16 @@ object ScholixUtils extends Serializable {
override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
}
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] =
new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
override def zero: Scholix = null
def scholix_complete(s: Scholix): Boolean = {
if (s == null || s.getIdentifier == null) {
false
} else if (s.getSource == null || s.getTarget == null) {
false
}
else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
} else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
false
else
true
@ -129,7 +136,6 @@ object ScholixUtils extends Serializable {
override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
}
def createInverseScholixRelation(scholix: Scholix): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
@ -138,16 +144,19 @@ object ScholixUtils extends Serializable {
s.setRelationship(inverseRelationShip(scholix.getRelationship))
s.setSource(scholix.getTarget)
s.setTarget(scholix.getSource)
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s
}
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map {
d => new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
}(collection.breakOut)
l
} else List()
@ -155,8 +164,11 @@ object ScholixUtils extends Serializable {
def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map {
d => new ScholixEntityId(d.getDatasourceName, List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava)
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d =>
new ScholixEntityId(
d.getDatasourceName,
List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava
)
}(collection.breakOut)
l
} else List()
@ -165,17 +177,16 @@ object ScholixUtils extends Serializable {
def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map {
c =>
new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava)
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c =>
new ScholixEntityId(
c.getValue,
List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava
)
}.toList
l
} else List()
}
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
@ -184,11 +195,14 @@ object ScholixUtils extends Serializable {
s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource)
s.setTarget(generateScholixResourceFromSummary(target))
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s
}
def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
@ -197,11 +211,14 @@ object ScholixUtils extends Serializable {
s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource)
s.setTarget(target)
s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"))
s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s
}
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
val r = new ScholixResource
r.setIdentifier(summaryObject.getLocalIdentifier)
@ -214,7 +231,8 @@ object ScholixUtils extends Serializable {
r.setTitle(summaryObject.getTitle.get(0))
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
val l: List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
val l: List[ScholixEntityId] =
summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
if (l.nonEmpty)
r.setCreator(l.asJava)
}
@ -222,20 +240,27 @@ object ScholixUtils extends Serializable {
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
r.setPublicationDate(summaryObject.getDate.get(0))
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
val plist: List[ScholixEntityId] =
summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
if (plist.nonEmpty)
r.setPublisher(plist.asJava)
}
if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom(
new ScholixEntityId(c.getDatasourceName, List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava)
, "collected", "complete"
)).toList
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala
.map(c =>
new ScholixCollectedFrom(
new ScholixEntityId(
c.getDatasourceName,
List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava
),
"collected",
"complete"
)
)
.toList
if (l.nonEmpty)
r.setCollectedFrom(l.asJava)
@ -244,8 +269,6 @@ object ScholixUtils extends Serializable {
r
}
def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = {
if (relation == null || source == null)
return null
@ -262,7 +285,6 @@ object ScholixUtils extends Serializable {
s.setPublicationDate(d)
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
s.setPublisher(source.getPublisher)
}
@ -270,13 +292,14 @@ object ScholixUtils extends Serializable {
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null)
return null
s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
s.setRelationship(
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
)
s.setSource(source)
s
}
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
if (relation == null || source == null)
@ -298,11 +321,9 @@ object ScholixUtils extends Serializable {
s.setPublicationDate(d)
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
val l: List[ScholixEntityId] = source.getPublisher.asScala
.map {
p =>
.map { p =>
new ScholixEntityId(p, null)
}(collection.breakOut)
@ -313,16 +334,19 @@ object ScholixUtils extends Serializable {
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null)
return null
s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
s.setRelationship(
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
)
s.setSource(generateScholixResourceFromSummary(source))
s
}
def findURLForPID(pidValue: List[StructuredProperty], urls: List[String]): List[(StructuredProperty, String)] = {
pidValue.map {
p =>
def findURLForPID(
pidValue: List[StructuredProperty],
urls: List[String]
): List[(StructuredProperty, String)] = {
pidValue.map { p =>
val pv = p.getValue
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
@ -330,14 +354,17 @@ object ScholixUtils extends Serializable {
}
}
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
if (r.getInstance() == null || r.getInstance().isEmpty)
return List()
r.getInstance().asScala.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
r.getInstance()
.asScala
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
.filter(i => i.getPid != null && i.getUrl != null)
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
.distinct
.toList
}
def resultToSummary(r: Result): ScholixSummary = {
@ -371,7 +398,12 @@ object ScholixUtils extends Serializable {
s.setAuthor(authors.asJava)
}
if (r.getInstance() != null) {
val dt: List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue).toList
val dt: List[String] = r
.getInstance()
.asScala
.filter(i => i.getDateofacceptance != null)
.map(i => i.getDateofacceptance.getValue)
.toList
if (dt.nonEmpty)
s.setDate(dt.distinct.asJava)
}
@ -382,7 +414,9 @@ object ScholixUtils extends Serializable {
}
if (r.getSubject != null && !r.getSubject.isEmpty) {
val subjects: List[SchemeValue] = r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue)).toList
val subjects: List[SchemeValue] = r.getSubject.asScala
.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))
.toList
if (subjects.nonEmpty)
s.setSubject(subjects.asJava)
}
@ -391,7 +425,9 @@ object ScholixUtils extends Serializable {
s.setPublisher(List(r.getPublisher.getValue).asJava)
if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete")).toList
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala
.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))
.toList
if (cf.nonEmpty)
s.setDatasources(cf.distinct.asJava)
}

View File

@ -7,15 +7,13 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode}
object CollectionUtils {
/**
* This method in pipeline to the transformation phase,
/** This method in pipeline to the transformation phase,
* generates relations in both verse, typically it should be a phase of flatMap
*
* @param i input OAF
* @return
* If the input OAF is an entity -> List(i)
* If the input OAF is a relation -> List(relation, inverseRelation)
*
*/
def fixRelations(i: Oaf): List[Oaf] = {

View File

@ -6,7 +6,6 @@ import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest}
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClientBuilder
abstract class AbstractRestClient extends Iterator[String] {
var buffer: List[String] = List()
@ -16,12 +15,10 @@ abstract class AbstractRestClient extends Iterator[String] {
var complete: Boolean = false
def extractInfo(input: String): Unit
protected def getBufferData(): Unit
def doHTTPGETRequest(url: String): String = {
val httpGet = new HttpGet(url)
doHTTPRequest(httpGet)
@ -43,7 +40,6 @@ abstract class AbstractRestClient extends Iterator[String] {
buffer.nonEmpty && current_index < buffer.size
}
override def next(): String = {
val next_item: String = buffer(current_index)
current_index = current_index + 1
@ -52,13 +48,14 @@ abstract class AbstractRestClient extends Iterator[String] {
next_item
}
private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
val timeout = 60; // seconds
val config = RequestConfig.custom()
val config = RequestConfig
.custom()
.setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000)
.setSocketTimeout(timeout * 1000).build()
.setSocketTimeout(timeout * 1000)
.build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
try {
var tries = 4
@ -69,8 +66,7 @@ abstract class AbstractRestClient extends Iterator[String] {
println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
}
else
} else
return IOUtils.toString(response.getEntity.getContent)
} catch {
case e: Throwable =>

View File

@ -24,7 +24,9 @@ class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -
override def getBufferData(): Unit = {
if (!complete) {
val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url())
val response =
if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get)
else doHTTPGETRequest(get_url())
extractInfo(response)
}
}

View File

@ -10,8 +10,7 @@ import java.util.Locale
import java.util.regex.Pattern
import scala.io.Source
/**
* This class represent the dataModel of the input Dataset of Datacite
/** This class represent the dataModel of the input Dataset of Datacite
* @param doi THE DOI
* @param timestamp timestamp of last update date
* @param isActive the record is active or deleted
@ -23,11 +22,26 @@ case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: S
The following class are utility class used for the mapping from
json datacite to OAF Shema
*/
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
case class RelatedIdentifierType(
relationType: String,
relatedIdentifier: String,
relatedIdentifierType: String
) {}
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
case class NameIdentifiersType(
nameIdentifierScheme: Option[String],
schemeUri: Option[String],
nameIdentifier: Option[String]
) {}
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
case class CreatorType(
nameType: Option[String],
nameIdentifiers: Option[List[NameIdentifiersType]],
name: Option[String],
familyName: Option[String],
givenName: Option[String],
affiliation: Option[List[String]]
) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
@ -35,16 +49,20 @@ case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
case class FundingReferenceType(
funderIdentifierType: Option[String],
awardTitle: Option[String],
awardUri: Option[String],
funderName: Option[String],
funderIdentifier: Option[String],
awardNumber: Option[String]
) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class OAFRelations(relation: String, inverse: String, relType: String)
class DataciteModelConstants extends Serializable {
}
class DataciteModelConstants extends Serializable {}
object DataciteModelConstants {
@ -55,51 +73,147 @@ object DataciteModelConstants {
val SUBJ_CLASS = "keywords"
val DATACITE_NAME = "Datacite"
val dataInfo: DataInfo = dataciteDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
val DATACITE_COLLECTED_FROM: KeyValue =
OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
val subRelTypeMapping: Map[String, OAFRelations] = Map(
ModelConstants.REFERENCES -> OAFRelations(ModelConstants.REFERENCES, ModelConstants.IS_REFERENCED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_REFERENCED_BY -> OAFRelations(ModelConstants.IS_REFERENCED_BY,ModelConstants.REFERENCES, ModelConstants.RELATIONSHIP),
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.SUPPLEMENT),
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.SUPPLEMENT),
ModelConstants.HAS_PART -> OAFRelations(ModelConstants.HAS_PART,ModelConstants.IS_PART_OF, ModelConstants.PART),
ModelConstants.IS_PART_OF -> OAFRelations(ModelConstants.IS_PART_OF,ModelConstants.HAS_PART, ModelConstants.PART),
ModelConstants.IS_VERSION_OF-> OAFRelations(ModelConstants.IS_VERSION_OF,ModelConstants.HAS_VERSION,ModelConstants.VERSION),
ModelConstants.HAS_VERSION-> OAFRelations(ModelConstants.HAS_VERSION,ModelConstants.IS_VERSION_OF,ModelConstants.VERSION),
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(ModelConstants.IS_IDENTICAL_TO,ModelConstants.IS_IDENTICAL_TO, ModelConstants.RELATIONSHIP),
ModelConstants.IS_CONTINUED_BY -> OAFRelations(ModelConstants.IS_CONTINUED_BY,ModelConstants.CONTINUES, ModelConstants.RELATIONSHIP),
ModelConstants.CONTINUES -> OAFRelations(ModelConstants.CONTINUES,ModelConstants.IS_CONTINUED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_NEW_VERSION_OF-> OAFRelations(ModelConstants.IS_NEW_VERSION_OF,ModelConstants.IS_PREVIOUS_VERSION_OF, ModelConstants.VERSION),
ModelConstants.IS_PREVIOUS_VERSION_OF ->OAFRelations(ModelConstants.IS_PREVIOUS_VERSION_OF,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_SOURCE_OF -> OAFRelations(ModelConstants.IS_SOURCE_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
ModelConstants.IS_DERIVED_FROM -> OAFRelations(ModelConstants.IS_DERIVED_FROM,ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION),
ModelConstants.CITES -> OAFRelations(ModelConstants.CITES,ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
ModelConstants.IS_CITED_BY -> OAFRelations(ModelConstants.IS_CITED_BY,ModelConstants.CITES, ModelConstants.CITATION),
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(ModelConstants.IS_VARIANT_FORM_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(ModelConstants.IS_OBSOLETED_BY,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
ModelConstants.REVIEWS -> OAFRelations(ModelConstants.REVIEWS,ModelConstants.IS_REVIEWED_BY, ModelConstants.REVIEW),
ModelConstants.IS_REVIEWED_BY -> OAFRelations(ModelConstants.IS_REVIEWED_BY,ModelConstants.REVIEWS, ModelConstants.REVIEW),
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
ModelConstants.COMPILES -> OAFRelations(ModelConstants.COMPILES,ModelConstants.IS_COMPILED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_COMPILED_BY -> OAFRelations(ModelConstants.IS_COMPILED_BY,ModelConstants.COMPILES, ModelConstants.RELATIONSHIP)
ModelConstants.REFERENCES -> OAFRelations(
ModelConstants.REFERENCES,
ModelConstants.IS_REFERENCED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_REFERENCED_BY -> OAFRelations(
ModelConstants.IS_REFERENCED_BY,
ModelConstants.REFERENCES,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
ModelConstants.IS_SUPPLEMENTED_BY,
ModelConstants.IS_SUPPLEMENT_TO,
ModelConstants.SUPPLEMENT
),
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
ModelConstants.IS_SUPPLEMENT_TO,
ModelConstants.IS_SUPPLEMENTED_BY,
ModelConstants.SUPPLEMENT
),
ModelConstants.HAS_PART -> OAFRelations(
ModelConstants.HAS_PART,
ModelConstants.IS_PART_OF,
ModelConstants.PART
),
ModelConstants.IS_PART_OF -> OAFRelations(
ModelConstants.IS_PART_OF,
ModelConstants.HAS_PART,
ModelConstants.PART
),
ModelConstants.IS_VERSION_OF -> OAFRelations(
ModelConstants.IS_VERSION_OF,
ModelConstants.HAS_VERSION,
ModelConstants.VERSION
),
ModelConstants.HAS_VERSION -> OAFRelations(
ModelConstants.HAS_VERSION,
ModelConstants.IS_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
ModelConstants.IS_IDENTICAL_TO,
ModelConstants.IS_IDENTICAL_TO,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_CONTINUED_BY -> OAFRelations(
ModelConstants.IS_CONTINUED_BY,
ModelConstants.CONTINUES,
ModelConstants.RELATIONSHIP
),
ModelConstants.CONTINUES -> OAFRelations(
ModelConstants.CONTINUES,
ModelConstants.IS_CONTINUED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.IS_PREVIOUS_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
ModelConstants.IS_PREVIOUS_VERSION_OF,
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.DOCUMENTS,
ModelConstants.RELATIONSHIP
),
ModelConstants.DOCUMENTS -> OAFRelations(
ModelConstants.DOCUMENTS,
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_SOURCE_OF -> OAFRelations(
ModelConstants.IS_SOURCE_OF,
ModelConstants.IS_DERIVED_FROM,
ModelConstants.VERSION
),
ModelConstants.IS_DERIVED_FROM -> OAFRelations(
ModelConstants.IS_DERIVED_FROM,
ModelConstants.IS_SOURCE_OF,
ModelConstants.VERSION
),
ModelConstants.CITES -> OAFRelations(
ModelConstants.CITES,
ModelConstants.IS_CITED_BY,
ModelConstants.CITATION
),
ModelConstants.IS_CITED_BY -> OAFRelations(
ModelConstants.IS_CITED_BY,
ModelConstants.CITES,
ModelConstants.CITATION
),
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
ModelConstants.IS_VARIANT_FORM_OF,
ModelConstants.IS_DERIVED_FROM,
ModelConstants.VERSION
),
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
ModelConstants.IS_OBSOLETED_BY,
ModelConstants.IS_NEW_VERSION_OF,
ModelConstants.VERSION
),
ModelConstants.REVIEWS -> OAFRelations(
ModelConstants.REVIEWS,
ModelConstants.IS_REVIEWED_BY,
ModelConstants.REVIEW
),
ModelConstants.IS_REVIEWED_BY -> OAFRelations(
ModelConstants.IS_REVIEWED_BY,
ModelConstants.REVIEWS,
ModelConstants.REVIEW
),
ModelConstants.DOCUMENTS -> OAFRelations(
ModelConstants.DOCUMENTS,
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
ModelConstants.IS_DOCUMENTED_BY,
ModelConstants.DOCUMENTS,
ModelConstants.RELATIONSHIP
),
ModelConstants.COMPILES -> OAFRelations(
ModelConstants.COMPILES,
ModelConstants.IS_COMPILED_BY,
ModelConstants.RELATIONSHIP
),
ModelConstants.IS_COMPILED_BY -> OAFRelations(
ModelConstants.IS_COMPILED_BY,
ModelConstants.COMPILES,
ModelConstants.RELATIONSHIP
)
)
val datacite_filter: List[String] = {
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
@ -107,28 +221,58 @@ object DataciteModelConstants {
Source.fromInputStream(stream).getLines().toList
}
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
false,
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
trust
)
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(false,null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, trust)
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
Locale.ENGLISH
)
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val df_it: DateTimeFormatter =
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
(
Pattern.compile(
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
),
"40|corda__h2020::"
),
(
Pattern.compile(
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
),
"40|corda_______::"
)
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
Pattern.compile(
"(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
Pattern.MULTILINE
),
//M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
Pattern.compile(
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
Pattern.MULTILINE
),
//D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
Pattern.compile(
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
Pattern.MULTILINE
),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
}

View File

@ -20,14 +20,11 @@ import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import scala.collection.JavaConverters._
object DataciteToOAFTransformation {
val mapper = new ObjectMapper()
/**
* This method should skip record if json contains invalid text
/** This method should skip record if json contains invalid text
* defined in gile datacite_filter
*
* @param json
@ -74,30 +71,30 @@ object DataciteToOAFTransformation {
}
def embargo_end(embargo_end_date: String): Boolean = {
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
val td = LocalDate.now()
td.isAfter(dt)
}
def extract_date(input: String): Option[String] = {
val d = Date_regex.map(pattern => {
val d = Date_regex
.map(pattern => {
val matcher = pattern.matcher(input)
if (matcher.find())
matcher.group(0)
else
null
}
).find(s => s != null)
})
.find(s => s != null)
if (d.isDefined) {
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
try {
return Some(LocalDate.parse(a_date, df_en).toString)
} catch {
case _: Throwable => try {
case _: Throwable =>
try {
return Some(LocalDate.parse(a_date, df_it).toString)
} catch {
case _: Throwable =>
@ -118,31 +115,63 @@ object DataciteToOAFTransformation {
}
}
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
def getTypeQualifier(
resourceType: String,
resourceTypeGeneral: String,
schemaOrg: String,
vocabularies: VocabularyGroup
): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
val typeQualifier =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
}
if (schemaOrg != null && schemaOrg.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
val typeQualifier =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
}
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
val typeQualifier = vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_PUBLICATION_RESOURCE,
resourceTypeGeneral
)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
return (
typeQualifier,
vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid
)
)
}
null
}
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
def getResult(
resourceType: String,
resourceTypeGeneral: String,
schemaOrg: String,
vocabularies: VocabularyGroup
): Result = {
val typeQualifiers: (Qualifier, Qualifier) =
getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (typeQualifiers == null)
return null
val i = new Instance
@ -168,7 +197,6 @@ object DataciteToOAFTransformation {
null
}
def available_date(input: String): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -182,9 +210,7 @@ object DataciteToOAFTransformation {
}
/**
* As describe in ticket #6377
/** As describe in ticket #6377
* when the result come from figshare we need to remove subject
* and set Access rights OPEN.
*
@ -193,7 +219,10 @@ object DataciteToOAFTransformation {
def fix_figshare(r: Result): Unit = {
if (r.getInstance() != null) {
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
val hosted_by_figshare = r
.getInstance()
.asScala
.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
if (hosted_by_figshare) {
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
val l: List[StructuredProperty] = List()
@ -201,10 +230,8 @@ object DataciteToOAFTransformation {
}
}
}
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
@ -214,7 +241,13 @@ object DataciteToOAFTransformation {
OafMapperUtils.structuredProperty(dt, q, null)
}
def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
def generateRelation(
sourceId: String,
targetId: String,
relClass: String,
cf: KeyValue,
di: DataInfo
): Relation = {
val r = new Relation
r.setSource(sourceId)
@ -226,7 +259,6 @@ object DataciteToOAFTransformation {
r.setDataInfo(di)
r
}
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
@ -238,14 +270,18 @@ object DataciteToOAFTransformation {
val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}"
List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo))
}
else
} else
List()
}
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
def generateOAF(
input: String,
ts: Long,
dateOfCollection: Long,
vocabularies: VocabularyGroup,
exportLinks: Boolean
): List[Oaf] = {
if (skip_record(input))
return List()
@ -253,7 +289,8 @@ object DataciteToOAFTransformation {
lazy val json = parse(input)
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
val resourceTypeGeneral =
(json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
val doi = (json \ "attributes" \ "doi").extract[String]
@ -265,8 +302,12 @@ object DataciteToOAFTransformation {
if (result == null)
return List()
val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
val doi_q = OafMapperUtils.qualifier(
"doi",
"doi",
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES
)
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
result.setPid(List(pid).asJava)
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
@ -275,48 +316,70 @@ object DataciteToOAFTransformation {
val d = new Date(dateOfCollection * 1000)
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
result.setDateofcollection(ISO8601FORMAT.format(d))
result.setDateoftransformation(ISO8601FORMAT.format(d))
result.setDataInfo(dataInfo)
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
val authors = creators.zipWithIndex.map { case (c, idx) =>
val a = new Author
a.setFullname(c.name.orNull)
a.setName(c.givenName.orNull)
a.setSurname(c.familyName.orNull)
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
a.setPid(c.nameIdentifiers.get.map(ni => {
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
a.setPid(
c.nameIdentifiers.get
.map(ni => {
val q =
if (ni.nameIdentifierScheme.isDefined)
vocabularies.getTermAsQualifier(
ModelConstants.DNET_PID_TYPES,
ni.nameIdentifierScheme.get.toLowerCase()
)
else null
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
}
else
} else
null
}
})
.asJava
)
.asJava)
}
if (c.affiliation.isDefined)
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
a.setAffiliation(
c.affiliation.get
.filter(af => af.nonEmpty)
.map(af => OafMapperUtils.field(af, dataInfo))
.asJava
)
a.setRank(idx + 1)
a
}
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
result.setTitle(
titles
.filter(t => t.title.nonEmpty)
.map(t => {
if (t.titleType.isEmpty) {
OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
OafMapperUtils
.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
} else {
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null)
OafMapperUtils.structuredProperty(
t.title.get,
t.titleType.get,
t.titleType.get,
ModelConstants.DNET_DATACITE_TITLE,
ModelConstants.DNET_DATACITE_TITLE,
null
)
}
}).asJava)
})
.asJava
)
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
return List()
@ -337,46 +400,81 @@ object DataciteToOAFTransformation {
if (a_date.isDefined) {
if (doi.startsWith("10.14457"))
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null))
result.setEmbargoenddate(
OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null)
)
else
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
}
if (i_date.isDefined && i_date.get.isDefined) {
if (doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
}
else {
result.setDateofacceptance(
OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
)
result
.getInstance()
.get(0)
.setDateofacceptance(
OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
)
} else {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
}
}
else if (publication_year != null) {
} else if (publication_year != null) {
if (doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
result.setDateofacceptance(
OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
)
result
.getInstance()
.get(0)
.setDateofacceptance(
OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
)
} else {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result
.getInstance()
.get(0)
.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
}
}
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
result.setRelevantdate(
dates
.filter(d => d.date.isDefined && d.dateType.isDefined)
.map(d => (extract_date(d.date.get), d.dateType.get))
.filter(d => d._1.isDefined)
.map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())))
.map(d =>
(
d._1.get,
vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
)
)
.filter(d => d._2 != null)
.map(d => generateOAFDate(d._1, d._2)).asJava)
.map(d => generateOAFDate(d._1, d._2))
.asJava
)
val subjects = (json \\ "subjects").extract[List[SubjectType]]
result.setSubject(subjects.filter(s => s.subject.nonEmpty)
result.setSubject(
subjects
.filter(s => s.subject.nonEmpty)
.map(s =>
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
).asJava)
OafMapperUtils.structuredProperty(
s.subject.get,
SUBJ_CLASS,
SUBJ_CLASS,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
null
)
)
.asJava
)
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
@ -384,22 +482,22 @@ object DataciteToOAFTransformation {
result.setDescription(
descriptions
.filter(d => d.description.isDefined).
map(d =>
OafMapperUtils.field(d.description.get, null)
).filter(s => s != null).asJava)
.filter(d => d.description.isDefined)
.map(d => OafMapperUtils.field(d.description.get, null))
.filter(s => s != null)
.asJava
)
val publisher = (json \\ "publisher").extractOrElse[String](null)
if (publisher != null)
result.setPublisher(OafMapperUtils.field(publisher, null))
val language: String = (json \\ "language").extractOrElse[String](null)
if (language != null)
result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
result.setLanguage(
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
)
val instance = result.getInstance().get(0)
@ -410,9 +508,12 @@ object DataciteToOAFTransformation {
JField("rightsUri", JString(rightsUri)) <- rightsList
} yield rightsUri
val aRights: Option[AccessRight] = accessRights.map(r => {
val aRights: Option[AccessRight] = accessRights
.map(r => {
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
}).find(q => q != null).map(q => {
})
.find(q => q != null)
.map(q => {
val a = new AccessRight
a.setClassid(q.getClassid)
a.setClassname(q.getClassname)
@ -421,18 +522,34 @@ object DataciteToOAFTransformation {
a
})
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
val access_rights_qualifier =
if (aRights.isDefined) aRights.get
else
OafMapperUtils.accessRight(
ModelConstants.UNKNOWN,
ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
if (client.isDefined) {
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID), ModelConstants.UNKNOWN_REPOSITORY.getValue))
instance.setHostedby(
OafMapperUtils.keyValue(
generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID),
ModelConstants.UNKNOWN_REPOSITORY.getValue
)
)
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
instance.setAccessright(access_rights_qualifier)
instance.setPid(result.getPid)
val license = accessRights
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
.find(r =>
r.startsWith("http") && r.matches(
".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
)
)
if (license.isDefined)
instance.setLicense(OafMapperUtils.field(license.get, null))
}
@ -443,7 +560,8 @@ object DataciteToOAFTransformation {
} yield awardUri
result.setId(IdentifierFactory.createIdentifier(result))
var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
var relations: List[Relation] =
awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
fix_figshare(result)
@ -458,20 +576,27 @@ object DataciteToOAFTransformation {
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
relations = relations ::: generateRelations(rels, result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null)
relations = relations ::: generateRelations(
rels,
result.getId,
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
)
}
if (relations != null && relations.nonEmpty) {
List(result) ::: relations
}
else
} else
List(result)
}
private def generateRelations(rels: List[RelatedIdentifierType], id: String, date: String): List[Relation] = {
private def generateRelations(
rels: List[RelatedIdentifierType],
id: String,
date: String
): List[Relation] = {
rels
.filter(r =>
subRelTypeMapping.contains(r.relationType) && (
r.relatedIdentifierType.equalsIgnoreCase("doi") ||
subRelTypeMapping
.contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
)
@ -490,19 +615,19 @@ object DataciteToOAFTransformation {
rel.setProperties(List(dateProps).asJava)
rel.setSource(id)
rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType))
rel.setTarget(
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
)
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
rel
})
}
def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::")
val a = StringUtils.substringAfter(input, "::")
s"10|$b::${DHPUtils.md5(a)}"
}
}

View File

@ -12,10 +12,10 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
class GenerateDataciteDatasetSpark(propertyPath: String, args: Array[String], log: Logger)
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) {
/**
* Here all the spark applications runs this method
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
@ -46,27 +46,34 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
reportTotalSize(targetPath, outputBasePath)
}
/**
* For working with MDStore we need to store in a file on hdfs the size of
/** For working with MDStore we need to store in a file on hdfs the size of
* the current dataset
* @param targetPath
* @param outputBasePath
*/
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
val total_items = spark.read.text(targetPath).count()
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
writeHdfsFile(
spark.sparkContext.hadoopConfiguration,
s"$total_items",
outputBasePath + MDSTORE_SIZE_PATH
)
}
/**
* Generate the transformed and cleaned OAF Dataset from the native one
/** Generate the transformed and cleaned OAF Dataset from the native one
*
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
* @param exportLinks If true it generates unresolved links
* @param vocabularies vocabularies for cleaning
* @param targetPath the targetPath of the result Dataset
*/
def generateDataciteDataset(sourcePath: String, exportLinks: Boolean, vocabularies: VocabularyGroup, targetPath: String, spark:SparkSession):Unit = {
def generateDataciteDataset(
sourcePath: String,
exportLinks: Boolean,
vocabularies: VocabularyGroup,
targetPath: String,
spark: SparkSession
): Unit = {
require(spark != null)
import spark.implicits._
@ -74,21 +81,30 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
CollectionUtils.saveDataset(
spark.read.load(sourcePath).as[DataciteType]
spark.read
.load(sourcePath)
.as[DataciteType]
.filter(d => d.isActive)
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
.flatMap(d =>
DataciteToOAFTransformation
.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)
)
.filter(d => d != null),
targetPath)
targetPath
)
}
}
object GenerateDataciteDatasetSpark {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = {
new GenerateDataciteDatasetSpark("/eu/dnetlib/dhp/datacite/generate_dataset_params.json", args, log).initialize().run()
new GenerateDataciteDatasetSpark(
"/eu/dnetlib/dhp/datacite/generate_dataset_params.json",
args,
log
).initialize().run()
}
}

View File

@ -22,7 +22,6 @@ object ImportDatacite {
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
def convertAPIStringToDataciteItem(input: String): DataciteType = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
@ -32,14 +31,26 @@ object ImportDatacite {
val timestamp_string = (json \ "attributes" \ "updated").extract[String]
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input)
DataciteType(
doi = doi,
timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000,
isActive = isActive,
json = input
)
}
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
val parser = new ArgumentApplicationParser(
Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json"
)
)
.mkString
)
parser.parseArgument(args)
val master = parser.get("master")
@ -60,7 +71,8 @@ object ImportDatacite {
val spkipImport = parser.get("skipImport")
log.info(s"skipImport is $spkipImport")
val spark: SparkSession = SparkSession.builder()
val spark: SparkSession = SparkSession
.builder()
.appName(ImportDatacite.getClass.getSimpleName)
.master(master)
.getOrCreate()
@ -78,8 +90,8 @@ object ImportDatacite {
import spark.implicits._
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] =
new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
override def zero: DataciteType = null
@ -110,13 +122,16 @@ object ImportDatacite {
println(s"last Timestamp is $ts")
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
val cnt =
if ("true".equalsIgnoreCase(spkipImport)) 1
else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
println(s"Imported from Datacite API $cnt documents")
if (cnt > 0) {
val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
val inputRdd: RDD[DataciteType] = sc
.sequenceFile(targetPath, classOf[Int], classOf[Text])
.map(s => s._2.toString)
.map(s => convertAPIStringToDataciteItem(s))
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
@ -129,7 +144,9 @@ object ImportDatacite {
.agg(dataciteAggregator.toColumn)
.map(s => s._2)
.repartition(4000)
.write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
.write
.mode(SaveMode.Overwrite)
.save(s"${dataciteDump}_updated")
val fs = FileSystem.get(sc.hadoopConfiguration)
fs.delete(new Path(s"$dataciteDump"), true)
@ -137,14 +154,24 @@ object ImportDatacite {
}
}
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs: Int): Long = {
private def writeSequenceFile(
hdfsTargetPath: Path,
timestamp: Long,
conf: Configuration,
bs: Int
): Long = {
var from: Long = timestamp * 1000
val delta: Long = 100000000L
var client: DataciteAPIImporter = null
val now: Long = System.currentTimeMillis()
var i = 0
try {
val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
val writer = SequenceFile.createWriter(
conf,
SequenceFile.Writer.file(hdfsTargetPath),
SequenceFile.Writer.keyClass(classOf[IntWritable]),
SequenceFile.Writer.valueClass(classOf[Text])
)
try {
var start: Long = System.currentTimeMillis
while (from < now) {
@ -153,16 +180,16 @@ object ImportDatacite {
val key: IntWritable = new IntWritable(i)
val value: Text = new Text
while (client.hasNext) {
key.set({
key.set {
i += 1;
i - 1
})
}
value.set(client.next())
writer.append(key, value)
writer.hflush()
if (i % 1000 == 0) {
end = System.currentTimeMillis
val time = (end - start) / 1000.0F
val time = (end - start) / 1000.0f
println(s"Imported $i in $time seconds")
start = System.currentTimeMillis
}
@ -174,8 +201,7 @@ object ImportDatacite {
case e: Throwable =>
println("Error", e)
} finally if (writer != null) writer.close()
}
catch {
} catch {
case e: Throwable =>
log.error("Error", e)
}

View File

@ -17,7 +17,13 @@ object SparkDownloadUpdateDatacite {
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString)
val parser = new ArgumentApplicationParser(
Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")
)
.mkString
)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
@ -26,8 +32,9 @@ object SparkDownloadUpdateDatacite {
val hdfsuri = parser.get("namenode")
log.info(s"namenode is $hdfsuri")
val spark: SparkSession = SparkSession.builder().config(conf)
val spark: SparkSession = SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()
@ -37,13 +44,18 @@ object SparkDownloadUpdateDatacite {
import spark.implicits._
val maxDate: String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0)
val maxDate: String = spark.read
.load(workingPath)
.as[Oaf]
.filter(s => s.isInstanceOf[Result])
.map(r => r.asInstanceOf[Result].getDateofcollection)
.select(max("value"))
.first()
.getString(0)
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
val string_to_date = ISO8601FORMAT.parse(maxDate)
val ts = string_to_date.getTime
}
}

View File

@ -12,13 +12,36 @@ object BioDBToOAF {
case class EBILinkItem(id: Long, links: String) {}
case class EBILinks(relType: String, date: String, title: String, pmid: String, targetPid: String, targetPidType: String, targetUrl: String) {}
case class EBILinks(
relType: String,
date: String,
title: String,
pmid: String,
targetPid: String,
targetPidType: String,
targetUrl: String
) {}
case class UniprotDate(date: String, date_info: String) {}
case class ScholixResolved(pid: String, pidType: String, typology: String, tilte: List[String], datasource: List[String], date: List[String], authors: List[String]) {}
case class ScholixResolved(
pid: String,
pidType: String,
typology: String,
tilte: List[String],
datasource: List[String],
date: List[String],
authors: List[String]
) {}
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
false,
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
"0.9"
)
val SUBJ_CLASS = "Keywords"
val DATE_RELATION_KEY = "RelationDate"
@ -35,16 +58,35 @@ object BioDBToOAF {
"geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
)
val collectedFromMap: Map[String, KeyValue] = {
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", "Protein Data Bank")
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive")
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", "NCBI Nucleotide")
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::296e1abaf1302897a6838d3588cd0310", "UniProtKB/Swiss-Prot")
val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature")
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)")
val pubmedCollectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue(
"10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2",
"Protein Data Bank"
)
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue(
"10|re3data_____::c2a591f440598b63d854556beaf01591",
"European Nucleotide Archive"
)
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue(
"10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6",
"NCBI Nucleotide"
)
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue(
"10|re3data_____::296e1abaf1302897a6838d3588cd0310",
"UniProtKB/Swiss-Prot"
)
val ElsevierCollectedFrom: KeyValue =
OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue(
"10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e",
"Springer Nature"
)
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue(
"10|opendoar____::83e60e09c222f206c725385f53d7e567c",
"EMBL-EBIs Protein Data Bank in Europe (PDBe)"
)
val pubmedCollectedFrom: KeyValue =
OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
PDBCollectedFrom.setDataInfo(DATA_INFO)
@ -80,18 +122,32 @@ object BioDBToOAF {
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type), collectedFromMap("elsevier"), "relationship", relation_semantic, date)
createRelation(
target_pid,
target_pid_type,
generate_unresolved_id(source_pid, source_pid_type),
collectedFromMap("elsevier"),
"relationship",
relation_semantic,
date
)
}
def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
val d = new Dataset
d.setPid(
List(
OafMapperUtils.structuredProperty(input.pid.toLowerCase, input.pidType.toLowerCase, input.pidType.toLowerCase, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
OafMapperUtils.structuredProperty(
input.pid.toLowerCase,
input.pidType.toLowerCase,
input.pidType.toLowerCase,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
DATA_INFO
)
).asJava
)
@ -101,7 +157,15 @@ object BioDBToOAF {
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
if (input.tilte != null && input.tilte.nonEmpty)
d.setTitle(List(OafMapperUtils.structuredProperty(input.tilte.head, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
d.setTitle(
List(
OafMapperUtils.structuredProperty(
input.tilte.head,
ModelConstants.MAIN_TITLE_QUALIFIER,
DATA_INFO
)
).asJava
)
d.setOriginalId(List(input.pid).asJava)
val i = new Instance
@ -113,9 +177,23 @@ object BioDBToOAF {
}
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
i.setInstancetype(OafMapperUtils.qualifier("0037", "Clinical Trial", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
i.setInstancetype(
OafMapperUtils.qualifier(
"0037",
"Clinical Trial",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
else
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
i.setInstancetype(
OafMapperUtils.qualifier(
"0046",
"Bioentity",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
if (input.datasource == null || input.datasource.isEmpty)
return null
@ -141,7 +219,6 @@ object BioDBToOAF {
d
}
def uniprotToOAF(input: String): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
@ -151,7 +228,14 @@ object BioDBToOAF {
d.setPid(
List(
OafMapperUtils.structuredProperty(pid, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
OafMapperUtils.structuredProperty(
pid,
"uniprot",
"uniprot",
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
DATA_INFO
)
).asJava
)
@ -162,14 +246,25 @@ object BioDBToOAF {
val title: String = (json \ "title").extractOrElse[String](null)
if (title != null)
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
d.setTitle(
List(
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
).asJava
)
d.setOriginalId(List(pid).asJava)
val i = new Instance
i.setPid(d.getPid)
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
i.setInstancetype(
OafMapperUtils.qualifier(
"0046",
"Bioentity",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
i.setCollectedfrom(collectedFromMap("uniprot"))
d.setInstance(List(i).asJava)
@ -182,12 +277,21 @@ object BioDBToOAF {
val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
if (subjects != null) {
d.setSubject(
subjects.map(s =>
OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
).asJava)
subjects
.map(s =>
OafMapperUtils.structuredProperty(
s,
SUBJ_CLASS,
SUBJ_CLASS,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
null
)
)
.asJava
)
}
var i_date: Option[UniprotDate] = None
@ -197,14 +301,23 @@ object BioDBToOAF {
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
}
val relevant_dates: List[StructuredProperty] = dates.filter(d => !d.date_info.contains("entry version"))
.map(date => OafMapperUtils.structuredProperty(date.date, ModelConstants.UNKNOWN, ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO))
val relevant_dates: List[StructuredProperty] = dates
.filter(d => !d.date_info.contains("entry version"))
.map(date =>
OafMapperUtils.structuredProperty(
date.date,
ModelConstants.UNKNOWN,
ModelConstants.UNKNOWN,
ModelConstants.DNET_DATACITE_DATE,
ModelConstants.DNET_DATACITE_DATE,
DATA_INFO
)
)
if (relevant_dates != null && relevant_dates.nonEmpty)
d.setRelevantdate(relevant_dates.asJava)
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
}
val references_pmid: List[String] = for {
JObject(reference) <- json \ "references"
JField("PubMed", JString(pid)) <- reference
@ -215,27 +328,46 @@ object BioDBToOAF {
JField(" DOI", JString(pid)) <- reference
} yield pid
if (references_pmid != null && references_pmid.nonEmpty) {
val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
val rel = createRelation(
references_pmid.head,
"pmid",
d.getId,
collectedFromMap("uniprot"),
ModelConstants.RELATIONSHIP,
ModelConstants.IS_RELATED_TO,
if (i_date.isDefined) i_date.get.date else null
)
rel.getCollectedfrom
List(d, rel)
}
else if (references_doi != null && references_doi.nonEmpty) {
val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
} else if (references_doi != null && references_doi.nonEmpty) {
val rel = createRelation(
references_doi.head,
"doi",
d.getId,
collectedFromMap("uniprot"),
ModelConstants.RELATIONSHIP,
ModelConstants.IS_RELATED_TO,
if (i_date.isDefined) i_date.get.date else null
)
List(d, rel)
}
else
} else
List(d)
}
def generate_unresolved_id(pid: String, pidType: String): String = {
s"unresolved::$pid::$pidType"
}
def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType: String, relClass: String, date: String): Relation = {
def createRelation(
pid: String,
pidType: String,
sourceId: String,
collectedFrom: KeyValue,
subRelType: String,
relClass: String,
date: String
): Relation = {
val rel = new Relation
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
@ -248,7 +380,6 @@ object BioDBToOAF {
rel.setSource(sourceId)
rel.setTarget(s"unresolved::$pid::$pidType")
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava)
@ -259,12 +390,24 @@ object BioDBToOAF {
}
def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date: String): Relation = {
createRelation(pid, pidType, sourceId, collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date)
def createSupplementaryRelation(
pid: String,
pidType: String,
sourceId: String,
collectedFrom: KeyValue,
date: String
): Relation = {
createRelation(
pid,
pidType,
sourceId,
collectedFrom,
ModelConstants.SUPPLEMENT,
ModelConstants.IS_SUPPLEMENT_TO,
date
)
}
def pdbTOOaf(input: String): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
@ -277,7 +420,14 @@ object BioDBToOAF {
d.setPid(
List(
OafMapperUtils.structuredProperty(pdb, "pdb", "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
OafMapperUtils.structuredProperty(
pdb,
"pdb",
"Protein Data Bank Identifier",
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
DATA_INFO
)
).asJava
)
@ -290,13 +440,16 @@ object BioDBToOAF {
if (title == null)
return List()
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
d.setTitle(
List(
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
).asJava
)
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
if (authors != null) {
val convertedAuthors = authors.zipWithIndex.map { a =>
val res = new Author
res.setFullname(a._1)
res.setRank(a._2 + 1)
@ -310,7 +463,14 @@ object BioDBToOAF {
i.setPid(d.getPid)
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
i.setInstancetype(
OafMapperUtils.qualifier(
"0046",
"Bioentity",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
i.setCollectedfrom(collectedFromMap("pdb"))
d.setInstance(List(i).asJava)
@ -323,7 +483,6 @@ object BioDBToOAF {
List(d)
}
def extractEBILinksFromDump(input: String): EBILinkItem = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
@ -333,14 +492,14 @@ object BioDBToOAF {
EBILinkItem(pmid.toLong, compact(render(links)))
}
def EBITargetLinksFilter(input: EBILinks): Boolean = {
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot")
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase(
"pdb"
) || input.targetPidType.equalsIgnoreCase("uniprot")
}
def parse_ebi_links(input: String): List[EBILinks] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
@ -357,25 +516,46 @@ object BioDBToOAF {
JField("IDURL", JString(idUrl)) <- identifier
JField("ID", JString(id)) <- identifier
} yield EBILinks(relation, GraphCleaningFunctions.cleanDate(publicationDate), title, pmid, id, idScheme, idUrl)
} yield EBILinks(
relation,
GraphCleaningFunctions.cleanDate(publicationDate),
title,
pmid,
id,
idScheme,
idUrl
)
}
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
val d = new Dataset
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
d.setDataInfo(DATA_INFO)
d.setTitle(List(OafMapperUtils.structuredProperty(input.title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
d.setTitle(
List(
OafMapperUtils.structuredProperty(
input.title,
ModelConstants.MAIN_TITLE_QUALIFIER,
DATA_INFO
)
).asJava
)
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
d.setPid(
List(
OafMapperUtils.structuredProperty(input.targetPid.toLowerCase, input.targetPidType.toLowerCase, "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
OafMapperUtils.structuredProperty(
input.targetPid.toLowerCase,
input.targetPidType.toLowerCase,
"Protein Data Bank Identifier",
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
DATA_INFO
)
).asJava
)
@ -383,13 +563,35 @@ object BioDBToOAF {
i.setPid(d.getPid)
i.setUrl(List(input.targetUrl).asJava)
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
i.setInstancetype(
OafMapperUtils.qualifier(
"0046",
"Bioentity",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
i.setCollectedfrom(collectedFromMap("ebi"))
d.setInstance(List(i).asJava)
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
i.setDateofacceptance(
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
)
d.setDateofacceptance(
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
)
List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, GraphCleaningFunctions.cleanDate(input.date)))
List(
d,
createRelation(
input.pmid,
"pmid",
d.getId,
collectedFromMap("ebi"),
ModelConstants.RELATIONSHIP,
ModelConstants.IS_RELATED_TO,
GraphCleaningFunctions.cleanDate(input.date)
)
)
}
}

View File

@ -14,7 +14,11 @@ object SparkTransformBioDatabaseToOAF {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val log: Logger = LoggerFactory.getLogger(getClass)
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")
)
)
parser.parseArgument(args)
val database: String = parser.get("database")
log.info("database: {}", database)
@ -29,20 +33,33 @@ object SparkTransformBioDatabaseToOAF {
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val sc = spark.sparkContext
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
import spark.implicits._
database.toUpperCase() match {
case "UNIPROT" =>
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))), targetPath)
CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
targetPath
)
case "PDB" =>
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))), targetPath)
CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
targetPath
)
case "SCHOLIX" =>
CollectionUtils.saveDataset(spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)), targetPath)
CollectionUtils.saveDataset(
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
targetPath
)
case "CROSSREF_LINKS" =>
CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))), targetPath)
CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
targetPath
)
}
}

View File

@ -24,11 +24,12 @@ import scala.xml.pull.XMLEventReader
object SparkCreateBaselineDataFrame {
def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = {
val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
val result = data.lines.filter(l => l.startsWith("<a href=")).map { l =>
val result = data.lines
.filter(l => l.startsWith("<a href="))
.map { l =>
val end = l.lastIndexOf("\">")
val start = l.indexOf("<a href=\"")
@ -36,19 +37,24 @@ object SparkCreateBaselineDataFrame {
l.substring(start + 9, end - start)
else
""
}.filter(s => s.endsWith(".gz")).filter(s => s > maxFile).map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList
}
.filter(s => s.endsWith(".gz"))
.filter(s => s > maxFile)
.map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s"))
.toList
result
}
def downloadBaselinePart(url: String): InputStream = {
val r = new HttpGet(url)
val timeout = 60; // seconds
val config = RequestConfig.custom()
val config = RequestConfig
.custom()
.setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000)
.setSocketTimeout(timeout * 1000).build()
.setSocketTimeout(timeout * 1000)
.build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
val response = client.execute(r)
println(s"get response with status${response.getStatusLine.getStatusCode}")
@ -59,10 +65,12 @@ object SparkCreateBaselineDataFrame {
def requestPage(url: String): String = {
val r = new HttpGet(url)
val timeout = 60; // seconds
val config = RequestConfig.custom()
val config = RequestConfig
.custom()
.setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000)
.setSocketTimeout(timeout * 1000).build()
.setSocketTimeout(timeout * 1000)
.build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
try {
var tries = 4
@ -73,8 +81,7 @@ object SparkCreateBaselineDataFrame {
println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
}
else
} else
return IOUtils.toString(response.getEntity.getContent)
} catch {
case e: Throwable =>
@ -90,10 +97,8 @@ object SparkCreateBaselineDataFrame {
}
}
def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = {
val conf = new Configuration
conf.set("fs.defaultFS", hdfsServerUri)
val fs = FileSystem.get(conf)
@ -122,8 +127,8 @@ object SparkCreateBaselineDataFrame {
}
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] =
new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
override def zero: PMArticle = new PMArticle
override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
@ -142,11 +147,16 @@ object SparkCreateBaselineDataFrame {
override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
}
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val log: Logger = LoggerFactory.getLogger(getClass)
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkEBILinksToOaf.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
)
)
)
parser.parseArgument(args)
val isLookupUrl: String = parser.get("isLookupUrl")
log.info("isLookupUrl: {}", isLookupUrl)
@ -162,7 +172,6 @@ object SparkCreateBaselineDataFrame {
val skipUpdate = parser.get("skipUpdate")
log.info("skipUpdate: {}", skipUpdate)
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
val spark: SparkSession =
@ -170,7 +179,8 @@ object SparkCreateBaselineDataFrame {
.builder()
.config(conf)
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val sc = spark.sparkContext
import spark.implicits._
@ -183,20 +193,30 @@ object SparkCreateBaselineDataFrame {
if (!"true".equalsIgnoreCase(skipUpdate)) {
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => {
val ds: Dataset[PMArticle] = spark.createDataset(
k.filter(i => i._1.endsWith(".gz"))
.flatMap(i => {
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
new PMParser(xml)
}))
ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
})
)
ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder))
.groupByKey(_._1)
.agg(pmArticleAggregator.toColumn)
.map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
.map(p => p._2)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/baseline_dataset")
}
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
CollectionUtils.saveDataset(exported_dataset
.map(a => PubMedToOaf.convert(a, vocabularies)).as[Oaf]
CollectionUtils.saveDataset(
exported_dataset
.map(a => PubMedToOaf.convert(a, vocabularies))
.as[Oaf]
.filter(p => p != null),
targetPath)
targetPath
)
}
}

View File

@ -25,10 +25,12 @@ object SparkDownloadEBILinks {
def requestPage(url: String): String = {
val r = new HttpGet(url)
val timeout = 60; // seconds
val config = RequestConfig.custom()
val config = RequestConfig
.custom()
.setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000)
.setSocketTimeout(timeout * 1000).build()
.setSocketTimeout(timeout * 1000)
.build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
try {
var tries = 4
@ -39,8 +41,7 @@ object SparkDownloadEBILinks {
println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
}
else
} else
return IOUtils.toString(response.getEntity.getContent)
} catch {
case e: Throwable =>
@ -66,14 +67,19 @@ object SparkDownloadEBILinks {
val log: Logger = LoggerFactory.getLogger(getClass)
val MAX_ITEM_PER_PARTITION = 20000
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
import spark.implicits._
@ -87,22 +93,40 @@ object SparkDownloadEBILinks {
log.info(s"workingPath -> $workingPath")
log.info("Getting max pubmedId where the links have already requested")
val links: Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
val links: Dataset[EBILinkItem] =
spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)
log.info("Retrieving PMID to request links")
val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request")
pubmed
.map(p => p.getPmid.toLong)
.where(s"value > $lastPMIDRequested")
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/id_to_request")
val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
val total = pmidToReq.count()
spark.createDataset(pmidToReq.rdd.repartition((total / MAX_ITEM_PER_PARTITION).toInt).map(pmid => createEBILinks(pmid)).filter(l => l != null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update")
spark
.createDataset(
pmidToReq.rdd
.repartition((total / MAX_ITEM_PER_PARTITION).toInt)
.map(pmid => createEBILinks(pmid))
.filter(l => l != null)
)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/links_update")
val updates: Dataset[EBILinkItem] = spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
val updates: Dataset[EBILinkItem] =
spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
links.union(updates).groupByKey(_.id)
links
.union(updates)
.groupByKey(_.id)
.reduceGroups { (x, y) =>
if (x == null || x.links == null)
y
@ -112,6 +136,10 @@ object SparkDownloadEBILinks {
x
else
y
}.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final")
}
.map(_._2)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/links_final")
}
}

View File

@ -15,15 +15,19 @@ object SparkEBILinksToOaf {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
import spark.implicits._
val sourcePath = parser.get("sourcePath")
@ -32,11 +36,17 @@ object SparkEBILinksToOaf {
log.info(s"targetPath -> $targetPath")
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{"))
val ebLinks: Dataset[EBILinkItem] = spark.read
.load(sourcePath)
.as[EBILinkItem]
.filter(l => l.links != null && l.links.startsWith("{"))
CollectionUtils.saveDataset(ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
CollectionUtils.saveDataset(
ebLinks
.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
targetPath)
targetPath
)
}
}

View File

@ -3,10 +3,7 @@ package eu.dnetlib.dhp.sx.bio.pubmed
import scala.xml.MetaData
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
/**
*
* @param xml
/** @param xml
*/
class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] {
@ -29,10 +26,8 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
s.head.text
else
null
} else null
}
else null
}
def validate_Date(year: String, month: String, day: String): String = {
try {
@ -45,7 +40,6 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
def generateNextArticle(): PMArticle = {
var currentSubject: PMSubject = null
var currentAuthor: PMAuthor = null
var currentJournal: PMJournal = null
@ -56,11 +50,6 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
var currentDay = "01"
var currentArticleType: String = null
while (xml.hasNext) {
xml.next match {
case EvElemStart(_, label, attrs, _) =>
@ -83,7 +72,8 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
case "Author" => currentArticle.getAuthors.add(currentAuthor)
case "Journal" => currentArticle.setJournal(currentJournal)
case "Grant" => currentArticle.getGrants.add(currentGrant)
case "PubMedPubDate" => if (currentArticle.getDate== null)
case "PubMedPubDate" =>
if (currentArticle.getDate == null)
currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
@ -106,7 +96,8 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
currentArticle.setDescription(currentArticle.getDescription + text.trim)
}
case "PMID" => currentArticle.setPmid(text.trim)
case "ArticleId" => if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
case "ArticleId" =>
if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
case "Language" => currentArticle.setLanguage(text.trim)
case "ISSN" => currentJournal.setIssn(text.trim)
case "GrantID" => currentGrant.setGrantID(text.trim)
@ -122,7 +113,8 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
if (currentAuthor != null)
currentAuthor.setLastName(text.trim)
}
case "ForeName" => if (currentAuthor != null)
case "ForeName" =>
if (currentAuthor != null)
currentAuthor.setForeName(text.trim)
case "Title" =>
if (currentJournal.getTitle == null)
@ -139,8 +131,3 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
null
}
}

View File

@ -9,21 +9,29 @@ import collection.JavaConverters._
import java.util.regex.Pattern
/**
*
*/
object PubMedToOaf {
val SUBJ_CLASS = "keywords"
val urlMap = Map(
"pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
"doi" -> "https://dx.doi.org/"
)
val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
val dataInfo: DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
false,
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
"0.9"
)
/**
* Cleaning the DOI Applying regex in order to
val collectedFrom: KeyValue =
OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
/** Cleaning the DOI Applying regex in order to
* remove doi starting with URL
*
* @param doi input DOI
@ -33,7 +41,6 @@ object PubMedToOaf {
val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
val matcher = pattern.matcher(doi)
@ -43,9 +50,7 @@ object PubMedToOaf {
null
}
/**
*
* Create an instance of class extends Result
/** Create an instance of class extends Result
* starting from OAF instanceType value
*
* @param cobjQualifier OAF instance type
@ -53,7 +58,11 @@ object PubMedToOaf {
* @return the correct instance
*/
def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid)
val result_typologies = getVocabularyTerm(
ModelConstants.DNET_RESULT_TYPOLOGIES,
vocabularies,
cobjQualifier.getClassid
)
result_typologies.getClassid match {
case "dataset" => new Dataset
case "publication" => new Publication
@ -64,8 +73,7 @@ object PubMedToOaf {
}
}
/**
* Mapping the Pubmedjournal info into the OAF Journale
/** Mapping the Pubmedjournal info into the OAF Journale
*
* @param j the pubmedJournal
* @return the OAF Journal
@ -83,27 +91,26 @@ object PubMedToOaf {
journal.setIss(j.getIssue)
journal
}
/**
*
* Find vocabulary term into synonyms and term in the vocabulary
/** Find vocabulary term into synonyms and term in the vocabulary
*
* @param vocabularyName the input vocabulary name
* @param vocabularies all the vocabularies
* @param term the term to find
* @return the cleaned term value
*/
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
def getVocabularyTerm(
vocabularyName: String,
vocabularies: VocabularyGroup,
term: String
): Qualifier = {
val a = vocabularies.getSynonymAsQualifier(vocabularyName, term)
val b = vocabularies.getTermAsQualifier(vocabularyName, term)
if (a == null) b else a
}
/**
* Map the Pubmed Article into the OAF instance
/** Map the Pubmed Article into the OAF instance
*
* @param article the pubmed articles
* @param vocabularies the vocabularies
@ -114,9 +121,17 @@ object PubMedToOaf {
if (article.getPublicationTypes == null)
return null
// MAP PMID into pid with classid = classname = pmid
val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
val pidList: List[StructuredProperty] = List(
OafMapperUtils.structuredProperty(
article.getPmid,
PidType.pmid.toString,
PidType.pmid.toString,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
dataInfo
)
)
if (pidList == null)
return null
@ -125,7 +140,14 @@ object PubMedToOaf {
if (article.getDoi != null) {
val normalizedPid = cleanDoi(article.getDoi)
if (normalizedPid != null)
alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
alternateIdentifier = OafMapperUtils.structuredProperty(
normalizedPid,
PidType.doi.toString,
PidType.doi.toString,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
dataInfo
)
}
// INSTANCE MAPPING
@ -133,10 +155,12 @@ object PubMedToOaf {
// If the article contains the typology Journal Article then we apply this type
//else We have to find a terms that match the vocabulary otherwise we discard it
val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
val ja =
article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
val pubmedInstance = new Instance
if (ja.isDefined) {
val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
val cojbCategory =
getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
pubmedInstance.setInstancetype(cojbCategory)
} else {
val i_type = article.getPublicationTypes.asScala
@ -155,7 +179,9 @@ object PubMedToOaf {
if (alternateIdentifier != null)
pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava)
result.setInstance(List(pubmedInstance).asJava)
pubmedInstance.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut)
pubmedInstance.getPid.asScala
.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid))
.map(p => p.getValue)(collection.breakOut)
//CREATE URL From pmid
val urlLists: List[String] = pidList
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
@ -165,7 +191,9 @@ object PubMedToOaf {
pubmedInstance.setUrl(urlLists.asJava)
//ASSIGN DateofAcceptance
pubmedInstance.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
pubmedInstance.setDateofacceptance(
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
)
//ASSIGN COLLECTEDFROM
pubmedInstance.setCollectedfrom(collectedFrom)
result.setPid(pidList.asJava)
@ -173,7 +201,6 @@ object PubMedToOaf {
//END INSTANCE MAPPING
//--------------------------------------------------------------------------------------
// JOURNAL MAPPING
//--------------------------------------------------------------------------------------
if (article.getJournal != null && result.isInstanceOf[Publication])
@ -182,31 +209,48 @@ object PubMedToOaf {
//END JOURNAL MAPPING
//--------------------------------------------------------------------------------------
// RESULT MAPPING
//--------------------------------------------------------------------------------------
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
result.setDateofacceptance(
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
)
if (article.getTitle == null || article.getTitle.isEmpty)
return null
result.setTitle(List(OafMapperUtils.structuredProperty(article.getTitle, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo)).asJava)
result.setTitle(
List(
OafMapperUtils.structuredProperty(
article.getTitle,
ModelConstants.MAIN_TITLE_QUALIFIER,
dataInfo
)
).asJava
)
if (article.getDescription != null && article.getDescription.nonEmpty)
result.setDescription(List(OafMapperUtils.field(article.getDescription, dataInfo)).asJava)
if (article.getLanguage != null) {
val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
val term =
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
if (term != null)
result.setLanguage(term)
}
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection.breakOut)
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s =>
OafMapperUtils.structuredProperty(
s.getValue,
SUBJ_CLASS,
SUBJ_CLASS,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
dataInfo
)
)(collection.breakOut)
if (subjects != null)
result.setSubject(subjects.asJava)
val authors: List[Author] = article.getAuthors.asScala.zipWithIndex.map { case (a, index) =>
val author = new Author()
author.setName(a.getForeName)
@ -216,15 +260,12 @@ object PubMedToOaf {
author
}(collection.breakOut)
if (authors != null && authors.nonEmpty)
result.setAuthor(authors.asJava)
result.setOriginalId(pidList.map(s => s.getValue).asJava)
result.setId(article.getPmid)
// END RESULT MAPPING
//--------------------------------------------------------------------------------------
val id = IdentifierFactory.createIdentifier(result)
@ -234,5 +275,4 @@ object PubMedToOaf {
result
}
}

View File

@ -17,7 +17,8 @@ import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
import java.text.SimpleDateFormat
class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) {
class SparkRetrieveDataciteDelta(propertyPath: String, args: Array[String], log: Logger)
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
val ISO_DATE_PATTERN = "yyyy-MM-dd'T'HH:mm:ssZ"
val simpleFormatter = new SimpleDateFormat(ISO_DATE_PATTERN)
@ -28,16 +29,13 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
val RESOLVED_REL_PATH_NAME = "resolvedRelation"
val SCHOLIX_PATH_NAME = "scholix"
def scholixResourcePath(workingPath: String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME"
def dataciteOAFPath(workingPath: String) = s"$workingPath/$DATACITE_OAF_PATH_NAME"
def pidMapPath(workingPath: String) = s"$workingPath/$PID_MAP_PATH_NAME"
def resolvedRelationPath(workingPath: String) = s"$workingPath/$RESOLVED_REL_PATH_NAME"
def scholixPath(workingPath: String) = s"$workingPath/$SCHOLIX_PATH_NAME"
/**
* Utility to parse Date in ISO8601 to epochMillis
/** Utility to parse Date in ISO8601 to epochMillis
* @param inputDate The String represents an input date in ISO8601
* @return The relative epochMillis of parsed date
*/
@ -45,9 +43,7 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
simpleFormatter.parse(inputDate).getTime
}
/**
* This method tries to retrieve the last collection date from all datacite
/** This method tries to retrieve the last collection date from all datacite
* records in HDFS.
* This method should be called before indexing scholexplorer to retrieve
* the delta of Datacite record to download, since from the generation of
@ -63,16 +59,23 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
import spark.implicits._
val entitiesDS = spark.read.load(s"$entitiesPath/*").as[Oaf].filter(o =>o.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
val entitiesDS = spark.read
.load(s"$entitiesPath/*")
.as[Oaf]
.filter(o => o.isInstanceOf[Result])
.map(r => r.asInstanceOf[Result])
val date = entitiesDS.filter(r => r.getDateofcollection!= null).map(_.getDateofcollection).select(max("value")).first.getString(0)
val date = entitiesDS
.filter(r => r.getDateofcollection != null)
.map(_.getDateofcollection)
.select(max("value"))
.first
.getString(0)
ISO8601toEpochMillis(date) / 1000
}
/**
* The method of update Datacite relationships on Scholexplorer
/** The method of update Datacite relationships on Scholexplorer
* needs some utilities data structures
* One is the scholixResource DS that stores all the nodes in the Scholix Graph
* in format ScholixResource
@ -80,19 +83,26 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
* @param workingPath the working path
* @param spark the spark session
*/
def generateScholixResource(summaryPath:String, workingPath: String, spark:SparkSession) :Unit = {
def generateScholixResource(
summaryPath: String,
workingPath: String,
spark: SparkSession
): Unit = {
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
log.info("Convert All summary to ScholixResource")
spark.read.load(summaryPath).as[ScholixSummary]
spark.read
.load(summaryPath)
.as[ScholixSummary]
.map(ScholixUtils.generateScholixResourceFromSummary)(scholixResourceEncoder)
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_native")
.write
.mode(SaveMode.Overwrite)
.save(s"${scholixResourcePath(workingPath)}_native")
}
/**
* This method convert the new Datacite Resource into Scholix Resource
/** This method convert the new Datacite Resource into Scholix Resource
* Needed to fill the source and the type of Scholix Relationships
* @param workingPath the Working Path
* @param spark The spark Session
@ -103,25 +113,28 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
import spark.implicits._
spark.read.load(dataciteOAFPath(workingPath)).as[Oaf]
spark.read
.load(dataciteOAFPath(workingPath))
.as[Oaf]
.filter(_.isInstanceOf[Result])
.map(_.asInstanceOf[Result])
.map(ScholixUtils.generateScholixResourceFromResult)
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_update")
.write
.mode(SaveMode.Overwrite)
.save(s"${scholixResourcePath(workingPath)}_update")
val update = spark.read.load(s"${scholixResourcePath(workingPath)}_update").as[ScholixResource]
val native = spark.read.load(s"${scholixResourcePath(workingPath)}_native").as[ScholixResource]
val graph = update.union(native)
val graph = update
.union(native)
.groupByKey(_.getDnetIdentifier)
.reduceGroups((a, b) => if (a != null && a.getDnetIdentifier != null) a else b)
.map(_._2)
graph.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_graph")
}
/**
* This method get and Transform only datacite records with
/** This method get and Transform only datacite records with
* timestamp greater than timestamp
* @param datacitePath the datacite input Path
* @param timestamp the timestamp
@ -130,31 +143,44 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
* @param vocabularies Vocabularies needed for transformation
*/
def getDataciteUpdate(datacitePath:String, timestamp:Long, workingPath:String, spark:SparkSession,vocabularies: VocabularyGroup): Long = {
def getDataciteUpdate(
datacitePath: String,
timestamp: Long,
workingPath: String,
spark: SparkSession,
vocabularies: VocabularyGroup
): Long = {
import spark.implicits._
val ds = spark.read.load(datacitePath).as[DataciteType]
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
val total = ds.filter(_.timestamp >= timestamp).count()
if (total > 0) {
ds.filter(_.timestamp >= timestamp)
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true))
.flatMap(i => fixRelations(i)).filter(i => i != null)
.write.mode(SaveMode.Overwrite).save(dataciteOAFPath(workingPath))
.flatMap(d =>
DataciteToOAFTransformation
.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true)
)
.flatMap(i => fixRelations(i))
.filter(i => i != null)
.write
.mode(SaveMode.Overwrite)
.save(dataciteOAFPath(workingPath))
}
total
}
/**
* After added the new ScholixResource, we need to update the scholix Pid Map
/** After added the new ScholixResource, we need to update the scholix Pid Map
* to intersected with the new Datacite Relations
*
* @param workingPath The working Path starting from save the new Map
* @param spark the spark session
*/
def generatePidMap(workingPath: String, spark: SparkSession): Unit = {
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
import spark.implicits._
spark.read.load(s"${scholixResourcePath(workingPath)}_graph").as[ScholixResource]
spark.read
.load(s"${scholixResourcePath(workingPath)}_graph")
.as[ScholixResource]
.flatMap(r =>
r.getIdentifier.asScala
.map(i => DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema))
@ -163,11 +189,12 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
.groupByKey(_._1)
.reduceGroups((a, b) => if (a != null && a._2 != null) a else b)
.map(_._2)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Overwrite).save(pidMapPath(workingPath))
.write
.mode(SaveMode.Overwrite)
.save(pidMapPath(workingPath))
}
/**
* This method resolve the datacite relation and filter the resolved
/** This method resolve the datacite relation and filter the resolved
* relation
* @param workingPath the working path
* @param spark the spark session
@ -180,7 +207,9 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String, String)]
val unresolvedRelations:Dataset[(String,Relation)] = spark.read.load(dataciteOAFPath(workingPath)).as[Oaf]
val unresolvedRelations: Dataset[(String, Relation)] = spark.read
.load(dataciteOAFPath(workingPath))
.as[Oaf]
.filter(_.isInstanceOf[Relation])
.map(_.asInstanceOf[Relation])
.map { r =>
@ -202,15 +231,12 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
r
})(relationEncoder)
.filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved")))
.write.mode(SaveMode.Overwrite)
.write
.mode(SaveMode.Overwrite)
.save(resolvedRelationPath(workingPath))
}
/**
* This method generate scholix starting from resolved relation
*
/** This method generate scholix starting from resolved relation
*
* @param workingPath
* @param spark
@ -220,35 +246,44 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation]
implicit val intermediateEncoder :Encoder[(String,Scholix)] = Encoders.tuple(Encoders.STRING, scholixEncoder)
implicit val intermediateEncoder: Encoder[(String, Scholix)] =
Encoders.tuple(Encoders.STRING, scholixEncoder)
val relations: Dataset[(String, Relation)] = spark.read
.load(resolvedRelationPath(workingPath))
.as[Relation]
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relationEncoder))
val relations:Dataset[(String, Relation)] = spark.read.load(resolvedRelationPath(workingPath)).as[Relation].map(r =>(r.getSource,r))(Encoders.tuple(Encoders.STRING, relationEncoder))
val id_summary:Dataset[(String,ScholixResource)] = spark.read.load(s"${scholixResourcePath(workingPath)}_graph").as[ScholixResource].map(r => (r.getDnetIdentifier,r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder))
val id_summary: Dataset[(String, ScholixResource)] = spark.read
.load(s"${scholixResourcePath(workingPath)}_graph")
.as[ScholixResource]
.map(r => (r.getDnetIdentifier, r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder))
id_summary.cache()
relations.joinWith(id_summary, relations("_1").equalTo(id_summary("_1")),"inner")
relations
.joinWith(id_summary, relations("_1").equalTo(id_summary("_1")), "inner")
.map(t => (t._1._2.getTarget, ScholixUtils.scholixFromSource(t._1._2, t._2._2)))
.write.mode(SaveMode.Overwrite).save(s"$workingPath/scholix_one_verse")
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/scholix_one_verse")
val source_scholix:Dataset[(String, Scholix)] =spark.read.load(s"$workingPath/scholix_one_verse").as[(String,Scholix)]
val source_scholix: Dataset[(String, Scholix)] =
spark.read.load(s"$workingPath/scholix_one_verse").as[(String, Scholix)]
source_scholix.joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")),"inner")
source_scholix
.joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")), "inner")
.map(t => {
val target: ScholixResource = t._2._2
val scholix: Scholix = t._1._2
ScholixUtils.generateCompleteScholix(scholix, target)
})(scholixEncoder).write.mode(SaveMode.Overwrite).save(s"$workingPath/scholix")
})(scholixEncoder)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/scholix")
}
/**
* Here all the spark applications runs this method
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
@ -268,7 +303,6 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
require(vocabularies != null)
val updateDS: Boolean = "true".equalsIgnoreCase(parser.get("updateDS"))
log.info(s"updateDS is '$updateDS'")
@ -277,15 +311,18 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
generateScholixResource(s"$sourcePath/provision/summaries", workingPath, spark)
log.info("Retrieve last entities collected From starting from scholix Graph")
lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities")
}
else {
} else {
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
fs.delete(new Path(s"${scholixResourcePath(workingPath)}_native"), true)
fs.rename(new Path(s"${scholixResourcePath(workingPath)}_graph"), new Path(s"${scholixResourcePath(workingPath)}_native"))
fs.rename(
new Path(s"${scholixResourcePath(workingPath)}_graph"),
new Path(s"${scholixResourcePath(workingPath)}_native")
)
lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath))
}
val numRecords = getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies)
val numRecords =
getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies)
if (numRecords > 0) {
addMissingScholixResource(workingPath, spark)
generatePidMap(workingPath, spark)
@ -295,11 +332,14 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L
}
}
object SparkRetrieveDataciteDelta {
val log: Logger = LoggerFactory.getLogger(SparkRetrieveDataciteDelta.getClass)
def main(args: Array[String]): Unit = {
new SparkRetrieveDataciteDelta("/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json", args, log).initialize().run()
new SparkRetrieveDataciteDelta(
"/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json",
args,
log
).initialize().run()
}
}

View File

@ -1,6 +1,5 @@
package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.Oaf
@ -37,7 +36,6 @@ class DataciteToOAFTest extends AbstractVocabularyTest{
FileUtils.deleteDirectory(workingDir.toFile)
}
@Test
def testDateMapping: Unit = {
val inputDate = "2021-07-14T11:52:54+0000"
@ -45,24 +43,21 @@ class DataciteToOAFTest extends AbstractVocabularyTest{
val dt = ISO8601FORMAT.parse(inputDate)
println(dt.getTime)
}
@Test
def testConvert(): Unit = {
val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
val conf = new SparkConf()
val spark:SparkSession = SparkSession.builder().config(conf)
val spark: SparkSession = SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
val instance = new GenerateDataciteDatasetSpark(null, null, log)
val targetPath = s"$workingDir/result"
@ -73,30 +68,31 @@ class DataciteToOAFTest extends AbstractVocabularyTest{
val nativeSize = spark.read.load(path).count()
assertEquals(100, nativeSize)
val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
result.map(s => s.getClass.getSimpleName).groupBy(col("value").alias("class")).agg(count("value").alias("Total")).show(false)
result
.map(s => s.getClass.getSimpleName)
.groupBy(col("value").alias("class"))
.agg(count("value").alias("Total"))
.show(false)
val t = spark.read.load(targetPath).count()
assertTrue(t > 0)
spark.stop()
}
@Test
def testMapping(): Unit = {
val record =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")).mkString
val record = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")
)
.mkString
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
val res: List[Oaf] = DataciteToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true)
@ -107,8 +103,6 @@ class DataciteToOAFTest extends AbstractVocabularyTest{
})
}
}

View File

@ -22,7 +22,6 @@ import scala.xml.pull.XMLEventReader
@ExtendWith(Array(classOf[MockitoExtension]))
class BioScholixTest extends AbstractVocabularyTest {
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
@ -38,53 +37,55 @@ class BioScholixTest extends AbstractVocabularyTest{
}
object GzFileIterator {
def apply(is: InputStream, encoding: String) = {
new BufferedReaderIterator(
new BufferedReader(
new InputStreamReader(
new GZIPInputStream(
is), encoding)))
new BufferedReader(new InputStreamReader(new GZIPInputStream(is), encoding))
)
}
}
@Test
def testEBIData() = {
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")).mkString
val inputXML = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
.mkString
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
}
@Test
def testPubmedToOaf(): Unit = {
assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")).mkString
val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies))
val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump"))
.mkString
val r: List[Oaf] = records.lines.toList
.map(s => mapper.readValue(s, classOf[PMArticle]))
.map(a => PubMedToOaf.convert(a, vocabularies))
assertEquals(10, r.size)
assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p)))
assertTrue(
r.map(p => p.asInstanceOf[Result])
.flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid))
.exists(p => "0037".equalsIgnoreCase(p))
)
println(mapper.writeValueAsString(r.head))
}
@Test
def testPDBToOAF(): Unit = {
assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump")).mkString
val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump"))
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
assertTrue(result.nonEmpty)
result.foreach(r => assertNotNull(r))
@ -93,20 +94,19 @@ class BioScholixTest extends AbstractVocabularyTest{
}
@Test
def testUNIprotToOAF(): Unit = {
assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump")).mkString
val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump"))
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
assertTrue(result.nonEmpty)
result.foreach(r => assertNotNull(r))
@ -115,7 +115,14 @@ class BioScholixTest extends AbstractVocabularyTest{
}
case class EBILinks(relType:String, date:String, title:String, pmid:String, targetPid:String, targetPidType:String) {}
case class EBILinks(
relType: String,
date: String,
title: String,
pmid: String,
targetPid: String,
targetPidType: String
) {}
def parse_ebi_links(input: String): List[EBILinks] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -135,14 +142,14 @@ class BioScholixTest extends AbstractVocabularyTest{
} yield EBILinks(relation, publicationDate, title, pmid, id, idScheme)
}
@Test
def testCrossrefLinksToOAF(): Unit = {
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links")).mkString
val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links"))
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
val result: List[Oaf] = records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
assertNotNull(result)
@ -154,24 +161,30 @@ class BioScholixTest extends AbstractVocabularyTest{
@Test
def testEBILinksToOAF(): Unit = {
val iterator = GzFileIterator(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"), "UTF-8")
val iterator = GzFileIterator(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"),
"UTF-8"
)
val data = iterator.next()
val res = BioDBToOAF.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links).filter(BioDBToOAF.EBITargetLinksFilter).flatMap(BioDBToOAF.convertEBILinksToOaf)
val res = BioDBToOAF
.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links)
.filter(BioDBToOAF.EBITargetLinksFilter)
.flatMap(BioDBToOAF.convertEBILinksToOaf)
print(res.length)
println(mapper.writeValueAsString(res.head))
}
@Test
def scholixResolvedToOAF(): Unit = {
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")).mkString
val records: String = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")
)
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -181,7 +194,6 @@ class BioScholixTest extends AbstractVocabularyTest{
json.extract[ScholixResolved]
}.toList
val result: List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
assertTrue(result.nonEmpty)

View File

@ -16,10 +16,22 @@ import java.time.LocalDate
import java.time.format.DateTimeFormatter
import scala.collection.JavaConverters._
case class HostedByItemType(
id: String,
officialname: String,
issn: String,
eissn: String,
lissn: String,
openAccess: Boolean
) {}
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){}
case class DoiBoostAffiliation(
PaperId: Long,
AffiliationId: Long,
GridId: Option[String],
OfficialPage: Option[String],
DisplayName: Option[String]
) {}
object DoiBoostMappingUtil {
@ -43,7 +55,17 @@ object DoiBoostMappingUtil {
val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"
val DOI_PREFIX = "10."
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
val invalidName = List(
",",
"none none",
"none, none",
"none &na;",
"(:null)",
"test test test",
"test test",
"test",
"&na; &na;"
)
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
@ -75,7 +97,6 @@ object DoiBoostMappingUtil {
}
def toHostedByItem(input: String): (String, HostedByItemType) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -84,7 +105,6 @@ object DoiBoostMappingUtil {
(c.keys.head, c.values.head)
}
def toISSNPair(publication: Publication): (String, Publication) = {
val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
val eissn = if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
@ -100,26 +120,24 @@ object DoiBoostMappingUtil {
(publication.getId, publication)
}
def generateGridAffiliationId(gridId: String): String = {
s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
}
def fixResult(result: Dataset): Dataset = {
val instanceType = extractInstance(result)
if (instanceType.isDefined) {
result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
}
result.getInstance().asScala.foreach(i => {
result
.getInstance()
.asScala
.foreach(i => {
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
})
result
}
def decideAccessRight(lic: Field[String], date: String): AccessRight = {
if (lic == null) {
//Default value Unknown
@ -127,7 +145,8 @@ object DoiBoostMappingUtil {
}
val license: String = lic.getValue
//CC licenses
if(license.startsWith("cc") ||
if (
license.startsWith("cc") ||
license.startsWith("http://creativecommons.org/licenses") ||
license.startsWith("https://creativecommons.org/licenses") ||
@ -137,7 +156,8 @@ object DoiBoostMappingUtil {
license.equals("http://pubs.acs.org/page/policy/authorchoice_ccbyncnd_termsofuse.html") ||
//APA (considered OPEN also by Unpaywall)
license.equals("http://www.apa.org/pubs/journals/resources/open-access.aspx")){
license.equals("http://www.apa.org/pubs/journals/resources/open-access.aspx")
) {
val oaq: AccessRight = getOpenAccessQualifier()
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
@ -145,7 +165,11 @@ object DoiBoostMappingUtil {
}
//OUP (BUT ONLY AFTER 12 MONTHS FROM THE PUBLICATION DATE, OTHERWISE THEY ARE EMBARGOED)
if(license.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")){
if (
license.equals(
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
)
) {
val now = java.time.LocalDate.now
try {
@ -154,20 +178,19 @@ object DoiBoostMappingUtil {
val oaq: AccessRight = getOpenAccessQualifier()
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
return oaq
}
else{
} else {
return getEmbargoedAccessQualifier()
}
} catch {
case e: Exception => {
try {
val pub_date = LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))
val pub_date =
LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'"))
if (((now.toEpochDay - pub_date.toEpochDay) / 365.0) > 1) {
val oaq: AccessRight = getOpenAccessQualifier()
oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
return oaq
}
else{
} else {
return getEmbargoedAccessQualifier()
}
} catch {
@ -183,34 +206,56 @@ object DoiBoostMappingUtil {
}
def getOpenAccessQualifier(): AccessRight = {
OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN,"Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
OafMapperUtils.accessRight(
ModelConstants.ACCESS_RIGHT_OPEN,
"Open Access",
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
}
def getRestrictedQualifier(): AccessRight = {
OafMapperUtils.accessRight( "RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
OafMapperUtils.accessRight(
"RESTRICTED",
"Restricted",
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
}
def getUnknownQualifier(): AccessRight = {
OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
OafMapperUtils.accessRight(
ModelConstants.UNKNOWN,
ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
}
def getEmbargoedAccessQualifier(): AccessRight = {
OafMapperUtils.accessRight("EMBARGO","Embargo",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
OafMapperUtils.accessRight(
"EMBARGO",
"Embargo",
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
}
def getClosedAccessQualifier(): AccessRight = {
OafMapperUtils.accessRight("CLOSED","Closed Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
OafMapperUtils.accessRight(
"CLOSED",
"Closed Access",
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
}
def extractInstance(r: Result): Option[Instance] = {
r.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
r.getInstance()
.asScala
.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
}
def fixPublication(input: ((String, Publication), (String, HostedByItemType))): Publication = {
@ -222,10 +267,16 @@ object DoiBoostMappingUtil {
val instanceType: Option[Instance] = extractInstance(publication)
if (instanceType.isDefined) {
publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
publication
.getInstance()
.asScala
.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
}
publication.getInstance().asScala.foreach(i => {
publication
.getInstance()
.asScala
.foreach(i => {
var hb = new KeyValue
if (item != null) {
hb.setValue(item.officialname)
@ -235,8 +286,7 @@ object DoiBoostMappingUtil {
i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
}
}
else {
} else {
hb = ModelConstants.UNKNOWN_REPOSITORY
}
i.setHostedby(hb)
@ -270,17 +320,22 @@ object DoiBoostMappingUtil {
if (publication.getTitle == null || publication.getTitle.size == 0)
return false
val s = publication.getTitle.asScala.count(p => p.getValue != null
&& p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]"))
val s = publication.getTitle.asScala.count(p =>
p.getValue != null
&& p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]")
)
if (s == 0)
return false
// fixes #4360 (test publisher)
val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
val publisher =
if (publication.getPublisher != null) publication.getPublisher.getValue else null
if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
if (
publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher
.equalsIgnoreCase("CrossRef Test Account"))
) {
return false;
}
@ -288,18 +343,12 @@ object DoiBoostMappingUtil {
if (publication.getAuthor == null || publication.getAuthor.size() == 0)
return false
//filter invalid author
val authors = publication.getAuthor.asScala.map(s => {
if (s.getFullname.nonEmpty) {
s.getFullname
}
else
s"${
s.getName
} ${
s.getSurname
}"
} else
s"${s.getName} ${s.getSurname}"
})
val c = authors.count(isValidAuthorName)
@ -307,13 +356,16 @@ object DoiBoostMappingUtil {
return false
// fixes #4368
if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue))
if (
authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(
publication.getPublisher.getValue
)
)
return false
true
}
def isValidAuthorName(fullName: String): Boolean = {
if (fullName == null || fullName.isEmpty)
return false
@ -322,20 +374,30 @@ object DoiBoostMappingUtil {
true
}
def generateDataInfo(trust: String): DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
di.setProvenanceaction(OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_ACTIONSET,ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS))
di.setProvenanceaction(
OafMapperUtils.qualifier(
ModelConstants.SYSIMPORT_ACTIONSET,
ModelConstants.SYSIMPORT_ACTIONSET,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS
)
)
di
}
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
def createSP(
value: String,
classId: String,
className: String,
schemeId: String,
schemeName: String
): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
sp.setValue(value)
@ -343,9 +405,14 @@ object DoiBoostMappingUtil {
}
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
def createSP(
value: String,
classId: String,
className: String,
schemeId: String,
schemeName: String,
dataInfo: DataInfo
): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName))
sp.setValue(value)
@ -362,9 +429,12 @@ object DoiBoostMappingUtil {
}
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
def createSP(
value: String,
classId: String,
schemeId: String,
dataInfo: DataInfo
): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId))
sp.setValue(value)
@ -382,7 +452,6 @@ object DoiBoostMappingUtil {
}
def createUnpayWallCollectedFrom(): KeyValue = {
val cf = new KeyValue
@ -401,15 +470,11 @@ object DoiBoostMappingUtil {
}
def generateIdentifier(oaf: Result, doi: String): String = {
val id = DHPUtils.md5(doi.toLowerCase)
s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}"
}
def createMAGCollectedFrom(): KeyValue = {
val cf = new KeyValue
@ -424,7 +489,6 @@ object DoiBoostMappingUtil {
tmp.setValue(value)
tmp
}
def isEmpty(x: String) = x == null || x.trim.isEmpty
@ -432,7 +496,10 @@ object DoiBoostMappingUtil {
def normalizeDoi(input: String): String = {
if (input == null)
return null
val replaced = input.replaceAll("(?:\\n|\\r|\\t|\\s)", "").toLowerCase.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
val replaced = input
.replaceAll("(?:\\n|\\r|\\t|\\s)", "")
.toLowerCase
.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
if (isEmpty(replaced))
return null
@ -446,9 +513,6 @@ object DoiBoostMappingUtil {
return ret
}
}

View File

@ -17,22 +17,29 @@ object SparkGenerateDOIBoostActionSet {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
implicit val mapEncoderAS: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
implicit val mapEncoderAS: Encoder[(String, String)] =
Encoders.tuple(Encoders.STRING, Encoders.STRING)
implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]]
implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] =
Encoders.kryo[AtomicAction[OafDataset]]
val dbPublicationPath = parser.get("dbPublicationPath")
val dbDatasetPath = parser.get("dbDatasetPath")
@ -41,35 +48,61 @@ object SparkGenerateDOIBoostActionSet {
val dbOrganizationPath = parser.get("dbOrganizationPath")
val sequenceFilePath = parser.get("sFilePath")
val asDataset = spark.read.load(dbDatasetPath).as[OafDataset]
val asDataset = spark.read
.load(dbDatasetPath)
.as[OafDataset]
.filter(p => p != null || p.getId != null)
.map(d => DoiBoostMappingUtil.fixResult(d))
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.map(d => DoiBoostMappingUtil.toActionSet(d))(
Encoders.tuple(Encoders.STRING, Encoders.STRING)
)
val asPublication = spark.read.load(dbPublicationPath).as[Publication]
val asPublication = spark.read
.load(dbPublicationPath)
.as[Publication]
.filter(p => p != null || p.getId != null)
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.map(d => DoiBoostMappingUtil.toActionSet(d))(
Encoders.tuple(Encoders.STRING, Encoders.STRING)
)
val asOrganization = spark.read
.load(dbOrganizationPath)
.as[Organization]
.map(d => DoiBoostMappingUtil.toActionSet(d))(
Encoders.tuple(Encoders.STRING, Encoders.STRING)
)
val asOrganization = spark.read.load(dbOrganizationPath).as[Organization]
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
val asCRelation = spark.read.load(crossRefRelation).as[Relation]
val asCRelation = spark.read
.load(crossRefRelation)
.as[Relation]
.filter(r => r != null && r.getSource != null && r.getTarget != null)
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.map(d => DoiBoostMappingUtil.toActionSet(d))(
Encoders.tuple(Encoders.STRING, Encoders.STRING)
)
val asRelAffiliation = spark.read
.load(dbaffiliationRelationPath)
.as[Relation]
.map(d => DoiBoostMappingUtil.toActionSet(d))(
Encoders.tuple(Encoders.STRING, Encoders.STRING)
)
val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation]
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation)
d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
val d: Dataset[(String, String)] = asDataset
.union(asPublication)
.union(asOrganization)
.union(asCRelation)
.union(asRelAffiliation)
d.rdd
.repartition(6000)
.map(s => (new Text(s._1), new Text(s._2)))
.saveAsHadoopFile(
s"$sequenceFilePath",
classOf[Text],
classOf[Text],
classOf[SequenceFileOutputFormat[Text, Text]],
classOf[GzipCodec]
)
}

View File

@ -15,8 +15,8 @@ import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
object SparkGenerateDoiBoost {
object SparkGenerateDoiBoost {
def extractIdGRID(input: String): List[(String, String)] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -35,19 +35,23 @@ object SparkGenerateDoiBoost {
grids.map(g => (id, s"unresolved::grid::${g.toLowerCase}"))(collection.breakOut)
}
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
import spark.implicits._
@ -65,8 +69,7 @@ object SparkGenerateDoiBoost {
a._2.setId(a._1)
return a._2
}
}
else {
} else {
if (a != null && a._2 != null) {
b.mergeFrom(a._2)
b.setId(a._1)
@ -82,8 +85,7 @@ object SparkGenerateDoiBoost {
if (b1 == null) {
if (b2 != null)
return b2
}
else {
} else {
if (b2 != null) {
b1.mergeFrom(b2)
val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
@ -103,17 +105,19 @@ object SparkGenerateDoiBoost {
override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication]
}
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =
Encoders.tuple(Encoders.STRING, mapEncoderPub)
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
logger.info("Phase 2) Join Crossref with UnpayWall")
val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
val uwPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
val crossrefPublication: Dataset[(String, Publication)] =
spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
val uwPublication: Dataset[(String, Publication)] =
spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
def applyMerge(item: ((String, Publication), (String, Publication))): Publication = {
val crossrefPub = item._1._2
@ -127,53 +131,95 @@ object SparkGenerateDoiBoost {
crossrefPub
}
crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/firstJoin")
crossrefPublication
.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left")
.map(applyMerge)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/firstJoin")
logger.info("Phase 3) Join Result with ORCID")
val fj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
val fj: Dataset[(String, Publication)] =
spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
val orcidPublication: Dataset[(String, Publication)] =
spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left")
.map(applyMerge)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/secondJoin")
logger.info("Phase 4) Join Result with MAG")
val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
val sj: Dataset[(String, Publication)] =
spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublication")
val magPublication: Dataset[(String, Publication)] =
spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left")
.map(applyMerge)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/doiBoostPublication")
val doiBoostPublication: Dataset[(String, Publication)] = spark.read
.load(s"$workingDirPath/doiBoostPublication")
.as[Publication]
.filter(p => DoiBoostMappingUtil.filterPublication(p))
.map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
val doiBoostPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication].filter(p => DoiBoostMappingUtil.filterPublication(p)).map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(
spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem)
)
val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem))
doiBoostPublication.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
doiBoostPublication
.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
.map(DoiBoostMappingUtil.fixPublication)
.map(p => (p.getId, p))
.groupByKey(_._1)
.agg(crossrefAggregator.toColumn)
.map(p => p._2)
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/doiBoostPublicationFiltered")
val affiliationPath = parser.get("affiliationPath")
val paperAffiliationPath = parser.get("paperAffiliationPath")
val affiliation = spark.read.load(affiliationPath).select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))
val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId"))
val affiliation = spark.read
.load(affiliationPath)
.select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))
val paperAffiliation = spark.read
.load(paperAffiliationPath)
.select(col("AffiliationId").alias("affId"), col("PaperId"))
val a: Dataset[DoiBoostAffiliation] = paperAffiliation
.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId")))
.select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId"), col("_2.OfficialPage"), col("_2.DisplayName")).as[DoiBoostAffiliation]
.select(
col("_1.PaperId"),
col("_2.AffiliationId"),
col("_2.GridId"),
col("_2.OfficialPage"),
col("_2.DisplayName")
)
.as[DoiBoostAffiliation]
val magPubs: Dataset[(String, Publication)] = spark.read
.load(s"$workingDirPath/doiBoostPublicationFiltered")
.as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(
tupleForJoinEncoder
)
.filter(s => s._1 != null)
val magPubs: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s => s._1 != null)
magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
magPubs
.joinWith(a, magPubs("_1").equalTo(a("PaperId")))
.flatMap(item => {
val pub: Publication = item._1._2
val affiliation = item._2
val affId: String = if (affiliation.GridId.isDefined) s"unresolved::grid::${affiliation.GridId.get.toLowerCase}" else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
val affId: String =
if (affiliation.GridId.isDefined)
s"unresolved::grid::${affiliation.GridId.get.toLowerCase}"
else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
val r: Relation = new Relation
r.setSource(pub.getId)
r.setTarget(affId)
@ -191,10 +237,15 @@ object SparkGenerateDoiBoost {
r1.setDataInfo(pub.getDataInfo)
r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
List(r, r1)
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
})(mapEncoderRel)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
val unresolvedRels: Dataset[(String, Relation)] = spark.read.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved").as[Relation].map(r => {
val unresolvedRels: Dataset[(String, Relation)] = spark.read
.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
.as[Relation]
.map(r => {
if (r.getSource.startsWith("unresolved"))
(r.getSource, r)
@ -204,9 +255,16 @@ object SparkGenerateDoiBoost {
("resolved", r)
})(Encoders.tuple(Encoders.STRING, mapEncoderRel))
val openaireOrganization: Dataset[(String, String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x, y) => if (x != null) x else y).map(_._2)
val openaireOrganization: Dataset[(String, String)] = spark.read
.text(openaireOrganizationPath)
.as[String]
.flatMap(s => extractIdGRID(s))
.groupByKey(_._2)
.reduceGroups((x, y) => if (x != null) x else y)
.map(_._2)
unresolvedRels.joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2")))
unresolvedRels
.joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2")))
.map { x =>
val currentRels = x._1._2
val currentOrgs = x._2
@ -216,9 +274,15 @@ object SparkGenerateDoiBoost {
else
currentRels.setTarget(currentOrgs._1)
currentRels
}.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
}
.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/doiBoostPublicationAffiliation")
magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).map(item => {
magPubs
.joinWith(a, magPubs("_1").equalTo(a("PaperId")))
.map(item => {
val affiliation = item._2
if (affiliation.GridId.isEmpty) {
val o = new Organization
@ -232,10 +296,13 @@ object SparkGenerateDoiBoost {
o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
o
}
else
} else
null
}).filter(o => o != null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization")
})
.filter(o => o != null)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/doiBoostOrganization")
}
}

View File

@ -22,11 +22,16 @@ case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
case class mappingAffiliation(name: String) {}
case class mappingAuthor(given: Option[String], family: String, sequence:Option[String], ORCID: Option[String], affiliation: Option[mappingAffiliation]) {}
case class mappingAuthor(
given: Option[String],
family: String,
sequence: Option[String],
ORCID: Option[String],
affiliation: Option[mappingAffiliation]
) {}
case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {}
case object Crossref2Oaf {
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
@ -56,7 +61,6 @@ case object Crossref2Oaf {
"dataset" -> "dataset"
)
val mappingCrossrefSubType = Map(
"book-section" -> "0013 Part of book or chapter of book",
"book" -> "0002 Book",
@ -100,7 +104,6 @@ case object Crossref2Oaf {
val originalIds = new util.ArrayList(tmp.filter(id => id != null).asJava)
result.setOriginalId(originalIds)
// Add DataInfo
result.setDataInfo(generateDataInfo())
@ -114,55 +117,105 @@ case object Crossref2Oaf {
if (publisher != null && publisher.nonEmpty)
result.setPublisher(asField(publisher))
// TITLE
val mainTitles = for {JString(title) <- json \ "title" if title.nonEmpty} yield createSP(title, "main title", ModelConstants.DNET_DATACITE_TITLE)
val originalTitles = for {JString(title) <- json \ "original-title" if title.nonEmpty} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
val shortTitles = for {JString(title) <- json \ "short-title" if title.nonEmpty} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
val subtitles = for {JString(title) <- json \ "subtitle" if title.nonEmpty} yield createSP(title, "subtitle", ModelConstants.DNET_DATACITE_TITLE)
val mainTitles =
for { JString(title) <- json \ "title" if title.nonEmpty } yield createSP(
title,
"main title",
ModelConstants.DNET_DATACITE_TITLE
)
val originalTitles = for {
JString(title) <- json \ "original-title" if title.nonEmpty
} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
val shortTitles = for {
JString(title) <- json \ "short-title" if title.nonEmpty
} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
val subtitles =
for { JString(title) <- json \ "subtitle" if title.nonEmpty } yield createSP(
title,
"subtitle",
ModelConstants.DNET_DATACITE_TITLE
)
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
// DESCRIPTION
val descriptionList = for {JString(description) <- json \ "abstract"} yield asField(description)
val descriptionList =
for { JString(description) <- json \ "abstract" } yield asField(description)
result.setDescription(descriptionList.asJava)
// Source
val sourceList = for {JString(source) <- json \ "source" if source!= null && source.nonEmpty} yield asField(source)
val sourceList = for {
JString(source) <- json \ "source" if source != null && source.nonEmpty
} yield asField(source)
result.setSource(sourceList.asJava)
//RELEVANT DATE Mapping
val createdDate = generateDate((json \ "created" \ "date-time").extract[String], (json \ "created" \ "date-parts").extract[List[List[Int]]], "created", ModelConstants.DNET_DATACITE_DATE)
val postedDate = generateDate((json \ "posted" \ "date-time").extractOrElse[String](null), (json \ "posted" \ "date-parts").extract[List[List[Int]]], "available", ModelConstants.DNET_DATACITE_DATE)
val acceptedDate = generateDate((json \ "accepted" \ "date-time").extractOrElse[String](null), (json \ "accepted" \ "date-parts").extract[List[List[Int]]], "accepted", ModelConstants.DNET_DATACITE_DATE)
val publishedPrintDate = generateDate((json \ "published-print" \ "date-time").extractOrElse[String](null), (json \ "published-print" \ "date-parts").extract[List[List[Int]]], "published-print", ModelConstants.DNET_DATACITE_DATE)
val publishedOnlineDate = generateDate((json \ "published-online" \ "date-time").extractOrElse[String](null), (json \ "published-online" \ "date-parts").extract[List[List[Int]]], "published-online", ModelConstants.DNET_DATACITE_DATE)
val createdDate = generateDate(
(json \ "created" \ "date-time").extract[String],
(json \ "created" \ "date-parts").extract[List[List[Int]]],
"created",
ModelConstants.DNET_DATACITE_DATE
)
val postedDate = generateDate(
(json \ "posted" \ "date-time").extractOrElse[String](null),
(json \ "posted" \ "date-parts").extract[List[List[Int]]],
"available",
ModelConstants.DNET_DATACITE_DATE
)
val acceptedDate = generateDate(
(json \ "accepted" \ "date-time").extractOrElse[String](null),
(json \ "accepted" \ "date-parts").extract[List[List[Int]]],
"accepted",
ModelConstants.DNET_DATACITE_DATE
)
val publishedPrintDate = generateDate(
(json \ "published-print" \ "date-time").extractOrElse[String](null),
(json \ "published-print" \ "date-parts").extract[List[List[Int]]],
"published-print",
ModelConstants.DNET_DATACITE_DATE
)
val publishedOnlineDate = generateDate(
(json \ "published-online" \ "date-time").extractOrElse[String](null),
(json \ "published-online" \ "date-parts").extract[List[List[Int]]],
"published-online",
ModelConstants.DNET_DATACITE_DATE
)
val issuedDate = extractDate((json \ "issued" \ "date-time").extractOrElse[String](null), (json \ "issued" \ "date-parts").extract[List[List[Int]]])
val issuedDate = extractDate(
(json \ "issued" \ "date-time").extractOrElse[String](null),
(json \ "issued" \ "date-parts").extract[List[List[Int]]]
)
if (StringUtils.isNotBlank(issuedDate)) {
result.setDateofacceptance(asField(issuedDate))
}
else {
} else {
result.setDateofacceptance(asField(createdDate.getValue))
}
result.setRelevantdate(List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate).filter(p => p != null).asJava)
result.setRelevantdate(
List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate)
.filter(p => p != null)
.asJava
)
//Mapping Subject
val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List())
if (subjectList.nonEmpty) {
result.setSubject(subjectList.map(s=> createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava)
result.setSubject(
subjectList.map(s => createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava
)
}
//Mapping Author
val authorList: List[mappingAuthor] = (json \ "author").extractOrElse[List[mappingAuthor]](List())
val authorList: List[mappingAuthor] =
(json \ "author").extractOrElse[List[mappingAuthor]](List())
val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) =>
a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first")
)
val sorted_list = authorList.sortWith((a:mappingAuthor, b:mappingAuthor) => a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first"))
result.setAuthor(sorted_list.zipWithIndex.map{case (a, index) => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)}.asJava)
result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) =>
generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)
}.asJava)
// Mapping instance
val instance = new Instance()
@ -179,9 +232,9 @@ case object Crossref2Oaf {
instance.setLicense(d._1)
}
}
} else {
instance.setLicense(l.head._1)
}
else{
instance.setLicense(l.head._1)}
}
// Ticket #6281 added pid to Instance
@ -191,18 +244,39 @@ case object Crossref2Oaf {
if (has_review != JNothing) {
instance.setRefereed(
OafMapperUtils.qualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS))
OafMapperUtils.qualifier(
"0001",
"peerReviewed",
ModelConstants.DNET_REVIEW_LEVELS,
ModelConstants.DNET_REVIEW_LEVELS
)
)
}
instance.setAccessright(decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue))
instance.setInstancetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
result.setResourcetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
instance.setAccessright(
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
)
instance.setInstancetype(
OafMapperUtils.qualifier(
cobjCategory.substring(0, 4),
cobjCategory.substring(5),
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
result.setResourcetype(
OafMapperUtils.qualifier(
cobjCategory.substring(0, 4),
cobjCategory.substring(5),
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
)
instance.setCollectedfrom(createCrossrefCollectedFrom())
if (StringUtils.isNotBlank(issuedDate)) {
instance.setDateofacceptance(asField(issuedDate))
}
else {
} else {
instance.setDateofacceptance(asField(createdDate.getValue))
}
val s: List[String] = List("https://doi.org/" + doi)
@ -210,8 +284,7 @@ case object Crossref2Oaf {
// if (links.nonEmpty) {
// instance.setUrl(links.asJava)
// }
if(s.nonEmpty)
{
if (s.nonEmpty) {
instance.setUrl(s.asJava)
}
@ -236,7 +309,6 @@ case object Crossref2Oaf {
result
}
def generateAuhtor(given: String, family: String, orcid: String, index: Int): Author = {
val a = new Author
a.setName(given)
@ -244,7 +316,16 @@ case object Crossref2Oaf {
a.setFullname(s"$given $family")
a.setRank(index + 1)
if (StringUtils.isNotBlank(orcid))
a.setPid(List(createSP(orcid, ModelConstants.ORCID_PENDING, ModelConstants.DNET_PID_TYPES, generateDataInfo())).asJava)
a.setPid(
List(
createSP(
orcid,
ModelConstants.ORCID_PENDING,
ModelConstants.DNET_PID_TYPES,
generateDataInfo()
)
).asJava
)
a
}
@ -255,29 +336,35 @@ case object Crossref2Oaf {
var resultList: List[Oaf] = List()
val objectType = (json \ "type").extractOrElse[String](null)
val objectSubType = (json \ "subtype").extractOrElse[String](null)
if (objectType == null)
return resultList
val result = generateItemFromType(objectType, objectSubType)
if (result == null)
return List()
val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type"))
val cOBJCategory = mappingCrossrefSubType.getOrElse(
objectType,
mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type")
)
mappingResult(result, json, cOBJCategory)
if (result == null || result.getId == null)
return List()
val funderList: List[mappingFunder] = (json \ "funder").extractOrElse[List[mappingFunder]](List())
val funderList: List[mappingFunder] =
(json \ "funder").extractOrElse[List[mappingFunder]](List())
if (funderList.nonEmpty) {
resultList = resultList ::: mappingFunderToRelations(funderList, result.getId, createCrossrefCollectedFrom(), result.getDataInfo, result.getLastupdatetimestamp)
resultList = resultList ::: mappingFunderToRelations(
funderList,
result.getId,
createCrossrefCollectedFrom(),
result.getDataInfo,
result.getLastupdatetimestamp
)
}
result match {
case publication: Publication => convertPublication(publication, json, cOBJCategory)
case dataset: Dataset => convertDataset(dataset)
@ -287,22 +374,24 @@ case object Crossref2Oaf {
resultList
}
def mappingFunderToRelations(funders: List[mappingFunder], sourceId: String, cf: KeyValue, di: DataInfo, ts: Long): List[Relation] = {
def mappingFunderToRelations(
funders: List[mappingFunder],
sourceId: String,
cf: KeyValue,
di: DataInfo,
ts: Long
): List[Relation] = {
val queue = new mutable.Queue[Relation]
def snsfRule(award: String): String = {
val tmp1 = StringUtils.substringAfter(award, "_")
val tmp2 = StringUtils.substringBefore(tmp1, "/")
logger.debug(s"From $award to $tmp2")
tmp2
}
def extractECAward(award: String): String = {
val awardECRegex: Regex = "[0-9]{4,9}".r
if (awardECRegex.findAllIn(award).hasNext)
@ -310,7 +399,6 @@ case object Crossref2Oaf {
null
}
def generateRelation(sourceId: String, targetId: String, relClass: String): Relation = {
val r = new Relation
@ -324,89 +412,111 @@ case object Crossref2Oaf {
r.setLastupdatetimestamp(ts)
r
}
def generateSimpleRelationFromAward(funder: mappingFunder, nsPrefix: String, extractField: String => String): Unit = {
def generateSimpleRelationFromAward(
funder: mappingFunder,
nsPrefix: String,
extractField: String => String
): Unit = {
if (funder.award.isDefined && funder.award.get.nonEmpty)
funder.award.get.map(extractField).filter(a => a!= null && a.nonEmpty).foreach(
award => {
funder.award.get
.map(extractField)
.filter(a => a != null && a.nonEmpty)
.foreach(award => {
val targetId = getProjectId(nsPrefix, DHPUtils.md5(award))
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
}
)
})
}
def getProjectId(nsPrefix: String, targetId: String): String = {
s"40|$nsPrefix::$targetId"
}
if (funders != null)
funders.foreach(funder => {
if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
funder.DOI.get match {
case "10.13039/100010663" |
"10.13039/100010661" |
"10.13039/501100007601" |
"10.13039/501100000780" |
"10.13039/100010665" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "10.13039/100011199" |
"10.13039/100004431" |
"10.13039/501100004963" |
"10.13039/501100000780" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
case "10.13039/501100000781" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
case "10.13039/100010663" | "10.13039/100010661" | "10.13039/501100007601" | "10.13039/501100000780" |
"10.13039/100010665" =>
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "10.13039/100000001" => generateSimpleRelationFromAward(funder, "nsf_________", a => a)
case "10.13039/501100001665" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
case "10.13039/501100002341" => generateSimpleRelationFromAward(funder, "aka_________", a => a)
case "10.13039/501100001602" => generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
case "10.13039/501100000923" => generateSimpleRelationFromAward(funder, "arc_________", a => a)
case "10.13039/501100000038"=> val targetId = getProjectId("nserc_______" , "1e5e62235d094afd01cd56e65112fc63")
case "10.13039/100011199" | "10.13039/100004431" | "10.13039/501100004963" | "10.13039/501100000780" =>
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
case "10.13039/501100000781" =>
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "10.13039/100000001" =>
generateSimpleRelationFromAward(funder, "nsf_________", a => a)
case "10.13039/501100001665" =>
generateSimpleRelationFromAward(funder, "anr_________", a => a)
case "10.13039/501100002341" =>
generateSimpleRelationFromAward(funder, "aka_________", a => a)
case "10.13039/501100001602" =>
generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
case "10.13039/501100000923" =>
generateSimpleRelationFromAward(funder, "arc_________", a => a)
case "10.13039/501100000038" =>
val targetId = getProjectId("nserc_______", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case "10.13039/501100000155"=> val targetId = getProjectId("sshrc_______" , "1e5e62235d094afd01cd56e65112fc63")
case "10.13039/501100000155" =>
val targetId = getProjectId("sshrc_______", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case "10.13039/501100000024"=> val targetId = getProjectId("cihr________" , "1e5e62235d094afd01cd56e65112fc63")
case "10.13039/501100000024" =>
val targetId = getProjectId("cihr________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a)
case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a=>a)
case "10.13039/501100004564" => generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
case "10.13039/501100003407" => generateSimpleRelationFromAward(funder, "miur________", a=>a)
case "10.13039/501100002848" =>
generateSimpleRelationFromAward(funder, "conicytf____", a => a)
case "10.13039/501100003448" =>
generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
case "10.13039/501100010198" =>
generateSimpleRelationFromAward(funder, "sgov________", a => a)
case "10.13039/501100004564" =>
generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
case "10.13039/501100003407" =>
generateSimpleRelationFromAward(funder, "miur________", a => a)
val targetId = getProjectId("miur________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case "10.13039/501100006588" |
"10.13039/501100004488" => generateSimpleRelationFromAward(funder, "irb_hr______", a=>a.replaceAll("Project No.", "").replaceAll("HRZZ-","") )
case "10.13039/501100006769"=> generateSimpleRelationFromAward(funder, "rsf_________", a=>a)
case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
case "10.13039/501100004410"=> generateSimpleRelationFromAward(funder, "tubitakf____", a =>a)
case "10.10.13039/100004440"=> generateSimpleRelationFromAward(funder, "wt__________", a =>a)
case "10.13039/100004440"=> val targetId = getProjectId("wt__________" , "1e5e62235d094afd01cd56e65112fc63")
case "10.13039/501100006588" | "10.13039/501100004488" =>
generateSimpleRelationFromAward(
funder,
"irb_hr______",
a => a.replaceAll("Project No.", "").replaceAll("HRZZ-", "")
)
case "10.13039/501100006769" =>
generateSimpleRelationFromAward(funder, "rsf_________", a => a)
case "10.13039/501100001711" =>
generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
case "10.13039/501100004410" =>
generateSimpleRelationFromAward(funder, "tubitakf____", a => a)
case "10.10.13039/100004440" =>
generateSimpleRelationFromAward(funder, "wt__________", a => a)
case "10.13039/100004440" =>
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case _ => logger.debug("no match for " + funder.DOI.get)
}
} else {
funder.name match {
case "European Unions Horizon 2020 research and innovation program" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "European Unions Horizon 2020 research and innovation program" =>
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "European Union's" =>
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
case "The French National Research Agency (ANR)" |
"The French National Research Agency" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
case "CONICYT, Programa de Formación de Capital Humano Avanzado" => generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
case "Wellcome Trust Masters Fellowship" => val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
case "The French National Research Agency (ANR)" | "The French National Research Agency" =>
generateSimpleRelationFromAward(funder, "anr_________", a => a)
case "CONICYT, Programa de Formación de Capital Humano Avanzado" =>
generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
case "Wellcome Trust Masters Fellowship" =>
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case _ => logger.debug("no match for " + funder.name)
@ -414,8 +524,7 @@ case object Crossref2Oaf {
}
}
}
)
})
queue.toList
}
@ -423,12 +532,10 @@ case object Crossref2Oaf {
// TODO check if there are other info to map into the Dataset
}
def convertPublication(publication: Publication, json: JValue, cobjCategory: String): Unit = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
val containerTitles = for { JString(ct) <- json \ "container-title" } yield ct
//Mapping book
if (cobjCategory.toLowerCase.contains("book")) {
val ISBN = for { JString(isbn) <- json \ "ISBN" } yield isbn
@ -438,14 +545,14 @@ case object Crossref2Oaf {
val l: List[Field[String]] = publication.getSource.asScala.toList
val ll: List[Field[String]] = l ::: List(asField(source))
publication.setSource(ll.asJava)
}
else
} else
publication.setSource(List(asField(source)).asJava)
}
} else {
// Mapping Journal
val issnInfos = for {JArray(issn_types) <- json \ "issn-type"
val issnInfos = for {
JArray(issn_types) <- json \ "issn-type"
JObject(issn_type) <- issn_types
JField("type", JString(tp)) <- issn_type
JField("value", JString(vl)) <- issn_type
@ -494,7 +601,12 @@ case object Crossref2Oaf {
}
def generateDate(dt: String, datePart: List[List[Int]], classId: String, schemeId: String): StructuredProperty = {
def generateDate(
dt: String,
datePart: List[List[Int]],
classId: String,
schemeId: String
): StructuredProperty = {
val dp = extractDate(dt, datePart)
if (StringUtils.isNotBlank(dp))
return createSP(dp, classId, schemeId)

View File

@ -16,7 +16,6 @@ object CrossrefDataset {
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
def to_item(input: String): CrossrefDT = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -29,19 +28,24 @@ object CrossrefDataset {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
CrossrefDataset.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json"
)
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
import spark.implicits._
val crossrefAggregator = new Aggregator[CrossrefDT, CrossrefDT, CrossrefDT] with Serializable {
override def zero: CrossrefDT = null
@ -52,7 +56,6 @@ object CrossrefDataset {
if (a == null)
return b
if (a.timestamp > b.timestamp) {
return a
}
@ -80,19 +83,24 @@ object CrossrefDataset {
val workingPath: String = parser.get("workingPath")
val main_ds: Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT]
val update =
spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text])
spark.createDataset(
spark.sparkContext
.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text])
.map(i => CrossrefImporter.decompressBlob(i._2.toString))
.map(i => to_item(i)))
.map(i => to_item(i))
)
main_ds.union(update).groupByKey(_.doi)
main_ds
.union(update)
.groupByKey(_.doi)
.agg(crossrefAggregator.toColumn)
.map(s => s._2)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated")
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/crossref_ds_updated")
}

View File

@ -18,7 +18,6 @@ object GenerateCrossrefDataset {
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
def crossrefElement(meta: String): CrossrefDT = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(meta)
@ -30,13 +29,23 @@ object GenerateCrossrefDataset {
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString)
val parser = new ArgumentApplicationParser(
Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
)
)
.mkString
)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val spark: SparkSession = SparkSession.builder().config(conf)
val spark: SparkSession = SparkSession
.builder()
.config(conf)
.appName(UnpackCrtossrefEntries.getClass.getSimpleName)
.master(master)
.getOrCreate()
@ -44,12 +53,14 @@ object GenerateCrossrefDataset {
import spark.implicits._
val tmp: RDD[String] = sc.textFile(sourcePath, 6000)
spark.createDataset(tmp)
spark
.createDataset(tmp)
.map(entry => crossrefElement(entry))
.write.mode(SaveMode.Overwrite).save(targetPath)
.write
.mode(SaveMode.Overwrite)
.save(targetPath)
// .map(meta => crossrefElement(meta))
// .toDS.as[CrossrefDT]
// .write.mode(SaveMode.Overwrite).save(targetPath)

View File

@ -8,7 +8,6 @@ import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
case class Reference(author: String, firstPage: String) {}
object SparkMapDumpIntoOAF {
@ -19,14 +18,21 @@ object SparkMapDumpIntoOAF {
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkMapDumpIntoOAF.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json"
)
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
@ -35,19 +41,34 @@ object SparkMapDumpIntoOAF {
val targetPath = parser.get("targetPath")
spark.read.load(parser.get("sourcePath")).as[CrossrefDT]
spark.read
.load(parser.get("sourcePath"))
.as[CrossrefDT]
.flatMap(k => Crossref2Oaf.convert(k.json))
.filter(o => o != null)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/mixObject")
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/mixObject")
val ds: Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]
ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefPublication")
ds.filter(o => o.isInstanceOf[Publication])
.map(o => o.asInstanceOf[Publication])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/crossrefPublication")
ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefRelation")
ds.filter(o => o.isInstanceOf[Relation])
.map(o => o.asInstanceOf[Relation])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/crossrefRelation")
ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefDataset")
ds.filter(o => o.isInstanceOf[OafDataset])
.map(o => o.asInstanceOf[OafDataset])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/crossrefDataset")
}
}

View File

@ -16,7 +16,6 @@ object UnpackCrtossrefEntries {
val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass)
def extractDump(input: String): List[String] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
@ -24,28 +23,36 @@ object UnpackCrtossrefEntries {
val a = (json \ "items").extract[JArray]
a.arr.map(s => compact(render(s)))
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString)
val parser = new ArgumentApplicationParser(
Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json"
)
)
.mkString
)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val spark: SparkSession = SparkSession.builder().config(conf)
val spark: SparkSession = SparkSession
.builder()
.config(conf)
.appName(UnpackCrtossrefEntries.getClass.getSimpleName)
.master(master)
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.wholeTextFiles(sourcePath, 6000).flatMap(d => extractDump(d._2))
sc.wholeTextFiles(sourcePath, 6000)
.flatMap(d => extractDump(d._2))
.saveAsTextFile(targetPath, classOf[GzipCodec])
}
}

View File

@ -1,6 +1,5 @@
package eu.dnetlib.doiboost.mag
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
@ -14,45 +13,121 @@ import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.util.matching.Regex
case class MagPapers(PaperId: Long, Rank: Integer, Doi: String,
DocType: String, PaperTitle: String, OriginalTitle: String,
BookTitle: String, Year: Option[Integer], Date: Option[java.sql.Timestamp], Publisher: String,
JournalId: Option[Long], ConferenceSeriesId: Option[Long], ConferenceInstanceId: Option[Long],
Volume: String, Issue: String, FirstPage: String, LastPage: String,
ReferenceCount: Option[Long], CitationCount: Option[Long], EstimatedCitation: Option[Long],
OriginalVenue: String, FamilyId: Option[Long], CreatedDate: java.sql.Timestamp) {}
case class MagPapers(
PaperId: Long,
Rank: Integer,
Doi: String,
DocType: String,
PaperTitle: String,
OriginalTitle: String,
BookTitle: String,
Year: Option[Integer],
Date: Option[java.sql.Timestamp],
Publisher: String,
JournalId: Option[Long],
ConferenceSeriesId: Option[Long],
ConferenceInstanceId: Option[Long],
Volume: String,
Issue: String,
FirstPage: String,
LastPage: String,
ReferenceCount: Option[Long],
CitationCount: Option[Long],
EstimatedCitation: Option[Long],
OriginalVenue: String,
FamilyId: Option[Long],
CreatedDate: java.sql.Timestamp
) {}
case class MagPaperAbstract(PaperId: Long, IndexedAbstract: String) {}
case class MagAuthor(AuthorId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], LastKnownAffiliationId: Option[Long], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
case class MagAuthor(
AuthorId: Long,
Rank: Option[Int],
NormalizedName: Option[String],
DisplayName: Option[String],
LastKnownAffiliationId: Option[Long],
PaperCount: Option[Long],
CitationCount: Option[Long],
CreatedDate: Option[java.sql.Timestamp]
) {}
case class MagAffiliation(AffiliationId: Long, Rank: Int, NormalizedName: String, DisplayName: String, GridId: String, OfficialPage: String, WikiPage: String, PaperCount: Long, CitationCount: Long, Latitude: Option[Float], Longitude: Option[Float], CreatedDate: java.sql.Timestamp) {}
case class MagPaperAuthorAffiliation(PaperId: Long, AuthorId: Long, AffiliationId: Option[Long], AuthorSequenceNumber: Int, OriginalAuthor: String, OriginalAffiliation: String) {}
case class MagAffiliation(
AffiliationId: Long,
Rank: Int,
NormalizedName: String,
DisplayName: String,
GridId: String,
OfficialPage: String,
WikiPage: String,
PaperCount: Long,
CitationCount: Long,
Latitude: Option[Float],
Longitude: Option[Float],
CreatedDate: java.sql.Timestamp
) {}
case class MagPaperAuthorAffiliation(
PaperId: Long,
AuthorId: Long,
AffiliationId: Option[Long],
AuthorSequenceNumber: Int,
OriginalAuthor: String,
OriginalAffiliation: String
) {}
case class MagAuthorAffiliation(author: MagAuthor, affiliation: String, sequenceNumber: Int)
case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {}
case class MagPaperAuthorDenormalized(PaperId: Long, author: MagAuthor, affiliation:String, sequenceNumber:Int) {}
case class MagPaperAuthorDenormalized(
PaperId: Long,
author: MagAuthor,
affiliation: String,
sequenceNumber: Int
) {}
case class MagPaperUrl(PaperId: Long, SourceType: Option[Int], SourceUrl: Option[String], LanguageCode: Option[String]) {}
case class MagPaperUrl(
PaperId: Long,
SourceType: Option[Int],
SourceUrl: Option[String],
LanguageCode: Option[String]
) {}
case class MagUrlInstance(SourceUrl: String) {}
case class MagUrl(PaperId: Long, instances: List[MagUrlInstance])
case class MagSubject(FieldOfStudyId:Long, DisplayName:String, MainType:Option[String], Score:Float){}
case class MagSubject(
FieldOfStudyId: Long,
DisplayName: String,
MainType: Option[String],
Score: Float
) {}
case class MagFieldOfStudy(PaperId: Long, subjects: List[MagSubject]) {}
case class MagJournal(JournalId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], Issn: Option[String], Publisher: Option[String], Webpage: Option[String], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
case class MagJournal(
JournalId: Long,
Rank: Option[Int],
NormalizedName: Option[String],
DisplayName: Option[String],
Issn: Option[String],
Publisher: Option[String],
Webpage: Option[String],
PaperCount: Option[Long],
CitationCount: Option[Long],
CreatedDate: Option[java.sql.Timestamp]
) {}
case class MagConferenceInstance(ci:Long, DisplayName:Option[String], Location:Option[String], StartDate:Option[java.sql.Timestamp], EndDate:Option[java.sql.Timestamp], PaperId:Long){}
case class MagConferenceInstance(
ci: Long,
DisplayName: Option[String],
Location: Option[String],
StartDate: Option[java.sql.Timestamp],
EndDate: Option[java.sql.Timestamp],
PaperId: Long
) {}
case object ConversionUtil {
@ -65,7 +140,6 @@ case object ConversionUtil {
null
}
def mergePublication(a: Publication, b: Publication): Publication = {
if ((a != null) && (b != null)) {
a.mergeFrom(b)
@ -74,7 +148,6 @@ case object ConversionUtil {
if (a == null) b else a
}
}
def choiceLatestMagArtitcle(p1: MagPapers, p2: MagPapers): MagPapers = {
@ -93,8 +166,9 @@ case object ConversionUtil {
}
def updatePubsWithDescription(inputItem:((String, Publication), MagPaperAbstract)) : Publication = {
def updatePubsWithDescription(
inputItem: ((String, Publication), MagPaperAbstract)
): Publication = {
val pub = inputItem._1._2
val abst = inputItem._2
if (abst != null) {
@ -104,8 +178,9 @@ case object ConversionUtil {
}
def updatePubsWithConferenceInfo(inputItem:((String, Publication), MagConferenceInstance)) : Publication = {
def updatePubsWithConferenceInfo(
inputItem: ((String, Publication), MagConferenceInstance)
): Publication = {
val publication: Publication = inputItem._1._2
val ci: MagConferenceInstance = inputItem._2
@ -115,9 +190,10 @@ case object ConversionUtil {
if (ci.Location.isDefined)
j.setConferenceplace(ci.Location.get)
j.setName(ci.DisplayName.get)
if (ci.StartDate.isDefined && ci.EndDate.isDefined)
{
j.setConferencedate(s"${ci.StartDate.get.toString.substring(0,10)} - ${ci.EndDate.get.toString.substring(0,10)}")
if (ci.StartDate.isDefined && ci.EndDate.isDefined) {
j.setConferencedate(
s"${ci.StartDate.get.toString.substring(0, 10)} - ${ci.EndDate.get.toString.substring(0, 10)}"
)
}
publication.setJournal(j)
@ -135,16 +211,34 @@ case object ConversionUtil {
val classid = "MAG"
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
val s1 = createSP(s.DisplayName, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
val s1 = createSP(
s.DisplayName,
classid,
className,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES
)
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
var resList: List[StructuredProperty] = List(s1)
if (s.MainType.isDefined) {
val maintp = s.MainType.get
val s2 = createSP(s.MainType.get, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
val s2 = createSP(
s.MainType.get,
classid,
className,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES
)
s2.setDataInfo(di)
resList = resList ::: List(s2)
if (maintp.contains(".")) {
val s3 = createSP(maintp.split("\\.").head, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)
val s3 = createSP(
maintp.split("\\.").head,
classid,
className,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES
)
s3.setDataInfo(di)
resList = resList ::: List(s3)
}
@ -156,25 +250,27 @@ case object ConversionUtil {
publication
}
def addInstances(a: (Publication, MagUrl)): Publication = {
val pub = a._1
val urls = a._2
val i = new Instance
if (urls != null) {
val l:List[String] = urls.instances.filter(k=>k.SourceUrl.nonEmpty).map(k=>k.SourceUrl):::List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}")
val l: List[String] = urls.instances
.filter(k => k.SourceUrl.nonEmpty)
.map(k => k.SourceUrl) ::: List(
s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}"
)
i.setUrl(l.asJava)
}
else
i.setUrl(List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}").asJava)
} else
i.setUrl(
List(
s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}"
).asJava
)
// Ticket #6281 added pid to Instance
i.setPid(pub.getPid)
@ -184,13 +280,13 @@ case object ConversionUtil {
pub
}
def transformPaperAbstract(input: MagPaperAbstract): MagPaperAbstract = {
MagPaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract))
}
def createOAFFromJournalAuthorPaper(inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)): Publication = {
def createOAFFromJournalAuthorPaper(
inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)
): Publication = {
val paper = inputParams._1._1
val journal = inputParams._1._2
val authors = inputParams._2
@ -206,31 +302,37 @@ case object ConversionUtil {
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
val originalTitles =
createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
pub.setTitle(List(mainTitles, originalTitles).asJava)
pub.setSource(List(asField(paper.BookTitle)).asJava)
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
a.setRank(f.sequenceNumber)
if (f.author.DisplayName.isDefined)
a.setFullname(f.author.DisplayName.get)
if (f.affiliation != null)
a.setAffiliation(List(asField(f.affiliation)).asJava)
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava)
a.setPid(
List(
createSP(
s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}",
"URL",
ModelConstants.DNET_PID_TYPES
)
).asJava
)
a
}
pub.setAuthor(authorsOAF.asJava)
if (paper.Date != null && paper.Date.isDefined) {
pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0, 10)))
}
pub.setPublisher(asField(paper.Publisher))
if (journal != null && journal.DisplayName.isDefined) {
val j = new Journal
@ -250,8 +352,9 @@ case object ConversionUtil {
pub
}
def createOAF(inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)): Publication = {
def createOAF(
inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)
): Publication = {
val paper = inputParams._1._1
val authors = inputParams._1._2
@ -268,19 +371,17 @@ case object ConversionUtil {
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)
val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
val originalTitles =
createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE)
pub.setTitle(List(mainTitles, originalTitles).asJava)
pub.setSource(List(asField(paper.BookTitle)).asJava)
if (description != null) {
pub.setDescription(List(asField(description.IndexedAbstract)).asJava)
}
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
a.setFullname(f.author.DisplayName.get)
@ -288,26 +389,30 @@ case object ConversionUtil {
if (f.affiliation != null)
a.setAffiliation(List(asField(f.affiliation)).asJava)
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava)
a.setPid(
List(
createSP(
s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}",
"URL",
ModelConstants.DNET_PID_TYPES
)
).asJava
)
a
}
if (paper.Date != null) {
pub.setDateofacceptance(asField(paper.Date.toString.substring(0, 10)))
}
pub.setAuthor(authorsOAF.asJava)
pub
}
def convertInvertedIndexString(json_input: String): String = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(json_input)

View File

@ -8,6 +8,7 @@ import org.apache.spark.sql.{SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
object SparkImportMagIntoDataset {
val datatypedict = Map(
"bool" -> BooleanType,
"int" -> IntegerType,
@ -19,32 +20,232 @@ object SparkImportMagIntoDataset {
"DateTime" -> DateType
)
val stream = Map(
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")),
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
"FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
"FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
"PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
"PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
"PaperExtendedAttributes" -> Tuple2("mag/PaperExtendedAttributes.txt", Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")),
"PaperFieldsOfStudy" -> Tuple2("advanced/PaperFieldsOfStudy.txt", Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")),
"PaperMeSH" -> Tuple2("advanced/PaperMeSH.txt", Seq("PaperId:long", "DescriptorUI:string", "DescriptorName:string", "QualifierUI:string", "QualifierName:string", "IsMajorTopic:bool")),
"PaperRecommendations" -> Tuple2("advanced/PaperRecommendations.txt", Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")),
"PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
"PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
"PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "FamilyRank:uint?", "DocSubTypes:string", "CreatedDate:DateTime")),
"RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
"Affiliations" -> Tuple2(
"mag/Affiliations.txt",
Seq(
"AffiliationId:long",
"Rank:uint",
"NormalizedName:string",
"DisplayName:string",
"GridId:string",
"OfficialPage:string",
"WikiPage:string",
"PaperCount:long",
"PaperFamilyCount:long",
"CitationCount:long",
"Iso3166Code:string",
"Latitude:float?",
"Longitude:float?",
"CreatedDate:DateTime"
)
),
"AuthorExtendedAttributes" -> Tuple2(
"mag/AuthorExtendedAttributes.txt",
Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")
),
"Authors" -> Tuple2(
"mag/Authors.txt",
Seq(
"AuthorId:long",
"Rank:uint",
"NormalizedName:string",
"DisplayName:string",
"LastKnownAffiliationId:long?",
"PaperCount:long",
"PaperFamilyCount:long",
"CitationCount:long",
"CreatedDate:DateTime"
)
),
"ConferenceInstances" -> Tuple2(
"mag/ConferenceInstances.txt",
Seq(
"ConferenceInstanceId:long",
"NormalizedName:string",
"DisplayName:string",
"ConferenceSeriesId:long",
"Location:string",
"OfficialUrl:string",
"StartDate:DateTime?",
"EndDate:DateTime?",
"AbstractRegistrationDate:DateTime?",
"SubmissionDeadlineDate:DateTime?",
"NotificationDueDate:DateTime?",
"FinalVersionDueDate:DateTime?",
"PaperCount:long",
"PaperFamilyCount:long",
"CitationCount:long",
"Latitude:float?",
"Longitude:float?",
"CreatedDate:DateTime"
)
),
"ConferenceSeries" -> Tuple2(
"mag/ConferenceSeries.txt",
Seq(
"ConferenceSeriesId:long",
"Rank:uint",
"NormalizedName:string",
"DisplayName:string",
"PaperCount:long",
"PaperFamilyCount:long",
"CitationCount:long",
"CreatedDate:DateTime"
)
),
"EntityRelatedEntities" -> Tuple2(
"advanced/EntityRelatedEntities.txt",
Seq(
"EntityId:long",
"EntityType:string",
"RelatedEntityId:long",
"RelatedEntityType:string",
"RelatedType:int",
"Score:float"
)
),
"FieldOfStudyChildren" -> Tuple2(
"advanced/FieldOfStudyChildren.txt",
Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")
),
"FieldOfStudyExtendedAttributes" -> Tuple2(
"advanced/FieldOfStudyExtendedAttributes.txt",
Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")
),
"FieldsOfStudy" -> Tuple2(
"advanced/FieldsOfStudy.txt",
Seq(
"FieldOfStudyId:long",
"Rank:uint",
"NormalizedName:string",
"DisplayName:string",
"MainType:string",
"Level:int",
"PaperCount:long",
"PaperFamilyCount:long",
"CitationCount:long",
"CreatedDate:DateTime"
)
),
"Journals" -> Tuple2(
"mag/Journals.txt",
Seq(
"JournalId:long",
"Rank:uint",
"NormalizedName:string",
"DisplayName:string",
"Issn:string",
"Publisher:string",
"Webpage:string",
"PaperCount:long",
"PaperFamilyCount:long",
"CitationCount:long",
"CreatedDate:DateTime"
)
),
"PaperAbstractsInvertedIndex" -> Tuple2(
"nlp/PaperAbstractsInvertedIndex.txt.*",
Seq("PaperId:long", "IndexedAbstract:string")
),
"PaperAuthorAffiliations" -> Tuple2(
"mag/PaperAuthorAffiliations.txt",
Seq(
"PaperId:long",
"AuthorId:long",
"AffiliationId:long?",
"AuthorSequenceNumber:uint",
"OriginalAuthor:string",
"OriginalAffiliation:string"
)
),
"PaperCitationContexts" -> Tuple2(
"nlp/PaperCitationContexts.txt",
Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")
),
"PaperExtendedAttributes" -> Tuple2(
"mag/PaperExtendedAttributes.txt",
Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")
),
"PaperFieldsOfStudy" -> Tuple2(
"advanced/PaperFieldsOfStudy.txt",
Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")
),
"PaperMeSH" -> Tuple2(
"advanced/PaperMeSH.txt",
Seq(
"PaperId:long",
"DescriptorUI:string",
"DescriptorName:string",
"QualifierUI:string",
"QualifierName:string",
"IsMajorTopic:bool"
)
),
"PaperRecommendations" -> Tuple2(
"advanced/PaperRecommendations.txt",
Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")
),
"PaperReferences" -> Tuple2(
"mag/PaperReferences.txt",
Seq("PaperId:long", "PaperReferenceId:long")
),
"PaperResources" -> Tuple2(
"mag/PaperResources.txt",
Seq(
"PaperId:long",
"ResourceType:int",
"ResourceUrl:string",
"SourceUrl:string",
"RelationshipType:int"
)
),
"PaperUrls" -> Tuple2(
"mag/PaperUrls.txt",
Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")
),
"Papers" -> Tuple2(
"mag/Papers.txt",
Seq(
"PaperId:long",
"Rank:uint",
"Doi:string",
"DocType:string",
"PaperTitle:string",
"OriginalTitle:string",
"BookTitle:string",
"Year:int?",
"Date:DateTime?",
"OnlineDate:DateTime?",
"Publisher:string",
"JournalId:long?",
"ConferenceSeriesId:long?",
"ConferenceInstanceId:long?",
"Volume:string",
"Issue:string",
"FirstPage:string",
"LastPage:string",
"ReferenceCount:long",
"CitationCount:long",
"EstimatedCitation:long",
"OriginalVenue:string",
"FamilyId:long?",
"FamilyRank:uint?",
"DocSubTypes:string",
"CreatedDate:DateTime"
)
),
"RelatedFieldOfStudy" -> Tuple2(
"advanced/RelatedFieldOfStudy.txt",
Seq(
"FieldOfStudyId1:long",
"Type1:string",
"FieldOfStudyId2:long",
"Type2:string",
"Rank:float"
)
)
)
def getSchema(streamName: String): StructType = {
var schema = new StructType()
@ -61,19 +262,22 @@ object SparkImportMagIntoDataset {
schema
}
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
stream.foreach { case (k, v) =>
val s: StructType = getSchema(k)

View File

@ -9,6 +9,7 @@ import org.apache.spark.sql.functions.{col, collect_list, struct}
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
object SparkProcessMAG {
def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = {
@ -17,13 +18,31 @@ object SparkProcessMAG {
.reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2))
.map(_._2)(Encoders.product[MagPapers])
.map(mp => {
MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi),
mp.DocType, mp.PaperTitle, mp.OriginalTitle,
mp.BookTitle, mp.Year, mp.Date, mp.Publisher: String,
mp.JournalId, mp.ConferenceSeriesId, mp.ConferenceInstanceId,
mp.Volume, mp.Issue, mp.FirstPage, mp.LastPage,
mp.ReferenceCount, mp.CitationCount, mp.EstimatedCitation,
mp.OriginalVenue, mp.FamilyId, mp.CreatedDate)
MagPapers(
mp.PaperId,
mp.Rank,
DoiBoostMappingUtil.normalizeDoi(mp.Doi),
mp.DocType,
mp.PaperTitle,
mp.OriginalTitle,
mp.BookTitle,
mp.Year,
mp.Date,
mp.Publisher: String,
mp.JournalId,
mp.ConferenceSeriesId,
mp.ConferenceInstanceId,
mp.Volume,
mp.Issue,
mp.FirstPage,
mp.LastPage,
mp.ReferenceCount,
mp.CitationCount,
mp.EstimatedCitation,
mp.OriginalVenue,
mp.FamilyId,
mp.CreatedDate
)
})(Encoders.product[MagPapers])
}
@ -31,22 +50,29 @@ object SparkProcessMAG {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val sourcePath = parser.get("sourcePath")
val workingPath = parser.get("workingPath")
val targetPath = parser.get("targetPath")
import spark.implicits._
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
implicit val mapEncoderPubs: Encoder[Publication] =
org.apache.spark.sql.Encoders.kryo[Publication]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =
Encoders.tuple(Encoders.STRING, mapEncoderPubs)
logger.info("Phase 1) make uninue DOI in Papers:")
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
@ -58,16 +84,23 @@ object SparkProcessMAG {
logger.info("Phase 0) Enrich Publication with description")
val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"$workingPath/PaperAbstract")
pa.map(ConversionUtil.transformPaperAbstract)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/PaperAbstract")
logger.info("Phase 3) Group Author by PaperId")
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]
val paperAuthorAffiliation = spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
val paperAuthorAffiliation =
spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
paperAuthorAffiliation.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
.map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) => (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber)) }
paperAuthorAffiliation
.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
.map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) =>
(a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber))
}
.joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")
.map(s => {
val mpa = s._1._2
@ -76,79 +109,133 @@ object SparkProcessMAG {
MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName, mpa.sequenceNumber)
} else
mpa
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors"))
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_1_paper_authors")
})
.groupBy("PaperId")
.agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors"))
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/merge_step_1_paper_authors")
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
logger.info(
"Phase 4) create First Version of publication Entity with Paper Journal and Authors"
)
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
val papers = spark.read.load((s"$workingPath/Papers_distinct")).as[MagPapers]
val papers = spark.read.load(s"$workingPath/Papers_distinct").as[MagPapers]
val paperWithAuthors = spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
val paperWithAuthors =
spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
val firstJoin =
papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
firstJoin
.joinWith(
paperWithAuthors,
firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")),
"left"
)
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_2")
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/merge_step_2")
var magPubs: Dataset[(String, Publication)] =
spark.read.load(s"$workingPath/merge_step_2").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
spark.read
.load(s"$workingPath/merge_step_2")
.as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
.as[(String, Publication)]
val conference = spark.read
.load(s"$sourcePath/ConferenceInstances")
.select(
$"ConferenceInstanceId".as("ci"),
$"DisplayName",
$"Location",
$"StartDate",
$"EndDate"
)
val conferenceInstance = conference
.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
.select(
$"_1.ci",
$"_1.DisplayName",
$"_1.Location",
$"_1.StartDate",
$"_1.EndDate",
$"_2.PaperId"
)
.as[MagConferenceInstance]
val conference = spark.read.load(s"$sourcePath/ConferenceInstances")
.select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate", $"EndDate")
val conferenceInstance = conference.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
.select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate", $"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance]
magPubs.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
magPubs
.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/merge_step_3")
val paperAbstract = spark.read.load(s"$workingPath/PaperAbstract").as[MagPaperAbstract]
val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract]
magPubs = spark.read.load(s"$workingPath/merge_step_3").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithDescription(item)
).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4")
magPubs = spark.read
.load(s"$workingPath/merge_step_3")
.as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
.as[(String, Publication)]
magPubs
.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithDescription(item))
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/merge_step_4")
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
magPubs = spark.read.load(s"$workingPath/merge_step_4").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
magPubs = spark.read
.load(s"$workingPath/merge_step_4")
.as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
.as[(String, Publication)]
val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
val fos = spark.read
.load(s"$sourcePath/FieldsOfStudy")
.select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy")
val paperField = pfos.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
val paperField = pfos
.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
.select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score")
.groupBy($"PaperId").agg(collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score")).as("subjects"))
.groupBy($"PaperId")
.agg(
collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score"))
.as("subjects")
)
.as[MagFieldOfStudy]
magPubs.joinWith(paperField, col("_1")
.equalTo(paperField("PaperId")), "left")
magPubs
.joinWith(
paperField,
col("_1")
.equalTo(paperField("PaperId")),
"left"
)
.map(item => ConversionUtil.updatePubsWithSubject(item))
.write.mode(SaveMode.Overwrite)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/mag_publication")
spark.read.load(s"$workingPath/mag_publication").as[Publication]
spark.read
.load(s"$workingPath/mag_publication")
.as[Publication]
.filter(p => p.getId != null)
.groupByKey(p => p.getId)
.reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b))
.map(_._2)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/magPublication")
}
}

View File

@ -15,15 +15,20 @@ import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
case class ORCIDItem(doi: String, authors: List[OrcidAuthor]) {}
case class OrcidAuthor(oid:String, name:Option[String], surname:Option[String], creditName:Option[String], otherNames:Option[List[String]], errorCode:Option[String]){}
case class OrcidAuthor(
oid: String,
name: Option[String],
surname: Option[String],
creditName: Option[String],
otherNames: Option[List[String]],
errorCode: Option[String]
) {}
case class OrcidWork(oid: String, doi: String)
case class ORCIDElement(doi: String, authors: List[ORCIDItem]) {}
object ORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
val mapper = new ObjectMapper()
@ -51,7 +56,6 @@ object ORCIDToOAF {
} else null
}
def strValid(s: Option[String]): Boolean = {
s.isDefined && s.get.nonEmpty
}
@ -70,7 +74,6 @@ object ORCIDToOAF {
false
}
def extractDOIWorks(input: String): List[OrcidWork] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
@ -97,7 +100,6 @@ object ORCIDToOAF {
(json \ "authorData").extractOrElse[OrcidAuthor](null)
}
def convertTOOAF(input: ORCIDItem): Publication = {
val doi = input.doi
val pub: Publication = new Publication
@ -145,10 +147,18 @@ object ORCIDToOAF {
else if (strValid(o.creditName))
a.setFullname(o.creditName.get)
if (StringUtils.isNotBlank(o.oid))
a.setPid(List(createSP(o.oid, ModelConstants.ORCID, ModelConstants.DNET_PID_TYPES, generateOricPIDDatainfo())).asJava)
a.setPid(
List(
createSP(
o.oid,
ModelConstants.ORCID,
ModelConstants.DNET_PID_TYPES,
generateOricPIDDatainfo()
)
).asJava
)
a
}
}

View File

@ -10,11 +10,11 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkConvertORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
def run(spark: SparkSession, workingPath: String, targetPath: String): Unit = {
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
import spark.implicits._
val dataset: Dataset[ORCIDItem] = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]
val dataset: Dataset[ORCIDItem] =
spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]
logger.info("Converting ORCID to OAF")
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath)
@ -22,15 +22,21 @@ object SparkConvertORCIDToOAF {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkConvertORCIDToOAF.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json"
)
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val workingPath = parser.get("workingPath")
val targetPath = parser.get("targetPath")

View File

@ -17,45 +17,72 @@ object SparkPreprocessORCID {
}
def run(spark: SparkSession, sourcePath: String, workingPath: String): Unit = {
import spark.implicits._
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
val inputRDD: RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s != null).filter(s => ORCIDToOAF.authorValid(s))
val inputRDD: RDD[OrcidAuthor] = spark.sparkContext
.textFile(s"$sourcePath/authors")
.map(s => ORCIDToOAF.convertORCIDAuthor(s))
.filter(s => s != null)
.filter(s => ORCIDToOAF.authorValid(s))
spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author")
spark
.createDataset(inputRDD)
.as[OrcidAuthor]
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/author")
val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s != null)
val res = spark.sparkContext
.textFile(s"$sourcePath/works")
.flatMap(s => ORCIDToOAF.extractDOIWorks(s))
.filter(s => s != null)
spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works")
spark
.createDataset(res)
.as[OrcidWork]
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/works")
val authors: Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor]
val works: Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork]
works.joinWith(authors, authors("oid").equalTo(works("oid")))
works
.joinWith(authors, authors("oid").equalTo(works("oid")))
.map(i => {
val doi = i._1.doi
val author = i._2
(doi, author)
}).groupBy(col("_1").alias("doi"))
.agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem]
})
.groupBy(col("_1").alias("doi"))
.agg(collect_list(col("_2")).alias("authors"))
.as[ORCIDItem]
.map(s => fixORCIDItem(s))
.write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor")
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/orcidworksWithAuthor")
}
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/preprocess_orcid_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkConvertORCIDToOAF.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/preprocess_orcid_params.json"
)
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val sourcePath = parser.get("sourcePath")
val workingPath = parser.get("workingPath")

View File

@ -13,28 +13,35 @@ object SparkMapUnpayWallToOAF {
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkMapDumpIntoOAF.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json"
)
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val inputRDD: RDD[String] = spark.sparkContext.textFile(s"$sourcePath")
logger.info("Converting UnpayWall to OAF")
val d: Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null)).as[Publication]
val d: Dataset[Publication] = spark
.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null))
.as[Publication]
d.write.mode(SaveMode.Overwrite).save(targetPath)
}

View File

@ -12,18 +12,22 @@ import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
case class OALocation(evidence:Option[String], host_type:Option[String], is_best:Option[Boolean], license: Option[String], pmh_id:Option[String], updated:Option[String],
url:Option[String], url_for_landing_page:Option[String], url_for_pdf:Option[String], version:Option[String]) {}
case class OALocation(
evidence: Option[String],
host_type: Option[String],
is_best: Option[Boolean],
license: Option[String],
pmh_id: Option[String],
updated: Option[String],
url: Option[String],
url_for_landing_page: Option[String],
url_for_pdf: Option[String],
version: Option[String]
) {}
object UnpayWallToOAF {
val logger: Logger = LoggerFactory.getLogger(getClass)
def get_unpaywall_color(input: String): Option[OpenAccessRoute] = {
if (input == null || input.equalsIgnoreCase("close"))
return None
@ -38,7 +42,11 @@ object UnpayWallToOAF {
}
def get_color(is_oa:Boolean, location: OALocation, journal_is_oa:Boolean):Option[OpenAccessRoute] = {
def get_color(
is_oa: Boolean,
location: OALocation,
journal_is_oa: Boolean
): Option[OpenAccessRoute] = {
if (is_oa) {
if (location.host_type.isDefined) {
{
@ -62,7 +70,6 @@ object UnpayWallToOAF {
None
}
def convertToOAF(input: String): Publication = {
val pub = new Publication
@ -122,7 +129,4 @@ object UnpayWallToOAF {
}
}

View File

@ -9,12 +9,8 @@ class DoiBoostHostedByMapTest {
def idDSGeneration(): Unit = {
val s = "doajarticles::0066-782X"
println(DoiBoostMappingUtil.generateDSId(s))
}
}

View File

@ -13,7 +13,6 @@ class NormalizeDOITest {
}
@Test
def doiFiltered(): Unit = {
val doi = "0.1042/BCJ20160876"
@ -28,7 +27,6 @@ class NormalizeDOITest {
assert(DoiBoostMappingUtil.normalizeDoi(doi) == null)
}
@Test
def doiCleaned(): Unit = {
val doi = "https://doi.org/10.1042/BCJ20160876"

View File

@ -12,20 +12,24 @@ import scala.collection.JavaConverters._
import scala.io.Source
import scala.util.matching.Regex
class CrossrefMappingTest {
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
val mapper = new ObjectMapper()
@Test
def testFunderRelationshipsMapping(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString
val funder_doi = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString
val funder_name = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString
val template = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")
)
.mkString
val funder_doi = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi"))
.mkString
val funder_name = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi"))
.mkString
for (line <- funder_doi.lines) {
val json = template.replace("%s", line)
@ -43,7 +47,8 @@ class CrossrefMappingTest {
def checkRelation(generatedOAF: List[Oaf]): Unit = {
val rels: List[Relation] = generatedOAF.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
val rels: List[Relation] =
generatedOAF.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
assertFalse(rels.isEmpty)
rels.foreach(relation => {
val relJson = mapper.writeValueAsString(relation)
@ -59,22 +64,22 @@ class CrossrefMappingTest {
}
@Test
def testSum(): Unit = {
val from: Long = 1613135645000L
val delta: Long = 1000000L
println(s"updating from value: $from -> ${from + delta}")
}
@Test
def testOrcidID(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")).mkString
val json = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")
)
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -85,17 +90,18 @@ class CrossrefMappingTest {
val items = resultList.filter(p => p.isInstanceOf[Result])
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
items.foreach(p => println(mapper.writeValueAsString(p)))
}
@Test
def testEmptyTitle(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")).mkString
val json = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")
)
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -106,17 +112,16 @@ class CrossrefMappingTest {
val items = resultList.filter(p => p.isInstanceOf[Result])
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
items.foreach(p => println(mapper.writeValueAsString(p)))
}
@Test
def testPeerReviewed(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json")).mkString
val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json"))
.mkString
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
assertNotNull(json)
@ -128,12 +133,8 @@ class CrossrefMappingTest {
val items = resultList.filter(p => p.isInstanceOf[Result])
items.foreach(p => logger.info(mapper.writeValueAsString(p)))
}
def extractECAward(award: String): String = {
@ -143,7 +144,6 @@ class CrossrefMappingTest {
null
}
@Test
def extractECTest(): Unit = {
val s = "FP7/2007-2013"
@ -152,12 +152,13 @@ class CrossrefMappingTest {
println(DHPUtils.md5(awardExtracted))
}
@Test
def testJournalRelation(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json")).mkString
val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json"))
.mkString
assertNotNull(json)
assertFalse(json.isEmpty)
@ -165,20 +166,19 @@ class CrossrefMappingTest {
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty)
val rels:List[Relation] = resultList.filter(p => p.isInstanceOf[Relation]).map(r=> r.asInstanceOf[Relation])
val rels: List[Relation] =
resultList.filter(p => p.isInstanceOf[Relation]).map(r => r.asInstanceOf[Relation])
rels.foreach(s => logger.info(s.getTarget))
assertEquals(rels.size, 6)
}
@Test
def testConvertBookFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json")).mkString
val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json"))
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -199,42 +199,62 @@ class CrossrefMappingTest {
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull(
result.getDataInfo.getProvenanceaction,
"DataInfo/Provenance test not null Failed");
"DataInfo/Provenance test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
"DataInfo/Provenance/classId test not null Failed");
"DataInfo/Provenance/classId test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
"DataInfo/Provenance/className test not null Failed");
"DataInfo/Provenance/className test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
"DataInfo/Provenance/SchemeId test not null Failed");
"DataInfo/Provenance/SchemeId test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
"DataInfo/Provenance/SchemeName test not null Failed");
"DataInfo/Provenance/SchemeName test not null Failed"
);
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty);
val collectedFromList = result.getCollectedfrom.asScala
assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
assert(
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
"Wrong collected from assertion"
)
assert(
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
"Wrong collected from assertion"
)
val relevantDates = result.getRelevantdate.asScala
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), "Missing relevant date of type published-online")
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), "Missing relevant date of type published-print")
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
"Missing relevant date of type created"
)
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")),
"Missing relevant date of type published-online"
)
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")),
"Missing relevant date of type published-print"
)
val rels = resultList.filter(p => p.isInstanceOf[Relation])
assert(rels.isEmpty)
}
@Test
def testConvertPreprintFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json")).mkString
val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json"))
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -255,44 +275,70 @@ class CrossrefMappingTest {
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull(
result.getDataInfo.getProvenanceaction,
"DataInfo/Provenance test not null Failed");
"DataInfo/Provenance test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
"DataInfo/Provenance/classId test not null Failed");
"DataInfo/Provenance/classId test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
"DataInfo/Provenance/className test not null Failed");
"DataInfo/Provenance/className test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
"DataInfo/Provenance/SchemeId test not null Failed");
"DataInfo/Provenance/SchemeId test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
"DataInfo/Provenance/SchemeName test not null Failed");
"DataInfo/Provenance/SchemeName test not null Failed"
);
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty);
val collectedFromList = result.getCollectedfrom.asScala
assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
assert(
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
"Wrong collected from assertion"
)
assert(
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
"Wrong collected from assertion"
)
val relevantDates = result.getRelevantdate.asScala
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("available")), "Missing relevant date of type available")
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("accepted")), "Missing relevant date of type accepted")
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), "Missing relevant date of type published-online")
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), "Missing relevant date of type published-print")
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
"Missing relevant date of type created"
)
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("available")),
"Missing relevant date of type available"
)
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("accepted")),
"Missing relevant date of type accepted"
)
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")),
"Missing relevant date of type published-online"
)
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")),
"Missing relevant date of type published-print"
)
val rels = resultList.filter(p => p.isInstanceOf[Relation])
assert(rels.isEmpty)
}
@Test
def testConvertDatasetFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json")).mkString
val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json"))
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -313,19 +359,24 @@ class CrossrefMappingTest {
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull(
result.getDataInfo.getProvenanceaction,
"DataInfo/Provenance test not null Failed");
"DataInfo/Provenance test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
"DataInfo/Provenance/classId test not null Failed");
"DataInfo/Provenance/classId test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
"DataInfo/Provenance/className test not null Failed");
"DataInfo/Provenance/className test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
"DataInfo/Provenance/SchemeId test not null Failed");
"DataInfo/Provenance/SchemeId test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
"DataInfo/Provenance/SchemeName test not null Failed");
"DataInfo/Provenance/SchemeName test not null Failed"
);
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty);
@ -333,7 +384,9 @@ class CrossrefMappingTest {
@Test
def testConvertArticleFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString
val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json"))
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -354,32 +407,45 @@ class CrossrefMappingTest {
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull(
result.getDataInfo.getProvenanceaction,
"DataInfo/Provenance test not null Failed");
"DataInfo/Provenance test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
"DataInfo/Provenance/classId test not null Failed");
"DataInfo/Provenance/classId test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
"DataInfo/Provenance/className test not null Failed");
"DataInfo/Provenance/className test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
"DataInfo/Provenance/SchemeId test not null Failed");
"DataInfo/Provenance/SchemeId test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
"DataInfo/Provenance/SchemeName test not null Failed");
"DataInfo/Provenance/SchemeName test not null Failed"
);
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty);
val collectedFromList = result.getCollectedfrom.asScala
assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
assert(
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
"Wrong collected from assertion"
)
assert(
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
"Wrong collected from assertion"
)
val relevantDates = result.getRelevantdate.asScala
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
"Missing relevant date of type created"
)
val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
assertFalse(rels.isEmpty)
@ -393,15 +459,14 @@ class CrossrefMappingTest {
})
}
@Test
def testSetDateOfAcceptanceCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json")).mkString
val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json"))
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -421,8 +486,13 @@ class CrossrefMappingTest {
@Test
def testNormalizeDOI(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString
val line :String = "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
val template = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")
)
.mkString
val line: String =
"\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
val json = template.replace("%s", line)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty)
@ -431,13 +501,17 @@ class CrossrefMappingTest {
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
assertTrue(result.getPid.size() == 1)
result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())))
result.getPid.asScala.foreach(pid =>
assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))
)
}
@Test
def testNormalizeDOI2(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString
val template = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json"))
.mkString
val resultList: List[Oaf] = Crossref2Oaf.convert(template)
assertTrue(resultList.nonEmpty)
@ -446,14 +520,19 @@ class CrossrefMappingTest {
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
assertTrue(result.getPid.size() == 1)
result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())))
result.getPid.asScala.foreach(pid =>
assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))
)
}
@Test
def testLicenseVorClosed(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")).mkString
val json = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")
)
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -462,25 +541,28 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty)
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor")))
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED")))
assertTrue(
item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor"))
)
assertTrue(
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED"))
)
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
}
@Test
def testLicenseOpen(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")).mkString
val json = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")
)
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -489,12 +571,19 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty)
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html")))
assertTrue(
item.getInstance().asScala exists (i =>
i.getLicense.getValue.equals(
"http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html"
)
)
)
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid))
assertTrue(
item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)
)
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
@ -502,8 +591,13 @@ class CrossrefMappingTest {
@Test
def testLicenseEmbargoOpen(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json")).mkString
val json = Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json"
)
)
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -512,12 +606,19 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty)
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
assertTrue(
item.getInstance().asScala exists (i =>
i.getLicense.getValue.equals(
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
)
)
)
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN")))
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid))
assertTrue(
item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)
)
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
@ -525,8 +626,13 @@ class CrossrefMappingTest {
@Test
def testLicenseEmbargo(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo.json")).mkString
val json = Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/doiboost/crossref/publication_license_embargo.json"
)
)
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -535,22 +641,33 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty)
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO")))
assertTrue(
item.getInstance().asScala exists (i =>
i.getLicense.getValue.equals(
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
)
)
)
assertTrue(
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))
)
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
}
@Test
def testLicenseEmbargoDateTime(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json")).mkString
val json = Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json"
)
)
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -559,11 +676,18 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty)
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")))
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO")))
assertTrue(
item.getInstance().asScala exists (i =>
i.getLicense.getValue.equals(
"https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"
)
)
)
assertTrue(
item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))
)
assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null))
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
println(mapper.writeValueAsString(item))
@ -572,8 +696,11 @@ class CrossrefMappingTest {
@Test
def testMultipleURLs(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")).mkString
val json = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")
)
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -582,12 +709,14 @@ class CrossrefMappingTest {
assertTrue(resultList.nonEmpty)
val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
assertEquals(1, item.getInstance().size())
assertEquals(1, item.getInstance().get(0).getUrl().size())
assertEquals("https://doi.org/10.1016/j.jas.2019.105013", item.getInstance().get(0).getUrl().get(0))
assertEquals(
"https://doi.org/10.1016/j.jas.2019.105013",
item.getInstance().get(0).getUrl().get(0)
)
//println(mapper.writeValueAsString(item))
}

View File

@ -12,29 +12,21 @@ import org.slf4j.{Logger, LoggerFactory}
import java.sql.Timestamp
import scala.io.Source
class MAGMappingTest {
val logger: Logger = LoggerFactory.getLogger(getClass)
val mapper = new ObjectMapper()
@Test
def testSplitter(): Unit = {
val s = "sports.team"
if (s.contains(".")) {
println(s.split("\\.") head)
}
}
@Test
def testDate(): Unit = {
@ -44,11 +36,11 @@ class MAGMappingTest {
}
@Test
def buildInvertedIndexTest(): Unit = {
val json_input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json")).mkString
val json_input = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json"))
.mkString
val description = ConversionUtil.convertInvertedIndexString(json_input)
assertNotNull(description)
assertTrue(description.nonEmpty)
@ -56,11 +48,10 @@ class MAGMappingTest {
logger.debug(description)
}
@Test
def normalizeDoiTest(): Unit = {
implicit val formats = DefaultFormats
val conf = new SparkConf()
@ -78,7 +69,8 @@ class MAGMappingTest {
val schema = Encoders.product[MagPapers].schema
import spark.implicits._
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
val magPapers: Dataset[MagPapers] =
spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers]
val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
assertTrue(ret.count == 10)
ret.take(10).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
@ -108,7 +100,8 @@ class MAGMappingTest {
val schema = Encoders.product[MagPapers].schema
import spark.implicits._
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
val magPapers: Dataset[MagPapers] =
spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers]
val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
assertTrue(ret.count == 8)
ret.take(8).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
@ -116,7 +109,4 @@ class MAGMappingTest {
//ret.take(8).foreach(mp => println(write(mp)))
}
}

View File

@ -20,7 +20,9 @@ class MappingORCIDToOAFTest {
@Test
def testExtractData(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput")).mkString
val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput"))
.mkString
assertNotNull(json)
assertFalse(json.isEmpty)
json.lines.foreach(s => {
@ -52,12 +54,8 @@ class MappingORCIDToOAFTest {
val mapper = new ObjectMapper()
val oA = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem].count()
val p: Dataset[Publication] = spark.read.load(targetPath).as[Publication]
assertTrue(oA == p.count())
@ -65,17 +63,16 @@ class MappingORCIDToOAFTest {
spark.close()
}
@Test
def testExtractDat1(): Unit = {
val aList: List[OrcidAuthor] = List(OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null ),
OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null ), OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null ))
val aList: List[OrcidAuthor] = List(
OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null),
OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null),
OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null)
)
val orcid: ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList)
@ -85,10 +82,6 @@ class MappingORCIDToOAFTest {
oaf.getPid.toList.foreach(pid => assert(pid.getValue.equals("10.1042/BCJ20160876")))
//println(mapper.writeValueAsString(ORCIDToOAF.convertTOOAF(orcid)))
}
}

View File

@ -14,11 +14,12 @@ class UnpayWallMappingTest {
val logger: Logger = LoggerFactory.getLogger(getClass)
val mapper = new ObjectMapper()
@Test
def testMappingToOAF(): Unit = {
val Ilist = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json")).mkString
val Ilist = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json"))
.mkString
var i: Int = 0
for (line <- Ilist.lines) {
@ -42,13 +43,14 @@ class UnpayWallMappingTest {
i = i + 1
}
val l = Ilist.lines.next()
val item = UnpayWallToOAF.convertToOAF(l)
assertEquals(item.getInstance().get(0).getAccessright.getOpenAccessRoute, OpenAccessRoute.bronze)
assertEquals(
item.getInstance().get(0).getAccessright.getOpenAccessRoute,
OpenAccessRoute.bronze
)
logger.info(mapper.writeValueAsString(item))

View File

@ -4,17 +4,29 @@ import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
case class HostedByItemType(
id: String,
officialname: String,
issn: String,
eissn: String,
lissn: String,
openAccess: Boolean
) {}
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
case class HostedByInfo(id: String, officialname: String, journal_id: String, provenance : String, id_type: String) {}
case class HostedByInfo(
id: String,
officialname: String,
journal_id: String,
provenance: String,
id_type: String
) {}
object Aggregators {
def getId(s1: String, s2: String): String = {
if (s1.startsWith("10|")) {
return s1}
return s1
}
s2
}
@ -25,24 +37,40 @@ object Aggregators {
s2
}
def explodeHostedByItemType(df: Dataset[(String, HostedByItemType)]): Dataset[(String, HostedByItemType)] = {
def explodeHostedByItemType(
df: Dataset[(String, HostedByItemType)]
): Dataset[(String, HostedByItemType)] = {
val transformedData: Dataset[(String, HostedByItemType)] = df
.groupByKey(_._1)(Encoders.STRING)
.agg(Aggregators.hostedByAggregator)
.map{
case (id:String , res:(String, HostedByItemType)) => res
.map { case (id: String, res: (String, HostedByItemType)) =>
res
}(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]))
transformedData
}
val hostedByAggregator: TypedColumn[(String, HostedByItemType), (String, HostedByItemType)] = new Aggregator[(String, HostedByItemType), (String, HostedByItemType), (String, HostedByItemType)] {
override def zero: (String, HostedByItemType) = ("", HostedByItemType("","","","","",false))
override def reduce(b: (String, HostedByItemType), a:(String,HostedByItemType)): (String, HostedByItemType) = {
val hostedByAggregator: TypedColumn[(String, HostedByItemType), (String, HostedByItemType)] =
new Aggregator[
(String, HostedByItemType),
(String, HostedByItemType),
(String, HostedByItemType)
] {
override def zero: (String, HostedByItemType) =
("", HostedByItemType("", "", "", "", "", false))
override def reduce(
b: (String, HostedByItemType),
a: (String, HostedByItemType)
): (String, HostedByItemType) = {
return merge(b, a)
}
override def merge(b1: (String, HostedByItemType), b2: (String, HostedByItemType)): (String, HostedByItemType) = {
override def merge(
b1: (String, HostedByItemType),
b2: (String, HostedByItemType)
): (String, HostedByItemType) = {
if (b1 == null) {
return b2
}
@ -50,27 +78,51 @@ object Aggregators {
return b1
}
if (b1._2.id.startsWith("10|")) {
return (b1._1, HostedByItemType(b1._2.id, b1._2.officialname, b1._2.issn, b1._2.eissn, b1._2.lissn, b1._2.openAccess || b2._2.openAccess))
return (
b1._1,
HostedByItemType(
b1._2.id,
b1._2.officialname,
b1._2.issn,
b1._2.eissn,
b1._2.lissn,
b1._2.openAccess || b2._2.openAccess
)
)
}
return (b2._1, HostedByItemType(b2._2.id, b2._2.officialname, b2._2.issn, b2._2.eissn, b2._2.lissn, b1._2.openAccess || b2._2.openAccess))
return (
b2._1,
HostedByItemType(
b2._2.id,
b2._2.officialname,
b2._2.issn,
b2._2.eissn,
b2._2.lissn,
b1._2.openAccess || b2._2.openAccess
)
)
}
override def finish(reduction: (String,HostedByItemType)): (String, HostedByItemType) = reduction
override def bufferEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType])
override def outputEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType])
override def finish(reduction: (String, HostedByItemType)): (String, HostedByItemType) =
reduction
override def bufferEncoder: Encoder[(String, HostedByItemType)] =
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
override def outputEncoder: Encoder[(String, HostedByItemType)] =
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
}.toColumn
def resultToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = new Aggregator[EntityInfo, EntityInfo, EntityInfo]{
def resultToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] =
new Aggregator[EntityInfo, EntityInfo, EntityInfo] {
override def zero: EntityInfo = EntityInfo.newInstance("", "", "")
override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = {
return merge(b, a)
}
override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
if (b1 == null) {
return b2
@ -96,19 +148,21 @@ object Aggregators {
val transformedData: Dataset[EntityInfo] = df
.groupByKey(_.getId)(Encoders.STRING)
.agg(Aggregators.resultToSingleIdAggregator)
.map{
case (id:String , res: EntityInfo) => res
.map { case (id: String, res: EntityInfo) =>
res
}(Encoders.bean(classOf[EntityInfo]))
transformedData
}
def datasourceToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = new Aggregator[EntityInfo, EntityInfo, EntityInfo]{
def datasourceToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] =
new Aggregator[EntityInfo, EntityInfo, EntityInfo] {
override def zero: EntityInfo = EntityInfo.newInstance("", "", "")
override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = {
return merge(b, a)
}
override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
if (b1 == null) {
return b2
@ -128,13 +182,12 @@ object Aggregators {
override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
}.toColumn
def datasourceToSingleId(df: Dataset[EntityInfo]): Dataset[EntityInfo] = {
val transformedData: Dataset[EntityInfo] = df
.groupByKey(_.getHostedById)(Encoders.STRING)
.agg(Aggregators.datasourceToSingleIdAggregator)
.map{
case (id:String , res: EntityInfo) => res
.map { case (id: String, res: EntityInfo) =>
res
}(Encoders.bean(classOf[EntityInfo]))
transformedData

View File

@ -14,7 +14,8 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkApplyHostedByMapToDatasource {
def applyHBtoDats(join: Dataset[EntityInfo], dats: Dataset[Datasource]): Dataset[Datasource] = {
dats.joinWith(join, dats.col("id").equalTo(join.col("hostedById")), "left")
dats
.joinWith(join, dats.col("id").equalTo(join.col("hostedById")), "left")
.map(t2 => {
val d: Datasource = t2._1
if (t2._2 != null) {
@ -31,14 +32,21 @@ object SparkApplyHostedByMapToDatasource {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"
)
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val graphPath = parser.get("graphPath")
val outputPath = parser.get("outputPath")
@ -51,20 +59,27 @@ object SparkApplyHostedByMapToDatasource {
val mapper = new ObjectMapper()
val dats: Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource")
val dats: Dataset[Datasource] = spark.read
.textFile(graphPath + "/datasource")
.map(r => mapper.readValue(r, classOf[Datasource]))
val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(preparedInfoPath)
.map(ei => mapper.readValue(ei, classOf[EntityInfo])))
val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(
spark.read
.textFile(preparedInfoPath)
.map(ei => mapper.readValue(ei, classOf[EntityInfo]))
)
applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
applyHBtoDats(pinfo, dats).write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath)
spark.read.textFile(outputPath)
spark.read
.textFile(outputPath)
.write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(graphPath + "/datasource")
}
}

View File

@ -16,7 +16,8 @@ import scala.collection.JavaConverters._
object SparkApplyHostedByMapToResult {
def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = {
pubs.joinWith(join, pubs.col("id").equalTo(join.col("id")), "left")
pubs
.joinWith(join, pubs.col("id").equalTo(join.col("id")), "left")
.map(t2 => {
val p: Publication = t2._1
if (t2._2 != null) {
@ -27,7 +28,14 @@ object SparkApplyHostedByMapToResult {
inst.getHostedby.setKey(ei.getHostedById)
inst.getHostedby.setValue(ei.getName)
if (ei.getOpenAccess) {
inst.setAccessright(OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN, "Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES))
inst.setAccessright(
OafMapperUtils.accessRight(
ModelConstants.ACCESS_RIGHT_OPEN,
"Open Access",
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES
)
)
inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
}
@ -40,46 +48,54 @@ object SparkApplyHostedByMapToResult {
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"
)
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val graphPath = parser.get("graphPath")
val outputPath = parser.get("outputPath")
val preparedInfoPath = parser.get("preparedInfoPath")
implicit val formats = DefaultFormats
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication])
implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
val mapper = new ObjectMapper()
val pubs: Dataset[Publication] = spark.read.textFile(graphPath + "/publication")
val pubs: Dataset[Publication] = spark.read
.textFile(graphPath + "/publication")
.map(r => mapper.readValue(r, classOf[Publication]))
val pinfo: Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath)
val pinfo: Dataset[EntityInfo] = spark.read
.textFile(preparedInfoPath)
.map(ei => mapper.readValue(ei, classOf[EntityInfo]))
applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
applyHBtoPubs(pinfo, pubs).write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath)
spark.read.textFile(outputPath)
spark.read
.textFile(outputPath)
.write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(graphPath + "/publication")
}
}

View File

@ -19,7 +19,6 @@ object SparkPrepareHostedByInfoToApply {
def getList(id: String, j: Journal, name: String): List[EntityInfo] = {
var lst: List[EntityInfo] = List()
if (j.getIssnLinking != null && !j.getIssnLinking.equals("")) {
lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst
}
@ -37,14 +36,14 @@ object SparkPrepareHostedByInfoToApply {
val mapper = new ObjectMapper()
val dd: Dataset[Publication] = spark.read.textFile(publicationPath)
val dd: Dataset[Publication] = spark.read
.textFile(publicationPath)
.map(r => mapper.readValue(r, classOf[Publication]))
dd.filter(p => p.getJournal != null).flatMap(p => getList(p.getId, p.getJournal, ""))
}
def toEntityInfo(input: String): EntityInfo = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -53,7 +52,6 @@ object SparkPrepareHostedByInfoToApply {
toEntityItem(c.keys.head, c.values.head)
}
def toEntityItem(journal_id: String, hbi: HostedByItemType): EntityInfo = {
EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess)
@ -61,7 +59,9 @@ object SparkPrepareHostedByInfoToApply {
}
def joinResHBM(res: Dataset[EntityInfo], hbm: Dataset[EntityInfo]): Dataset[EntityInfo] = {
Aggregators.resultToSingleId(res.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left")
Aggregators.resultToSingleId(
res
.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left")
.map(t2 => {
val res: EntityInfo = t2._1
if (t2._2 != null) {
@ -71,52 +71,57 @@ object SparkPrepareHostedByInfoToApply {
res.setName(ds.getName)
}
res
}))
})
)
}
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json"
)
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val graphPath = parser.get("graphPath")
val outputPath = parser.get("preparedInfoPath")
val hostedByMapPath = parser.get("hostedByMapPath")
implicit val formats = DefaultFormats
logger.info("Getting the Datasources")
import spark.implicits._
//STEP1: read the hostedbymap and transform it in EntityInfo
val hostedByInfo: Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
val hostedByInfo: Dataset[EntityInfo] =
spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
//STEP2: create association (publication, issn), (publication, eissn), (publication, lissn)
val resultInfoDataset: Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication")
val resultInfoDataset: Dataset[EntityInfo] =
prepareResultInfo(spark, graphPath + "/publication")
//STEP3: left join resultInfo with hostedByInfo on journal_id. Reduction of all the results with the same id in just
//one entry (one result could be associated to issn and eissn and so possivly matching more than once against the map)
//to this entry we add the id of the datasource for the next step
joinResHBM(resultInfoDataset, hostedByInfo)
.write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
joinResHBM(resultInfoDataset, hostedByInfo).write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath)
}
}

View File

@ -17,9 +17,8 @@ import java.io.PrintWriter
object SparkProduceHostedByMap {
implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] = Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] =
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)): HostedByItemType = {
val openaire: HostedByInfo = input._1._1
@ -28,9 +27,33 @@ object SparkProduceHostedByMap {
val isOpenAccess: Boolean = doaj == null && gold == null
openaire.journal_id match {
case Constants.ISSN => HostedByItemType(openaire.id, openaire.officialname, openaire.journal_id, "", "", isOpenAccess)
case Constants.EISSN => HostedByItemType(openaire.id, openaire.officialname, "", openaire.journal_id, "", isOpenAccess)
case Constants.ISSNL => HostedByItemType(openaire.id, openaire.officialname, "", "", openaire.journal_id, isOpenAccess)
case Constants.ISSN =>
HostedByItemType(
openaire.id,
openaire.officialname,
openaire.journal_id,
"",
"",
isOpenAccess
)
case Constants.EISSN =>
HostedByItemType(
openaire.id,
openaire.officialname,
"",
openaire.journal_id,
"",
isOpenAccess
)
case Constants.ISSNL =>
HostedByItemType(
openaire.id,
openaire.officialname,
"",
"",
openaire.journal_id,
isOpenAccess
)
// catch the default with a variable so you can print it
case whoa => null
@ -46,11 +69,16 @@ object SparkProduceHostedByMap {
Serialization.write(map)
}
def getHostedByItemType(id: String, officialname: String, issn: String, eissn: String, issnl: String, oa: Boolean): HostedByItemType = {
def getHostedByItemType(
id: String,
officialname: String,
issn: String,
eissn: String,
issnl: String,
oa: Boolean
): HostedByItemType = {
if (issn != null) {
if (eissn != null) {
if (issnl != null) {
@ -85,7 +113,14 @@ object SparkProduceHostedByMap {
def oaToHostedbyItemType(dats: Datasource): HostedByItemType = {
if (dats.getJournal != null) {
return getHostedByItemType(dats.getId, dats.getOfficialname.getValue, dats.getJournal.getIssnPrinted, dats.getJournal.getIssnOnline, dats.getJournal.getIssnLinking, false)
return getHostedByItemType(
dats.getId,
dats.getOfficialname.getValue,
dats.getJournal.getIssnPrinted,
dats.getJournal.getIssnOnline,
dats.getJournal.getIssnLinking,
false
)
}
HostedByItemType("", "", "", "", "", false)
}
@ -94,32 +129,41 @@ object SparkProduceHostedByMap {
import spark.implicits._
val mapper = new ObjectMapper()
implicit var encoderD = Encoders.kryo[Datasource]
val dd: Dataset[Datasource] = spark.read.textFile(datasourcePath)
val dd: Dataset[Datasource] = spark.read
.textFile(datasourcePath)
.map(r => mapper.readValue(r, classOf[Datasource]))
dd.map { ddt => oaToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
}
def goldToHostedbyItemType(gold: UnibiGoldModel): HostedByItemType = {
return getHostedByItemType(Constants.UNIBI, gold.getTitle, gold.getIssn, "", gold.getIssnL, true)
return getHostedByItemType(
Constants.UNIBI,
gold.getTitle,
gold.getIssn,
"",
gold.getIssnL,
true
)
}
def goldHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
def goldHostedByDataset(
spark: SparkSession,
datasourcePath: String
): Dataset[HostedByItemType] = {
import spark.implicits._
implicit val mapEncoderUnibi: Encoder[UnibiGoldModel] = Encoders.kryo[UnibiGoldModel]
val mapper = new ObjectMapper()
val dd: Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath)
val dd: Dataset[UnibiGoldModel] = spark.read
.textFile(datasourcePath)
.map(r => mapper.readValue(r, classOf[UnibiGoldModel]))
dd.map { ddt => goldToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
@ -128,17 +172,28 @@ object SparkProduceHostedByMap {
def doajToHostedbyItemType(doaj: DOAJModel): HostedByItemType = {
return getHostedByItemType(Constants.DOAJ, doaj.getJournalTitle, doaj.getIssn, doaj.getEissn, "", true)
return getHostedByItemType(
Constants.DOAJ,
doaj.getJournalTitle,
doaj.getIssn,
doaj.getEissn,
"",
true
)
}
def doajHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
def doajHostedByDataset(
spark: SparkSession,
datasourcePath: String
): Dataset[HostedByItemType] = {
import spark.implicits._
implicit val mapEncoderDOAJ: Encoder[DOAJModel] = Encoders.kryo[DOAJModel]
val mapper = new ObjectMapper()
val dd: Dataset[DOAJModel] = spark.read.textFile(datasourcePath)
val dd: Dataset[DOAJModel] = spark.read
.textFile(datasourcePath)
.map(r => mapper.readValue(r, classOf[DOAJModel]))
dd.map { ddt => doajToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
@ -159,7 +214,6 @@ object SparkProduceHostedByMap {
lst
}
def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = {
val conf = new Configuration()
@ -169,49 +223,51 @@ object SparkProduceHostedByMap {
val writer = new PrintWriter(output)
try {
input.foreach(hbi => writer.println(hbi))
}
finally {
} finally {
writer.close()
}
}
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_params.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val datasourcePath = parser.get("datasourcePath")
val workingDirPath = parser.get("workingPath")
val outputPath = parser.get("outputPath")
implicit val formats = DefaultFormats
logger.info("Getting the Datasources")
Aggregators.explodeHostedByItemType(oaHostedByDataset(spark, datasourcePath)
Aggregators
.explodeHostedByItemType(
oaHostedByDataset(spark, datasourcePath)
.union(goldHostedByDataset(spark, workingDirPath + "/unibi_gold.json"))
.union(doajHostedByDataset(spark, workingDirPath + "/doaj.json"))
.flatMap(hbi => toList(hbi))).filter(hbi => hbi._2.id.startsWith("10|"))
.flatMap(hbi => toList(hbi))
)
.filter(hbi => hbi._2.id.startsWith("10|"))
.map(hbi => toHostedByMap(hbi))(Encoders.STRING)
.rdd.saveAsTextFile(outputPath, classOf[GzipCodec])
.rdd
.saveAsTextFile(outputPath, classOf[GzipCodec])
}
}

View File

@ -20,7 +20,13 @@ object CopyHdfsOafSparkApplication {
def main(args: Array[String]): Unit = {
val log = LoggerFactory.getLogger(getClass)
val conf = new SparkConf()
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json")).mkString)
val parser = new ArgumentApplicationParser(
Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json")
)
.mkString
)
parser.parseArgument(args)
val spark =
@ -28,7 +34,8 @@ object CopyHdfsOafSparkApplication {
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val sc: SparkContext = spark.sparkContext
@ -49,19 +56,22 @@ object CopyHdfsOafSparkApplication {
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
val paths = DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala
val paths =
DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala
val validPaths: List[String] = paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList
val validPaths: List[String] =
paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList
val types = ModelSupport.oafTypes.entrySet
.asScala
val types = ModelSupport.oafTypes.entrySet.asScala
.map(e => Tuple2(e.getKey, e.getValue))
if (validPaths.nonEmpty) {
val oaf = spark.read.textFile(validPaths: _*)
val mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
val mapper =
new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
types.foreach(t => oaf
types.foreach(t =>
oaf
.filter(o => isOafType(o, t._1))
.map(j => mapper.readValue(j, t._2).asInstanceOf[Oaf])
.map(s => mapper.writeValueAsString(s))(Encoders.STRING)

View File

@ -13,20 +13,32 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkResolveEntities {
val mapper = new ObjectMapper()
val entities = List(EntityType.dataset, EntityType.publication, EntityType.software, EntityType.otherresearchproduct)
val entities = List(
EntityType.dataset,
EntityType.publication,
EntityType.software,
EntityType.otherresearchproduct
)
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json"
)
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val graphBasePath = parser.get("graphBasePath")
log.info(s"graphBasePath -> $graphBasePath")
@ -38,7 +50,6 @@ object SparkResolveEntities {
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
fs.mkdirs(new Path(workingPath))
@ -46,23 +57,30 @@ object SparkResolveEntities {
generateResolvedEntities(spark, workingPath, graphBasePath, targetPath)
}
def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = {
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
import spark.implicits._
val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
val up: Dataset[(String, Result)] = spark.read.text(unresolvedPath).as[String].map(s => mapper.readValue(s, classOf[Result])).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
val rPid: Dataset[(String, String)] =
spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
val up: Dataset[(String, Result)] = spark.read
.text(unresolvedPath)
.as[String]
.map(s => mapper.readValue(s, classOf[Result]))
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
rPid.joinWith(up, rPid("_2").equalTo(up("_1")), "inner").map {
r =>
rPid
.joinWith(up, rPid("_2").equalTo(up("_1")), "inner")
.map { r =>
val result = r._2._2
val dnetId = r._1._1
result.setId(dnetId)
result
}.write.mode(SaveMode.Overwrite).save(s"$workingPath/resolvedEntities")
}
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/resolvedEntities")
}
def deserializeObject(input: String, entity: EntityType): Result = {
@ -74,18 +92,32 @@ object SparkResolveEntities {
}
}
def generateResolvedEntities(spark: SparkSession, workingPath: String, graphBasePath: String, targetPath: String) = {
def generateResolvedEntities(
spark: SparkSession,
workingPath: String,
graphBasePath: String,
targetPath: String
) = {
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
import spark.implicits._
val re: Dataset[(String, Result)] = spark.read.load(s"$workingPath/resolvedEntities").as[Result].map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
entities.foreach {
e => {
val re: Dataset[(String, Result)] = spark.read
.load(s"$workingPath/resolvedEntities")
.as[Result]
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
entities.foreach { e =>
{
val currentEntityDataset: Dataset[(String, Result)] = spark.read.text(s"$graphBasePath/$e").as[String].map(s => deserializeObject(s, e)).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
val currentEntityDataset: Dataset[(String, Result)] = spark.read
.text(s"$graphBasePath/$e")
.as[String]
.map(s => deserializeObject(s, e))
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
currentEntityDataset.joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left").map(k => {
currentEntityDataset
.joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left")
.map(k => {
val a = k._1
val b = k._2
@ -95,11 +127,14 @@ object SparkResolveEntities {
a._2.mergeFrom(b._2)
a._2
}
}).map(r => mapper.writeValueAsString(r))(Encoders.STRING)
.write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$targetPath/$e")
})
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
.write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(s"$targetPath/$e")
}
}
}
}

View File

@ -17,18 +17,25 @@ import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
object SparkResolveRelation {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json"
)
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val graphBasePath = parser.get("graphBasePath")
log.info(s"graphBasePath -> $graphBasePath")
@ -41,7 +48,6 @@ object SparkResolveRelation {
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
import spark.implicits._
//CLEANING TEMPORARY FOLDER
HdfsSupport.remove(workingPath, spark.sparkContext.hadoopConfiguration)
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
@ -51,28 +57,36 @@ object SparkResolveRelation {
val mapper: ObjectMapper = new ObjectMapper()
val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
val rPid: Dataset[(String, String)] =
spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
val relationDs: Dataset[(String, Relation)] = spark.read.text(s"$graphBasePath/relation").as[String]
.map(s => mapper.readValue(s, classOf[Relation])).as[Relation]
val relationDs: Dataset[(String, Relation)] = spark.read
.text(s"$graphBasePath/relation")
.as[String]
.map(s => mapper.readValue(s, classOf[Relation]))
.as[Relation]
.map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
relationDs.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left").map {
m =>
relationDs
.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left")
.map { m =>
val sourceResolved = m._2
val currentRelation = m._1._2
if (sourceResolved != null && sourceResolved._1 != null && sourceResolved._1.nonEmpty)
currentRelation.setSource(sourceResolved._1)
currentRelation
}.write
}
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/relationResolvedSource")
val relationSourceResolved: Dataset[(String, Relation)] = spark.read.load(s"$workingPath/relationResolvedSource").as[Relation]
val relationSourceResolved: Dataset[(String, Relation)] = spark.read
.load(s"$workingPath/relationResolvedSource")
.as[Relation]
.map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left").map {
m =>
relationSourceResolved
.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left")
.map { m =>
val targetResolved = m._2
val currentRelation = m._1._2
if (targetResolved != null && targetResolved._1.nonEmpty)
@ -83,7 +97,9 @@ object SparkResolveRelation {
.mode(SaveMode.Overwrite)
.save(s"$workingPath/relation_resolved")
spark.read.load(s"$workingPath/relation_resolved").as[Relation]
spark.read
.load(s"$workingPath/relation_resolved")
.as[Relation]
.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
.map(r => mapper.writeValueAsString(r))
.write
@ -107,7 +123,6 @@ object SparkResolveRelation {
}
def extractPidsFromRecord(input: String): (String, List[(String, String)]) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
@ -122,7 +137,6 @@ object SparkResolveRelation {
(id, result)
}
private def isRelation(input: String): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -132,20 +146,25 @@ object SparkResolveRelation {
source != null
}
def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = {
def extractPidResolvedTableFromJsonRDD(
spark: SparkSession,
graphPath: String,
workingPath: String
) = {
import spark.implicits._
val d: RDD[(String, String)] = spark.sparkContext.textFile(s"$graphPath/*")
val d: RDD[(String, String)] = spark.sparkContext
.textFile(s"$graphPath/*")
.filter(i => !isRelation(i))
.map(i => extractPidsFromRecord(i))
.filter(s => s != null && s._1 != null && s._2 != null && s._2.nonEmpty)
.flatMap { p =>
p._2.map(pid =>
(p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2))
)
}.filter(r => r._1 != null || r._2 != null)
p._2.map(pid => (p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2)))
}
.filter(r => r._1 != null || r._2 != null)
spark.createDataset(d)
spark
.createDataset(d)
.groupByKey(_._2)
.reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y)
.map(s => s._2)

View File

@ -7,24 +7,26 @@ import org.apache.spark.sql.SparkSession
object SparkDataciteToOAF {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val sc = spark.sparkContext
val inputPath = parser.get("inputPath")
}
}

View File

@ -11,18 +11,22 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkConvertDatasetToJsonRDD {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
@ -33,9 +37,13 @@ object SparkConvertDatasetToJsonRDD {
val mapper = new ObjectMapper()
implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
resultObject.foreach { item =>
spark.read.load(s"$sourcePath/$item").as[Result].map(r => mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec])
spark.read
.load(s"$sourcePath/$item")
.as[Result]
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
.rdd
.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec])
}
}

View File

@ -15,14 +15,19 @@ object SparkConvertObjectToJson {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
@ -33,12 +38,9 @@ object SparkConvertObjectToJson {
val scholixUpdatePath = parser.get("scholixUpdatePath")
log.info(s"scholixUpdatePath -> $scholixUpdatePath")
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
val mapper = new ObjectMapper
objectType.toLowerCase match {
@ -46,11 +48,18 @@ object SparkConvertObjectToJson {
log.info("Serialize Scholix")
val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix]
val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
d.union(u).repartition(8000).map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.saveAsTextFile(targetPath, classOf[GzipCodec])
d.union(u)
.repartition(8000)
.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
.rdd
.saveAsTextFile(targetPath, classOf[GzipCodec])
case "summary" =>
log.info("Serialize Summary")
val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary]
d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(1000).saveAsTextFile(targetPath, classOf[GzipCodec])
d.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
.rdd
.repartition(1000)
.saveAsTextFile(targetPath, classOf[GzipCodec])
}
}

View File

@ -7,21 +7,26 @@ import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
object SparkConvertRDDtoDataset {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
@ -34,40 +39,76 @@ object SparkConvertRDDtoDataset {
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct])
implicit val orpEncoder: Encoder[OtherResearchProduct] =
Encoders.kryo(classOf[OtherResearchProduct])
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
log.info("Converting dataset")
val rddDataset =spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
spark.createDataset(rddDataset).as[OafDataset].write.mode(SaveMode.Overwrite).save(s"$entityPath/dataset")
val rddDataset = spark.sparkContext
.textFile(s"$sourcePath/dataset")
.map(s => mapper.readValue(s, classOf[OafDataset]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
spark
.createDataset(rddDataset)
.as[OafDataset]
.write
.mode(SaveMode.Overwrite)
.save(s"$entityPath/dataset")
log.info("Converting publication")
val rddPublication =spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
spark.createDataset(rddPublication).as[Publication].write.mode(SaveMode.Overwrite).save(s"$entityPath/publication")
val rddPublication = spark.sparkContext
.textFile(s"$sourcePath/publication")
.map(s => mapper.readValue(s, classOf[Publication]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
spark
.createDataset(rddPublication)
.as[Publication]
.write
.mode(SaveMode.Overwrite)
.save(s"$entityPath/publication")
log.info("Converting software")
val rddSoftware =spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
spark.createDataset(rddSoftware).as[Software].write.mode(SaveMode.Overwrite).save(s"$entityPath/software")
val rddSoftware = spark.sparkContext
.textFile(s"$sourcePath/software")
.map(s => mapper.readValue(s, classOf[Software]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
spark
.createDataset(rddSoftware)
.as[Software]
.write
.mode(SaveMode.Overwrite)
.save(s"$entityPath/software")
log.info("Converting otherresearchproduct")
val rddOtherResearchProduct =spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false)
spark.createDataset(rddOtherResearchProduct).as[OtherResearchProduct].write.mode(SaveMode.Overwrite).save(s"$entityPath/otherresearchproduct")
val rddOtherResearchProduct = spark.sparkContext
.textFile(s"$sourcePath/otherresearchproduct")
.map(s => mapper.readValue(s, classOf[OtherResearchProduct]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
spark
.createDataset(rddOtherResearchProduct)
.as[OtherResearchProduct]
.write
.mode(SaveMode.Overwrite)
.save(s"$entityPath/otherresearchproduct")
log.info("Converting Relation")
val relationSemanticFilter = List(
"cites",
"iscitedby",
"merges",
"ismergedin",
"HasAmongTopNSimilarDocuments",
"IsAmongTopNSimilarDocuments"
)
val relationSemanticFilter = List("cites", "iscitedby","merges", "ismergedin", "HasAmongTopNSimilarDocuments","IsAmongTopNSimilarDocuments" )
val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation")
val rddRelation = spark.sparkContext
.textFile(s"$sourcePath/relation")
.map(s => mapper.readValue(s, classOf[Relation]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
}
}

View File

@ -13,82 +13,131 @@ object SparkCreateInputGraph {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/extract_entities_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/extract_entities_params.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val resultObject = List(
("publication", classOf[Publication]),
("dataset", classOf[OafDataset]),
("software", classOf[Software]),
("otherResearchProduct", classOf[OtherResearchProduct])
)
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct])
implicit val orpEncoder: Encoder[OtherResearchProduct] =
Encoders.kryo(classOf[OtherResearchProduct])
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
val oafDs: Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf]
log.info("Extract Publication")
oafDs.filter(o => o.isInstanceOf[Publication]).map(p => p.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/publication")
oafDs
.filter(o => o.isInstanceOf[Publication])
.map(p => p.asInstanceOf[Publication])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/extracted/publication")
log.info("Extract dataset")
oafDs.filter(o => o.isInstanceOf[OafDataset]).map(p => p.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/dataset")
oafDs
.filter(o => o.isInstanceOf[OafDataset])
.map(p => p.asInstanceOf[OafDataset])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/extracted/dataset")
log.info("Extract software")
oafDs.filter(o => o.isInstanceOf[Software]).map(p => p.asInstanceOf[Software]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/software")
oafDs
.filter(o => o.isInstanceOf[Software])
.map(p => p.asInstanceOf[Software])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/extracted/software")
log.info("Extract otherResearchProduct")
oafDs.filter(o => o.isInstanceOf[OtherResearchProduct]).map(p => p.asInstanceOf[OtherResearchProduct]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/otherResearchProduct")
oafDs
.filter(o => o.isInstanceOf[OtherResearchProduct])
.map(p => p.asInstanceOf[OtherResearchProduct])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/extracted/otherResearchProduct")
log.info("Extract Relation")
oafDs.filter(o => o.isInstanceOf[Relation]).map(p => p.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/relation")
oafDs
.filter(o => o.isInstanceOf[Relation])
.map(p => p.asInstanceOf[Relation])
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/extracted/relation")
resultObject.foreach { r =>
log.info(s"Make ${r._1} unique")
makeDatasetUnique(s"$targetPath/extracted/${r._1}", s"$targetPath/preprocess/${r._1}", spark, r._2)
makeDatasetUnique(
s"$targetPath/extracted/${r._1}",
s"$targetPath/preprocess/${r._1}",
spark,
r._2
)
}
}
def extractEntities[T <: Oaf](oafDs: Dataset[Oaf], targetPath: String, clazz: Class[T], log: Logger): Unit = {
def extractEntities[T <: Oaf](
oafDs: Dataset[Oaf],
targetPath: String,
clazz: Class[T],
log: Logger
): Unit = {
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
log.info(s"Extract ${clazz.getSimpleName}")
oafDs.filter(o => o.isInstanceOf[T]).map(p => p.asInstanceOf[T]).write.mode(SaveMode.Overwrite).save(targetPath)
oafDs
.filter(o => o.isInstanceOf[T])
.map(p => p.asInstanceOf[T])
.write
.mode(SaveMode.Overwrite)
.save(targetPath)
}
def makeDatasetUnique[T <: Result](sourcePath: String, targetPath: String, spark: SparkSession, clazz: Class[T]): Unit = {
def makeDatasetUnique[T <: Result](
sourcePath: String,
targetPath: String,
spark: SparkSession,
clazz: Class[T]
): Unit = {
import spark.implicits._
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
val ds: Dataset[T] = spark.read.load(sourcePath).as[T]
ds.groupByKey(_.getId).reduceGroups { (x, y) =>
ds.groupByKey(_.getId)
.reduceGroups { (x, y) =>
x.mergeFrom(y)
x
}.map(_._2).write.mode(SaveMode.Overwrite).save(targetPath)
}
.map(_._2)
.write
.mode(SaveMode.Overwrite)
.save(targetPath)
}

View File

@ -17,14 +17,19 @@ object SparkCreateScholix {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val relationPath = parser.get("relationPath")
log.info(s"relationPath -> $relationPath")
@ -33,37 +38,46 @@ object SparkCreateScholix {
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
import spark.implicits._
val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation]
.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
val relationDS: Dataset[(String, Relation)] = spark.read
.load(relationPath)
.as[Relation]
.filter(r =>
(r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase
.contains("merge")
)
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary]
val summaryDS: Dataset[(String, ScholixSummary)] = spark.read
.load(summaryPath)
.as[ScholixSummary]
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, summaryEncoder))
relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
relationDS
.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Relation), (String, ScholixSummary)) =>
if (input._1 != null && input._2 != null) {
val rel: Relation = input._1._2
val source: ScholixSummary = input._2._2
(rel.getTarget, ScholixUtils.scholixFromSource(rel, source))
}
else null
} else null
}(Encoders.tuple(Encoders.STRING, scholixEncoder))
.filter(r => r != null)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source")
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/scholix_from_source")
val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
val scholixSource: Dataset[(String, Scholix)] = spark.read
.load(s"$targetPath/scholix_from_source")
.as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
scholixSource.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
scholixSource
.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Scholix), (String, ScholixSummary)) =>
if (input._2 == null) {
null
@ -72,40 +86,73 @@ object SparkCreateScholix {
val target: ScholixSummary = input._2._2
ScholixUtils.generateCompleteScholix(s, target)
}
}.filter(s => s != null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse")
}
.filter(s => s != null)
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/scholix_one_verse")
val scholix_o_v: Dataset[Scholix] =
spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
scholix_o_v.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))).as[Scholix]
scholix_o_v
.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s)))
.as[Scholix]
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder))
.groupByKey(_._1)
.agg(ScholixUtils.scholixAggregator.toColumn)
.map(s => s._2)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix")
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/scholix")
val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix]
val stats: Dataset[(String, String, Long)] = scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String, String, Long)]
val stats: Dataset[(String, String, Long)] = scholix_final
.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType))
.groupBy("_1", "_2")
.agg(count("_1"))
.as[(String, String, Long)]
stats
.map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0))
.map(s =>
RelatedEntities(
s._1,
if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0,
if ("publication".equalsIgnoreCase(s._2)) s._3 else 0
)
)
.groupByKey(_.id)
.reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset + b.relatedDataset, a.relatedPublication + b.relatedPublication))
.reduceGroups((a, b) =>
RelatedEntities(
a.id,
a.relatedDataset + b.relatedDataset,
a.relatedPublication + b.relatedPublication
)
)
.map(_._2)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/related_entities")
.write
.mode(SaveMode.Overwrite)
.save(s"$targetPath/related_entities")
val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication > 0 || r.relatedDataset > 0)
val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read
.load(s"$targetPath/related_entities")
.as[RelatedEntities]
.filter(r => r.relatedPublication > 0 || r.relatedDataset > 0)
relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map { i =>
relatedEntitiesDS
.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner")
.map { i =>
val re = i._1
val sum = i._2._2
sum.setRelatedDatasets(re.relatedDataset)
sum.setRelatedPublications(re.relatedPublication)
sum
}.write.mode(SaveMode.Overwrite).save(s"${summaryPath}_filtered")
}
.write
.mode(SaveMode.Overwrite)
.save(s"${summaryPath}_filtered")
}
}

View File

@ -14,14 +14,19 @@ object SparkCreateSummaryObject {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json")))
val parser = new ArgumentApplicationParser(
IOUtils.toString(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json")
)
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
@ -33,10 +38,17 @@ object SparkCreateSummaryObject {
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
val ds: Dataset[Result] = spark.read
.load(s"$sourcePath/*")
.as[Result]
.filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false)
val ds: Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false)
ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s != null).write.mode(SaveMode.Overwrite).save(targetPath)
ds.repartition(6000)
.map(r => ScholixUtils.resultToSummary(r))
.filter(s => s != null)
.write
.mode(SaveMode.Overwrite)
.save(targetPath)
}

View File

@ -10,13 +10,23 @@ import java.util.regex.Pattern
import scala.language.postfixOps
import scala.xml.{Elem, Node, XML}
case class PangaeaDataModel(identifier:String, title:List[String], objectType:List[String], creator:List[String],
publisher:List[String], dataCenter :List[String],subject :List[String], language:String,
rights:String, parent:String,relation :List[String],linkage:List[(String,String)] ) {}
case class PangaeaDataModel(
identifier: String,
title: List[String],
objectType: List[String],
creator: List[String],
publisher: List[String],
dataCenter: List[String],
subject: List[String],
language: String,
rights: String,
parent: String,
relation: List[String],
linkage: List[(String, String)]
) {}
object PangaeaUtils {
def toDataset(input: String): PangaeaDataModel = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
@ -26,20 +36,25 @@ object PangaeaUtils {
def findDOIInRelation(input: List[String]): List[String] = {
val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b")
input.map(i => {
input
.map(i => {
val matcher = pattern.matcher(i)
if (matcher.find())
matcher.group(0)
else
null
}).filter(i => i!= null)
})
.filter(i => i != null)
}
def attributeOpt(attribute: String, node: Node): Option[String] =
node.attribute(attribute) flatMap (_.headOption) map (_.text)
def extractLinkage(node: Elem): List[(String, String)] = {
(node \ "linkage").map(n =>(attributeOpt("type",n), n.text)).filter(t => t._1.isDefined).map(t=> (t._1.get, t._2))(collection.breakOut)
(node \ "linkage")
.map(n => (attributeOpt("type", n), n.text))
.filter(t => t._1.isDefined)
.map(t => (t._1.get, t._2))(collection.breakOut)
}
def parseXml(input: String): PangaeaDataModel = {
@ -59,12 +74,24 @@ object PangaeaUtils {
val relationFiltered = findDOIInRelation(relation)
val linkage: List[(String, String)] = extractLinkage(xml)
PangaeaDataModel(identifier,title, pType, creators,publisher, dataCenter, subject, language, rights, parentIdentifier, relationFiltered, linkage)
PangaeaDataModel(
identifier,
title,
pType,
creators,
publisher,
dataCenter,
subject,
language,
rights,
parentIdentifier,
relationFiltered,
linkage
)
}
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] = new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel]{
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] =
new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] {
override def zero: PangaeaDataModel = null
@ -106,7 +133,4 @@ object PangaeaUtils {
override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
}
}

View File

@ -11,20 +11,25 @@ import scala.io.Source
object SparkGeneratePanagaeaDataset {
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")).mkString)
val parser = new ArgumentApplicationParser(
Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")
)
.mkString
)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkGeneratePanagaeaDataset.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
.master(parser.get("master"))
.getOrCreate()
parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}"))
logger.info("Converting sequential file into Dataset")
@ -34,16 +39,20 @@ object SparkGeneratePanagaeaDataset {
implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
val inputRDD: RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
val inputRDD: RDD[PangaeaDataModel] =
sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
spark.createDataset(inputRDD).as[PangaeaDataModel]
spark
.createDataset(inputRDD)
.as[PangaeaDataModel]
.map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders))
.groupByKey(_._1)(Encoders.STRING)
.agg(PangaeaUtils.getDatasetAggregator().toColumn)
.map(s => s._2)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset")
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/dataset")
}
}

View File

@ -30,10 +30,10 @@ class TestApply extends java.io.Serializable{
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
implicit val mapEncoderPubInfo: Encoder[Publication] = Encoders.bean(classOf[Publication])
val pub_ds :Dataset[Publication] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[Publication]))
val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
val pub_ds: Dataset[Publication] =
spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[Publication]))
val hbm_ds: Dataset[EntityInfo] =
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
assertEquals(13, pub_ds.count())
@ -41,7 +41,8 @@ class TestApply extends java.io.Serializable{
assertEquals(13, ds.count)
val temp: Dataset[(Publication, Publication)] = pub_ds.joinWith(ds, pub_ds.col("id").equalTo(ds.col("id")), "left")
val temp: Dataset[(Publication, Publication)] =
pub_ds.joinWith(ds, pub_ds.col("id").equalTo(ds.col("id")), "left")
assertEquals(13, temp.count())
temp.foreach(t2 => {
val pb: Publication = t2._1
@ -50,17 +51,36 @@ class TestApply extends java.io.Serializable{
assertEquals(1, pb.getInstance().size())
assertTrue(t2._1.getId.equals(t2._2.getId))
if (pb.getId.equals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9")) {
assertTrue(pa.getInstance().get(0).getHostedby.getKey.equals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735"))
assertTrue(
pa.getInstance()
.get(0)
.getHostedby
.getKey
.equals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735")
)
assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals("Academic Therapy"))
assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals("OPEN"))
assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals("Open Access"))
assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.gold))
assertTrue(
pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.gold)
)
assertTrue(pa.getBestaccessright.getClassid.equals("OPEN"))
assertTrue(pa.getBestaccessright.getClassname.equals("Open Access"))
assertTrue(pb.getInstance().get(0).getHostedby.getKey.equals("10|openaire____::0b74b6a356bbf23c245f9ae9a748745c"))
assertTrue(pb.getInstance().get(0).getHostedby.getValue.equals("Revistas de investigación Universidad Nacional Mayor de San Marcos"))
assertTrue(
pb.getInstance()
.get(0)
.getHostedby
.getKey
.equals("10|openaire____::0b74b6a356bbf23c245f9ae9a748745c")
)
assertTrue(
pb.getInstance()
.get(0)
.getHostedby
.getValue
.equals("Revistas de investigación Universidad Nacional Mayor de San Marcos")
)
assertTrue(pb.getInstance().get(0).getAccessright.getClassname.equals("not available"))
assertTrue(pb.getInstance().get(0).getAccessright.getClassid.equals("UNKNOWN"))
assertTrue(pb.getInstance().get(0).getAccessright.getOpenAccessRoute == null)
@ -68,11 +88,41 @@ class TestApply extends java.io.Serializable{
assertTrue(pb.getBestaccessright.getClassname.equals("not available"))
} else {
assertTrue(pa.getInstance().get(0).getHostedby.getKey.equals(pb.getInstance().get(0).getHostedby.getKey))
assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals(pb.getInstance().get(0).getHostedby.getValue))
assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals(pb.getInstance().get(0).getAccessright.getClassid))
assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals(pb.getInstance().get(0).getAccessright.getClassname))
assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute == pb.getInstance().get(0).getAccessright.getOpenAccessRoute)
assertTrue(
pa.getInstance()
.get(0)
.getHostedby
.getKey
.equals(pb.getInstance().get(0).getHostedby.getKey)
)
assertTrue(
pa.getInstance()
.get(0)
.getHostedby
.getValue
.equals(pb.getInstance().get(0).getHostedby.getValue)
)
assertTrue(
pa.getInstance()
.get(0)
.getAccessright
.getClassid
.equals(pb.getInstance().get(0).getAccessright.getClassid)
)
assertTrue(
pa.getInstance()
.get(0)
.getAccessright
.getClassname
.equals(pb.getInstance().get(0).getAccessright.getClassname)
)
assertTrue(
pa.getInstance().get(0).getAccessright.getOpenAccessRoute == pb
.getInstance()
.get(0)
.getAccessright
.getOpenAccessRoute
)
}
})
@ -80,7 +130,6 @@ class TestApply extends java.io.Serializable{
spark.close()
}
@Test
def testApplyOnDatasource(): Unit = {
val conf = new SparkConf()
@ -100,10 +149,11 @@ class TestApply extends java.io.Serializable{
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
implicit val mapEncoderPubInfo: Encoder[Datasource] = Encoders.bean(classOf[Datasource])
val dats_ds :Dataset[Datasource] = spark.read.textFile(dats).map(p => mapper.readValue(p, classOf[Datasource]))
val hbm_ds :Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])))
val dats_ds: Dataset[Datasource] =
spark.read.textFile(dats).map(p => mapper.readValue(p, classOf[Datasource]))
val hbm_ds: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
)
assertEquals(10, dats_ds.count())
@ -111,7 +161,8 @@ class TestApply extends java.io.Serializable{
assertEquals(10, ds.count)
val temp: Dataset[(Datasource, Datasource)] = dats_ds.joinWith(ds, dats_ds.col("id").equalTo(ds.col("id")), "left")
val temp: Dataset[(Datasource, Datasource)] =
dats_ds.joinWith(ds, dats_ds.col("id").equalTo(ds.col("id")), "left")
assertEquals(10, temp.count())
temp.foreach(t2 => {
val pb: Datasource = t2._1
@ -119,14 +170,23 @@ class TestApply extends java.io.Serializable{
assertTrue(t2._1.getId.equals(t2._2.getId))
if (pb.getId.equals("10|doajarticles::0ab37b7620eb9a73ac95d3ca4320c97d")) {
assertTrue(pa.getOpenairecompatibility().getClassid.equals("hostedBy"))
assertTrue(pa.getOpenairecompatibility().getClassname.equals("collected from a compatible aggregator"))
assertTrue(
pa.getOpenairecompatibility()
.getClassname
.equals("collected from a compatible aggregator")
)
assertTrue(pb.getOpenairecompatibility().getClassid.equals(ModelConstants.UNKNOWN))
} else {
assertTrue(pa.getOpenairecompatibility().getClassid.equals(pb.getOpenairecompatibility.getClassid))
assertTrue(pa.getOpenairecompatibility().getClassname.equals(pb.getOpenairecompatibility.getClassname))
assertTrue(
pa.getOpenairecompatibility().getClassid.equals(pb.getOpenairecompatibility.getClassid)
)
assertTrue(
pa.getOpenairecompatibility()
.getClassname
.equals(pb.getOpenairecompatibility.getClassname)
)
}
})

View File

@ -19,7 +19,6 @@ class TestPrepare extends java.io.Serializable{
write(input)
}
@Test
def testHostedByMaptoEntityInfo(): Unit = {
val conf = new SparkConf()
@ -33,14 +32,14 @@ class TestPrepare extends java.io.Serializable{
.getOrCreate()
val hbm = getClass.getResource("hostedbymap.json").getPath
import spark.implicits._
val mapper: ObjectMapper = new ObjectMapper()
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
val ds :Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hbm)).map(toEntityInfo)
val ds: Dataset[EntityInfo] =
spark.createDataset(spark.sparkContext.textFile(hbm)).map(toEntityInfo)
ds.foreach(e => println(mapper.writeValueAsString(e)))
@ -71,8 +70,14 @@ class TestPrepare extends java.io.Serializable{
assertEquals(2, ds.count)
assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ds.filter(ei => ei.getJournalId.equals("1728-5852")).first().getId)
assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ds.filter(ei => ei.getJournalId.equals("0001-396X")).first().getId)
assertEquals(
"50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9",
ds.filter(ei => ei.getJournalId.equals("1728-5852")).first().getId
)
assertEquals(
"50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9",
ds.filter(ei => ei.getJournalId.equals("0001-396X")).first().getId
)
spark.close()
}
@ -95,8 +100,10 @@ class TestPrepare extends java.io.Serializable{
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
val pub_ds :Dataset[EntityInfo] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
val pub_ds: Dataset[EntityInfo] =
spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
val hbm_ds: Dataset[EntityInfo] =
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds)
@ -131,8 +138,10 @@ class TestPrepare extends java.io.Serializable{
implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
val pub_ds :Dataset[EntityInfo] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
val pub_ds: Dataset[EntityInfo] =
spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo]))
val hbm_ds: Dataset[EntityInfo] =
spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))
val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds)
@ -150,6 +159,4 @@ class TestPrepare extends java.io.Serializable{
spark.close()
}
}

View File

@ -13,7 +13,6 @@ class TestPreprocess extends java.io.Serializable{
implicit val mapEncoderDats: Encoder[Datasource] = Encoders.kryo[Datasource]
implicit val schema = Encoders.product[HostedByInfo]
def toHBIString(hbi: HostedByItemType): String = {
implicit val formats = DefaultFormats
@ -41,19 +40,30 @@ class TestPreprocess extends java.io.Serializable{
assertEquals(5, ds.filter(hbi => !hbi.eissn.equals("")).count)
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count)
assertEquals(
0,
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
)
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365")).count == 1)
assertTrue(ds.filter(hbi => hbi.eissn.equals("2253-900X")).count == 1)
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1)
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata")).count == 1)
assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.id.equals("10|doajarticles::abbc9265bea9ff62776a1c39785af00c")).count == 1)
assertTrue(
ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1
)
assertTrue(
ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata")).count == 1
)
assertTrue(
ds.filter(hbi =>
hbi.issn.equals("0212-8365") && hbi.id
.equals("10|doajarticles::abbc9265bea9ff62776a1c39785af00c")
).count == 1
)
ds.foreach(hbi => assertTrue(hbi.id.startsWith("10|")))
ds.foreach(hbi => println(toHBIString(hbi)))
spark.close()
}
@Test
def readGold(): Unit = {
val conf = new SparkConf()
@ -67,7 +77,6 @@ class TestPreprocess extends java.io.Serializable{
.getOrCreate()
val path = getClass.getResource("unibi_transformed.json").getPath
val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.goldHostedByDataset(spark, path)
assertEquals(29, ds.count)
@ -76,9 +85,17 @@ class TestPreprocess extends java.io.Serializable{
assertEquals(0, ds.filter(hbi => !hbi.eissn.equals("")).count)
assertEquals(29, ds.filter(hbi => !hbi.lissn.equals("")).count)
assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count)
assertEquals(
0,
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
)
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().officialname.equals("European journal of sustainable development."))
assertTrue(
ds.filter(hbi => hbi.issn.equals("2239-6101"))
.first()
.officialname
.equals("European journal of sustainable development.")
)
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().lissn.equals("2239-5938"))
assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).count == 1)
ds.foreach(hbi => assertTrue(hbi.id.equals(Constants.UNIBI)))
@ -108,9 +125,17 @@ class TestPreprocess extends java.io.Serializable{
assertEquals(21, ds.filter(hbi => !hbi.eissn.equals("")).count)
assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count)
assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count)
assertEquals(
0,
ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count
)
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().officialname.equals("Journal of Space Technology"))
assertTrue(
ds.filter(hbi => hbi.issn.equals("2077-3099"))
.first()
.officialname
.equals("Journal of Space Technology")
)
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().eissn.equals("2411-5029"))
assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).count == 1)
assertTrue(ds.filter(hbi => hbi.eissn.equals("2077-2955")).first().issn.equals(""))
@ -133,20 +158,38 @@ class TestPreprocess extends java.io.Serializable{
.config(conf)
.getOrCreate()
val tmp = SparkProduceHostedByMap.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
.union(SparkProduceHostedByMap.goldHostedByDataset(spark,getClass.getResource("unibi_transformed.json").getPath))
.union(SparkProduceHostedByMap.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath))
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]))
val tmp = SparkProduceHostedByMap
.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
.union(
SparkProduceHostedByMap
.goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath)
)
.union(
SparkProduceHostedByMap
.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)
)
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
)
assertEquals(106, tmp.count)
assertEquals(82, tmp.map(i => i._1)(Encoders.STRING).distinct().count)
val ds :Dataset[(String, HostedByItemType)] = Aggregators.explodeHostedByItemType(SparkProduceHostedByMap.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
.union(SparkProduceHostedByMap.goldHostedByDataset(spark,getClass.getResource("unibi_transformed.json").getPath))
.union(SparkProduceHostedByMap.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath))
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])))
val ds: Dataset[(String, HostedByItemType)] = Aggregators.explodeHostedByItemType(
SparkProduceHostedByMap
.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath)
.union(
SparkProduceHostedByMap
.goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath)
)
.union(
SparkProduceHostedByMap
.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)
)
.flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(
Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
)
)
assertEquals(82, ds.count)
@ -156,14 +199,13 @@ class TestPreprocess extends java.io.Serializable{
assertTrue(ds.filter(i => i._1.equals("2077-3757")).first()._2.openAccess)
assertEquals(1, ds.filter(i => i._1.equals("2077-3757")).count)
val hbmap : Dataset[String] = ds.filter(hbi => hbi._2.id.startsWith("10|")).map(SparkProduceHostedByMap.toHostedByMap)(Encoders.STRING)
val hbmap: Dataset[String] = ds
.filter(hbi => hbi._2.id.startsWith("10|"))
.map(SparkProduceHostedByMap.toHostedByMap)(Encoders.STRING)
hbmap.foreach(entry => println(entry))
spark.close()
}
}

View File

@ -1,6 +1,5 @@
package eu.dnetlib.dhp.oa.graph.resolution
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.common.EntityType
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
@ -26,65 +25,86 @@ class ResolveEntitiesTest extends Serializable {
var sparkSession: Option[SparkSession] = None
@BeforeAll
def setUp(): Unit = {
workingDir = Files.createTempDirectory(getClass.getSimpleName)
val conf = new SparkConf()
sparkSession = Some(SparkSession
sparkSession = Some(
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master("local[*]").getOrCreate())
.master("local[*]")
.getOrCreate()
)
populateDatasets(sparkSession.get)
generateUpdates(sparkSession.get)
}
@AfterAll
def tearDown(): Unit = {
FileUtils.deleteDirectory(workingDir.toFile)
sparkSession.get.stop()
}
def generateUpdates(spark: SparkSession): Unit = {
val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString
val pids:List[String] = template.lines.map{id =>
val pids: List[String] = template.lines
.map { id =>
val r = new Result
r.setId(id.toLowerCase.trim)
r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
r.setTitle(List(OafMapperUtils.structuredProperty(FAKE_TITLE, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
r.setSubject(
List(
OafMapperUtils.structuredProperty(
FAKE_SUBJECT,
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
null
)
).asJava
)
r.setTitle(
List(
OafMapperUtils.structuredProperty(
FAKE_TITLE,
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
null
)
).asJava
)
r
}.map{r =>
}
.map { r =>
val mapper = new ObjectMapper()
mapper.writeValueAsString(r)}.toList
mapper.writeValueAsString(r)
}
.toList
val sc = spark.sparkContext
println(sc.parallelize(pids).count())
spark.createDataset(sc.parallelize(pids))(Encoders.STRING).write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingDir/updates")
spark
.createDataset(sc.parallelize(pids))(Encoders.STRING)
.write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(s"$workingDir/updates")
import spark.implicits._
implicit val resEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
val ds = spark.read.text(s"$workingDir/updates").as[String].map{s => val mapper = new ObjectMapper()
mapper.readValue(s, classOf[Result])}.collect()
val ds = spark.read
.text(s"$workingDir/updates")
.as[String]
.map { s =>
val mapper = new ObjectMapper()
mapper.readValue(s, classOf[Result])
}
.collect()
assertEquals(4, ds.length)
ds.foreach { r => assertNotNull(r.getSubject) }
@ -92,30 +112,36 @@ class ResolveEntitiesTest extends Serializable {
ds.foreach { r => assertNotNull(r.getTitle) }
ds.foreach { r => assertEquals(1, r.getTitle.size()) }
ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_TITLE,t))
ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_SUBJECT,t))
ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue))
.foreach(t => assertEquals(FAKE_TITLE, t))
ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue))
.foreach(t => assertEquals(FAKE_SUBJECT, t))
println("generated Updates")
}
def populateDatasets(spark: SparkSession): Unit = {
import spark.implicits._
val entities = SparkResolveEntities.entities
entities.foreach{
e =>
entities.foreach { e =>
val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString
spark.createDataset(spark.sparkContext.parallelize(template.lines.toList)).as[String].write.option("compression", "gzip").text(s"$workingDir/graph/$e")
spark
.createDataset(spark.sparkContext.parallelize(template.lines.toList))
.as[String]
.write
.option("compression", "gzip")
.text(s"$workingDir/graph/$e")
println(s"Created Dataset $e")
}
SparkResolveRelation.extractPidResolvedTableFromJsonRDD(spark, s"$workingDir/graph", s"$workingDir/work")
SparkResolveRelation.extractPidResolvedTableFromJsonRDD(
spark,
s"$workingDir/graph",
s"$workingDir/work"
)
}
@Test
def testResolution(): Unit = {
val spark: SparkSession = sparkSession.get
@ -126,16 +152,15 @@ class ResolveEntitiesTest extends Serializable {
assertEquals(3, ds.count())
ds.collect().foreach{
r =>
ds.collect().foreach { r =>
assertTrue(r.getId.startsWith("50"))
}
}
private def structuredPContainsValue(l:java.util.List[StructuredProperty], exptectedValue:String):Boolean = {
private def structuredPContainsValue(
l: java.util.List[StructuredProperty],
exptectedValue: String
): Boolean = {
l.asScala.exists(p => p.getValue != null && p.getValue.equalsIgnoreCase(exptectedValue))
}
@ -146,47 +171,72 @@ class ResolveEntitiesTest extends Serializable {
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
val m = new ObjectMapper()
SparkResolveEntities.resolveEntities(spark, s"$workingDir/work", s"$workingDir/updates")
SparkResolveEntities.generateResolvedEntities(spark,s"$workingDir/work",s"$workingDir/graph", s"$workingDir/target" )
val pubDS:Dataset[Result] = spark.read.text(s"$workingDir/target/publication").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
val t = pubDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
SparkResolveEntities.generateResolvedEntities(
spark,
s"$workingDir/work",
s"$workingDir/graph",
s"$workingDir/target"
)
val pubDS: Dataset[Result] = spark.read
.text(s"$workingDir/target/publication")
.as[String]
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
val t = pubDS
.filter(p => p.getTitle != null && p.getSubject != null)
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
.count()
var ct = pubDS.count()
var et = pubDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
var et = pubDS
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
.count()
assertEquals(ct, et)
val datDS:Dataset[Result] = spark.read.text(s"$workingDir/target/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
val td = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
val datDS: Dataset[Result] = spark.read
.text(s"$workingDir/target/dataset")
.as[String]
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
val td = datDS
.filter(p => p.getTitle != null && p.getSubject != null)
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
.count()
ct = datDS.count()
et = datDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
et = datDS
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
.count()
assertEquals(ct, et)
val softDS:Dataset[Result] = spark.read.text(s"$workingDir/target/software").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.software))
val ts = softDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
val softDS: Dataset[Result] = spark.read
.text(s"$workingDir/target/software")
.as[String]
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.software))
val ts = softDS
.filter(p => p.getTitle != null && p.getSubject != null)
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
.count()
ct = softDS.count()
et = softDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
et = softDS
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
.count()
assertEquals(ct, et)
val orpDS:Dataset[Result] = spark.read.text(s"$workingDir/target/otherresearchproduct").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct))
val to = orpDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
val orpDS: Dataset[Result] = spark.read
.text(s"$workingDir/target/otherresearchproduct")
.as[String]
.map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct))
val to = orpDS
.filter(p => p.getTitle != null && p.getSubject != null)
.filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE")))
.count()
ct = orpDS.count()
et = orpDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count()
et = orpDS
.filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty))
.count()
assertEquals(ct, et)
assertEquals(0, t)
assertEquals(2, td)
assertEquals(1, ts)
@ -194,40 +244,35 @@ class ResolveEntitiesTest extends Serializable {
}
@Test
def testMerge(): Unit = {
val r = new Result
r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
r.setSubject(
List(
OafMapperUtils.structuredProperty(
FAKE_SUBJECT,
OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"),
null
)
).asJava
)
val mapper = new ObjectMapper()
val p = mapper.readValue(Source.fromInputStream(this.getClass.getResourceAsStream(s"publication")).mkString.lines.next(), classOf[Publication])
val p = mapper.readValue(
Source
.fromInputStream(this.getClass.getResourceAsStream(s"publication"))
.mkString
.lines
.next(),
classOf[Publication]
)
r.mergeFrom(p)
println(mapper.writeValueAsString(r))
}
}

View File

@ -1,26 +1,20 @@
package eu.dnetlib.dhp.sx.graph
import org.junit.jupiter.api.Test
import java.text.SimpleDateFormat
class RetrieveDataciteDeltaTest {
@Test
def testParsingDate(): Unit = {
val inputDate = "2021-12-02T11:17:36+0000"
val t = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ").parse(inputDate).getTime
println(t)
}
}

View File

@ -20,7 +20,6 @@ import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension]))
class ScholixGraphTest extends AbstractVocabularyTest {
val mapper: ObjectMapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
@ -30,11 +29,12 @@ class ScholixGraphTest extends AbstractVocabularyTest{
super.setUpVocabulary()
}
@Test
def testExtractPids(): Unit = {
val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json")).mkString
val input = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json"))
.mkString
val res = SparkResolveRelation.extractPidsFromRecord(input)
assertNotNull(res)
@ -44,11 +44,14 @@ class ScholixGraphTest extends AbstractVocabularyTest{
@Test
def testOAFToSummary(): Unit = {
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary")).mkString
val inputRelations = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary"))
.mkString
val items = inputRelations.lines.toList
assertNotNull(items)
items.foreach(i => assertTrue(i.nonEmpty))
val result = items.map(r => mapper.readValue(r, classOf[Result])).map(i => ScholixUtils.resultToSummary(i))
val result =
items.map(r => mapper.readValue(r, classOf[Result])).map(i => ScholixUtils.resultToSummary(i))
assertNotNull(result)
@ -59,12 +62,18 @@ class ScholixGraphTest extends AbstractVocabularyTest{
}
@Test
def testScholixMergeOnSource(): Unit = {
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix")).mkString
val result:List[(Relation,ScholixSummary)] =inputRelations.lines.sliding(2).map(s => (s.head, s(1))).map(p => (mapper.readValue(p._1, classOf[Relation]),mapper.readValue(p._2, classOf[ScholixSummary]) )).toList
val inputRelations = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix")
)
.mkString
val result: List[(Relation, ScholixSummary)] = inputRelations.lines
.sliding(2)
.map(s => (s.head, s(1)))
.map(p => (mapper.readValue(p._1, classOf[Relation]), mapper.readValue(p._2, classOf[ScholixSummary])))
.toList
assertNotNull(result)
assertTrue(result.nonEmpty)
result.foreach(r => assertEquals(r._1.getSource, r._2.getId))
@ -72,12 +81,13 @@ class ScholixGraphTest extends AbstractVocabularyTest{
println(mapper.writeValueAsString(scholix.head))
}
@Test
def testScholixRelationshipsClean(): Unit = {
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relation_transform.json")).mkString
val inputRelations = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relation_transform.json")
)
.mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(inputRelations)
@ -89,7 +99,4 @@ class ScholixGraphTest extends AbstractVocabularyTest{
}
}

39
pom.xml
View File

@ -620,6 +620,18 @@
</dependency>
</dependencies>
</plugin>
<plugin>
<groupId>org.antipathy</groupId>
<artifactId>mvn-scalafmt_2.11</artifactId>
<version>1.0.1640073709.733712b</version>
<dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-code-style</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</plugin>
</plugins>
</pluginManagement>
<plugins>
@ -665,6 +677,33 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.antipathy</groupId>
<artifactId>mvn-scalafmt_2.11</artifactId>
<configuration>
<configLocation>dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf</configLocation>
<skipTestSources>false</skipTestSources>
<skipSources>false</skipSources>
<sourceDirectories>
<param>${project.basedir}/src/main/scala</param>
</sourceDirectories>
<testSourceDirectories>
<param>${project.basedir}/src/test/scala</param>
</testSourceDirectories>
<validateOnly>false</validateOnly>
<onlyChangedFiles>false</onlyChangedFiles>
<branch>: git rev-parse --abbrev-ref HEAD</branch>
<useSpecifiedRepositories>false</useSpecifiedRepositories>
</configuration>
<executions>
<execution>
<phase>validate</phase>
<goals>
<goal>format</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>