diff --git a/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf b/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf new file mode 100644 index 000000000..0b5dbe0b4 --- /dev/null +++ b/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf @@ -0,0 +1,21 @@ +style = defaultWithAlign + +align.openParenCallSite = false +align.openParenDefnSite = false +align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}] +continuationIndent.callSite = 2 +continuationIndent.defnSite = 2 +danglingParentheses = true +indentOperator = spray +maxColumn = 120 +newlines.alwaysBeforeTopLevelStatements = true +project.excludeFilters = [".*\\.sbt"] +rewrite.rules = [AvoidInfix] +rewrite.rules = [ExpandImportSelectors] +rewrite.rules = [RedundantBraces] +rewrite.rules = [RedundantParens] +rewrite.rules = [SortImports] +rewrite.rules = [SortModifiers] +rewrite.rules = [PreferCurlyFors] +spaces.inImportCurlyBraces = false +unindentTopLevelOperators = true \ No newline at end of file diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala index 6541746b2..f8afe9af4 100644 --- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala +++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala @@ -2,71 +2,72 @@ package eu.dnetlib.dhp.application import scala.io.Source -/** - * This is the main Interface SparkApplication - * where all the Spark Scala class should inherit - * - */ +/** This is the main Interface SparkApplication + * where all the Spark Scala class should inherit + */ trait SparkScalaApplication { - /** - * This is the path in the classpath of the json - * describes all the argument needed to run - */ + + /** This is the path in the classpath of the json + * describes all the argument needed to run + */ val propertyPath: String - /** - * Utility to parse the arguments using the - * property json in the classpath identified from - * the variable propertyPath - * - * @param args the list of arguments - */ + /** Utility to parse the arguments using the + * property json in the classpath identified from + * the variable propertyPath + * + * @param args the list of arguments + */ def parseArguments(args: Array[String]): ArgumentApplicationParser = { - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString) + val parser = new ArgumentApplicationParser( + Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString + ) parser.parseArgument(args) parser } - /** - * Here all the spark applications runs this method - * where the whole logic of the spark node is defined - */ + /** Here all the spark applications runs this method + * where the whole logic of the spark node is defined + */ def run(): Unit } - import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.slf4j.Logger -abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends SparkScalaApplication { +abstract class AbstractScalaApplication( + val propertyPath: String, + val args: Array[String], + log: Logger +) extends SparkScalaApplication { var parser: ArgumentApplicationParser = null - var spark:SparkSession = null + var spark: SparkSession = null - - def initialize():SparkScalaApplication = { + def initialize(): SparkScalaApplication = { parser = parseArguments(args) spark = createSparkSession() this } - /** - * Utility for creating a spark session starting from parser - * - * @return a spark Session - */ - private def createSparkSession():SparkSession = { - require(parser!= null) + /** Utility for creating a spark session starting from parser + * + * @return a spark Session + */ + private def createSparkSession(): SparkSession = { + require(parser != null) - val conf:SparkConf = new SparkConf() + val conf: SparkConf = new SparkConf() val master = parser.get("master") log.info(s"Creating Spark session: Master: $master") - SparkSession.builder().config(conf) + SparkSession + .builder() + .config(conf) .appName(getClass.getSimpleName) .master(master) .getOrCreate() } -} \ No newline at end of file +} diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala index f35af0905..a995016a8 100644 --- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala +++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala @@ -14,7 +14,6 @@ import scala.io.Source object ScholixUtils extends Serializable { - val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier" val DATE_RELATION_KEY: String = "RelationDate" @@ -24,7 +23,11 @@ object ScholixUtils extends Serializable { case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {} val relations: Map[String, RelationVocabulary] = { - val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")).mkString + val input = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json") + ) + .mkString implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) @@ -32,13 +35,14 @@ object ScholixUtils extends Serializable { json.extract[Map[String, RelationVocabulary]] } - def extractRelationDate(relation: Relation): String = { if (relation.getProperties == null || !relation.getProperties.isEmpty) null else { - val date = relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue) + val date = relation.getProperties.asScala + .find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)) + .map(p => p.getValue) if (date.isDefined) date.get else @@ -58,78 +62,80 @@ object ScholixUtils extends Serializable { def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = { new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName) - } - - def generateScholixResourceFromResult(r:Result) :ScholixResource = { + def generateScholixResourceFromResult(r: Result): ScholixResource = { generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r)) } + val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = + new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable { + override def zero: RelatedEntities = null - val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable { - override def zero: RelatedEntities = null + override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = { + val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0 + val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0 - override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = { - val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0 - val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0 - - if (b == null) - RelatedEntities(a._1, relatedDataset, relatedPublication) - else - RelatedEntities(a._1, b.relatedDataset + relatedDataset, b.relatedPublication + relatedPublication) - } - - override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = { - if (b1 != null && b2 != null) - RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication) - - else if (b1 != null) - b1 - else - b2 - } - - override def finish(reduction: RelatedEntities): RelatedEntities = reduction - - override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities]) - - override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities]) - } - - - val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable { - override def zero: Scholix = null - - - def scholix_complete(s: Scholix): Boolean = { - if (s == null || s.getIdentifier == null) { - false - } else if (s.getSource == null || s.getTarget == null) { - false + if (b == null) + RelatedEntities(a._1, relatedDataset, relatedPublication) + else + RelatedEntities( + a._1, + b.relatedDataset + relatedDataset, + b.relatedPublication + relatedPublication + ) } - else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty) - false - else - true + + override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = { + if (b1 != null && b2 != null) + RelatedEntities( + b1.id, + b1.relatedDataset + b2.relatedDataset, + b1.relatedPublication + b2.relatedPublication + ) + else if (b1 != null) + b1 + else + b2 + } + + override def finish(reduction: RelatedEntities): RelatedEntities = reduction + + override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities]) + + override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities]) } - override def reduce(b: Scholix, a: (String, Scholix)): Scholix = { - if (scholix_complete(b)) b else a._2 + val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] = + new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable { + override def zero: Scholix = null + + def scholix_complete(s: Scholix): Boolean = { + if (s == null || s.getIdentifier == null) { + false + } else if (s.getSource == null || s.getTarget == null) { + false + } else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty) + false + else + true + } + + override def reduce(b: Scholix, a: (String, Scholix)): Scholix = { + if (scholix_complete(b)) b else a._2 + } + + override def merge(b1: Scholix, b2: Scholix): Scholix = { + if (scholix_complete(b1)) b1 else b2 + } + + override def finish(reduction: Scholix): Scholix = reduction + + override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] + + override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] } - override def merge(b1: Scholix, b2: Scholix): Scholix = { - if (scholix_complete(b1)) b1 else b2 - } - - override def finish(reduction: Scholix): Scholix = reduction - - override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] - - override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] - } - - def createInverseScholixRelation(scholix: Scholix): Scholix = { val s = new Scholix s.setPublicationDate(scholix.getPublicationDate) @@ -138,16 +144,19 @@ object ScholixUtils extends Serializable { s.setRelationship(inverseRelationShip(scholix.getRelationship)) s.setSource(scholix.getTarget) s.setTarget(scholix.getSource) - s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}")) + s.setIdentifier( + DHPUtils.md5( + s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}" + ) + ) s - } def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = { if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) { - val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { - d => new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers) + val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d => + new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers) }(collection.breakOut) l } else List() @@ -155,8 +164,11 @@ object ScholixUtils extends Serializable { def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = { if (summary.getDatasources != null && !summary.getDatasources.isEmpty) { - val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { - d => new ScholixEntityId(d.getDatasourceName, List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava) + val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d => + new ScholixEntityId( + d.getDatasourceName, + List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava + ) }(collection.breakOut) l } else List() @@ -165,17 +177,16 @@ object ScholixUtils extends Serializable { def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = { if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) { - - val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { - c => - - new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava) + val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c => + new ScholixEntityId( + c.getValue, + List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava + ) }.toList l } else List() } - def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = { val s = new Scholix s.setPublicationDate(scholix.getPublicationDate) @@ -184,11 +195,14 @@ object ScholixUtils extends Serializable { s.setRelationship(scholix.getRelationship) s.setSource(scholix.getSource) s.setTarget(generateScholixResourceFromSummary(target)) - s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}")) + s.setIdentifier( + DHPUtils.md5( + s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}" + ) + ) s } - def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = { val s = new Scholix s.setPublicationDate(scholix.getPublicationDate) @@ -197,11 +211,14 @@ object ScholixUtils extends Serializable { s.setRelationship(scholix.getRelationship) s.setSource(scholix.getSource) s.setTarget(target) - s.setIdentifier(DHPUtils.md5(s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}")) + s.setIdentifier( + DHPUtils.md5( + s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}" + ) + ) s } - def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = { val r = new ScholixResource r.setIdentifier(summaryObject.getLocalIdentifier) @@ -214,7 +231,8 @@ object ScholixUtils extends Serializable { r.setTitle(summaryObject.getTitle.get(0)) if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) { - val l: List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList + val l: List[ScholixEntityId] = + summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList if (l.nonEmpty) r.setCreator(l.asJava) } @@ -222,20 +240,27 @@ object ScholixUtils extends Serializable { if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty) r.setPublicationDate(summaryObject.getDate.get(0)) if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) { - val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList + val plist: List[ScholixEntityId] = + summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList if (plist.nonEmpty) r.setPublisher(plist.asJava) } - if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) { - val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom( - new ScholixEntityId(c.getDatasourceName, List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava) - , "collected", "complete" - - )).toList + val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala + .map(c => + new ScholixCollectedFrom( + new ScholixEntityId( + c.getDatasourceName, + List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava + ), + "collected", + "complete" + ) + ) + .toList if (l.nonEmpty) r.setCollectedFrom(l.asJava) @@ -244,9 +269,7 @@ object ScholixUtils extends Serializable { r } - - - def scholixFromSource(relation: Relation, source: ScholixResource):Scholix = { + def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = { if (relation == null || source == null) return null val s = new Scholix @@ -262,7 +285,6 @@ object ScholixUtils extends Serializable { s.setPublicationDate(d) - if (source.getPublisher != null && !source.getPublisher.isEmpty) { s.setPublisher(source.getPublisher) } @@ -270,13 +292,14 @@ object ScholixUtils extends Serializable { val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null) if (semanticRelation == null) return null - s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)) + s.setRelationship( + new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse) + ) s.setSource(source) s } - def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = { if (relation == null || source == null) @@ -298,12 +321,10 @@ object ScholixUtils extends Serializable { s.setPublicationDate(d) - if (source.getPublisher != null && !source.getPublisher.isEmpty) { val l: List[ScholixEntityId] = source.getPublisher.asScala - .map { - p => - new ScholixEntityId(p, null) + .map { p => + new ScholixEntityId(p, null) }(collection.breakOut) if (l.nonEmpty) @@ -313,31 +334,37 @@ object ScholixUtils extends Serializable { val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null) if (semanticRelation == null) return null - s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)) + s.setRelationship( + new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse) + ) s.setSource(generateScholixResourceFromSummary(source)) s } + def findURLForPID( + pidValue: List[StructuredProperty], + urls: List[String] + ): List[(StructuredProperty, String)] = { + pidValue.map { p => + val pv = p.getValue - def findURLForPID(pidValue: List[StructuredProperty], urls: List[String]): List[(StructuredProperty, String)] = { - pidValue.map { - p => - val pv = p.getValue - - val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase)) - (p, r.orNull) + val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase)) + (p, r.orNull) } } - def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = { if (r.getInstance() == null || r.getInstance().isEmpty) return List() - r.getInstance().asScala.filter(i => i.getUrl != null && !i.getUrl.isEmpty) + r.getInstance() + .asScala + .filter(i => i.getUrl != null && !i.getUrl.isEmpty) .filter(i => i.getPid != null && i.getUrl != null) .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList)) - .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList + .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)) + .distinct + .toList } def resultToSummary(r: Result): ScholixSummary = { @@ -371,7 +398,12 @@ object ScholixUtils extends Serializable { s.setAuthor(authors.asJava) } if (r.getInstance() != null) { - val dt: List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue).toList + val dt: List[String] = r + .getInstance() + .asScala + .filter(i => i.getDateofacceptance != null) + .map(i => i.getDateofacceptance.getValue) + .toList if (dt.nonEmpty) s.setDate(dt.distinct.asJava) } @@ -382,7 +414,9 @@ object ScholixUtils extends Serializable { } if (r.getSubject != null && !r.getSubject.isEmpty) { - val subjects: List[SchemeValue] = r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue)).toList + val subjects: List[SchemeValue] = r.getSubject.asScala + .map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue)) + .toList if (subjects.nonEmpty) s.setSubject(subjects.asJava) } @@ -391,7 +425,9 @@ object ScholixUtils extends Serializable { s.setPublisher(List(r.getPublisher.getValue).asJava) if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) { - val cf: List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete")).toList + val cf: List[CollectedFromType] = r.getCollectedfrom.asScala + .map(c => new CollectedFromType(c.getValue, c.getKey, "complete")) + .toList if (cf.nonEmpty) s.setDatasources(cf.distinct.asJava) } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala index 86a28ac10..85f5a3082 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala @@ -7,16 +7,14 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode} object CollectionUtils { - /** - * This method in pipeline to the transformation phase, - * generates relations in both verse, typically it should be a phase of flatMap - * - * @param i input OAF - * @return - * If the input OAF is an entity -> List(i) - * If the input OAF is a relation -> List(relation, inverseRelation) - * - */ + /** This method in pipeline to the transformation phase, + * generates relations in both verse, typically it should be a phase of flatMap + * + * @param i input OAF + * @return + * If the input OAF is an entity -> List(i) + * If the input OAF is a relation -> List(relation, inverseRelation) + */ def fixRelations(i: Oaf): List[Oaf] = { if (i.isInstanceOf[OafEntity]) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala index 6a9b8e3e5..471149b25 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala @@ -6,7 +6,6 @@ import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest} import org.apache.http.entity.StringEntity import org.apache.http.impl.client.HttpClientBuilder - abstract class AbstractRestClient extends Iterator[String] { var buffer: List[String] = List() @@ -16,12 +15,10 @@ abstract class AbstractRestClient extends Iterator[String] { var complete: Boolean = false - def extractInfo(input: String): Unit protected def getBufferData(): Unit - def doHTTPGETRequest(url: String): String = { val httpGet = new HttpGet(url) doHTTPRequest(httpGet) @@ -43,7 +40,6 @@ abstract class AbstractRestClient extends Iterator[String] { buffer.nonEmpty && current_index < buffer.size } - override def next(): String = { val next_item: String = buffer(current_index) current_index = current_index + 1 @@ -52,13 +48,14 @@ abstract class AbstractRestClient extends Iterator[String] { next_item } - private def doHTTPRequest[A <: HttpUriRequest](r: A): String = { val timeout = 60; // seconds - val config = RequestConfig.custom() + val config = RequestConfig + .custom() .setConnectTimeout(timeout * 1000) .setConnectionRequestTimeout(timeout * 1000) - .setSocketTimeout(timeout * 1000).build() + .setSocketTimeout(timeout * 1000) + .build() val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build() try { var tries = 4 @@ -69,8 +66,7 @@ abstract class AbstractRestClient extends Iterator[String] { println(s"get response with status${response.getStatusLine.getStatusCode}") if (response.getStatusLine.getStatusCode > 400) { tries -= 1 - } - else + } else return IOUtils.toString(response.getEntity.getContent) } catch { case e: Throwable => @@ -87,4 +83,4 @@ abstract class AbstractRestClient extends Iterator[String] { } getBufferData() -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala index 7ec44a6ff..d2fd709aa 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.datacite import org.json4s.jackson.JsonMethods.{compact, parse, render} import org.json4s.{DefaultFormats, JValue} -class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient { +class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until: Long = -1) extends AbstractRestClient { override def extractInfo(input: String): Unit = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats @@ -16,16 +16,18 @@ class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = - current_index = 0 } - def get_url():String ={ - val to = if (until> 0) s"$until" else "*" + def get_url(): String = { + val to = if (until > 0) s"$until" else "*" s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]" } override def getBufferData(): Unit = { if (!complete) { - val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url()) + val response = + if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) + else doHTTPGETRequest(get_url()) extractInfo(response) } } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala index 6c5dc8cce..a59779387 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala @@ -10,24 +10,38 @@ import java.util.Locale import java.util.regex.Pattern import scala.io.Source -/** - * This class represent the dataModel of the input Dataset of Datacite - * @param doi THE DOI - * @param timestamp timestamp of last update date - * @param isActive the record is active or deleted - * @param json the json native records - */ +/** This class represent the dataModel of the input Dataset of Datacite + * @param doi THE DOI + * @param timestamp timestamp of last update date + * @param isActive the record is active or deleted + * @param json the json native records + */ case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {} /* The following class are utility class used for the mapping from json datacite to OAF Shema */ -case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {} +case class RelatedIdentifierType( + relationType: String, + relatedIdentifier: String, + relatedIdentifierType: String +) {} -case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {} +case class NameIdentifiersType( + nameIdentifierScheme: Option[String], + schemeUri: Option[String], + nameIdentifier: Option[String] +) {} -case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {} +case class CreatorType( + nameType: Option[String], + nameIdentifiers: Option[List[NameIdentifiersType]], + name: Option[String], + familyName: Option[String], + givenName: Option[String], + affiliation: Option[List[String]] +) {} case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {} @@ -35,100 +49,230 @@ case class SubjectType(subject: Option[String], subjectScheme: Option[String]) { case class DescriptionType(descriptionType: Option[String], description: Option[String]) {} -case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {} +case class FundingReferenceType( + funderIdentifierType: Option[String], + awardTitle: Option[String], + awardUri: Option[String], + funderName: Option[String], + funderIdentifier: Option[String], + awardNumber: Option[String] +) {} case class DateType(date: Option[String], dateType: Option[String]) {} -case class OAFRelations(relation:String, inverse:String, relType:String) +case class OAFRelations(relation: String, inverse: String, relType: String) - -class DataciteModelConstants extends Serializable { - -} +class DataciteModelConstants extends Serializable {} object DataciteModelConstants { - val REL_TYPE_VALUE:String = "resultResult" + val REL_TYPE_VALUE: String = "resultResult" val DATE_RELATION_KEY = "RelationDate" val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter" val DOI_CLASS = "doi" val SUBJ_CLASS = "keywords" val DATACITE_NAME = "Datacite" val dataInfo: DataInfo = dataciteDataInfo("0.9") - val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME) - val subRelTypeMapping: Map[String,OAFRelations] = Map( - ModelConstants.REFERENCES -> OAFRelations(ModelConstants.REFERENCES, ModelConstants.IS_REFERENCED_BY, ModelConstants.RELATIONSHIP), - ModelConstants.IS_REFERENCED_BY -> OAFRelations(ModelConstants.IS_REFERENCED_BY,ModelConstants.REFERENCES, ModelConstants.RELATIONSHIP), + val DATACITE_COLLECTED_FROM: KeyValue = + OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME) - ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.SUPPLEMENT), - ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.SUPPLEMENT), - - ModelConstants.HAS_PART -> OAFRelations(ModelConstants.HAS_PART,ModelConstants.IS_PART_OF, ModelConstants.PART), - ModelConstants.IS_PART_OF -> OAFRelations(ModelConstants.IS_PART_OF,ModelConstants.HAS_PART, ModelConstants.PART), - - ModelConstants.IS_VERSION_OF-> OAFRelations(ModelConstants.IS_VERSION_OF,ModelConstants.HAS_VERSION,ModelConstants.VERSION), - ModelConstants.HAS_VERSION-> OAFRelations(ModelConstants.HAS_VERSION,ModelConstants.IS_VERSION_OF,ModelConstants.VERSION), - - ModelConstants.IS_IDENTICAL_TO -> OAFRelations(ModelConstants.IS_IDENTICAL_TO,ModelConstants.IS_IDENTICAL_TO, ModelConstants.RELATIONSHIP), - - ModelConstants.IS_CONTINUED_BY -> OAFRelations(ModelConstants.IS_CONTINUED_BY,ModelConstants.CONTINUES, ModelConstants.RELATIONSHIP), - ModelConstants.CONTINUES -> OAFRelations(ModelConstants.CONTINUES,ModelConstants.IS_CONTINUED_BY, ModelConstants.RELATIONSHIP), - - ModelConstants.IS_NEW_VERSION_OF-> OAFRelations(ModelConstants.IS_NEW_VERSION_OF,ModelConstants.IS_PREVIOUS_VERSION_OF, ModelConstants.VERSION), - ModelConstants.IS_PREVIOUS_VERSION_OF ->OAFRelations(ModelConstants.IS_PREVIOUS_VERSION_OF,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION), - - ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP), - ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP), - - ModelConstants.IS_SOURCE_OF -> OAFRelations(ModelConstants.IS_SOURCE_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION), - ModelConstants.IS_DERIVED_FROM -> OAFRelations(ModelConstants.IS_DERIVED_FROM,ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION), - - ModelConstants.CITES -> OAFRelations(ModelConstants.CITES,ModelConstants.IS_CITED_BY, ModelConstants.CITATION), - ModelConstants.IS_CITED_BY -> OAFRelations(ModelConstants.IS_CITED_BY,ModelConstants.CITES, ModelConstants.CITATION), - - ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(ModelConstants.IS_VARIANT_FORM_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION), - ModelConstants.IS_OBSOLETED_BY -> OAFRelations(ModelConstants.IS_OBSOLETED_BY,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION), - - ModelConstants.REVIEWS -> OAFRelations(ModelConstants.REVIEWS,ModelConstants.IS_REVIEWED_BY, ModelConstants.REVIEW), - ModelConstants.IS_REVIEWED_BY -> OAFRelations(ModelConstants.IS_REVIEWED_BY,ModelConstants.REVIEWS, ModelConstants.REVIEW), - - ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP), - ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP), - - ModelConstants.COMPILES -> OAFRelations(ModelConstants.COMPILES,ModelConstants.IS_COMPILED_BY, ModelConstants.RELATIONSHIP), - ModelConstants.IS_COMPILED_BY -> OAFRelations(ModelConstants.IS_COMPILED_BY,ModelConstants.COMPILES, ModelConstants.RELATIONSHIP) + val subRelTypeMapping: Map[String, OAFRelations] = Map( + ModelConstants.REFERENCES -> OAFRelations( + ModelConstants.REFERENCES, + ModelConstants.IS_REFERENCED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_REFERENCED_BY -> OAFRelations( + ModelConstants.IS_REFERENCED_BY, + ModelConstants.REFERENCES, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations( + ModelConstants.IS_SUPPLEMENTED_BY, + ModelConstants.IS_SUPPLEMENT_TO, + ModelConstants.SUPPLEMENT + ), + ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations( + ModelConstants.IS_SUPPLEMENT_TO, + ModelConstants.IS_SUPPLEMENTED_BY, + ModelConstants.SUPPLEMENT + ), + ModelConstants.HAS_PART -> OAFRelations( + ModelConstants.HAS_PART, + ModelConstants.IS_PART_OF, + ModelConstants.PART + ), + ModelConstants.IS_PART_OF -> OAFRelations( + ModelConstants.IS_PART_OF, + ModelConstants.HAS_PART, + ModelConstants.PART + ), + ModelConstants.IS_VERSION_OF -> OAFRelations( + ModelConstants.IS_VERSION_OF, + ModelConstants.HAS_VERSION, + ModelConstants.VERSION + ), + ModelConstants.HAS_VERSION -> OAFRelations( + ModelConstants.HAS_VERSION, + ModelConstants.IS_VERSION_OF, + ModelConstants.VERSION + ), + ModelConstants.IS_IDENTICAL_TO -> OAFRelations( + ModelConstants.IS_IDENTICAL_TO, + ModelConstants.IS_IDENTICAL_TO, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_CONTINUED_BY -> OAFRelations( + ModelConstants.IS_CONTINUED_BY, + ModelConstants.CONTINUES, + ModelConstants.RELATIONSHIP + ), + ModelConstants.CONTINUES -> OAFRelations( + ModelConstants.CONTINUES, + ModelConstants.IS_CONTINUED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_NEW_VERSION_OF -> OAFRelations( + ModelConstants.IS_NEW_VERSION_OF, + ModelConstants.IS_PREVIOUS_VERSION_OF, + ModelConstants.VERSION + ), + ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations( + ModelConstants.IS_PREVIOUS_VERSION_OF, + ModelConstants.IS_NEW_VERSION_OF, + ModelConstants.VERSION + ), + ModelConstants.IS_DOCUMENTED_BY -> OAFRelations( + ModelConstants.IS_DOCUMENTED_BY, + ModelConstants.DOCUMENTS, + ModelConstants.RELATIONSHIP + ), + ModelConstants.DOCUMENTS -> OAFRelations( + ModelConstants.DOCUMENTS, + ModelConstants.IS_DOCUMENTED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_SOURCE_OF -> OAFRelations( + ModelConstants.IS_SOURCE_OF, + ModelConstants.IS_DERIVED_FROM, + ModelConstants.VERSION + ), + ModelConstants.IS_DERIVED_FROM -> OAFRelations( + ModelConstants.IS_DERIVED_FROM, + ModelConstants.IS_SOURCE_OF, + ModelConstants.VERSION + ), + ModelConstants.CITES -> OAFRelations( + ModelConstants.CITES, + ModelConstants.IS_CITED_BY, + ModelConstants.CITATION + ), + ModelConstants.IS_CITED_BY -> OAFRelations( + ModelConstants.IS_CITED_BY, + ModelConstants.CITES, + ModelConstants.CITATION + ), + ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations( + ModelConstants.IS_VARIANT_FORM_OF, + ModelConstants.IS_DERIVED_FROM, + ModelConstants.VERSION + ), + ModelConstants.IS_OBSOLETED_BY -> OAFRelations( + ModelConstants.IS_OBSOLETED_BY, + ModelConstants.IS_NEW_VERSION_OF, + ModelConstants.VERSION + ), + ModelConstants.REVIEWS -> OAFRelations( + ModelConstants.REVIEWS, + ModelConstants.IS_REVIEWED_BY, + ModelConstants.REVIEW + ), + ModelConstants.IS_REVIEWED_BY -> OAFRelations( + ModelConstants.IS_REVIEWED_BY, + ModelConstants.REVIEWS, + ModelConstants.REVIEW + ), + ModelConstants.DOCUMENTS -> OAFRelations( + ModelConstants.DOCUMENTS, + ModelConstants.IS_DOCUMENTED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_DOCUMENTED_BY -> OAFRelations( + ModelConstants.IS_DOCUMENTED_BY, + ModelConstants.DOCUMENTS, + ModelConstants.RELATIONSHIP + ), + ModelConstants.COMPILES -> OAFRelations( + ModelConstants.COMPILES, + ModelConstants.IS_COMPILED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_COMPILED_BY -> OAFRelations( + ModelConstants.IS_COMPILED_BY, + ModelConstants.COMPILES, + ModelConstants.RELATIONSHIP + ) ) - val datacite_filter: List[String] = { val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH) - require(stream!= null) + require(stream != null) Source.fromInputStream(stream).getLines().toList } + def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo( + false, + null, + false, + false, + ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, + trust + ) - def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(false,null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, trust) + val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern( + "[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", + Locale.ENGLISH + ) - val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH) - val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) + val df_it: DateTimeFormatter = + DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) val funder_regex: List[(Pattern, String)] = List( - (Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"), - (Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::") - + ( + Pattern.compile( + "(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", + Pattern.MULTILINE | Pattern.CASE_INSENSITIVE + ), + "40|corda__h2020::" + ), + ( + Pattern.compile( + "(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", + Pattern.MULTILINE | Pattern.CASE_INSENSITIVE + ), + "40|corda_______::" + ) ) val Date_regex: List[Pattern] = List( //Y-M-D - Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE), + Pattern.compile( + "(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", + Pattern.MULTILINE + ), //M-D-Y - Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE), + Pattern.compile( + "((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", + Pattern.MULTILINE + ), //D-M-Y - Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE), + Pattern.compile( + "(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", + Pattern.MULTILINE + ), //Y Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE) ) - } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index a662cf99d..a0b7cd95e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -20,19 +20,16 @@ import java.time.format.DateTimeFormatter import java.util.{Date, Locale} import scala.collection.JavaConverters._ - object DataciteToOAFTransformation { val mapper = new ObjectMapper() - - /** - * This method should skip record if json contains invalid text - * defined in gile datacite_filter - * - * @param json - * @return True if the record should be skipped - */ + /** This method should skip record if json contains invalid text + * defined in gile datacite_filter + * + * @param json + * @return True if the record should be skipped + */ def skip_record(json: String): Boolean = { datacite_filter.exists(f => json.contains(f)) } @@ -74,35 +71,35 @@ object DataciteToOAFTransformation { } - def embargo_end(embargo_end_date: String): Boolean = { val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]")) val td = LocalDate.now() td.isAfter(dt) } - def extract_date(input: String): Option[String] = { - val d = Date_regex.map(pattern => { - val matcher = pattern.matcher(input) - if (matcher.find()) - matcher.group(0) - else - null - } - ).find(s => s != null) + val d = Date_regex + .map(pattern => { + val matcher = pattern.matcher(input) + if (matcher.find()) + matcher.group(0) + else + null + }) + .find(s => s != null) if (d.isDefined) { val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get try { return Some(LocalDate.parse(a_date, df_en).toString) } catch { - case _: Throwable => try { - return Some(LocalDate.parse(a_date, df_it).toString) - } catch { - case _: Throwable => - return None - } + case _: Throwable => + try { + return Some(LocalDate.parse(a_date, df_it).toString) + } catch { + case _: Throwable => + return None + } } } d @@ -118,31 +115,63 @@ object DataciteToOAFTransformation { } } - - def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = { + def getTypeQualifier( + resourceType: String, + resourceTypeGeneral: String, + schemaOrg: String, + vocabularies: VocabularyGroup + ): (Qualifier, Qualifier) = { if (resourceType != null && resourceType.nonEmpty) { - val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType) + val typeQualifier = + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType) if (typeQualifier != null) - return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid)) + return ( + typeQualifier, + vocabularies.getSynonymAsQualifier( + ModelConstants.DNET_RESULT_TYPOLOGIES, + typeQualifier.getClassid + ) + ) } if (schemaOrg != null && schemaOrg.nonEmpty) { - val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg) + val typeQualifier = + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg) if (typeQualifier != null) - return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid)) + return ( + typeQualifier, + vocabularies.getSynonymAsQualifier( + ModelConstants.DNET_RESULT_TYPOLOGIES, + typeQualifier.getClassid + ) + ) } if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) { - val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral) + val typeQualifier = vocabularies.getSynonymAsQualifier( + ModelConstants.DNET_PUBLICATION_RESOURCE, + resourceTypeGeneral + ) if (typeQualifier != null) - return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid)) + return ( + typeQualifier, + vocabularies.getSynonymAsQualifier( + ModelConstants.DNET_RESULT_TYPOLOGIES, + typeQualifier.getClassid + ) + ) } null } - - def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = { - val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) + def getResult( + resourceType: String, + resourceTypeGeneral: String, + schemaOrg: String, + vocabularies: VocabularyGroup + ): Result = { + val typeQualifiers: (Qualifier, Qualifier) = + getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) if (typeQualifiers == null) return null val i = new Instance @@ -168,13 +197,12 @@ object DataciteToOAFTransformation { null } - def available_date(input: String): Boolean = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: org.json4s.JValue = parse(input) val l: List[String] = for { - JObject(dates) <- json \\ "dates" + JObject(dates) <- json \\ "dates" JField("dateType", JString(dateTypes)) <- dates } yield dateTypes @@ -182,18 +210,19 @@ object DataciteToOAFTransformation { } - - /** - * As describe in ticket #6377 - * when the result come from figshare we need to remove subject - * and set Access rights OPEN. - * - * @param r - */ + /** As describe in ticket #6377 + * when the result come from figshare we need to remove subject + * and set Access rights OPEN. + * + * @param r + */ def fix_figshare(r: Result): Unit = { if (r.getInstance() != null) { - val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue)) + val hosted_by_figshare = r + .getInstance() + .asScala + .exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue)) if (hosted_by_figshare) { r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT())) val l: List[StructuredProperty] = List() @@ -201,10 +230,8 @@ object DataciteToOAFTransformation { } } - } - def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = { val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_') s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}" @@ -214,7 +241,13 @@ object DataciteToOAFTransformation { OafMapperUtils.structuredProperty(dt, q, null) } - def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = { + def generateRelation( + sourceId: String, + targetId: String, + relClass: String, + cf: KeyValue, + di: DataInfo + ): Relation = { val r = new Relation r.setSource(sourceId) @@ -226,7 +259,6 @@ object DataciteToOAFTransformation { r.setDataInfo(di) r - } def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = { @@ -238,14 +270,18 @@ object DataciteToOAFTransformation { val grantId = m.matcher(awardUri).replaceAll("$2") val targetId = s"$p${DHPUtils.md5(grantId)}" List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo)) - } - else + } else List() } - - def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = { + def generateOAF( + input: String, + ts: Long, + dateOfCollection: Long, + vocabularies: VocabularyGroup, + exportLinks: Boolean + ): List[Oaf] = { if (skip_record(input)) return List() @@ -253,7 +289,8 @@ object DataciteToOAFTransformation { lazy val json = parse(input) val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null) - val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null) + val resourceTypeGeneral = + (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null) val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null) val doi = (json \ "attributes" \ "doi").extract[String] @@ -265,8 +302,12 @@ object DataciteToOAFTransformation { if (result == null) return List() - - val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES) + val doi_q = OafMapperUtils.qualifier( + "doi", + "doi", + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES + ) val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo) result.setPid(List(pid).asJava) result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true)) @@ -275,48 +316,70 @@ object DataciteToOAFTransformation { val d = new Date(dateOfCollection * 1000) val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US) - result.setDateofcollection(ISO8601FORMAT.format(d)) result.setDateoftransformation(ISO8601FORMAT.format(d)) result.setDataInfo(dataInfo) val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List()) - val authors = creators.zipWithIndex.map { case (c, idx) => val a = new Author a.setFullname(c.name.orNull) a.setName(c.givenName.orNull) a.setSurname(c.familyName.orNull) if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) { - a.setPid(c.nameIdentifiers.get.map(ni => { - val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null - if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) { - OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo) - } - else - null + a.setPid( + c.nameIdentifiers.get + .map(ni => { + val q = + if (ni.nameIdentifierScheme.isDefined) + vocabularies.getTermAsQualifier( + ModelConstants.DNET_PID_TYPES, + ni.nameIdentifierScheme.get.toLowerCase() + ) + else null + if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) { + OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo) + } else + null - } + }) + .asJava ) - .asJava) } if (c.affiliation.isDefined) - a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava) + a.setAffiliation( + c.affiliation.get + .filter(af => af.nonEmpty) + .map(af => OafMapperUtils.field(af, dataInfo)) + .asJava + ) a.setRank(idx + 1) a } - val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List()) - result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => { - if (t.titleType.isEmpty) { - OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null) - } else { - OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null) - } - }).asJava) + result.setTitle( + titles + .filter(t => t.title.nonEmpty) + .map(t => { + if (t.titleType.isEmpty) { + OafMapperUtils + .structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null) + } else { + OafMapperUtils.structuredProperty( + t.title.get, + t.titleType.get, + t.titleType.get, + ModelConstants.DNET_DATACITE_TITLE, + ModelConstants.DNET_DATACITE_TITLE, + null + ) + } + }) + .asJava + ) if (authors == null || authors.isEmpty || !authors.exists(a => a != null)) return List() @@ -337,46 +400,81 @@ object DataciteToOAFTransformation { if (a_date.isDefined) { if (doi.startsWith("10.14457")) - result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null)) + result.setEmbargoenddate( + OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null) + ) else result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null)) } if (i_date.isDefined && i_date.get.isDefined) { if (doi.startsWith("10.14457")) { - result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)) - result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)) - } - else { + result.setDateofacceptance( + OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null) + ) + result + .getInstance() + .get(0) + .setDateofacceptance( + OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null) + ) + } else { result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) } - } - else if (publication_year != null) { + } else if (publication_year != null) { if (doi.startsWith("10.14457")) { - result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)) - result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)) + result.setDateofacceptance( + OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null) + ) + result + .getInstance() + .get(0) + .setDateofacceptance( + OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null) + ) } else { result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) - result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + result + .getInstance() + .get(0) + .setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) } } - - result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined) - .map(d => (extract_date(d.date.get), d.dateType.get)) - .filter(d => d._1.isDefined) - .map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase()))) - .filter(d => d._2 != null) - .map(d => generateOAFDate(d._1, d._2)).asJava) + result.setRelevantdate( + dates + .filter(d => d.date.isDefined && d.dateType.isDefined) + .map(d => (extract_date(d.date.get), d.dateType.get)) + .filter(d => d._1.isDefined) + .map(d => + ( + d._1.get, + vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase()) + ) + ) + .filter(d => d._2 != null) + .map(d => generateOAFDate(d._1, d._2)) + .asJava + ) val subjects = (json \\ "subjects").extract[List[SubjectType]] - result.setSubject(subjects.filter(s => s.subject.nonEmpty) - .map(s => - OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null) - ).asJava) - + result.setSubject( + subjects + .filter(s => s.subject.nonEmpty) + .map(s => + OafMapperUtils.structuredProperty( + s.subject.get, + SUBJ_CLASS, + SUBJ_CLASS, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + null + ) + ) + .asJava + ) result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) @@ -384,66 +482,86 @@ object DataciteToOAFTransformation { result.setDescription( descriptions - .filter(d => d.description.isDefined). - map(d => - OafMapperUtils.field(d.description.get, null) - ).filter(s => s != null).asJava) - + .filter(d => d.description.isDefined) + .map(d => OafMapperUtils.field(d.description.get, null)) + .filter(s => s != null) + .asJava + ) val publisher = (json \\ "publisher").extractOrElse[String](null) if (publisher != null) result.setPublisher(OafMapperUtils.field(publisher, null)) - val language: String = (json \\ "language").extractOrElse[String](null) if (language != null) - result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)) - + result.setLanguage( + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language) + ) val instance = result.getInstance().get(0) val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String] val accessRights: List[String] = for { - JObject(rightsList) <- json \\ "rightsList" + JObject(rightsList) <- json \\ "rightsList" JField("rightsUri", JString(rightsUri)) <- rightsList } yield rightsUri - val aRights: Option[AccessRight] = accessRights.map(r => { - vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r) - }).find(q => q != null).map(q => { - val a = new AccessRight - a.setClassid(q.getClassid) - a.setClassname(q.getClassname) - a.setSchemeid(q.getSchemeid) - a.setSchemename(q.getSchemename) - a - }) + val aRights: Option[AccessRight] = accessRights + .map(r => { + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r) + }) + .find(q => q != null) + .map(q => { + val a = new AccessRight + a.setClassid(q.getClassid) + a.setClassname(q.getClassname) + a.setSchemeid(q.getSchemeid) + a.setSchemename(q.getSchemename) + a + }) - - val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) + val access_rights_qualifier = + if (aRights.isDefined) aRights.get + else + OafMapperUtils.accessRight( + ModelConstants.UNKNOWN, + ModelConstants.NOT_AVAILABLE, + ModelConstants.DNET_ACCESS_MODES, + ModelConstants.DNET_ACCESS_MODES + ) if (client.isDefined) { - instance.setHostedby(OafMapperUtils.keyValue(generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID), ModelConstants.UNKNOWN_REPOSITORY.getValue)) + instance.setHostedby( + OafMapperUtils.keyValue( + generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID), + ModelConstants.UNKNOWN_REPOSITORY.getValue + ) + ) instance.setCollectedfrom(DATACITE_COLLECTED_FROM) instance.setUrl(List(s"https://dx.doi.org/$doi").asJava) instance.setAccessright(access_rights_qualifier) instance.setPid(result.getPid) val license = accessRights - .find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*")) + .find(r => + r.startsWith("http") && r.matches( + ".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*" + ) + ) if (license.isDefined) instance.setLicense(OafMapperUtils.field(license.get, null)) } val awardUris: List[String] = for { - JObject(fundingReferences) <- json \\ "fundingReferences" + JObject(fundingReferences) <- json \\ "fundingReferences" JField("awardUri", JString(awardUri)) <- fundingReferences } yield awardUri result.setId(IdentifierFactory.createIdentifier(result)) - var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null) + var relations: List[Relation] = + awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null) fix_figshare(result) @@ -452,28 +570,35 @@ object DataciteToOAFTransformation { if (exportLinks) { val rels: List[RelatedIdentifierType] = for { - JObject(relIdentifier) <- json \\ "relatedIdentifiers" - JField("relationType", JString(relationType)) <- relIdentifier + JObject(relIdentifier) <- json \\ "relatedIdentifiers" + JField("relationType", JString(relationType)) <- relIdentifier JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier - JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier + JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier } yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType) - relations = relations ::: generateRelations(rels, result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null) + relations = relations ::: generateRelations( + rels, + result.getId, + if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null + ) } if (relations != null && relations.nonEmpty) { List(result) ::: relations - } - else + } else List(result) } - private def generateRelations(rels: List[RelatedIdentifierType], id: String, date: String): List[Relation] = { + private def generateRelations( + rels: List[RelatedIdentifierType], + id: String, + date: String + ): List[Relation] = { rels .filter(r => - subRelTypeMapping.contains(r.relationType) && ( - r.relatedIdentifierType.equalsIgnoreCase("doi") || - r.relatedIdentifierType.equalsIgnoreCase("pmid") || - r.relatedIdentifierType.equalsIgnoreCase("arxiv")) + subRelTypeMapping + .contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") || + r.relatedIdentifierType.equalsIgnoreCase("pmid") || + r.relatedIdentifierType.equalsIgnoreCase("arxiv")) ) .map(r => { val rel = new Relation @@ -490,19 +615,19 @@ object DataciteToOAFTransformation { rel.setProperties(List(dateProps).asJava) rel.setSource(id) - rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)) + rel.setTarget( + DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType) + ) rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) rel.getCollectedfrom.asScala.map(c => c.getValue).toList rel }) } - def generateDSId(input: String): String = { val b = StringUtils.substringBefore(input, "::") val a = StringUtils.substringAfter(input, "::") s"10|$b::${DHPUtils.md5(a)}" } - } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala index a205edcf2..046290969 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala @@ -12,12 +12,12 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.slf4j.{Logger, LoggerFactory} +class GenerateDataciteDatasetSpark(propertyPath: String, args: Array[String], log: Logger) + extends AbstractScalaApplication(propertyPath, args, log: Logger) { -class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) { - /** - * Here all the spark applications runs this method - * where the whole logic of the spark node is defined - */ + /** Here all the spark applications runs this method + * where the whole logic of the spark node is defined + */ override def run(): Unit = { val sourcePath = parser.get("sourcePath") @@ -46,49 +46,65 @@ class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log reportTotalSize(targetPath, outputBasePath) } - - /** - * For working with MDStore we need to store in a file on hdfs the size of - * the current dataset - * @param targetPath - * @param outputBasePath - */ - def reportTotalSize( targetPath: String, outputBasePath: String ):Unit = { + /** For working with MDStore we need to store in a file on hdfs the size of + * the current dataset + * @param targetPath + * @param outputBasePath + */ + def reportTotalSize(targetPath: String, outputBasePath: String): Unit = { val total_items = spark.read.text(targetPath).count() - writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH) + writeHdfsFile( + spark.sparkContext.hadoopConfiguration, + s"$total_items", + outputBasePath + MDSTORE_SIZE_PATH + ) } - /** - * Generate the transformed and cleaned OAF Dataset from the native one - - * @param sourcePath sourcePath of the native Dataset in format JSON/Datacite - * @param exportLinks If true it generates unresolved links - * @param vocabularies vocabularies for cleaning - * @param targetPath the targetPath of the result Dataset - */ - def generateDataciteDataset(sourcePath: String, exportLinks: Boolean, vocabularies: VocabularyGroup, targetPath: String, spark:SparkSession):Unit = { - require(spark!= null) + /** Generate the transformed and cleaned OAF Dataset from the native one + * + * @param sourcePath sourcePath of the native Dataset in format JSON/Datacite + * @param exportLinks If true it generates unresolved links + * @param vocabularies vocabularies for cleaning + * @param targetPath the targetPath of the result Dataset + */ + def generateDataciteDataset( + sourcePath: String, + exportLinks: Boolean, + vocabularies: VocabularyGroup, + targetPath: String, + spark: SparkSession + ): Unit = { + require(spark != null) import spark.implicits._ implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord] implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] CollectionUtils.saveDataset( - spark.read.load(sourcePath).as[DataciteType] + spark.read + .load(sourcePath) + .as[DataciteType] .filter(d => d.isActive) - .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)) + .flatMap(d => + DataciteToOAFTransformation + .generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks) + ) .filter(d => d != null), - targetPath) + targetPath + ) } } - object GenerateDataciteDatasetSpark { val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass) def main(args: Array[String]): Unit = { - new GenerateDataciteDatasetSpark("/eu/dnetlib/dhp/datacite/generate_dataset_params.json", args, log).initialize().run() + new GenerateDataciteDatasetSpark( + "/eu/dnetlib/dhp/datacite/generate_dataset_params.json", + args, + log + ).initialize().run() } } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala index 018b4958a..cb021925a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala @@ -22,7 +22,6 @@ object ImportDatacite { val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass) - def convertAPIStringToDataciteItem(input: String): DataciteType = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: org.json4s.JValue = parse(input) @@ -32,14 +31,26 @@ object ImportDatacite { val timestamp_string = (json \ "attributes" \ "updated").extract[String] val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME) - DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input) + DataciteType( + doi = doi, + timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, + isActive = isActive, + json = input + ) } - def main(args: Array[String]): Unit = { - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString) + val parser = new ArgumentApplicationParser( + Source + .fromInputStream( + getClass.getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json" + ) + ) + .mkString + ) parser.parseArgument(args) val master = parser.get("master") @@ -60,7 +71,8 @@ object ImportDatacite { val spkipImport = parser.get("skipImport") log.info(s"skipImport is $spkipImport") - val spark: SparkSession = SparkSession.builder() + val spark: SparkSession = SparkSession + .builder() .appName(ImportDatacite.getClass.getSimpleName) .master(master) .getOrCreate() @@ -78,45 +90,48 @@ object ImportDatacite { import spark.implicits._ + val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = + new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable { - val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable { + override def zero: DataciteType = null - override def zero: DataciteType = null - - override def reduce(a: DataciteType, b: DataciteType): DataciteType = { - if (b == null) - return a - if (a == null) - return b - if (a.timestamp > b.timestamp) { - return a + override def reduce(a: DataciteType, b: DataciteType): DataciteType = { + if (b == null) + return a + if (a == null) + return b + if (a.timestamp > b.timestamp) { + return a + } + b } - b + + override def merge(a: DataciteType, b: DataciteType): DataciteType = { + reduce(a, b) + } + + override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]] + + override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]] + + override def finish(reduction: DataciteType): DataciteType = reduction } - override def merge(a: DataciteType, b: DataciteType): DataciteType = { - reduce(a, b) - } - - override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]] - - override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]] - - override def finish(reduction: DataciteType): DataciteType = reduction - } - val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType] val ts = dump.select(max("timestamp")).first().getLong(0) println(s"last Timestamp is $ts") - val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs) + val cnt = + if ("true".equalsIgnoreCase(spkipImport)) 1 + else writeSequenceFile(hdfsTargetPath, ts, conf, bs) println(s"Imported from Datacite API $cnt documents") if (cnt > 0) { - val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text]) + val inputRdd: RDD[DataciteType] = sc + .sequenceFile(targetPath, classOf[Int], classOf[Text]) .map(s => s._2.toString) .map(s => convertAPIStringToDataciteItem(s)) spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset") @@ -129,7 +144,9 @@ object ImportDatacite { .agg(dataciteAggregator.toColumn) .map(s => s._2) .repartition(4000) - .write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated") + .write + .mode(SaveMode.Overwrite) + .save(s"${dataciteDump}_updated") val fs = FileSystem.get(sc.hadoopConfiguration) fs.delete(new Path(s"$dataciteDump"), true) @@ -137,14 +154,24 @@ object ImportDatacite { } } - private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs: Int): Long = { + private def writeSequenceFile( + hdfsTargetPath: Path, + timestamp: Long, + conf: Configuration, + bs: Int + ): Long = { var from: Long = timestamp * 1000 val delta: Long = 100000000L var client: DataciteAPIImporter = null val now: Long = System.currentTimeMillis() var i = 0 try { - val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text])) + val writer = SequenceFile.createWriter( + conf, + SequenceFile.Writer.file(hdfsTargetPath), + SequenceFile.Writer.keyClass(classOf[IntWritable]), + SequenceFile.Writer.valueClass(classOf[Text]) + ) try { var start: Long = System.currentTimeMillis while (from < now) { @@ -153,16 +180,16 @@ object ImportDatacite { val key: IntWritable = new IntWritable(i) val value: Text = new Text while (client.hasNext) { - key.set({ + key.set { i += 1; i - 1 - }) + } value.set(client.next()) writer.append(key, value) writer.hflush() if (i % 1000 == 0) { end = System.currentTimeMillis - val time = (end - start) / 1000.0F + val time = (end - start) / 1000.0f println(s"Imported $i in $time seconds") start = System.currentTimeMillis } @@ -174,8 +201,7 @@ object ImportDatacite { case e: Throwable => println("Error", e) } finally if (writer != null) writer.close() - } - catch { + } catch { case e: Throwable => log.error("Error", e) } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala index d46e5423d..3e61edf02 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala @@ -17,7 +17,13 @@ object SparkDownloadUpdateDatacite { def main(args: Array[String]): Unit = { val conf = new SparkConf - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString) + val parser = new ArgumentApplicationParser( + Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json") + ) + .mkString + ) parser.parseArgument(args) val master = parser.get("master") val sourcePath = parser.get("sourcePath") @@ -26,8 +32,9 @@ object SparkDownloadUpdateDatacite { val hdfsuri = parser.get("namenode") log.info(s"namenode is $hdfsuri") - - val spark: SparkSession = SparkSession.builder().config(conf) + val spark: SparkSession = SparkSession + .builder() + .config(conf) .appName(getClass.getSimpleName) .master(master) .getOrCreate() @@ -37,13 +44,18 @@ object SparkDownloadUpdateDatacite { import spark.implicits._ - - val maxDate: String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0) + val maxDate: String = spark.read + .load(workingPath) + .as[Oaf] + .filter(s => s.isInstanceOf[Result]) + .map(r => r.asInstanceOf[Result].getDateofcollection) + .select(max("value")) + .first() + .getString(0) val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US) val string_to_date = ISO8601FORMAT.parse(maxDate) val ts = string_to_date.getTime - } } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala index 853b24862..ffdab1799 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala @@ -12,39 +12,81 @@ object BioDBToOAF { case class EBILinkItem(id: Long, links: String) {} - case class EBILinks(relType: String, date: String, title: String, pmid: String, targetPid: String, targetPidType: String, targetUrl: String) {} + case class EBILinks( + relType: String, + date: String, + title: String, + pmid: String, + targetPid: String, + targetPidType: String, + targetUrl: String + ) {} case class UniprotDate(date: String, date_info: String) {} - case class ScholixResolved(pid: String, pidType: String, typology: String, tilte: List[String], datasource: List[String], date: List[String], authors: List[String]) {} + case class ScholixResolved( + pid: String, + pidType: String, + typology: String, + tilte: List[String], + datasource: List[String], + date: List[String], + authors: List[String] + ) {} - val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9") + val DATA_INFO: DataInfo = OafMapperUtils.dataInfo( + false, + null, + false, + false, + ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, + "0.9" + ) val SUBJ_CLASS = "Keywords" val DATE_RELATION_KEY = "RelationDate" val resolvedURL: Map[String, String] = Map( - "genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/", - "ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/", - "ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/", - "ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/", - "ena" -> "https://www.ebi.ac.uk/ena/browser/view/", + "genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/", + "ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/", + "ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/", + "ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/", + "ena" -> "https://www.ebi.ac.uk/ena/browser/view/", "clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/", - "onim" -> "https://omim.org/entry/", - "refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/", - "geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" + "onim" -> "https://omim.org/entry/", + "refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/", + "geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" ) - val collectedFromMap: Map[String, KeyValue] = { - val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", "Protein Data Bank") - val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive") - val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", "NCBI Nucleotide") - val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::296e1abaf1302897a6838d3588cd0310", "UniProtKB/Swiss-Prot") - val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier") - val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature") - val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)") - val pubmedCollectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") + val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue( + "10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", + "Protein Data Bank" + ) + val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue( + "10|re3data_____::c2a591f440598b63d854556beaf01591", + "European Nucleotide Archive" + ) + val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue( + "10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", + "NCBI Nucleotide" + ) + val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue( + "10|re3data_____::296e1abaf1302897a6838d3588cd0310", + "UniProtKB/Swiss-Prot" + ) + val ElsevierCollectedFrom: KeyValue = + OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier") + val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue( + "10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", + "Springer Nature" + ) + val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue( + "10|opendoar____::83e60e09c222f206c725385f53d7e567c", + "EMBL-EBIs Protein Data Bank in Europe (PDBe)" + ) + val pubmedCollectedFrom: KeyValue = + OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") UNIPROTCollectedFrom.setDataInfo(DATA_INFO) PDBCollectedFrom.setDataInfo(DATA_INFO) @@ -56,14 +98,14 @@ object BioDBToOAF { springerNatureCollectedFrom.setDataInfo(DATA_INFO) Map( - "uniprot" -> UNIPROTCollectedFrom, - "pdb" -> PDBCollectedFrom, - "elsevier" -> ElsevierCollectedFrom, - "ebi" -> EBICollectedFrom, - "Springer Nature" -> springerNatureCollectedFrom, - "NCBI Nucleotide" -> ncbiCollectedFrom, + "uniprot" -> UNIPROTCollectedFrom, + "pdb" -> PDBCollectedFrom, + "elsevier" -> ElsevierCollectedFrom, + "ebi" -> EBICollectedFrom, + "Springer Nature" -> springerNatureCollectedFrom, + "NCBI Nucleotide" -> ncbiCollectedFrom, "European Nucleotide Archive" -> enaCollectedFrom, - "Europe PMC" -> pubmedCollectedFrom + "Europe PMC" -> pubmedCollectedFrom ) } @@ -80,18 +122,32 @@ object BioDBToOAF { val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String]) - createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type), collectedFromMap("elsevier"), "relationship", relation_semantic, date) + createRelation( + target_pid, + target_pid_type, + generate_unresolved_id(source_pid, source_pid_type), + collectedFromMap("elsevier"), + "relationship", + relation_semantic, + date + ) } - def scholixResolvedToOAF(input: ScholixResolved): Oaf = { val d = new Dataset d.setPid( List( - OafMapperUtils.structuredProperty(input.pid.toLowerCase, input.pidType.toLowerCase, input.pidType.toLowerCase, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO) + OafMapperUtils.structuredProperty( + input.pid.toLowerCase, + input.pidType.toLowerCase, + input.pidType.toLowerCase, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + DATA_INFO + ) ).asJava ) @@ -101,7 +157,15 @@ object BioDBToOAF { d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true)) if (input.tilte != null && input.tilte.nonEmpty) - d.setTitle(List(OafMapperUtils.structuredProperty(input.tilte.head, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava) + d.setTitle( + List( + OafMapperUtils.structuredProperty( + input.tilte.head, + ModelConstants.MAIN_TITLE_QUALIFIER, + DATA_INFO + ) + ).asJava + ) d.setOriginalId(List(input.pid).asJava) val i = new Instance @@ -113,9 +177,23 @@ object BioDBToOAF { } if (input.pidType.equalsIgnoreCase("clinicaltrials.gov")) - i.setInstancetype(OafMapperUtils.qualifier("0037", "Clinical Trial", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) + i.setInstancetype( + OafMapperUtils.qualifier( + "0037", + "Clinical Trial", + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) else - i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) + i.setInstancetype( + OafMapperUtils.qualifier( + "0046", + "Bioentity", + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) if (input.datasource == null || input.datasource.isEmpty) return null @@ -141,7 +219,6 @@ object BioDBToOAF { d } - def uniprotToOAF(input: String): List[Oaf] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) @@ -151,7 +228,14 @@ object BioDBToOAF { d.setPid( List( - OafMapperUtils.structuredProperty(pid, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO) + OafMapperUtils.structuredProperty( + pid, + "uniprot", + "uniprot", + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + DATA_INFO + ) ).asJava ) @@ -162,32 +246,52 @@ object BioDBToOAF { val title: String = (json \ "title").extractOrElse[String](null) if (title != null) - d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava) + d.setTitle( + List( + OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO) + ).asJava + ) d.setOriginalId(List(pid).asJava) val i = new Instance i.setPid(d.getPid) i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava) - i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) + i.setInstancetype( + OafMapperUtils.qualifier( + "0046", + "Bioentity", + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) i.setCollectedfrom(collectedFromMap("uniprot")) d.setInstance(List(i).asJava) val dates: List[UniprotDate] = for { - JObject(dateOBJ) <- json \ "dates" - JField("date", JString(date)) <- dateOBJ + JObject(dateOBJ) <- json \ "dates" + JField("date", JString(date)) <- dateOBJ JField("date_info", JString(date_info)) <- dateOBJ } yield UniprotDate(GraphCleaningFunctions.cleanDate(date), date_info) val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null) - if (subjects != null) { d.setSubject( - subjects.map(s => - OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null) - ).asJava) + subjects + .map(s => + OafMapperUtils.structuredProperty( + s, + SUBJ_CLASS, + SUBJ_CLASS, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + null + ) + ) + .asJava + ) } var i_date: Option[UniprotDate] = None @@ -197,45 +301,73 @@ object BioDBToOAF { i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO)) d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO)) } - val relevant_dates: List[StructuredProperty] = dates.filter(d => !d.date_info.contains("entry version")) - .map(date => OafMapperUtils.structuredProperty(date.date, ModelConstants.UNKNOWN, ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO)) + val relevant_dates: List[StructuredProperty] = dates + .filter(d => !d.date_info.contains("entry version")) + .map(date => + OafMapperUtils.structuredProperty( + date.date, + ModelConstants.UNKNOWN, + ModelConstants.UNKNOWN, + ModelConstants.DNET_DATACITE_DATE, + ModelConstants.DNET_DATACITE_DATE, + DATA_INFO + ) + ) if (relevant_dates != null && relevant_dates.nonEmpty) d.setRelevantdate(relevant_dates.asJava) d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO)) } - val references_pmid: List[String] = for { - JObject(reference) <- json \ "references" + JObject(reference) <- json \ "references" JField("PubMed", JString(pid)) <- reference } yield pid val references_doi: List[String] = for { - JObject(reference) <- json \ "references" + JObject(reference) <- json \ "references" JField(" DOI", JString(pid)) <- reference } yield pid - if (references_pmid != null && references_pmid.nonEmpty) { - val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null) + val rel = createRelation( + references_pmid.head, + "pmid", + d.getId, + collectedFromMap("uniprot"), + ModelConstants.RELATIONSHIP, + ModelConstants.IS_RELATED_TO, + if (i_date.isDefined) i_date.get.date else null + ) rel.getCollectedfrom List(d, rel) - } - else if (references_doi != null && references_doi.nonEmpty) { - val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null) + } else if (references_doi != null && references_doi.nonEmpty) { + val rel = createRelation( + references_doi.head, + "doi", + d.getId, + collectedFromMap("uniprot"), + ModelConstants.RELATIONSHIP, + ModelConstants.IS_RELATED_TO, + if (i_date.isDefined) i_date.get.date else null + ) List(d, rel) - } - else + } else List(d) } - def generate_unresolved_id(pid: String, pidType: String): String = { s"unresolved::$pid::$pidType" } - - def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType: String, relClass: String, date: String): Relation = { + def createRelation( + pid: String, + pidType: String, + sourceId: String, + collectedFrom: KeyValue, + subRelType: String, + relClass: String, + date: String + ): Relation = { val rel = new Relation rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava) @@ -248,7 +380,6 @@ object BioDBToOAF { rel.setSource(sourceId) rel.setTarget(s"unresolved::$pid::$pidType") - val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date) rel.setProperties(List(dateProps).asJava) @@ -259,12 +390,24 @@ object BioDBToOAF { } - - def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date: String): Relation = { - createRelation(pid, pidType, sourceId, collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date) + def createSupplementaryRelation( + pid: String, + pidType: String, + sourceId: String, + collectedFrom: KeyValue, + date: String + ): Relation = { + createRelation( + pid, + pidType, + sourceId, + collectedFrom, + ModelConstants.SUPPLEMENT, + ModelConstants.IS_SUPPLEMENT_TO, + date + ) } - def pdbTOOaf(input: String): List[Oaf] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) @@ -277,7 +420,14 @@ object BioDBToOAF { d.setPid( List( - OafMapperUtils.structuredProperty(pdb, "pdb", "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO) + OafMapperUtils.structuredProperty( + pdb, + "pdb", + "Protein Data Bank Identifier", + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + DATA_INFO + ) ).asJava ) @@ -290,13 +440,16 @@ object BioDBToOAF { if (title == null) return List() - d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava) + d.setTitle( + List( + OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO) + ).asJava + ) val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null) if (authors != null) { val convertedAuthors = authors.zipWithIndex.map { a => - val res = new Author res.setFullname(a._1) res.setRank(a._2 + 1) @@ -310,7 +463,14 @@ object BioDBToOAF { i.setPid(d.getPid) i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava) - i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) + i.setInstancetype( + OafMapperUtils.qualifier( + "0046", + "Bioentity", + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) i.setCollectedfrom(collectedFromMap("pdb")) d.setInstance(List(i).asJava) @@ -323,7 +483,6 @@ object BioDBToOAF { List(d) } - def extractEBILinksFromDump(input: String): EBILinkItem = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) @@ -333,49 +492,70 @@ object BioDBToOAF { EBILinkItem(pmid.toLong, compact(render(links))) } - def EBITargetLinksFilter(input: EBILinks): Boolean = { - input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot") + input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase( + "pdb" + ) || input.targetPidType.equalsIgnoreCase("uniprot") } - def parse_ebi_links(input: String): List[EBILinks] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) val pmid = (json \ "request" \ "id").extract[String] for { - JObject(link) <- json \\ "Link" - JField("Target", JObject(target)) <- link - JField("RelationshipType", JObject(relType)) <- link - JField("Name", JString(relation)) <- relType + JObject(link) <- json \\ "Link" + JField("Target", JObject(target)) <- link + JField("RelationshipType", JObject(relType)) <- link + JField("Name", JString(relation)) <- relType JField("PublicationDate", JString(publicationDate)) <- link - JField("Title", JString(title)) <- target - JField("Identifier", JObject(identifier)) <- target - JField("IDScheme", JString(idScheme)) <- identifier - JField("IDURL", JString(idUrl)) <- identifier - JField("ID", JString(id)) <- identifier + JField("Title", JString(title)) <- target + JField("Identifier", JObject(identifier)) <- target + JField("IDScheme", JString(idScheme)) <- identifier + JField("IDURL", JString(idUrl)) <- identifier + JField("ID", JString(id)) <- identifier - } yield EBILinks(relation, GraphCleaningFunctions.cleanDate(publicationDate), title, pmid, id, idScheme, idUrl) + } yield EBILinks( + relation, + GraphCleaningFunctions.cleanDate(publicationDate), + title, + pmid, + id, + idScheme, + idUrl + ) } - def convertEBILinksToOaf(input: EBILinks): List[Oaf] = { val d = new Dataset d.setCollectedfrom(List(collectedFromMap("ebi")).asJava) d.setDataInfo(DATA_INFO) - d.setTitle(List(OafMapperUtils.structuredProperty(input.title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava) + d.setTitle( + List( + OafMapperUtils.structuredProperty( + input.title, + ModelConstants.MAIN_TITLE_QUALIFIER, + DATA_INFO + ) + ).asJava + ) val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_') d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true)) d.setOriginalId(List(input.targetPid.toLowerCase).asJava) - d.setPid( List( - OafMapperUtils.structuredProperty(input.targetPid.toLowerCase, input.targetPidType.toLowerCase, "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO) + OafMapperUtils.structuredProperty( + input.targetPid.toLowerCase, + input.targetPidType.toLowerCase, + "Protein Data Bank Identifier", + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + DATA_INFO + ) ).asJava ) @@ -383,13 +563,35 @@ object BioDBToOAF { i.setPid(d.getPid) i.setUrl(List(input.targetUrl).asJava) - i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) + i.setInstancetype( + OafMapperUtils.qualifier( + "0046", + "Bioentity", + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) i.setCollectedfrom(collectedFromMap("ebi")) d.setInstance(List(i).asJava) - i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)) - d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)) + i.setDateofacceptance( + OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO) + ) + d.setDateofacceptance( + OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO) + ) - List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, GraphCleaningFunctions.cleanDate(input.date))) + List( + d, + createRelation( + input.pmid, + "pmid", + d.getId, + collectedFromMap("ebi"), + ModelConstants.RELATIONSHIP, + ModelConstants.IS_RELATED_TO, + GraphCleaningFunctions.cleanDate(input.date) + ) + ) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala index 27caa8f36..96075b4f3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala @@ -14,7 +14,11 @@ object SparkTransformBioDatabaseToOAF { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() val log: Logger = LoggerFactory.getLogger(getClass) - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json") + ) + ) parser.parseArgument(args) val database: String = parser.get("database") log.info("database: {}", database) @@ -29,20 +33,33 @@ object SparkTransformBioDatabaseToOAF { .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() val sc = spark.sparkContext implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) import spark.implicits._ database.toUpperCase() match { case "UNIPROT" => - CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))), targetPath) + CollectionUtils.saveDataset( + spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))), + targetPath + ) case "PDB" => - CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))), targetPath) + CollectionUtils.saveDataset( + spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))), + targetPath + ) case "SCHOLIX" => - CollectionUtils.saveDataset(spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)), targetPath) + CollectionUtils.saveDataset( + spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)), + targetPath + ) case "CROSSREF_LINKS" => - CollectionUtils.saveDataset(spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))), targetPath) + CollectionUtils.saveDataset( + spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))), + targetPath + ) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala index 0fea4ff7f..9c55ec7be 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala @@ -24,31 +24,37 @@ import scala.xml.pull.XMLEventReader object SparkCreateBaselineDataFrame { - def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = { val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/") - val result = data.lines.filter(l => l.startsWith("") - val start = l.indexOf(" l.startsWith("") + val start = l.indexOf("= 0 && end > start) - l.substring(start + 9, end - start) - else - "" - }.filter(s => s.endsWith(".gz")).filter(s => s > maxFile).map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList + if (start >= 0 && end > start) + l.substring(start + 9, end - start) + else + "" + } + .filter(s => s.endsWith(".gz")) + .filter(s => s > maxFile) + .map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")) + .toList result } - def downloadBaselinePart(url: String): InputStream = { val r = new HttpGet(url) val timeout = 60; // seconds - val config = RequestConfig.custom() + val config = RequestConfig + .custom() .setConnectTimeout(timeout * 1000) .setConnectionRequestTimeout(timeout * 1000) - .setSocketTimeout(timeout * 1000).build() + .setSocketTimeout(timeout * 1000) + .build() val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build() val response = client.execute(r) println(s"get response with status${response.getStatusLine.getStatusCode}") @@ -59,10 +65,12 @@ object SparkCreateBaselineDataFrame { def requestPage(url: String): String = { val r = new HttpGet(url) val timeout = 60; // seconds - val config = RequestConfig.custom() + val config = RequestConfig + .custom() .setConnectTimeout(timeout * 1000) .setConnectionRequestTimeout(timeout * 1000) - .setSocketTimeout(timeout * 1000).build() + .setSocketTimeout(timeout * 1000) + .build() val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build() try { var tries = 4 @@ -73,8 +81,7 @@ object SparkCreateBaselineDataFrame { println(s"get response with status${response.getStatusLine.getStatusCode}") if (response.getStatusLine.getStatusCode > 400) { tries -= 1 - } - else + } else return IOUtils.toString(response.getEntity.getContent) } catch { case e: Throwable => @@ -90,10 +97,8 @@ object SparkCreateBaselineDataFrame { } } - def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = { - val conf = new Configuration conf.set("fs.defaultFS", hdfsServerUri) val fs = FileSystem.get(conf) @@ -122,31 +127,36 @@ object SparkCreateBaselineDataFrame { } + val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = + new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable { + override def zero: PMArticle = new PMArticle - val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable { - override def zero: PMArticle = new PMArticle + override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = { + if (b != null && b.getPmid != null) b else a._2 + } - override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = { - if (b != null && b.getPmid != null) b else a._2 + override def merge(b1: PMArticle, b2: PMArticle): PMArticle = { + if (b1 != null && b1.getPmid != null) b1 else b2 + + } + + override def finish(reduction: PMArticle): PMArticle = reduction + + override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle] + + override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle] } - override def merge(b1: PMArticle, b2: PMArticle): PMArticle = { - if (b1 != null && b1.getPmid != null) b1 else b2 - - } - - override def finish(reduction: PMArticle): PMArticle = reduction - - override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle] - - override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle] - } - - def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() val log: Logger = LoggerFactory.getLogger(getClass) - val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + SparkEBILinksToOaf.getClass.getResourceAsStream( + "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json" + ) + ) + ) parser.parseArgument(args) val isLookupUrl: String = parser.get("isLookupUrl") log.info("isLookupUrl: {}", isLookupUrl) @@ -162,7 +172,6 @@ object SparkCreateBaselineDataFrame { val skipUpdate = parser.get("skipUpdate") log.info("skipUpdate: {}", skipUpdate) - val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) val spark: SparkSession = @@ -170,7 +179,8 @@ object SparkCreateBaselineDataFrame { .builder() .config(conf) .appName(SparkEBILinksToOaf.getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() val sc = spark.sparkContext import spark.implicits._ @@ -183,20 +193,30 @@ object SparkCreateBaselineDataFrame { if (!"true".equalsIgnoreCase(skipUpdate)) { downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri) val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000) - val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => { - val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes())) - new PMParser(xml) - })) - ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1) + val ds: Dataset[PMArticle] = spark.createDataset( + k.filter(i => i._1.endsWith(".gz")) + .flatMap(i => { + val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes())) + new PMParser(xml) + }) + ) + ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)) + .groupByKey(_._1) .agg(pmArticleAggregator.toColumn) - .map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset") + .map(p => p._2) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/baseline_dataset") } val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle] - CollectionUtils.saveDataset(exported_dataset - .map(a => PubMedToOaf.convert(a, vocabularies)).as[Oaf] - .filter(p => p != null), - targetPath) + CollectionUtils.saveDataset( + exported_dataset + .map(a => PubMedToOaf.convert(a, vocabularies)) + .as[Oaf] + .filter(p => p != null), + targetPath + ) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala index 18e39387f..44e9e22ea 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala @@ -25,10 +25,12 @@ object SparkDownloadEBILinks { def requestPage(url: String): String = { val r = new HttpGet(url) val timeout = 60; // seconds - val config = RequestConfig.custom() + val config = RequestConfig + .custom() .setConnectTimeout(timeout * 1000) .setConnectionRequestTimeout(timeout * 1000) - .setSocketTimeout(timeout * 1000).build() + .setSocketTimeout(timeout * 1000) + .build() val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build() try { var tries = 4 @@ -39,8 +41,7 @@ object SparkDownloadEBILinks { println(s"get response with status${response.getStatusLine.getStatusCode}") if (response.getStatusLine.getStatusCode > 400) { tries -= 1 - } - else + } else return IOUtils.toString(response.getEntity.getContent) } catch { case e: Throwable => @@ -66,14 +67,19 @@ object SparkDownloadEBILinks { val log: Logger = LoggerFactory.getLogger(getClass) val MAX_ITEM_PER_PARTITION = 20000 val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(SparkEBILinksToOaf.getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() import spark.implicits._ @@ -87,22 +93,40 @@ object SparkDownloadEBILinks { log.info(s"workingPath -> $workingPath") log.info("Getting max pubmedId where the links have already requested") - val links: Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem] + val links: Dataset[EBILinkItem] = + spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem] val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0) log.info("Retrieving PMID to request links") val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle] - pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request") + pubmed + .map(p => p.getPmid.toLong) + .where(s"value > $lastPMIDRequested") + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/id_to_request") val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long] val total = pmidToReq.count() - spark.createDataset(pmidToReq.rdd.repartition((total / MAX_ITEM_PER_PARTITION).toInt).map(pmid => createEBILinks(pmid)).filter(l => l != null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update") + spark + .createDataset( + pmidToReq.rdd + .repartition((total / MAX_ITEM_PER_PARTITION).toInt) + .map(pmid => createEBILinks(pmid)) + .filter(l => l != null) + ) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/links_update") - val updates: Dataset[EBILinkItem] = spark.read.load(s"$workingPath/links_update").as[EBILinkItem] + val updates: Dataset[EBILinkItem] = + spark.read.load(s"$workingPath/links_update").as[EBILinkItem] - links.union(updates).groupByKey(_.id) + links + .union(updates) + .groupByKey(_.id) .reduceGroups { (x, y) => if (x == null || x.links == null) y @@ -112,6 +136,10 @@ object SparkDownloadEBILinks { x else y - }.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final") + } + .map(_._2) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/links_final") } } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala index cd03f004d..7cb6153ff 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala @@ -15,15 +15,19 @@ object SparkEBILinksToOaf { def main(args: Array[String]): Unit = { val log: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(SparkEBILinksToOaf.getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - + .master(parser.get("master")) + .getOrCreate() import spark.implicits._ val sourcePath = parser.get("sourcePath") @@ -32,11 +36,17 @@ object SparkEBILinksToOaf { log.info(s"targetPath -> $targetPath") implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) - val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{")) + val ebLinks: Dataset[EBILinkItem] = spark.read + .load(sourcePath) + .as[EBILinkItem] + .filter(l => l.links != null && l.links.startsWith("{")) - CollectionUtils.saveDataset(ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links)) - .filter(p => BioDBToOAF.EBITargetLinksFilter(p)) - .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)), - targetPath) + CollectionUtils.saveDataset( + ebLinks + .flatMap(j => BioDBToOAF.parse_ebi_links(j.links)) + .filter(p => BioDBToOAF.EBITargetLinksFilter(p)) + .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)), + targetPath + ) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala index c6d5fdf74..49a271641 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala @@ -3,16 +3,13 @@ package eu.dnetlib.dhp.sx.bio.pubmed import scala.xml.MetaData import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader} +/** @param xml + */ +class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] { -/** - * - * @param xml - */ -class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] { + var currentArticle: PMArticle = generateNextArticle() - var currentArticle:PMArticle = generateNextArticle() - - override def hasNext: Boolean = currentArticle!= null + override def hasNext: Boolean = currentArticle != null override def next(): PMArticle = { val tmp = currentArticle @@ -20,33 +17,30 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] { tmp } - def extractAttributes(attrs:MetaData, key:String):String = { + def extractAttributes(attrs: MetaData, key: String): String = { val res = attrs.get(key) if (res.isDefined) { - val s =res.get + val s = res.get if (s != null && s.nonEmpty) s.head.text else null - } - else null + } else null } - - def validate_Date(year:String, month:String, day:String):String = { + def validate_Date(year: String, month: String, day: String): String = { try { f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d" } catch { - case _: Throwable =>null + case _: Throwable => null } } - def generateNextArticle():PMArticle = { + def generateNextArticle(): PMArticle = { - - var currentSubject:PMSubject = null + var currentSubject: PMSubject = null var currentAuthor: PMAuthor = null var currentJournal: PMJournal = null var currentGrant: PMGrant = null @@ -54,12 +48,7 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] { var currentYear = "0" var currentMonth = "01" var currentDay = "01" - var currentArticleType:String = null - - - - - + var currentArticleType: String = null while (xml.hasNext) { xml.next match { @@ -68,64 +57,67 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] { label match { case "PubmedArticle" => currentArticle = new PMArticle - case "Author" => currentAuthor = new PMAuthor - case "Journal" => currentJournal = new PMJournal - case "Grant" => currentGrant = new PMGrant + case "Author" => currentAuthor = new PMAuthor + case "Journal" => currentJournal = new PMJournal + case "Grant" => currentGrant = new PMGrant case "PublicationType" | "DescriptorName" => currentSubject = new PMSubject currentSubject.setMeshId(extractAttributes(attrs, "UI")) - case "ArticleId" => currentArticleType = extractAttributes(attrs,"IdType") - case _ => + case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType") + case _ => } case EvElemEnd(_, label) => label match { case "PubmedArticle" => return currentArticle - case "Author" => currentArticle.getAuthors.add(currentAuthor) - case "Journal" => currentArticle.setJournal(currentJournal) - case "Grant" => currentArticle.getGrants.add(currentGrant) - case "PubMedPubDate" => if (currentArticle.getDate== null) - currentArticle.setDate(validate_Date(currentYear,currentMonth,currentDay)) - case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay") - case "DescriptorName" => currentArticle.getSubjects.add(currentSubject) - case "PublicationType" =>currentArticle.getPublicationTypes.add(currentSubject) - case _ => + case "Author" => currentArticle.getAuthors.add(currentAuthor) + case "Journal" => currentArticle.setJournal(currentJournal) + case "Grant" => currentArticle.getGrants.add(currentGrant) + case "PubMedPubDate" => + if (currentArticle.getDate == null) + currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay)) + case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay") + case "DescriptorName" => currentArticle.getSubjects.add(currentSubject) + case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject) + case _ => } case EvText(text) => - if (currNode!= null && text.trim.nonEmpty) + if (currNode != null && text.trim.nonEmpty) currNode match { case "ArticleTitle" => { - if (currentArticle.getTitle==null) + if (currentArticle.getTitle == null) currentArticle.setTitle(text.trim) else currentArticle.setTitle(currentArticle.getTitle + text.trim) } case "AbstractText" => { - if (currentArticle.getDescription==null) + if (currentArticle.getDescription == null) currentArticle.setDescription(text.trim) else currentArticle.setDescription(currentArticle.getDescription + text.trim) } case "PMID" => currentArticle.setPmid(text.trim) - case "ArticleId" => if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim) - case "Language" => currentArticle.setLanguage(text.trim) - case "ISSN" => currentJournal.setIssn(text.trim) - case "GrantID" => currentGrant.setGrantID(text.trim) - case "Agency" => currentGrant.setAgency(text.trim) - case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim) - case "Year" => currentYear = text.trim - case "Month" => currentMonth = text.trim - case "Day" => currentDay = text.trim - case "Volume" => currentJournal.setVolume( text.trim) - case "Issue" => currentJournal.setIssue (text.trim) + case "ArticleId" => + if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim) + case "Language" => currentArticle.setLanguage(text.trim) + case "ISSN" => currentJournal.setIssn(text.trim) + case "GrantID" => currentGrant.setGrantID(text.trim) + case "Agency" => currentGrant.setAgency(text.trim) + case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim) + case "Year" => currentYear = text.trim + case "Month" => currentMonth = text.trim + case "Day" => currentDay = text.trim + case "Volume" => currentJournal.setVolume(text.trim) + case "Issue" => currentJournal.setIssue(text.trim) case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim) case "LastName" => { if (currentAuthor != null) currentAuthor.setLastName(text.trim) } - case "ForeName" => if (currentAuthor != null) - currentAuthor.setForeName(text.trim) + case "ForeName" => + if (currentAuthor != null) + currentAuthor.setForeName(text.trim) case "Title" => - if (currentJournal.getTitle==null) + if (currentJournal.getTitle == null) currentJournal.setTitle(text.trim) else currentJournal.setTitle(currentJournal.getTitle + text.trim) @@ -139,8 +131,3 @@ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] { null } } - - - - - diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index 65717adff..92ad22c57 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -9,31 +9,38 @@ import collection.JavaConverters._ import java.util.regex.Pattern /** - * - */ + */ object PubMedToOaf { val SUBJ_CLASS = "keywords" + val urlMap = Map( "pmid" -> "https://pubmed.ncbi.nlm.nih.gov/", - "doi" -> "https://dx.doi.org/" + "doi" -> "https://dx.doi.org/" ) - val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9") - val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") + val dataInfo: DataInfo = OafMapperUtils.dataInfo( + false, + null, + false, + false, + ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, + "0.9" + ) - /** - * Cleaning the DOI Applying regex in order to - * remove doi starting with URL - * - * @param doi input DOI - * @return cleaned DOI - */ + val collectedFrom: KeyValue = + OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") + + /** Cleaning the DOI Applying regex in order to + * remove doi starting with URL + * + * @param doi input DOI + * @return cleaned DOI + */ def cleanDoi(doi: String): String = { val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$" - val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE) val matcher = pattern.matcher(doi) @@ -43,33 +50,34 @@ object PubMedToOaf { null } - /** - * - * Create an instance of class extends Result - * starting from OAF instanceType value - * - * @param cobjQualifier OAF instance type - * @param vocabularies All dnet vocabularies - * @return the correct instance - */ + /** Create an instance of class extends Result + * starting from OAF instanceType value + * + * @param cobjQualifier OAF instance type + * @param vocabularies All dnet vocabularies + * @return the correct instance + */ def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = { - val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid) + val result_typologies = getVocabularyTerm( + ModelConstants.DNET_RESULT_TYPOLOGIES, + vocabularies, + cobjQualifier.getClassid + ) result_typologies.getClassid match { - case "dataset" => new Dataset + case "dataset" => new Dataset case "publication" => new Publication - case "other" => new OtherResearchProduct - case "software" => new Software - case _ => null + case "other" => new OtherResearchProduct + case "software" => new Software + case _ => null } } - /** - * Mapping the Pubmedjournal info into the OAF Journale - * - * @param j the pubmedJournal - * @return the OAF Journal - */ + /** Mapping the Pubmedjournal info into the OAF Journale + * + * @param j the pubmedJournal + * @return the OAF Journal + */ def mapJournal(j: PMJournal): Journal = { if (j == null) return null @@ -83,40 +91,47 @@ object PubMedToOaf { journal.setIss(j.getIssue) journal - } - /** - * - * Find vocabulary term into synonyms and term in the vocabulary - * - * @param vocabularyName the input vocabulary name - * @param vocabularies all the vocabularies - * @param term the term to find - * @return the cleaned term value - */ - def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = { + /** Find vocabulary term into synonyms and term in the vocabulary + * + * @param vocabularyName the input vocabulary name + * @param vocabularies all the vocabularies + * @param term the term to find + * @return the cleaned term value + */ + def getVocabularyTerm( + vocabularyName: String, + vocabularies: VocabularyGroup, + term: String + ): Qualifier = { val a = vocabularies.getSynonymAsQualifier(vocabularyName, term) val b = vocabularies.getTermAsQualifier(vocabularyName, term) if (a == null) b else a } - - /** - * Map the Pubmed Article into the OAF instance - * - * @param article the pubmed articles - * @param vocabularies the vocabularies - * @return The OAF instance if the mapping did not fail - */ + /** Map the Pubmed Article into the OAF instance + * + * @param article the pubmed articles + * @param vocabularies the vocabularies + * @return The OAF instance if the mapping did not fail + */ def convert(article: PMArticle, vocabularies: VocabularyGroup): Oaf = { if (article.getPublicationTypes == null) return null - // MAP PMID into pid with classid = classname = pmid - val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)) + val pidList: List[StructuredProperty] = List( + OafMapperUtils.structuredProperty( + article.getPmid, + PidType.pmid.toString, + PidType.pmid.toString, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + dataInfo + ) + ) if (pidList == null) return null @@ -125,7 +140,14 @@ object PubMedToOaf { if (article.getDoi != null) { val normalizedPid = cleanDoi(article.getDoi) if (normalizedPid != null) - alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo) + alternateIdentifier = OafMapperUtils.structuredProperty( + normalizedPid, + PidType.doi.toString, + PidType.doi.toString, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + dataInfo + ) } // INSTANCE MAPPING @@ -133,10 +155,12 @@ object PubMedToOaf { // If the article contains the typology Journal Article then we apply this type //else We have to find a terms that match the vocabulary otherwise we discard it - val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue)) + val ja = + article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue)) val pubmedInstance = new Instance if (ja.isDefined) { - val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue) + val cojbCategory = + getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue) pubmedInstance.setInstancetype(cojbCategory) } else { val i_type = article.getPublicationTypes.asScala @@ -155,7 +179,9 @@ object PubMedToOaf { if (alternateIdentifier != null) pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava) result.setInstance(List(pubmedInstance).asJava) - pubmedInstance.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut) + pubmedInstance.getPid.asScala + .filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)) + .map(p => p.getValue)(collection.breakOut) //CREATE URL From pmid val urlLists: List[String] = pidList .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue)) @@ -165,7 +191,9 @@ object PubMedToOaf { pubmedInstance.setUrl(urlLists.asJava) //ASSIGN DateofAcceptance - pubmedInstance.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) + pubmedInstance.setDateofacceptance( + OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo) + ) //ASSIGN COLLECTEDFROM pubmedInstance.setCollectedfrom(collectedFrom) result.setPid(pidList.asJava) @@ -173,7 +201,6 @@ object PubMedToOaf { //END INSTANCE MAPPING //-------------------------------------------------------------------------------------- - // JOURNAL MAPPING //-------------------------------------------------------------------------------------- if (article.getJournal != null && result.isInstanceOf[Publication]) @@ -182,31 +209,48 @@ object PubMedToOaf { //END JOURNAL MAPPING //-------------------------------------------------------------------------------------- - // RESULT MAPPING //-------------------------------------------------------------------------------------- - result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) + result.setDateofacceptance( + OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo) + ) if (article.getTitle == null || article.getTitle.isEmpty) return null - result.setTitle(List(OafMapperUtils.structuredProperty(article.getTitle, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo)).asJava) + result.setTitle( + List( + OafMapperUtils.structuredProperty( + article.getTitle, + ModelConstants.MAIN_TITLE_QUALIFIER, + dataInfo + ) + ).asJava + ) if (article.getDescription != null && article.getDescription.nonEmpty) result.setDescription(List(OafMapperUtils.field(article.getDescription, dataInfo)).asJava) if (article.getLanguage != null) { - val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage) + val term = + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage) if (term != null) result.setLanguage(term) } - - val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection.breakOut) + val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => + OafMapperUtils.structuredProperty( + s.getValue, + SUBJ_CLASS, + SUBJ_CLASS, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + dataInfo + ) + )(collection.breakOut) if (subjects != null) result.setSubject(subjects.asJava) - val authors: List[Author] = article.getAuthors.asScala.zipWithIndex.map { case (a, index) => val author = new Author() author.setName(a.getForeName) @@ -216,15 +260,12 @@ object PubMedToOaf { author }(collection.breakOut) - if (authors != null && authors.nonEmpty) result.setAuthor(authors.asJava) result.setOriginalId(pidList.map(s => s.getValue).asJava) - result.setId(article.getPmid) - // END RESULT MAPPING //-------------------------------------------------------------------------------------- val id = IdentifierFactory.createIdentifier(result) @@ -234,5 +275,4 @@ object PubMedToOaf { result } - } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkRetrieveDataciteDelta.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkRetrieveDataciteDelta.scala index 45a6cfc89..2618d466a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkRetrieveDataciteDelta.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkRetrieveDataciteDelta.scala @@ -17,7 +17,8 @@ import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ import java.text.SimpleDateFormat -class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) { +class SparkRetrieveDataciteDelta(propertyPath: String, args: Array[String], log: Logger) + extends AbstractScalaApplication(propertyPath, args, log: Logger) { val ISO_DATE_PATTERN = "yyyy-MM-dd'T'HH:mm:ssZ" val simpleFormatter = new SimpleDateFormat(ISO_DATE_PATTERN) @@ -25,162 +26,190 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L val SCHOLIX_RESOURCE_PATH_NAME = "scholixResource" val DATACITE_OAF_PATH_NAME = "dataciteOAFUpdate" val PID_MAP_PATH_NAME = "pidMap" - val RESOLVED_REL_PATH_NAME ="resolvedRelation" + val RESOLVED_REL_PATH_NAME = "resolvedRelation" val SCHOLIX_PATH_NAME = "scholix" + def scholixResourcePath(workingPath: String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME" + def dataciteOAFPath(workingPath: String) = s"$workingPath/$DATACITE_OAF_PATH_NAME" + def pidMapPath(workingPath: String) = s"$workingPath/$PID_MAP_PATH_NAME" + def resolvedRelationPath(workingPath: String) = s"$workingPath/$RESOLVED_REL_PATH_NAME" + def scholixPath(workingPath: String) = s"$workingPath/$SCHOLIX_PATH_NAME" - def scholixResourcePath(workingPath:String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME" - def dataciteOAFPath(workingPath:String) = s"$workingPath/$DATACITE_OAF_PATH_NAME" - def pidMapPath(workingPath:String) = s"$workingPath/$PID_MAP_PATH_NAME" - def resolvedRelationPath(workingPath:String) = s"$workingPath/$RESOLVED_REL_PATH_NAME" - def scholixPath(workingPath:String) = s"$workingPath/$SCHOLIX_PATH_NAME" - - - /** - * Utility to parse Date in ISO8601 to epochMillis - * @param inputDate The String represents an input date in ISO8601 - * @return The relative epochMillis of parsed date - */ - def ISO8601toEpochMillis(inputDate:String):Long = { + /** Utility to parse Date in ISO8601 to epochMillis + * @param inputDate The String represents an input date in ISO8601 + * @return The relative epochMillis of parsed date + */ + def ISO8601toEpochMillis(inputDate: String): Long = { simpleFormatter.parse(inputDate).getTime } - - /** - * This method tries to retrieve the last collection date from all datacite - * records in HDFS. - * This method should be called before indexing scholexplorer to retrieve - * the delta of Datacite record to download, since from the generation of - * raw graph to the generation of Scholexplorer sometimes it takes 20 days - * @param spark - * @param entitiesPath - * @return the last collection date from the current scholexplorer Graph of the datacite records - */ - def retrieveLastCollectedFrom(spark:SparkSession, entitiesPath:String):Long = { + /** This method tries to retrieve the last collection date from all datacite + * records in HDFS. + * This method should be called before indexing scholexplorer to retrieve + * the delta of Datacite record to download, since from the generation of + * raw graph to the generation of Scholexplorer sometimes it takes 20 days + * @param spark + * @param entitiesPath + * @return the last collection date from the current scholexplorer Graph of the datacite records + */ + def retrieveLastCollectedFrom(spark: SparkSession, entitiesPath: String): Long = { log.info("Retrieve last entities collected From") - implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result] + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result] import spark.implicits._ - val entitiesDS = spark.read.load(s"$entitiesPath/*").as[Oaf].filter(o =>o.isInstanceOf[Result]).map(r => r.asInstanceOf[Result]) + val entitiesDS = spark.read + .load(s"$entitiesPath/*") + .as[Oaf] + .filter(o => o.isInstanceOf[Result]) + .map(r => r.asInstanceOf[Result]) - val date = entitiesDS.filter(r => r.getDateofcollection!= null).map(_.getDateofcollection).select(max("value")).first.getString(0) + val date = entitiesDS + .filter(r => r.getDateofcollection != null) + .map(_.getDateofcollection) + .select(max("value")) + .first + .getString(0) ISO8601toEpochMillis(date) / 1000 } - - /** - * The method of update Datacite relationships on Scholexplorer - * needs some utilities data structures - * One is the scholixResource DS that stores all the nodes in the Scholix Graph - * in format ScholixResource - * @param summaryPath the path of the summary in Scholix - * @param workingPath the working path - * @param spark the spark session - */ - def generateScholixResource(summaryPath:String, workingPath: String, spark:SparkSession) :Unit = { - implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] - implicit val scholixResourceEncoder:Encoder[ScholixResource] = Encoders.kryo[ScholixResource] + /** The method of update Datacite relationships on Scholexplorer + * needs some utilities data structures + * One is the scholixResource DS that stores all the nodes in the Scholix Graph + * in format ScholixResource + * @param summaryPath the path of the summary in Scholix + * @param workingPath the working path + * @param spark the spark session + */ + def generateScholixResource( + summaryPath: String, + workingPath: String, + spark: SparkSession + ): Unit = { + implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] + implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource] log.info("Convert All summary to ScholixResource") - spark.read.load(summaryPath).as[ScholixSummary] + spark.read + .load(summaryPath) + .as[ScholixSummary] .map(ScholixUtils.generateScholixResourceFromSummary)(scholixResourceEncoder) - .filter(r => r.getIdentifier!= null && r.getIdentifier.size>0) - .write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_native") + .filter(r => r.getIdentifier != null && r.getIdentifier.size > 0) + .write + .mode(SaveMode.Overwrite) + .save(s"${scholixResourcePath(workingPath)}_native") } - /** - * This method convert the new Datacite Resource into Scholix Resource - * Needed to fill the source and the type of Scholix Relationships - * @param workingPath the Working Path - * @param spark The spark Session - */ - def addMissingScholixResource(workingPath:String, spark:SparkSession ) :Unit = { - implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val scholixResourceEncoder:Encoder[ScholixResource] = Encoders.kryo[ScholixResource] - implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result] + /** This method convert the new Datacite Resource into Scholix Resource + * Needed to fill the source and the type of Scholix Relationships + * @param workingPath the Working Path + * @param spark The spark Session + */ + def addMissingScholixResource(workingPath: String, spark: SparkSession): Unit = { + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource] + implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result] import spark.implicits._ - spark.read.load(dataciteOAFPath(workingPath)).as[Oaf] + spark.read + .load(dataciteOAFPath(workingPath)) + .as[Oaf] .filter(_.isInstanceOf[Result]) .map(_.asInstanceOf[Result]) .map(ScholixUtils.generateScholixResourceFromResult) - .filter(r => r.getIdentifier!= null && r.getIdentifier.size>0) - .write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_update") + .filter(r => r.getIdentifier != null && r.getIdentifier.size > 0) + .write + .mode(SaveMode.Overwrite) + .save(s"${scholixResourcePath(workingPath)}_update") val update = spark.read.load(s"${scholixResourcePath(workingPath)}_update").as[ScholixResource] val native = spark.read.load(s"${scholixResourcePath(workingPath)}_native").as[ScholixResource] - val graph = update.union(native) + val graph = update + .union(native) .groupByKey(_.getDnetIdentifier) - .reduceGroups((a,b) => if (a!= null && a.getDnetIdentifier!= null) a else b) + .reduceGroups((a, b) => if (a != null && a.getDnetIdentifier != null) a else b) .map(_._2) graph.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_graph") } + /** This method get and Transform only datacite records with + * timestamp greater than timestamp + * @param datacitePath the datacite input Path + * @param timestamp the timestamp + * @param workingPath the working path where save the generated Dataset + * @param spark SparkSession + * @param vocabularies Vocabularies needed for transformation + */ - /** - * This method get and Transform only datacite records with - * timestamp greater than timestamp - * @param datacitePath the datacite input Path - * @param timestamp the timestamp - * @param workingPath the working path where save the generated Dataset - * @param spark SparkSession - * @param vocabularies Vocabularies needed for transformation - */ - - def getDataciteUpdate(datacitePath:String, timestamp:Long, workingPath:String, spark:SparkSession,vocabularies: VocabularyGroup): Long = { + def getDataciteUpdate( + datacitePath: String, + timestamp: Long, + workingPath: String, + spark: SparkSession, + vocabularies: VocabularyGroup + ): Long = { import spark.implicits._ val ds = spark.read.load(datacitePath).as[DataciteType] - implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf] - val total = ds.filter(_.timestamp>=timestamp).count() - if (total >0) { + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + val total = ds.filter(_.timestamp >= timestamp).count() + if (total > 0) { ds.filter(_.timestamp >= timestamp) - .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true)) - .flatMap(i => fixRelations(i)).filter(i => i != null) - .write.mode(SaveMode.Overwrite).save(dataciteOAFPath(workingPath)) + .flatMap(d => + DataciteToOAFTransformation + .generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true) + ) + .flatMap(i => fixRelations(i)) + .filter(i => i != null) + .write + .mode(SaveMode.Overwrite) + .save(dataciteOAFPath(workingPath)) } total } - /** - * After added the new ScholixResource, we need to update the scholix Pid Map - * to intersected with the new Datacite Relations - - * @param workingPath The working Path starting from save the new Map - * @param spark the spark session - */ - def generatePidMap(workingPath:String, spark:SparkSession ) :Unit = { - implicit val scholixResourceEncoder:Encoder[ScholixResource] = Encoders.kryo[ScholixResource] + /** After added the new ScholixResource, we need to update the scholix Pid Map + * to intersected with the new Datacite Relations + * + * @param workingPath The working Path starting from save the new Map + * @param spark the spark session + */ + def generatePidMap(workingPath: String, spark: SparkSession): Unit = { + implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource] import spark.implicits._ - spark.read.load(s"${scholixResourcePath(workingPath)}_graph").as[ScholixResource] - .flatMap(r=> - r.getIdentifier.asScala - .map(i =>DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema)) - .map(t =>(t, r.getDnetIdentifier)) - )(Encoders.tuple(Encoders.STRING, Encoders.STRING)) + spark.read + .load(s"${scholixResourcePath(workingPath)}_graph") + .as[ScholixResource] + .flatMap(r => + r.getIdentifier.asScala + .map(i => DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema)) + .map(t => (t, r.getDnetIdentifier)) + )(Encoders.tuple(Encoders.STRING, Encoders.STRING)) .groupByKey(_._1) - .reduceGroups((a,b) => if (a!= null && a._2!= null) a else b) + .reduceGroups((a, b) => if (a != null && a._2 != null) a else b) .map(_._2)(Encoders.tuple(Encoders.STRING, Encoders.STRING)) - .write.mode(SaveMode.Overwrite).save(pidMapPath(workingPath)) + .write + .mode(SaveMode.Overwrite) + .save(pidMapPath(workingPath)) } - /** - * This method resolve the datacite relation and filter the resolved - * relation - * @param workingPath the working path - * @param spark the spark session - */ + /** This method resolve the datacite relation and filter the resolved + * relation + * @param workingPath the working path + * @param spark the spark session + */ - def resolveUpdateRelation(workingPath:String, spark:SparkSession) :Unit = { - implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val relationEncoder:Encoder[Relation] = Encoders.kryo[Relation] + def resolveUpdateRelation(workingPath: String, spark: SparkSession): Unit = { + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation] import spark.implicits._ - val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String,String)] + val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String, String)] - val unresolvedRelations:Dataset[(String,Relation)] = spark.read.load(dataciteOAFPath(workingPath)).as[Oaf] + val unresolvedRelations: Dataset[(String, Relation)] = spark.read + .load(dataciteOAFPath(workingPath)) + .as[Oaf] .filter(_.isInstanceOf[Relation]) .map(_.asInstanceOf[Relation]) .map { r => @@ -193,7 +222,7 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L unresolvedRelations .joinWith(pidMap, unresolvedRelations("_1").equalTo(pidMap("_1"))) .map(t => { - val r =t._1._2 + val r = t._1._2 val resolvedIdentifier = t._2._2 if (r.getSource.startsWith("unresolved")) r.setSource(resolvedIdentifier) @@ -201,56 +230,62 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L r.setTarget(resolvedIdentifier) r })(relationEncoder) - .filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved") )) - .write.mode(SaveMode.Overwrite) + .filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved"))) + .write + .mode(SaveMode.Overwrite) .save(resolvedRelationPath(workingPath)) } + /** This method generate scholix starting from resolved relation + * + * @param workingPath + * @param spark + */ + def generateScholixUpdate(workingPath: String, spark: SparkSession): Unit = { + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] + implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource] + implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation] + implicit val intermediateEncoder: Encoder[(String, Scholix)] = + Encoders.tuple(Encoders.STRING, scholixEncoder) + val relations: Dataset[(String, Relation)] = spark.read + .load(resolvedRelationPath(workingPath)) + .as[Relation] + .map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relationEncoder)) - /** - * This method generate scholix starting from resolved relation - * - * - * @param workingPath - * @param spark - */ - def generateScholixUpdate(workingPath:String, spark:SparkSession) :Unit = { - implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val scholixEncoder:Encoder[Scholix] = Encoders.kryo[Scholix] - implicit val scholixResourceEncoder:Encoder[ScholixResource] = Encoders.kryo[ScholixResource] - implicit val relationEncoder:Encoder[Relation] = Encoders.kryo[Relation] - implicit val intermediateEncoder :Encoder[(String,Scholix)] = Encoders.tuple(Encoders.STRING, scholixEncoder) - - - val relations:Dataset[(String, Relation)] = spark.read.load(resolvedRelationPath(workingPath)).as[Relation].map(r =>(r.getSource,r))(Encoders.tuple(Encoders.STRING, relationEncoder)) - - val id_summary:Dataset[(String,ScholixResource)] = spark.read.load(s"${scholixResourcePath(workingPath)}_graph").as[ScholixResource].map(r => (r.getDnetIdentifier,r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder)) + val id_summary: Dataset[(String, ScholixResource)] = spark.read + .load(s"${scholixResourcePath(workingPath)}_graph") + .as[ScholixResource] + .map(r => (r.getDnetIdentifier, r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder)) id_summary.cache() - relations.joinWith(id_summary, relations("_1").equalTo(id_summary("_1")),"inner") - .map(t => (t._1._2.getTarget,ScholixUtils.scholixFromSource(t._1._2, t._2._2))) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/scholix_one_verse") + relations + .joinWith(id_summary, relations("_1").equalTo(id_summary("_1")), "inner") + .map(t => (t._1._2.getTarget, ScholixUtils.scholixFromSource(t._1._2, t._2._2))) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/scholix_one_verse") - val source_scholix:Dataset[(String, Scholix)] =spark.read.load(s"$workingPath/scholix_one_verse").as[(String,Scholix)] + val source_scholix: Dataset[(String, Scholix)] = + spark.read.load(s"$workingPath/scholix_one_verse").as[(String, Scholix)] - source_scholix.joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")),"inner") + source_scholix + .joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")), "inner") .map(t => { - val target:ScholixResource =t._2._2 - val scholix:Scholix = t._1._2 - ScholixUtils.generateCompleteScholix(scholix,target) - })(scholixEncoder).write.mode(SaveMode.Overwrite).save(s"$workingPath/scholix") + val target: ScholixResource = t._2._2 + val scholix: Scholix = t._1._2 + ScholixUtils.generateCompleteScholix(scholix, target) + })(scholixEncoder) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/scholix") } - - - - - /** - * Here all the spark applications runs this method - * where the whole logic of the spark node is defined - */ + /** Here all the spark applications runs this method + * where the whole logic of the spark node is defined + */ override def run(): Unit = { val sourcePath = parser.get("sourcePath") log.info(s"SourcePath is '$sourcePath'") @@ -258,7 +293,7 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L val datacitePath = parser.get("datacitePath") log.info(s"DatacitePath is '$datacitePath'") - val workingPath = parser.get("workingSupportPath") + val workingPath = parser.get("workingSupportPath") log.info(s"workingPath is '$workingPath'") val isLookupUrl: String = parser.get("isLookupUrl") @@ -268,38 +303,43 @@ class SparkRetrieveDataciteDelta (propertyPath:String, args:Array[String], log:L val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) require(vocabularies != null) - - val updateDS:Boolean = "true".equalsIgnoreCase(parser.get("updateDS")) + val updateDS: Boolean = "true".equalsIgnoreCase(parser.get("updateDS")) log.info(s"updateDS is '$updateDS'") var lastCollectionDate = 0L if (updateDS) { generateScholixResource(s"$sourcePath/provision/summaries", workingPath, spark) log.info("Retrieve last entities collected From starting from scholix Graph") - lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities") - } - else { + lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities") + } else { val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) fs.delete(new Path(s"${scholixResourcePath(workingPath)}_native"), true) - fs.rename(new Path(s"${scholixResourcePath(workingPath)}_graph"), new Path(s"${scholixResourcePath(workingPath)}_native")) - lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath)) + fs.rename( + new Path(s"${scholixResourcePath(workingPath)}_graph"), + new Path(s"${scholixResourcePath(workingPath)}_native") + ) + lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath)) } - val numRecords = getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies) - if (numRecords>0) { - addMissingScholixResource(workingPath,spark) + val numRecords = + getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies) + if (numRecords > 0) { + addMissingScholixResource(workingPath, spark) generatePidMap(workingPath, spark) - resolveUpdateRelation(workingPath,spark) + resolveUpdateRelation(workingPath, spark) generateScholixUpdate(workingPath, spark) } } } - object SparkRetrieveDataciteDelta { val log: Logger = LoggerFactory.getLogger(SparkRetrieveDataciteDelta.getClass) def main(args: Array[String]): Unit = { - new SparkRetrieveDataciteDelta("/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json", args, log).initialize().run() + new SparkRetrieveDataciteDelta( + "/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json", + args, + log + ).initialize().run() } } diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala index 5bb6ba67d..ca1dbc665 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala @@ -1,6 +1,5 @@ package eu.dnetlib.dhp.datacite - import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature} import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest import eu.dnetlib.dhp.schema.oaf.Oaf @@ -20,95 +19,90 @@ import java.util.Locale import scala.io.Source @ExtendWith(Array(classOf[MockitoExtension])) -class DataciteToOAFTest extends AbstractVocabularyTest{ +class DataciteToOAFTest extends AbstractVocabularyTest { - private var workingDir:Path = null + private var workingDir: Path = null val log: Logger = LoggerFactory.getLogger(getClass) @BeforeEach - def setUp() :Unit = { + def setUp(): Unit = { - workingDir= Files.createTempDirectory(getClass.getSimpleName) + workingDir = Files.createTempDirectory(getClass.getSimpleName) super.setUpVocabulary() } @AfterEach - def tearDown() :Unit = { + def tearDown(): Unit = { FileUtils.deleteDirectory(workingDir.toFile) } - @Test - def testDateMapping:Unit = { + def testDateMapping: Unit = { val inputDate = "2021-07-14T11:52:54+0000" val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US) val dt = ISO8601FORMAT.parse(inputDate) println(dt.getTime) - } - @Test def testConvert(): Unit = { - val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath val conf = new SparkConf() - val spark:SparkSession = SparkSession.builder().config(conf) + val spark: SparkSession = SparkSession + .builder() + .config(conf) .appName(getClass.getSimpleName) .master("local[*]") .getOrCreate() - - - implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] val instance = new GenerateDataciteDatasetSpark(null, null, log) val targetPath = s"$workingDir/result" - instance.generateDataciteDataset(path, exportLinks = true, vocabularies,targetPath, spark) + instance.generateDataciteDataset(path, exportLinks = true, vocabularies, targetPath, spark) import spark.implicits._ - val nativeSize =spark.read.load(path).count() - + val nativeSize = spark.read.load(path).count() assertEquals(100, nativeSize) - val result:Dataset[Oaf] = spark.read.load(targetPath).as[Oaf] + val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf] - - result.map(s => s.getClass.getSimpleName).groupBy(col("value").alias("class")).agg(count("value").alias("Total")).show(false) + result + .map(s => s.getClass.getSimpleName) + .groupBy(col("value").alias("class")) + .agg(count("value").alias("Total")) + .show(false) val t = spark.read.load(targetPath).count() - assertTrue(t >0) - + assertTrue(t > 0) spark.stop() - - - } - @Test - def testMapping() :Unit = { - val record =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")).mkString + def testMapping(): Unit = { + val record = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json") + ) + .mkString val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT) - val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true ) + val res: List[Oaf] = DataciteToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true) res.foreach(r => { - println (mapper.writeValueAsString(r)) + println(mapper.writeValueAsString(r)) println("----------------------------") }) - - } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index 893a6e628..ea742a04a 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -20,14 +20,13 @@ import scala.io.Source import scala.xml.pull.XMLEventReader @ExtendWith(Array(classOf[MockitoExtension])) -class BioScholixTest extends AbstractVocabularyTest{ - +class BioScholixTest extends AbstractVocabularyTest { val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT) - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,false) + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) @BeforeEach - def setUp() :Unit = { + def setUp(): Unit = { super.setUpVocabulary() } @@ -38,52 +37,54 @@ class BioScholixTest extends AbstractVocabularyTest{ } object GzFileIterator { + def apply(is: InputStream, encoding: String) = { new BufferedReaderIterator( - new BufferedReader( - new InputStreamReader( - new GZIPInputStream( - is), encoding))) + new BufferedReader(new InputStreamReader(new GZIPInputStream(is), encoding)) + ) } } - - - @Test def testEBIData() = { - val inputXML = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")).mkString + val inputXML = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) + .mkString val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes())) - new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s))) + new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s))) } - @Test def testPubmedToOaf(): Unit = { assertNotNull(vocabularies) assertTrue(vocabularies.vocabularyExists("dnet:publication_resource")) - val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")).mkString - val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies)) + val records: String = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")) + .mkString + val r: List[Oaf] = records.lines.toList + .map(s => mapper.readValue(s, classOf[PMArticle])) + .map(a => PubMedToOaf.convert(a, vocabularies)) assertEquals(10, r.size) - assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p))) + assertTrue( + r.map(p => p.asInstanceOf[Result]) + .flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)) + .exists(p => "0037".equalsIgnoreCase(p)) + ) println(mapper.writeValueAsString(r.head)) - - } - @Test - def testPDBToOAF():Unit = { + def testPDBToOAF(): Unit = { assertNotNull(vocabularies) assertTrue(vocabularies.vocabularyExists("dnet:publication_resource")) - val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump")).mkString + val records: String = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump")) + .mkString records.lines.foreach(s => assertTrue(s.nonEmpty)) - val result:List[Oaf]= records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o)) - - + val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o)) assertTrue(result.nonEmpty) result.foreach(r => assertNotNull(r)) @@ -93,19 +94,18 @@ class BioScholixTest extends AbstractVocabularyTest{ } - @Test - def testUNIprotToOAF():Unit = { + def testUNIprotToOAF(): Unit = { assertNotNull(vocabularies) assertTrue(vocabularies.vocabularyExists("dnet:publication_resource")) - val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump")).mkString + val records: String = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump")) + .mkString records.lines.foreach(s => assertTrue(s.nonEmpty)) - val result:List[Oaf]= records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o)) - - + val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o)) assertTrue(result.nonEmpty) result.foreach(r => assertNotNull(r)) @@ -115,35 +115,42 @@ class BioScholixTest extends AbstractVocabularyTest{ } - case class EBILinks(relType:String, date:String, title:String, pmid:String, targetPid:String, targetPidType:String) {} + case class EBILinks( + relType: String, + date: String, + title: String, + pmid: String, + targetPid: String, + targetPidType: String + ) {} - def parse_ebi_links(input:String):List[EBILinks] ={ + def parse_ebi_links(input: String): List[EBILinks] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) - val pmid = (json \ "publication" \"pmid").extract[String] + val pmid = (json \ "publication" \ "pmid").extract[String] for { - JObject(link) <- json \\ "Link" - JField("Target",JObject(target)) <- link - JField("RelationshipType",JObject(relType)) <- link - JField("Name", JString(relation)) <- relType - JField("PublicationDate",JString(publicationDate)) <- link - JField("Title", JString(title)) <- target - JField("Identifier",JObject(identifier)) <- target - JField("IDScheme", JString(idScheme)) <- identifier - JField("ID", JString(id)) <- identifier + JObject(link) <- json \\ "Link" + JField("Target", JObject(target)) <- link + JField("RelationshipType", JObject(relType)) <- link + JField("Name", JString(relation)) <- relType + JField("PublicationDate", JString(publicationDate)) <- link + JField("Title", JString(title)) <- target + JField("Identifier", JObject(identifier)) <- target + JField("IDScheme", JString(idScheme)) <- identifier + JField("ID", JString(id)) <- identifier } yield EBILinks(relation, publicationDate, title, pmid, id, idScheme) } - @Test - def testCrossrefLinksToOAF():Unit = { + def testCrossrefLinksToOAF(): Unit = { - val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links")).mkString + val records: String = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links")) + .mkString records.lines.foreach(s => assertTrue(s.nonEmpty)) - - val result:List[Oaf] =records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList + val result: List[Oaf] = records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList assertNotNull(result) assertTrue(result.nonEmpty) @@ -153,36 +160,41 @@ class BioScholixTest extends AbstractVocabularyTest{ } @Test - def testEBILinksToOAF():Unit = { - val iterator = GzFileIterator(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"), "UTF-8") + def testEBILinksToOAF(): Unit = { + val iterator = GzFileIterator( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"), + "UTF-8" + ) val data = iterator.next() - val res = BioDBToOAF.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links).filter(BioDBToOAF.EBITargetLinksFilter).flatMap(BioDBToOAF.convertEBILinksToOaf) + val res = BioDBToOAF + .parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links) + .filter(BioDBToOAF.EBITargetLinksFilter) + .flatMap(BioDBToOAF.convertEBILinksToOaf) print(res.length) - println(mapper.writeValueAsString(res.head)) } - - - @Test - def scholixResolvedToOAF():Unit ={ + def scholixResolvedToOAF(): Unit = { - val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")).mkString + val records: String = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved") + ) + .mkString records.lines.foreach(s => assertTrue(s.nonEmpty)) implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats - val l:List[ScholixResolved] = records.lines.map{input => + val l: List[ScholixResolved] = records.lines.map { input => lazy val json = parse(input) json.extract[ScholixResolved] }.toList - - val result:List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s)) + val result: List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s)) assertTrue(result.nonEmpty) } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 3822f40b5..20471973a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -16,10 +16,22 @@ import java.time.LocalDate import java.time.format.DateTimeFormatter import scala.collection.JavaConverters._ +case class HostedByItemType( + id: String, + officialname: String, + issn: String, + eissn: String, + lissn: String, + openAccess: Boolean +) {} -case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {} - -case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){} +case class DoiBoostAffiliation( + PaperId: Long, + AffiliationId: Long, + GridId: Option[String], + OfficialPage: Option[String], + DisplayName: Option[String] +) {} object DoiBoostMappingUtil { @@ -43,9 +55,19 @@ object DoiBoostMappingUtil { val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)" val DOI_PREFIX = "10." - val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;") + val invalidName = List( + ",", + "none none", + "none, none", + "none &na;", + "(:null)", + "test test test", + "test test", + "test", + "&na; &na;" + ) - def toActionSet(item:Oaf) :(String, String) = { + def toActionSet(item: Oaf): (String, String) = { val mapper = new ObjectMapper() item match { @@ -75,59 +97,56 @@ object DoiBoostMappingUtil { } - - def toHostedByItem(input:String): (String, HostedByItemType) = { + def toHostedByItem(input: String): (String, HostedByItemType) = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]] + val c: Map[String, HostedByItemType] = json.extract[Map[String, HostedByItemType]] (c.keys.head, c.values.head) } - - def toISSNPair(publication: Publication) : (String, Publication) = { + def toISSNPair(publication: Publication): (String, Publication) = { val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted - val eissn =if (publication.getJournal == null) null else publication.getJournal.getIssnOnline - val lissn =if (publication.getJournal == null) null else publication.getJournal.getIssnLinking + val eissn = if (publication.getJournal == null) null else publication.getJournal.getIssnOnline + val lissn = if (publication.getJournal == null) null else publication.getJournal.getIssnLinking - if (issn!= null && issn.nonEmpty) + if (issn != null && issn.nonEmpty) (issn, publication) - else if(eissn!= null && eissn.nonEmpty) + else if (eissn != null && eissn.nonEmpty) (eissn, publication) - else if(lissn!= null && lissn.nonEmpty) + else if (lissn != null && lissn.nonEmpty) (lissn, publication) else (publication.getId, publication) } - - - - def generateGridAffiliationId(gridId:String) :String = { + def generateGridAffiliationId(gridId: String): String = { s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}" } - - def fixResult(result: Dataset) :Dataset = { + def fixResult(result: Dataset): Dataset = { val instanceType = extractInstance(result) if (instanceType.isDefined) { result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) } - result.getInstance().asScala.foreach(i => { - i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY) - }) + result + .getInstance() + .asScala + .foreach(i => { + i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY) + }) result } - - def decideAccessRight(lic : Field[String], date:String) : AccessRight = { - if(lic == null){ + def decideAccessRight(lic: Field[String], date: String): AccessRight = { + if (lic == null) { //Default value Unknown return getUnknownQualifier() } - val license : String = lic.getValue + val license: String = lic.getValue //CC licenses - if(license.startsWith("cc") || + if ( + license.startsWith("cc") || license.startsWith("http://creativecommons.org/licenses") || license.startsWith("https://creativecommons.org/licenses") || @@ -137,40 +156,44 @@ object DoiBoostMappingUtil { license.equals("http://pubs.acs.org/page/policy/authorchoice_ccbyncnd_termsofuse.html") || //APA (considered OPEN also by Unpaywall) - license.equals("http://www.apa.org/pubs/journals/resources/open-access.aspx")){ + license.equals("http://www.apa.org/pubs/journals/resources/open-access.aspx") + ) { - val oaq : AccessRight = getOpenAccessQualifier() + val oaq: AccessRight = getOpenAccessQualifier() oaq.setOpenAccessRoute(OpenAccessRoute.hybrid) return oaq } //OUP (BUT ONLY AFTER 12 MONTHS FROM THE PUBLICATION DATE, OTHERWISE THEY ARE EMBARGOED) - if(license.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model")){ + if ( + license.equals( + "https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model" + ) + ) { val now = java.time.LocalDate.now - try{ + try { val pub_date = LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd")) - if (((now.toEpochDay - pub_date.toEpochDay)/365.0) > 1){ - val oaq : AccessRight = getOpenAccessQualifier() + if (((now.toEpochDay - pub_date.toEpochDay) / 365.0) > 1) { + val oaq: AccessRight = getOpenAccessQualifier() oaq.setOpenAccessRoute(OpenAccessRoute.hybrid) return oaq - } - else{ + } else { return getEmbargoedAccessQualifier() } - }catch { + } catch { case e: Exception => { - try{ - val pub_date = LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")) - if (((now.toEpochDay - pub_date.toEpochDay)/365.0) > 1){ - val oaq : AccessRight = getOpenAccessQualifier() - oaq.setOpenAccessRoute(OpenAccessRoute.hybrid) - return oaq - } - else{ - return getEmbargoedAccessQualifier() - } - }catch{ + try { + val pub_date = + LocalDate.parse(date, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")) + if (((now.toEpochDay - pub_date.toEpochDay) / 365.0) > 1) { + val oaq: AccessRight = getOpenAccessQualifier() + oaq.setOpenAccessRoute(OpenAccessRoute.hybrid) + return oaq + } else { + return getEmbargoedAccessQualifier() + } + } catch { case ex: Exception => return getClosedAccessQualifier() } } @@ -183,64 +206,91 @@ object DoiBoostMappingUtil { } + def getOpenAccessQualifier(): AccessRight = { - - def getOpenAccessQualifier():AccessRight = { - - OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN,"Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) + OafMapperUtils.accessRight( + ModelConstants.ACCESS_RIGHT_OPEN, + "Open Access", + ModelConstants.DNET_ACCESS_MODES, + ModelConstants.DNET_ACCESS_MODES + ) } - def getRestrictedQualifier():AccessRight = { - OafMapperUtils.accessRight( "RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) + def getRestrictedQualifier(): AccessRight = { + OafMapperUtils.accessRight( + "RESTRICTED", + "Restricted", + ModelConstants.DNET_ACCESS_MODES, + ModelConstants.DNET_ACCESS_MODES + ) } - - def getUnknownQualifier():AccessRight = { - OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) + def getUnknownQualifier(): AccessRight = { + OafMapperUtils.accessRight( + ModelConstants.UNKNOWN, + ModelConstants.NOT_AVAILABLE, + ModelConstants.DNET_ACCESS_MODES, + ModelConstants.DNET_ACCESS_MODES + ) } - - def getEmbargoedAccessQualifier():AccessRight = { - OafMapperUtils.accessRight("EMBARGO","Embargo",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) + def getEmbargoedAccessQualifier(): AccessRight = { + OafMapperUtils.accessRight( + "EMBARGO", + "Embargo", + ModelConstants.DNET_ACCESS_MODES, + ModelConstants.DNET_ACCESS_MODES + ) } - def getClosedAccessQualifier():AccessRight = { - OafMapperUtils.accessRight("CLOSED","Closed Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) + def getClosedAccessQualifier(): AccessRight = { + OafMapperUtils.accessRight( + "CLOSED", + "Closed Access", + ModelConstants.DNET_ACCESS_MODES, + ModelConstants.DNET_ACCESS_MODES + ) } - - def extractInstance(r:Result):Option[Instance] = { - r.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty) + def extractInstance(r: Result): Option[Instance] = { + r.getInstance() + .asScala + .find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty) } - def fixPublication(input:((String,Publication), (String,HostedByItemType))): Publication = { + def fixPublication(input: ((String, Publication), (String, HostedByItemType))): Publication = { val publication = input._1._2 val item = if (input._2 != null) input._2._2 else null - val instanceType:Option[Instance] = extractInstance(publication) + val instanceType: Option[Instance] = extractInstance(publication) if (instanceType.isDefined) { - publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) + publication + .getInstance() + .asScala + .foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) } - publication.getInstance().asScala.foreach(i => { - var hb = new KeyValue - if (item != null) { - hb.setValue(item.officialname) - hb.setKey(item.id) - if (item.openAccess) { - i.setAccessright(getOpenAccessQualifier()) - i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold) - } + publication + .getInstance() + .asScala + .foreach(i => { + var hb = new KeyValue + if (item != null) { + hb.setValue(item.officialname) + hb.setKey(item.id) + if (item.openAccess) { + i.setAccessright(getOpenAccessQualifier()) + i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold) + } - } - else { - hb = ModelConstants.UNKNOWN_REPOSITORY - } - i.setHostedby(hb) - }) + } else { + hb = ModelConstants.UNKNOWN_REPOSITORY + } + i.setHostedby(hb) + }) publication.setBestaccessright(OafMapperUtils.createBestAccessRights(publication.getInstance())) @@ -270,17 +320,22 @@ object DoiBoostMappingUtil { if (publication.getTitle == null || publication.getTitle.size == 0) return false - - val s = publication.getTitle.asScala.count(p => p.getValue != null - && p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]")) + val s = publication.getTitle.asScala.count(p => + p.getValue != null + && p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]") + ) if (s == 0) return false // fixes #4360 (test publisher) - val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null + val publisher = + if (publication.getPublisher != null) publication.getPublisher.getValue else null - if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) { + if ( + publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher + .equalsIgnoreCase("CrossRef Test Account")) + ) { return false; } @@ -288,18 +343,12 @@ object DoiBoostMappingUtil { if (publication.getAuthor == null || publication.getAuthor.size() == 0) return false - //filter invalid author val authors = publication.getAuthor.asScala.map(s => { if (s.getFullname.nonEmpty) { s.getFullname - } - else - s"${ - s.getName - } ${ - s.getSurname - }" + } else + s"${s.getName} ${s.getSurname}" }) val c = authors.count(isValidAuthorName) @@ -307,13 +356,16 @@ object DoiBoostMappingUtil { return false // fixes #4368 - if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue)) + if ( + authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase( + publication.getPublisher.getValue + ) + ) return false true } - def isValidAuthorName(fullName: String): Boolean = { if (fullName == null || fullName.isEmpty) return false @@ -322,32 +374,47 @@ object DoiBoostMappingUtil { true } - def generateDataInfo(trust: String): DataInfo = { val di = new DataInfo di.setDeletedbyinference(false) di.setInferred(false) di.setInvisible(false) di.setTrust(trust) - di.setProvenanceaction(OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_ACTIONSET,ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS)) + di.setProvenanceaction( + OafMapperUtils.qualifier( + ModelConstants.SYSIMPORT_ACTIONSET, + ModelConstants.SYSIMPORT_ACTIONSET, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS + ) + ) di } - - - def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = { + def createSP( + value: String, + classId: String, + className: String, + schemeId: String, + schemeName: String + ): StructuredProperty = { val sp = new StructuredProperty - sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName)) + sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName)) sp.setValue(value) sp } - - - def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = { + def createSP( + value: String, + classId: String, + className: String, + schemeId: String, + schemeName: String, + dataInfo: DataInfo + ): StructuredProperty = { val sp = new StructuredProperty - sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName)) + sp.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName)) sp.setValue(value) sp.setDataInfo(dataInfo) sp @@ -356,17 +423,20 @@ object DoiBoostMappingUtil { def createSP(value: String, classId: String, schemeId: String): StructuredProperty = { val sp = new StructuredProperty - sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId)) + sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId)) sp.setValue(value) sp } - - - def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = { + def createSP( + value: String, + classId: String, + schemeId: String, + dataInfo: DataInfo + ): StructuredProperty = { val sp = new StructuredProperty - sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId)) + sp.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId)) sp.setValue(value) sp.setDataInfo(dataInfo) sp @@ -382,7 +452,6 @@ object DoiBoostMappingUtil { } - def createUnpayWallCollectedFrom(): KeyValue = { val cf = new KeyValue @@ -401,15 +470,11 @@ object DoiBoostMappingUtil { } - - def generateIdentifier (oaf: Result, doi: String): String = { - val id = DHPUtils.md5 (doi.toLowerCase) + def generateIdentifier(oaf: Result, doi: String): String = { + val id = DHPUtils.md5(doi.toLowerCase) s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}" } - - - def createMAGCollectedFrom(): KeyValue = { val cf = new KeyValue @@ -424,19 +489,21 @@ object DoiBoostMappingUtil { tmp.setValue(value) tmp - } def isEmpty(x: String) = x == null || x.trim.isEmpty - def normalizeDoi(input : String) :String ={ - if(input == null) + def normalizeDoi(input: String): String = { + if (input == null) return null - val replaced = input.replaceAll("(?:\\n|\\r|\\t|\\s)", "").toLowerCase.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX) - if (isEmpty(replaced)) + val replaced = input + .replaceAll("(?:\\n|\\r|\\t|\\s)", "") + .toLowerCase + .replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX) + if (isEmpty(replaced)) return null - if(replaced.indexOf("10.") < 0) + if (replaced.indexOf("10.") < 0) return null val ret = replaced.substring(replaced.indexOf("10.")) @@ -446,9 +513,6 @@ object DoiBoostMappingUtil { return ret - } - - } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala index f13900abe..b6152526d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala @@ -17,22 +17,29 @@ object SparkGenerateDOIBoostActionSet { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization] implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset] implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation] - implicit val mapEncoderAS: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING) + implicit val mapEncoderAS: Encoder[(String, String)] = + Encoders.tuple(Encoders.STRING, Encoders.STRING) - implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]] + implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = + Encoders.kryo[AtomicAction[OafDataset]] val dbPublicationPath = parser.get("dbPublicationPath") val dbDatasetPath = parser.get("dbDatasetPath") @@ -41,35 +48,61 @@ object SparkGenerateDOIBoostActionSet { val dbOrganizationPath = parser.get("dbOrganizationPath") val sequenceFilePath = parser.get("sFilePath") - val asDataset = spark.read.load(dbDatasetPath).as[OafDataset] + val asDataset = spark.read + .load(dbDatasetPath) + .as[OafDataset] .filter(p => p != null || p.getId != null) .map(d => DoiBoostMappingUtil.fixResult(d)) - .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) + .map(d => DoiBoostMappingUtil.toActionSet(d))( + Encoders.tuple(Encoders.STRING, Encoders.STRING) + ) - - val asPublication = spark.read.load(dbPublicationPath).as[Publication] + val asPublication = spark.read + .load(dbPublicationPath) + .as[Publication] .filter(p => p != null || p.getId != null) - .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) + .map(d => DoiBoostMappingUtil.toActionSet(d))( + Encoders.tuple(Encoders.STRING, Encoders.STRING) + ) + val asOrganization = spark.read + .load(dbOrganizationPath) + .as[Organization] + .map(d => DoiBoostMappingUtil.toActionSet(d))( + Encoders.tuple(Encoders.STRING, Encoders.STRING) + ) - val asOrganization = spark.read.load(dbOrganizationPath).as[Organization] - .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) - - - val asCRelation = spark.read.load(crossRefRelation).as[Relation] + val asCRelation = spark.read + .load(crossRefRelation) + .as[Relation] .filter(r => r != null && r.getSource != null && r.getTarget != null) - .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) + .map(d => DoiBoostMappingUtil.toActionSet(d))( + Encoders.tuple(Encoders.STRING, Encoders.STRING) + ) + val asRelAffiliation = spark.read + .load(dbaffiliationRelationPath) + .as[Relation] + .map(d => DoiBoostMappingUtil.toActionSet(d))( + Encoders.tuple(Encoders.STRING, Encoders.STRING) + ) - val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation] - .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) - - - val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation) - - - d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec]) + val d: Dataset[(String, String)] = asDataset + .union(asPublication) + .union(asOrganization) + .union(asCRelation) + .union(asRelAffiliation) + d.rdd + .repartition(6000) + .map(s => (new Text(s._1), new Text(s._2))) + .saveAsHadoopFile( + s"$sequenceFilePath", + classOf[Text], + classOf[Text], + classOf[SequenceFileOutputFormat[Text, Text]], + classOf[GzipCodec] + ) } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala index 91fe56cba..9323c994c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala @@ -15,8 +15,8 @@ import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ -object SparkGenerateDoiBoost { +object SparkGenerateDoiBoost { def extractIdGRID(input: String): List[(String, String)] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats @@ -26,28 +26,32 @@ object SparkGenerateDoiBoost { val grids: List[String] = for { - JObject(pid) <- json \ "pid" + JObject(pid) <- json \ "pid" JField("qualifier", JObject(qualifier)) <- pid - JField("classid", JString(classid)) <- qualifier - JField("value", JString(vl)) <- pid + JField("classid", JString(classid)) <- qualifier + JField("value", JString(vl)) <- pid if classid == "GRID" } yield vl grids.map(g => (id, s"unresolved::grid::${g.toLowerCase}"))(collection.breakOut) } - def main(args: Array[String]): Unit = { val logger: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() import spark.implicits._ @@ -65,8 +69,7 @@ object SparkGenerateDoiBoost { a._2.setId(a._1) return a._2 } - } - else { + } else { if (a != null && a._2 != null) { b.mergeFrom(a._2) b.setId(a._1) @@ -82,8 +85,7 @@ object SparkGenerateDoiBoost { if (b1 == null) { if (b2 != null) return b2 - } - else { + } else { if (b2 != null) { b1.mergeFrom(b2) val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor) @@ -103,17 +105,19 @@ object SparkGenerateDoiBoost { override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication] } - implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization] implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset] - implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub) + implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = + Encoders.tuple(Encoders.STRING, mapEncoderPub) implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation] logger.info("Phase 2) Join Crossref with UnpayWall") - val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p)) - val uwPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p)) + val crossrefPublication: Dataset[(String, Publication)] = + spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p)) + val uwPublication: Dataset[(String, Publication)] = + spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p)) def applyMerge(item: ((String, Publication), (String, Publication))): Publication = { val crossrefPub = item._1._2 @@ -127,86 +131,140 @@ object SparkGenerateDoiBoost { crossrefPub } - crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/firstJoin") + crossrefPublication + .joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left") + .map(applyMerge) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingDirPath/firstJoin") logger.info("Phase 3) Join Result with ORCID") - val fj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p)) - val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p)) - fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin") + val fj: Dataset[(String, Publication)] = + spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p)) + val orcidPublication: Dataset[(String, Publication)] = + spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p)) + fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left") + .map(applyMerge) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingDirPath/secondJoin") logger.info("Phase 4) Join Result with MAG") - val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p)) + val sj: Dataset[(String, Publication)] = + spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p)) - val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p)) - sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublication") + val magPublication: Dataset[(String, Publication)] = + spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p)) + sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left") + .map(applyMerge) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingDirPath/doiBoostPublication") + val doiBoostPublication: Dataset[(String, Publication)] = spark.read + .load(s"$workingDirPath/doiBoostPublication") + .as[Publication] + .filter(p => DoiBoostMappingUtil.filterPublication(p)) + .map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder) - val doiBoostPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication].filter(p => DoiBoostMappingUtil.filterPublication(p)).map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder) + val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset( + spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem) + ) - val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem)) - - - doiBoostPublication.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left") + doiBoostPublication + .joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left") .map(DoiBoostMappingUtil.fixPublication) .map(p => (p.getId, p)) .groupByKey(_._1) .agg(crossrefAggregator.toColumn) .map(p => p._2) - .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered") + .write + .mode(SaveMode.Overwrite) + .save(s"$workingDirPath/doiBoostPublicationFiltered") val affiliationPath = parser.get("affiliationPath") val paperAffiliationPath = parser.get("paperAffiliationPath") - val affiliation = spark.read.load(affiliationPath).select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName")) - - val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId")) + val affiliation = spark.read + .load(affiliationPath) + .select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName")) + val paperAffiliation = spark.read + .load(paperAffiliationPath) + .select(col("AffiliationId").alias("affId"), col("PaperId")) val a: Dataset[DoiBoostAffiliation] = paperAffiliation .joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId"))) - .select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId"), col("_2.OfficialPage"), col("_2.DisplayName")).as[DoiBoostAffiliation] + .select( + col("_1.PaperId"), + col("_2.AffiliationId"), + col("_2.GridId"), + col("_2.OfficialPage"), + col("_2.DisplayName") + ) + .as[DoiBoostAffiliation] + val magPubs: Dataset[(String, Publication)] = spark.read + .load(s"$workingDirPath/doiBoostPublicationFiltered") + .as[Publication] + .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))( + tupleForJoinEncoder + ) + .filter(s => s._1 != null) - val magPubs: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication] - .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s => s._1 != null) + magPubs + .joinWith(a, magPubs("_1").equalTo(a("PaperId"))) + .flatMap(item => { + val pub: Publication = item._1._2 + val affiliation = item._2 + val affId: String = + if (affiliation.GridId.isDefined) + s"unresolved::grid::${affiliation.GridId.get.toLowerCase}" + else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString) + val r: Relation = new Relation + r.setSource(pub.getId) + r.setTarget(affId) + r.setRelType(ModelConstants.RESULT_ORGANIZATION) + r.setRelClass(ModelConstants.HAS_AUTHOR_INSTITUTION) + r.setSubRelType(ModelConstants.AFFILIATION) + r.setDataInfo(pub.getDataInfo) + r.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava) + val r1: Relation = new Relation + r1.setTarget(pub.getId) + r1.setSource(affId) + r1.setRelType(ModelConstants.RESULT_ORGANIZATION) + r1.setRelClass(ModelConstants.IS_AUTHOR_INSTITUTION_OF) + r1.setSubRelType(ModelConstants.AFFILIATION) + r1.setDataInfo(pub.getDataInfo) + r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava) + List(r, r1) + })(mapEncoderRel) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved") + val unresolvedRels: Dataset[(String, Relation)] = spark.read + .load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved") + .as[Relation] + .map(r => { - magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).flatMap(item => { - val pub: Publication = item._1._2 - val affiliation = item._2 - val affId: String = if (affiliation.GridId.isDefined) s"unresolved::grid::${affiliation.GridId.get.toLowerCase}" else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString) - val r: Relation = new Relation - r.setSource(pub.getId) - r.setTarget(affId) - r.setRelType(ModelConstants.RESULT_ORGANIZATION) - r.setRelClass(ModelConstants.HAS_AUTHOR_INSTITUTION) - r.setSubRelType(ModelConstants.AFFILIATION) - r.setDataInfo(pub.getDataInfo) - r.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava) - val r1: Relation = new Relation - r1.setTarget(pub.getId) - r1.setSource(affId) - r1.setRelType(ModelConstants.RESULT_ORGANIZATION) - r1.setRelClass(ModelConstants.IS_AUTHOR_INSTITUTION_OF) - r1.setSubRelType(ModelConstants.AFFILIATION) - r1.setDataInfo(pub.getDataInfo) - r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava) - List(r, r1) - })(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved") + if (r.getSource.startsWith("unresolved")) + (r.getSource, r) + else if (r.getTarget.startsWith("unresolved")) + (r.getTarget, r) + else + ("resolved", r) + })(Encoders.tuple(Encoders.STRING, mapEncoderRel)) + val openaireOrganization: Dataset[(String, String)] = spark.read + .text(openaireOrganizationPath) + .as[String] + .flatMap(s => extractIdGRID(s)) + .groupByKey(_._2) + .reduceGroups((x, y) => if (x != null) x else y) + .map(_._2) - val unresolvedRels: Dataset[(String, Relation)] = spark.read.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved").as[Relation].map(r => { - - if (r.getSource.startsWith("unresolved")) - (r.getSource, r) - else if (r.getTarget.startsWith("unresolved")) - (r.getTarget, r) - else - ("resolved", r) - })(Encoders.tuple(Encoders.STRING, mapEncoderRel)) - - val openaireOrganization: Dataset[(String, String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x, y) => if (x != null) x else y).map(_._2) - - unresolvedRels.joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2"))) + unresolvedRels + .joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2"))) .map { x => val currentRels = x._1._2 val currentOrgs = x._2 @@ -216,26 +274,35 @@ object SparkGenerateDoiBoost { else currentRels.setTarget(currentOrgs._1) currentRels - }.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation") - - magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).map(item => { - val affiliation = item._2 - if (affiliation.GridId.isEmpty) { - val o = new Organization - o.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava) - o.setDataInfo(DoiBoostMappingUtil.generateDataInfo()) - o.setId(DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)) - o.setOriginalId(List(affiliation.AffiliationId.toString).asJava) - if (affiliation.DisplayName.nonEmpty) - o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get)) - if (affiliation.OfficialPage.isDefined) - o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get)) - o.setCountry(ModelConstants.UNKNOWN_COUNTRY) - o } - else - null - }).filter(o => o != null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization") + .filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingDirPath/doiBoostPublicationAffiliation") + + magPubs + .joinWith(a, magPubs("_1").equalTo(a("PaperId"))) + .map(item => { + val affiliation = item._2 + if (affiliation.GridId.isEmpty) { + val o = new Organization + o.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava) + o.setDataInfo(DoiBoostMappingUtil.generateDataInfo()) + o.setId(DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)) + o.setOriginalId(List(affiliation.AffiliationId.toString).asJava) + if (affiliation.DisplayName.nonEmpty) + o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get)) + if (affiliation.OfficialPage.isDefined) + o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get)) + o.setCountry(ModelConstants.UNKNOWN_COUNTRY) + o + } else + null + }) + .filter(o => o != null) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingDirPath/doiBoostOrganization") } } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index edca4a180..0cb08ea94 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -18,70 +18,74 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex -case class CrossrefDT(doi: String, json:String, timestamp: Long) {} +case class CrossrefDT(doi: String, json: String, timestamp: Long) {} case class mappingAffiliation(name: String) {} -case class mappingAuthor(given: Option[String], family: String, sequence:Option[String], ORCID: Option[String], affiliation: Option[mappingAffiliation]) {} +case class mappingAuthor( + given: Option[String], + family: String, + sequence: Option[String], + ORCID: Option[String], + affiliation: Option[mappingAffiliation] +) {} case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {} - case object Crossref2Oaf { val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass) val mappingCrossrefType = Map( - "book-section" -> "publication", - "book" -> "publication", - "book-chapter" -> "publication", - "book-part" -> "publication", - "book-series" -> "publication", - "book-set" -> "publication", - "book-track" -> "publication", - "edited-book" -> "publication", - "reference-book" -> "publication", - "monograph" -> "publication", - "journal-article" -> "publication", - "dissertation" -> "publication", - "other" -> "publication", - "peer-review" -> "publication", - "proceedings" -> "publication", + "book-section" -> "publication", + "book" -> "publication", + "book-chapter" -> "publication", + "book-part" -> "publication", + "book-series" -> "publication", + "book-set" -> "publication", + "book-track" -> "publication", + "edited-book" -> "publication", + "reference-book" -> "publication", + "monograph" -> "publication", + "journal-article" -> "publication", + "dissertation" -> "publication", + "other" -> "publication", + "peer-review" -> "publication", + "proceedings" -> "publication", "proceedings-article" -> "publication", - "reference-entry" -> "publication", - "report" -> "publication", - "report-series" -> "publication", - "standard" -> "publication", - "standard-series" -> "publication", - "posted-content" -> "publication", - "dataset" -> "dataset" + "reference-entry" -> "publication", + "report" -> "publication", + "report-series" -> "publication", + "standard" -> "publication", + "standard-series" -> "publication", + "posted-content" -> "publication", + "dataset" -> "dataset" ) - val mappingCrossrefSubType = Map( - "book-section" -> "0013 Part of book or chapter of book", - "book" -> "0002 Book", - "book-chapter" -> "0013 Part of book or chapter of book", - "book-part" -> "0013 Part of book or chapter of book", - "book-series" -> "0002 Book", - "book-set" -> "0002 Book", - "book-track" -> "0002 Book", - "edited-book" -> "0002 Book", - "reference-book" -> "0002 Book", - "monograph" -> "0002 Book", - "journal-article" -> "0001 Article", - "dissertation" -> "0044 Thesis", - "other" -> "0038 Other literature type", - "peer-review" -> "0015 Review", - "proceedings" -> "0004 Conference object", + "book-section" -> "0013 Part of book or chapter of book", + "book" -> "0002 Book", + "book-chapter" -> "0013 Part of book or chapter of book", + "book-part" -> "0013 Part of book or chapter of book", + "book-series" -> "0002 Book", + "book-set" -> "0002 Book", + "book-track" -> "0002 Book", + "edited-book" -> "0002 Book", + "reference-book" -> "0002 Book", + "monograph" -> "0002 Book", + "journal-article" -> "0001 Article", + "dissertation" -> "0044 Thesis", + "other" -> "0038 Other literature type", + "peer-review" -> "0015 Review", + "proceedings" -> "0004 Conference object", "proceedings-article" -> "0004 Conference object", - "reference-entry" -> "0013 Part of book or chapter of book", - "report" -> "0017 Report", - "report-series" -> "0017 Report", - "standard" -> "0038 Other literature type", - "standard-series" -> "0038 Other literature type", - "dataset" -> "0021 Dataset", - "preprint" -> "0016 Preprint", - "report" -> "0017 Report" + "reference-entry" -> "0013 Part of book or chapter of book", + "report" -> "0017 Report", + "report-series" -> "0017 Report", + "standard" -> "0038 Other literature type", + "standard-series" -> "0038 Other literature type", + "dataset" -> "0021 Dataset", + "preprint" -> "0016 Preprint", + "report" -> "0017 Report" ) def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = { @@ -100,7 +104,6 @@ case object Crossref2Oaf { val originalIds = new util.ArrayList(tmp.filter(id => id != null).asJava) result.setOriginalId(originalIds) - // Add DataInfo result.setDataInfo(generateDataInfo()) @@ -111,98 +114,169 @@ case object Crossref2Oaf { // Publisher ( Name of work's publisher mapped into Result/Publisher) val publisher = (json \ "publisher").extractOrElse[String](null) - if (publisher!= null && publisher.nonEmpty) + if (publisher != null && publisher.nonEmpty) result.setPublisher(asField(publisher)) - // TITLE - val mainTitles = for {JString(title) <- json \ "title" if title.nonEmpty} yield createSP(title, "main title", ModelConstants.DNET_DATACITE_TITLE) - val originalTitles = for {JString(title) <- json \ "original-title" if title.nonEmpty} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE) - val shortTitles = for {JString(title) <- json \ "short-title" if title.nonEmpty} yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE) - val subtitles = for {JString(title) <- json \ "subtitle" if title.nonEmpty} yield createSP(title, "subtitle", ModelConstants.DNET_DATACITE_TITLE) + val mainTitles = + for { JString(title) <- json \ "title" if title.nonEmpty } yield createSP( + title, + "main title", + ModelConstants.DNET_DATACITE_TITLE + ) + val originalTitles = for { + JString(title) <- json \ "original-title" if title.nonEmpty + } yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE) + val shortTitles = for { + JString(title) <- json \ "short-title" if title.nonEmpty + } yield createSP(title, "alternative title", ModelConstants.DNET_DATACITE_TITLE) + val subtitles = + for { JString(title) <- json \ "subtitle" if title.nonEmpty } yield createSP( + title, + "subtitle", + ModelConstants.DNET_DATACITE_TITLE + ) result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava) // DESCRIPTION - val descriptionList = for {JString(description) <- json \ "abstract"} yield asField(description) + val descriptionList = + for { JString(description) <- json \ "abstract" } yield asField(description) result.setDescription(descriptionList.asJava) // Source - val sourceList = for {JString(source) <- json \ "source" if source!= null && source.nonEmpty} yield asField(source) + val sourceList = for { + JString(source) <- json \ "source" if source != null && source.nonEmpty + } yield asField(source) result.setSource(sourceList.asJava) //RELEVANT DATE Mapping - val createdDate = generateDate((json \ "created" \ "date-time").extract[String], (json \ "created" \ "date-parts").extract[List[List[Int]]], "created", ModelConstants.DNET_DATACITE_DATE) - val postedDate = generateDate((json \ "posted" \ "date-time").extractOrElse[String](null), (json \ "posted" \ "date-parts").extract[List[List[Int]]], "available", ModelConstants.DNET_DATACITE_DATE) - val acceptedDate = generateDate((json \ "accepted" \ "date-time").extractOrElse[String](null), (json \ "accepted" \ "date-parts").extract[List[List[Int]]], "accepted", ModelConstants.DNET_DATACITE_DATE) - val publishedPrintDate = generateDate((json \ "published-print" \ "date-time").extractOrElse[String](null), (json \ "published-print" \ "date-parts").extract[List[List[Int]]], "published-print", ModelConstants.DNET_DATACITE_DATE) - val publishedOnlineDate = generateDate((json \ "published-online" \ "date-time").extractOrElse[String](null), (json \ "published-online" \ "date-parts").extract[List[List[Int]]], "published-online", ModelConstants.DNET_DATACITE_DATE) + val createdDate = generateDate( + (json \ "created" \ "date-time").extract[String], + (json \ "created" \ "date-parts").extract[List[List[Int]]], + "created", + ModelConstants.DNET_DATACITE_DATE + ) + val postedDate = generateDate( + (json \ "posted" \ "date-time").extractOrElse[String](null), + (json \ "posted" \ "date-parts").extract[List[List[Int]]], + "available", + ModelConstants.DNET_DATACITE_DATE + ) + val acceptedDate = generateDate( + (json \ "accepted" \ "date-time").extractOrElse[String](null), + (json \ "accepted" \ "date-parts").extract[List[List[Int]]], + "accepted", + ModelConstants.DNET_DATACITE_DATE + ) + val publishedPrintDate = generateDate( + (json \ "published-print" \ "date-time").extractOrElse[String](null), + (json \ "published-print" \ "date-parts").extract[List[List[Int]]], + "published-print", + ModelConstants.DNET_DATACITE_DATE + ) + val publishedOnlineDate = generateDate( + (json \ "published-online" \ "date-time").extractOrElse[String](null), + (json \ "published-online" \ "date-parts").extract[List[List[Int]]], + "published-online", + ModelConstants.DNET_DATACITE_DATE + ) - val issuedDate = extractDate((json \ "issued" \ "date-time").extractOrElse[String](null), (json \ "issued" \ "date-parts").extract[List[List[Int]]]) + val issuedDate = extractDate( + (json \ "issued" \ "date-time").extractOrElse[String](null), + (json \ "issued" \ "date-parts").extract[List[List[Int]]] + ) if (StringUtils.isNotBlank(issuedDate)) { result.setDateofacceptance(asField(issuedDate)) - } - else { + } else { result.setDateofacceptance(asField(createdDate.getValue)) } - result.setRelevantdate(List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate).filter(p => p != null).asJava) + result.setRelevantdate( + List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate) + .filter(p => p != null) + .asJava + ) //Mapping Subject - val subjectList:List[String] = (json \ "subject").extractOrElse[List[String]](List()) + val subjectList: List[String] = (json \ "subject").extractOrElse[List[String]](List()) if (subjectList.nonEmpty) { - result.setSubject(subjectList.map(s=> createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava) + result.setSubject( + subjectList.map(s => createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava + ) } - - //Mapping Author - val authorList: List[mappingAuthor] = (json \ "author").extractOrElse[List[mappingAuthor]](List()) + val authorList: List[mappingAuthor] = + (json \ "author").extractOrElse[List[mappingAuthor]](List()) + val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) => + a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first") + ) - - val sorted_list = authorList.sortWith((a:mappingAuthor, b:mappingAuthor) => a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first")) - - result.setAuthor(sorted_list.zipWithIndex.map{case (a, index) => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)}.asJava) + result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) => + generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index) + }.asJava) // Mapping instance val instance = new Instance() val license = for { - JObject(license) <- json \ "license" - JField("URL", JString(lic)) <- license + JObject(license) <- json \ "license" + JField("URL", JString(lic)) <- license JField("content-version", JString(content_version)) <- license } yield (asField(lic), content_version) val l = license.filter(d => StringUtils.isNotBlank(d._1.getValue)) - if (l.nonEmpty){ - if (l exists (d => d._2.equals("vor"))){ - for(d <- l){ - if (d._2.equals("vor")){ + if (l.nonEmpty) { + if (l exists (d => d._2.equals("vor"))) { + for (d <- l) { + if (d._2.equals("vor")) { instance.setLicense(d._1) } } + } else { + instance.setLicense(l.head._1) } - else{ - instance.setLicense(l.head._1)} } // Ticket #6281 added pid to Instance instance.setPid(result.getPid) - val has_review = json \ "relation" \"has-review" \ "id" + val has_review = json \ "relation" \ "has-review" \ "id" - if(has_review != JNothing) { + if (has_review != JNothing) { instance.setRefereed( - OafMapperUtils.qualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS)) + OafMapperUtils.qualifier( + "0001", + "peerReviewed", + ModelConstants.DNET_REVIEW_LEVELS, + ModelConstants.DNET_REVIEW_LEVELS + ) + ) } - instance.setAccessright(decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)) - instance.setInstancetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) - result.setResourcetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) + instance.setAccessright( + decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue) + ) + instance.setInstancetype( + OafMapperUtils.qualifier( + cobjCategory.substring(0, 4), + cobjCategory.substring(5), + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) + result.setResourcetype( + OafMapperUtils.qualifier( + cobjCategory.substring(0, 4), + cobjCategory.substring(5), + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) instance.setCollectedfrom(createCrossrefCollectedFrom()) if (StringUtils.isNotBlank(issuedDate)) { instance.setDateofacceptance(asField(issuedDate)) - } - else { + } else { instance.setDateofacceptance(asField(createdDate.getValue)) } val s: List[String] = List("https://doi.org/" + doi) @@ -210,10 +284,9 @@ case object Crossref2Oaf { // if (links.nonEmpty) { // instance.setUrl(links.asJava) // } - if(s.nonEmpty) - { - instance.setUrl(s.asJava) - } + if (s.nonEmpty) { + instance.setUrl(s.asJava) + } result.setInstance(List(instance).asJava) @@ -236,15 +309,23 @@ case object Crossref2Oaf { result } - - def generateAuhtor(given: String, family: String, orcid: String, index:Int): Author = { + def generateAuhtor(given: String, family: String, orcid: String, index: Int): Author = { val a = new Author a.setName(given) a.setSurname(family) a.setFullname(s"$given $family") - a.setRank(index+1) + a.setRank(index + 1) if (StringUtils.isNotBlank(orcid)) - a.setPid(List(createSP(orcid, ModelConstants.ORCID_PENDING, ModelConstants.DNET_PID_TYPES, generateDataInfo())).asJava) + a.setPid( + List( + createSP( + orcid, + ModelConstants.ORCID_PENDING, + ModelConstants.DNET_PID_TYPES, + generateDataInfo() + ) + ).asJava + ) a } @@ -255,54 +336,62 @@ case object Crossref2Oaf { var resultList: List[Oaf] = List() - val objectType = (json \ "type").extractOrElse[String](null) val objectSubType = (json \ "subtype").extractOrElse[String](null) if (objectType == null) return resultList - val result = generateItemFromType(objectType, objectSubType) if (result == null) return List() - val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type")) + val cOBJCategory = mappingCrossrefSubType.getOrElse( + objectType, + mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type") + ) mappingResult(result, json, cOBJCategory) if (result == null || result.getId == null) return List() - - val funderList: List[mappingFunder] = (json \ "funder").extractOrElse[List[mappingFunder]](List()) + val funderList: List[mappingFunder] = + (json \ "funder").extractOrElse[List[mappingFunder]](List()) if (funderList.nonEmpty) { - resultList = resultList ::: mappingFunderToRelations(funderList, result.getId, createCrossrefCollectedFrom(), result.getDataInfo, result.getLastupdatetimestamp) + resultList = resultList ::: mappingFunderToRelations( + funderList, + result.getId, + createCrossrefCollectedFrom(), + result.getDataInfo, + result.getLastupdatetimestamp + ) } - result match { case publication: Publication => convertPublication(publication, json, cOBJCategory) - case dataset: Dataset => convertDataset(dataset) + case dataset: Dataset => convertDataset(dataset) } resultList = resultList ::: List(result) resultList } - - def mappingFunderToRelations(funders: List[mappingFunder], sourceId: String, cf: KeyValue, di: DataInfo, ts: Long): List[Relation] = { + def mappingFunderToRelations( + funders: List[mappingFunder], + sourceId: String, + cf: KeyValue, + di: DataInfo, + ts: Long + ): List[Relation] = { val queue = new mutable.Queue[Relation] - - def snsfRule(award:String): String = { - val tmp1 = StringUtils.substringAfter(award,"_") - val tmp2 = StringUtils.substringBefore(tmp1,"/") + def snsfRule(award: String): String = { + val tmp1 = StringUtils.substringAfter(award, "_") + val tmp2 = StringUtils.substringBefore(tmp1, "/") logger.debug(s"From $award to $tmp2") tmp2 - } - def extractECAward(award: String): String = { val awardECRegex: Regex = "[0-9]{4,9}".r if (awardECRegex.findAllIn(award).hasNext) @@ -310,8 +399,7 @@ case object Crossref2Oaf { null } - - def generateRelation(sourceId:String, targetId:String, relClass:String) :Relation = { + def generateRelation(sourceId: String, targetId: String, relClass: String): Relation = { val r = new Relation r.setSource(sourceId) @@ -324,98 +412,119 @@ case object Crossref2Oaf { r.setLastupdatetimestamp(ts) r - } - - def generateSimpleRelationFromAward(funder: mappingFunder, nsPrefix: String, extractField: String => String): Unit = { + def generateSimpleRelationFromAward( + funder: mappingFunder, + nsPrefix: String, + extractField: String => String + ): Unit = { if (funder.award.isDefined && funder.award.get.nonEmpty) - funder.award.get.map(extractField).filter(a => a!= null && a.nonEmpty).foreach( - award => { + funder.award.get + .map(extractField) + .filter(a => a != null && a.nonEmpty) + .foreach(award => { val targetId = getProjectId(nsPrefix, DHPUtils.md5(award)) - queue += generateRelation(sourceId, targetId , ModelConstants.IS_PRODUCED_BY) - queue += generateRelation(targetId , sourceId, ModelConstants.PRODUCES) - } - ) + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + }) } - def getProjectId (nsPrefix:String, targetId:String):String = { + def getProjectId(nsPrefix: String, targetId: String): String = { s"40|$nsPrefix::$targetId" } - if (funders != null) - funders.foreach(funder => { - if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) { - funder.DOI.get match { - case "10.13039/100010663" | - "10.13039/100010661" | - "10.13039/501100007601" | - "10.13039/501100000780" | - "10.13039/100010665" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward) - case "10.13039/100011199" | - "10.13039/100004431" | - "10.13039/501100004963" | - "10.13039/501100000780" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward) - case "10.13039/501100000781" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward) - generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward) - case "10.13039/100000001" => generateSimpleRelationFromAward(funder, "nsf_________", a => a) - case "10.13039/501100001665" => generateSimpleRelationFromAward(funder, "anr_________", a => a) - case "10.13039/501100002341" => generateSimpleRelationFromAward(funder, "aka_________", a => a) - case "10.13039/501100001602" => generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", "")) - case "10.13039/501100000923" => generateSimpleRelationFromAward(funder, "arc_________", a => a) - case "10.13039/501100000038"=> val targetId = getProjectId("nserc_______" , "1e5e62235d094afd01cd56e65112fc63") - queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) - queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) - case "10.13039/501100000155"=> val targetId = getProjectId("sshrc_______" , "1e5e62235d094afd01cd56e65112fc63") - queue += generateRelation(sourceId,targetId, ModelConstants.IS_PRODUCED_BY) - queue += generateRelation(targetId,sourceId, ModelConstants.PRODUCES) - case "10.13039/501100000024"=> val targetId = getProjectId("cihr________" , "1e5e62235d094afd01cd56e65112fc63") - queue += generateRelation(sourceId,targetId, ModelConstants.IS_PRODUCED_BY) - queue += generateRelation(targetId,sourceId, ModelConstants.PRODUCES) - case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a) - case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward) - case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a=>a) - case "10.13039/501100004564" => generateSimpleRelationFromAward(funder, "mestd_______", extractECAward) - case "10.13039/501100003407" => generateSimpleRelationFromAward(funder, "miur________", a=>a) - val targetId = getProjectId("miur________" , "1e5e62235d094afd01cd56e65112fc63") - queue += generateRelation(sourceId,targetId, ModelConstants.IS_PRODUCED_BY) - queue += generateRelation(targetId,sourceId, ModelConstants.PRODUCES) - case "10.13039/501100006588" | - "10.13039/501100004488" => generateSimpleRelationFromAward(funder, "irb_hr______", a=>a.replaceAll("Project No.", "").replaceAll("HRZZ-","") ) - case "10.13039/501100006769"=> generateSimpleRelationFromAward(funder, "rsf_________", a=>a) - case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snsfRule) - case "10.13039/501100004410"=> generateSimpleRelationFromAward(funder, "tubitakf____", a =>a) - case "10.10.13039/100004440"=> generateSimpleRelationFromAward(funder, "wt__________", a =>a) - case "10.13039/100004440"=> val targetId = getProjectId("wt__________" , "1e5e62235d094afd01cd56e65112fc63") - queue += generateRelation(sourceId,targetId, ModelConstants.IS_PRODUCED_BY) - queue += generateRelation(targetId,sourceId, ModelConstants.PRODUCES) + funders.foreach(funder => { + if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) { + funder.DOI.get match { + case "10.13039/100010663" | "10.13039/100010661" | "10.13039/501100007601" | "10.13039/501100000780" | + "10.13039/100010665" => + generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward) + case "10.13039/100011199" | "10.13039/100004431" | "10.13039/501100004963" | "10.13039/501100000780" => + generateSimpleRelationFromAward(funder, "corda_______", extractECAward) + case "10.13039/501100000781" => + generateSimpleRelationFromAward(funder, "corda_______", extractECAward) + generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward) + case "10.13039/100000001" => + generateSimpleRelationFromAward(funder, "nsf_________", a => a) + case "10.13039/501100001665" => + generateSimpleRelationFromAward(funder, "anr_________", a => a) + case "10.13039/501100002341" => + generateSimpleRelationFromAward(funder, "aka_________", a => a) + case "10.13039/501100001602" => + generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", "")) + case "10.13039/501100000923" => + generateSimpleRelationFromAward(funder, "arc_________", a => a) + case "10.13039/501100000038" => + val targetId = getProjectId("nserc_______", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + case "10.13039/501100000155" => + val targetId = getProjectId("sshrc_______", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + case "10.13039/501100000024" => + val targetId = getProjectId("cihr________", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + case "10.13039/501100002848" => + generateSimpleRelationFromAward(funder, "conicytf____", a => a) + case "10.13039/501100003448" => + generateSimpleRelationFromAward(funder, "gsrt________", extractECAward) + case "10.13039/501100010198" => + generateSimpleRelationFromAward(funder, "sgov________", a => a) + case "10.13039/501100004564" => + generateSimpleRelationFromAward(funder, "mestd_______", extractECAward) + case "10.13039/501100003407" => + generateSimpleRelationFromAward(funder, "miur________", a => a) + val targetId = getProjectId("miur________", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + case "10.13039/501100006588" | "10.13039/501100004488" => + generateSimpleRelationFromAward( + funder, + "irb_hr______", + a => a.replaceAll("Project No.", "").replaceAll("HRZZ-", "") + ) + case "10.13039/501100006769" => + generateSimpleRelationFromAward(funder, "rsf_________", a => a) + case "10.13039/501100001711" => + generateSimpleRelationFromAward(funder, "snsf________", snsfRule) + case "10.13039/501100004410" => + generateSimpleRelationFromAward(funder, "tubitakf____", a => a) + case "10.10.13039/100004440" => + generateSimpleRelationFromAward(funder, "wt__________", a => a) + case "10.13039/100004440" => + val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) - case _ => logger.debug("no match for "+funder.DOI.get ) + case _ => logger.debug("no match for " + funder.DOI.get) + } + } else { + funder.name match { + case "European Union’s Horizon 2020 research and innovation program" => + generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward) + case "European Union's" => + generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward) + generateSimpleRelationFromAward(funder, "corda_______", extractECAward) + case "The French National Research Agency (ANR)" | "The French National Research Agency" => + generateSimpleRelationFromAward(funder, "anr_________", a => a) + case "CONICYT, Programa de Formación de Capital Humano Avanzado" => + generateSimpleRelationFromAward(funder, "conicytf____", extractECAward) + case "Wellcome Trust Masters Fellowship" => + val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + case _ => logger.debug("no match for " + funder.name) + + } } - - } else { - funder.name match { - case "European Union’s Horizon 2020 research and innovation program" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward) - case "European Union's" => - generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward) - generateSimpleRelationFromAward(funder, "corda_______", extractECAward) - case "The French National Research Agency (ANR)" | - "The French National Research Agency" => generateSimpleRelationFromAward(funder, "anr_________", a => a) - case "CONICYT, Programa de Formación de Capital Humano Avanzado" => generateSimpleRelationFromAward(funder, "conicytf____", extractECAward) - case "Wellcome Trust Masters Fellowship" => val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63") - queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY ) - queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES ) - case _ => logger.debug("no match for "+funder.name ) - - } - } - - } - ) + }) queue.toList } @@ -423,33 +532,31 @@ case object Crossref2Oaf { // TODO check if there are other info to map into the Dataset } - def convertPublication(publication: Publication, json: JValue, cobjCategory: String): Unit = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats - val containerTitles = for {JString(ct) <- json \ "container-title"} yield ct - + val containerTitles = for { JString(ct) <- json \ "container-title" } yield ct //Mapping book if (cobjCategory.toLowerCase.contains("book")) { - val ISBN = for {JString(isbn) <- json \ "ISBN"} yield isbn + val ISBN = for { JString(isbn) <- json \ "ISBN" } yield isbn if (ISBN.nonEmpty && containerTitles.nonEmpty) { val source = s"${containerTitles.head} ISBN: ${ISBN.head}" if (publication.getSource != null) { val l: List[Field[String]] = publication.getSource.asScala.toList val ll: List[Field[String]] = l ::: List(asField(source)) publication.setSource(ll.asJava) - } - else + } else publication.setSource(List(asField(source)).asJava) } } else { // Mapping Journal - val issnInfos = for {JArray(issn_types) <- json \ "issn-type" - JObject(issn_type) <- issn_types - JField("type", JString(tp)) <- issn_type - JField("value", JString(vl)) <- issn_type - } yield Tuple2(tp, vl) + val issnInfos = for { + JArray(issn_types) <- json \ "issn-type" + JObject(issn_type) <- issn_types + JField("type", JString(tp)) <- issn_type + JField("value", JString(vl)) <- issn_type + } yield Tuple2(tp, vl) val volume = (json \ "volume").extractOrElse[String](null) if (containerTitles.nonEmpty) { @@ -460,7 +567,7 @@ case object Crossref2Oaf { issnInfos.foreach(tp => { tp._1 match { case "electronic" => journal.setIssnOnline(tp._2) - case "print" => journal.setIssnPrinted(tp._2) + case "print" => journal.setIssnPrinted(tp._2) } }) } @@ -494,7 +601,12 @@ case object Crossref2Oaf { } - def generateDate(dt: String, datePart: List[List[Int]], classId: String, schemeId: String): StructuredProperty = { + def generateDate( + dt: String, + datePart: List[List[Int]], + classId: String, + schemeId: String + ): StructuredProperty = { val dp = extractDate(dt, datePart) if (StringUtils.isNotBlank(dp)) return createSP(dp, classId, schemeId) diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala index 6a1c701af..c6e4706d7 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala @@ -16,7 +16,6 @@ object CrossrefDataset { val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) - def to_item(input: String): CrossrefDT = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats @@ -29,19 +28,24 @@ object CrossrefDataset { def main(args: Array[String]): Unit = { - val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + CrossrefDataset.getClass.getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json" + ) + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(SparkMapDumpIntoOAF.getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() import spark.implicits._ - val crossrefAggregator = new Aggregator[CrossrefDT, CrossrefDT, CrossrefDT] with Serializable { override def zero: CrossrefDT = null @@ -52,7 +56,6 @@ object CrossrefDataset { if (a == null) return b - if (a.timestamp > b.timestamp) { return a } @@ -80,19 +83,24 @@ object CrossrefDataset { val workingPath: String = parser.get("workingPath") - val main_ds: Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT] - val update = - spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text]) - .map(i => CrossrefImporter.decompressBlob(i._2.toString)) - .map(i => to_item(i))) + spark.createDataset( + spark.sparkContext + .sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text]) + .map(i => CrossrefImporter.decompressBlob(i._2.toString)) + .map(i => to_item(i)) + ) - main_ds.union(update).groupByKey(_.doi) + main_ds + .union(update) + .groupByKey(_.doi) .agg(crossrefAggregator.toColumn) .map(s => s._2) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated") + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/crossref_ds_updated") } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala index 6d03abc25..df185910e 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala @@ -18,7 +18,6 @@ object GenerateCrossrefDataset { implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT] - def crossrefElement(meta: String): CrossrefDT = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(meta) @@ -30,13 +29,23 @@ object GenerateCrossrefDataset { def main(args: Array[String]): Unit = { val conf = new SparkConf - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString) + val parser = new ArgumentApplicationParser( + Source + .fromInputStream( + getClass.getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json" + ) + ) + .mkString + ) parser.parseArgument(args) val master = parser.get("master") val sourcePath = parser.get("sourcePath") val targetPath = parser.get("targetPath") - val spark: SparkSession = SparkSession.builder().config(conf) + val spark: SparkSession = SparkSession + .builder() + .config(conf) .appName(UnpackCrtossrefEntries.getClass.getSimpleName) .master(master) .getOrCreate() @@ -44,12 +53,14 @@ object GenerateCrossrefDataset { import spark.implicits._ - val tmp: RDD[String] = sc.textFile(sourcePath, 6000) - spark.createDataset(tmp) + spark + .createDataset(tmp) .map(entry => crossrefElement(entry)) - .write.mode(SaveMode.Overwrite).save(targetPath) + .write + .mode(SaveMode.Overwrite) + .save(targetPath) // .map(meta => crossrefElement(meta)) // .toDS.as[CrossrefDT] // .write.mode(SaveMode.Overwrite).save(targetPath) diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala index fa55b9fb9..96923f000 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala @@ -8,7 +8,6 @@ import org.apache.spark.SparkConf import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} - case class Reference(author: String, firstPage: String) {} object SparkMapDumpIntoOAF { @@ -19,14 +18,21 @@ object SparkMapDumpIntoOAF { val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + SparkMapDumpIntoOAF.getClass.getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/convert_crossref_dump_to_oaf_params.json" + ) + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(SparkMapDumpIntoOAF.getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] @@ -35,19 +41,34 @@ object SparkMapDumpIntoOAF { val targetPath = parser.get("targetPath") - spark.read.load(parser.get("sourcePath")).as[CrossrefDT] + spark.read + .load(parser.get("sourcePath")) + .as[CrossrefDT] .flatMap(k => Crossref2Oaf.convert(k.json)) .filter(o => o != null) - .write.mode(SaveMode.Overwrite).save(s"$targetPath/mixObject") + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/mixObject") - val ds:Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf] + val ds: Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf] - ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefPublication") + ds.filter(o => o.isInstanceOf[Publication]) + .map(o => o.asInstanceOf[Publication]) + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/crossrefPublication") - ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefRelation") + ds.filter(o => o.isInstanceOf[Relation]) + .map(o => o.asInstanceOf[Relation]) + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/crossrefRelation") - ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefDataset") + ds.filter(o => o.isInstanceOf[OafDataset]) + .map(o => o.asInstanceOf[OafDataset]) + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/crossrefDataset") } - } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala index 191c4587e..3fea9695c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala @@ -16,7 +16,6 @@ object UnpackCrtossrefEntries { val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass) - def extractDump(input: String): List[String] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) @@ -24,28 +23,36 @@ object UnpackCrtossrefEntries { val a = (json \ "items").extract[JArray] a.arr.map(s => compact(render(s))) - } - def main(args: Array[String]): Unit = { val conf = new SparkConf - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString) + val parser = new ArgumentApplicationParser( + Source + .fromInputStream( + getClass.getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json" + ) + ) + .mkString + ) parser.parseArgument(args) val master = parser.get("master") val sourcePath = parser.get("sourcePath") val targetPath = parser.get("targetPath") - val spark: SparkSession = SparkSession.builder().config(conf) + val spark: SparkSession = SparkSession + .builder() + .config(conf) .appName(UnpackCrtossrefEntries.getClass.getSimpleName) .master(master) .getOrCreate() val sc: SparkContext = spark.sparkContext - sc.wholeTextFiles(sourcePath, 6000).flatMap(d => extractDump(d._2)) + sc.wholeTextFiles(sourcePath, 6000) + .flatMap(d => extractDump(d._2)) .saveAsTextFile(targetPath, classOf[GzipCodec]) - } } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala index 0a6fa00f0..18ba864ce 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala @@ -1,6 +1,5 @@ package eu.dnetlib.doiboost.mag - import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty} @@ -14,59 +13,134 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex - -case class MagPapers(PaperId: Long, Rank: Integer, Doi: String, - DocType: String, PaperTitle: String, OriginalTitle: String, - BookTitle: String, Year: Option[Integer], Date: Option[java.sql.Timestamp], Publisher: String, - JournalId: Option[Long], ConferenceSeriesId: Option[Long], ConferenceInstanceId: Option[Long], - Volume: String, Issue: String, FirstPage: String, LastPage: String, - ReferenceCount: Option[Long], CitationCount: Option[Long], EstimatedCitation: Option[Long], - OriginalVenue: String, FamilyId: Option[Long], CreatedDate: java.sql.Timestamp) {} - +case class MagPapers( + PaperId: Long, + Rank: Integer, + Doi: String, + DocType: String, + PaperTitle: String, + OriginalTitle: String, + BookTitle: String, + Year: Option[Integer], + Date: Option[java.sql.Timestamp], + Publisher: String, + JournalId: Option[Long], + ConferenceSeriesId: Option[Long], + ConferenceInstanceId: Option[Long], + Volume: String, + Issue: String, + FirstPage: String, + LastPage: String, + ReferenceCount: Option[Long], + CitationCount: Option[Long], + EstimatedCitation: Option[Long], + OriginalVenue: String, + FamilyId: Option[Long], + CreatedDate: java.sql.Timestamp +) {} case class MagPaperAbstract(PaperId: Long, IndexedAbstract: String) {} -case class MagAuthor(AuthorId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], LastKnownAffiliationId: Option[Long], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {} +case class MagAuthor( + AuthorId: Long, + Rank: Option[Int], + NormalizedName: Option[String], + DisplayName: Option[String], + LastKnownAffiliationId: Option[Long], + PaperCount: Option[Long], + CitationCount: Option[Long], + CreatedDate: Option[java.sql.Timestamp] +) {} -case class MagAffiliation(AffiliationId: Long, Rank: Int, NormalizedName: String, DisplayName: String, GridId: String, OfficialPage: String, WikiPage: String, PaperCount: Long, CitationCount: Long, Latitude: Option[Float], Longitude: Option[Float], CreatedDate: java.sql.Timestamp) {} +case class MagAffiliation( + AffiliationId: Long, + Rank: Int, + NormalizedName: String, + DisplayName: String, + GridId: String, + OfficialPage: String, + WikiPage: String, + PaperCount: Long, + CitationCount: Long, + Latitude: Option[Float], + Longitude: Option[Float], + CreatedDate: java.sql.Timestamp +) {} -case class MagPaperAuthorAffiliation(PaperId: Long, AuthorId: Long, AffiliationId: Option[Long], AuthorSequenceNumber: Int, OriginalAuthor: String, OriginalAffiliation: String) {} +case class MagPaperAuthorAffiliation( + PaperId: Long, + AuthorId: Long, + AffiliationId: Option[Long], + AuthorSequenceNumber: Int, + OriginalAuthor: String, + OriginalAffiliation: String +) {} - -case class MagAuthorAffiliation(author: MagAuthor, affiliation:String, sequenceNumber:Int) +case class MagAuthorAffiliation(author: MagAuthor, affiliation: String, sequenceNumber: Int) case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {} -case class MagPaperAuthorDenormalized(PaperId: Long, author: MagAuthor, affiliation:String, sequenceNumber:Int) {} +case class MagPaperAuthorDenormalized( + PaperId: Long, + author: MagAuthor, + affiliation: String, + sequenceNumber: Int +) {} -case class MagPaperUrl(PaperId: Long, SourceType: Option[Int], SourceUrl: Option[String], LanguageCode: Option[String]) {} +case class MagPaperUrl( + PaperId: Long, + SourceType: Option[Int], + SourceUrl: Option[String], + LanguageCode: Option[String] +) {} -case class MagUrlInstance(SourceUrl:String){} +case class MagUrlInstance(SourceUrl: String) {} case class MagUrl(PaperId: Long, instances: List[MagUrlInstance]) -case class MagSubject(FieldOfStudyId:Long, DisplayName:String, MainType:Option[String], Score:Float){} +case class MagSubject( + FieldOfStudyId: Long, + DisplayName: String, + MainType: Option[String], + Score: Float +) {} -case class MagFieldOfStudy(PaperId:Long, subjects:List[MagSubject]) {} +case class MagFieldOfStudy(PaperId: Long, subjects: List[MagSubject]) {} -case class MagJournal(JournalId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], Issn: Option[String], Publisher: Option[String], Webpage: Option[String], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {} +case class MagJournal( + JournalId: Long, + Rank: Option[Int], + NormalizedName: Option[String], + DisplayName: Option[String], + Issn: Option[String], + Publisher: Option[String], + Webpage: Option[String], + PaperCount: Option[Long], + CitationCount: Option[Long], + CreatedDate: Option[java.sql.Timestamp] +) {} - -case class MagConferenceInstance(ci:Long, DisplayName:Option[String], Location:Option[String], StartDate:Option[java.sql.Timestamp], EndDate:Option[java.sql.Timestamp], PaperId:Long){} +case class MagConferenceInstance( + ci: Long, + DisplayName: Option[String], + Location: Option[String], + StartDate: Option[java.sql.Timestamp], + EndDate: Option[java.sql.Timestamp], + PaperId: Long +) {} case object ConversionUtil { - def extractMagIdentifier(pids:mutable.Buffer[String]) :String ={ + def extractMagIdentifier(pids: mutable.Buffer[String]): String = { val magIDRegex: Regex = "^[0-9]+$".r - val s =pids.filter(p=> magIDRegex.findAllIn(p).hasNext) + val s = pids.filter(p => magIDRegex.findAllIn(p).hasNext) if (s.nonEmpty) return s.head null } - - def mergePublication(a: Publication, b:Publication) : Publication = { + def mergePublication(a: Publication, b: Publication): Publication = { if ((a != null) && (b != null)) { a.mergeFrom(b) a @@ -74,10 +148,9 @@ case object ConversionUtil { if (a == null) b else a } - } - def choiceLatestMagArtitcle(p1: MagPapers, p2:MagPapers) :MagPapers = { + def choiceLatestMagArtitcle(p1: MagPapers, p2: MagPapers): MagPapers = { var r = if (p1 == null) p2 else p1 if (p1 != null && p2 != null) { if (p1.CreatedDate != null && p2.CreatedDate != null) { @@ -93,8 +166,9 @@ case object ConversionUtil { } - - def updatePubsWithDescription(inputItem:((String, Publication), MagPaperAbstract)) : Publication = { + def updatePubsWithDescription( + inputItem: ((String, Publication), MagPaperAbstract) + ): Publication = { val pub = inputItem._1._2 val abst = inputItem._2 if (abst != null) { @@ -104,20 +178,22 @@ case object ConversionUtil { } + def updatePubsWithConferenceInfo( + inputItem: ((String, Publication), MagConferenceInstance) + ): Publication = { + val publication: Publication = inputItem._1._2 + val ci: MagConferenceInstance = inputItem._2 - def updatePubsWithConferenceInfo(inputItem:((String, Publication), MagConferenceInstance)) : Publication = { - val publication:Publication= inputItem._1._2 - val ci:MagConferenceInstance = inputItem._2 + if (ci != null) { - if (ci!= null){ - - val j:Journal = new Journal + val j: Journal = new Journal if (ci.Location.isDefined) j.setConferenceplace(ci.Location.get) j.setName(ci.DisplayName.get) - if (ci.StartDate.isDefined && ci.EndDate.isDefined) - { - j.setConferencedate(s"${ci.StartDate.get.toString.substring(0,10)} - ${ci.EndDate.get.toString.substring(0,10)}") + if (ci.StartDate.isDefined && ci.EndDate.isDefined) { + j.setConferencedate( + s"${ci.StartDate.get.toString.substring(0, 10)} - ${ci.EndDate.get.toString.substring(0, 10)}" + ) } publication.setJournal(j) @@ -125,7 +201,7 @@ case object ConversionUtil { publication } - def updatePubsWithSubject(item:((String, Publication), MagFieldOfStudy)) : Publication = { + def updatePubsWithSubject(item: ((String, Publication), MagFieldOfStudy)): Publication = { val publication = item._1._2 val fieldOfStudy = item._2 @@ -135,16 +211,34 @@ case object ConversionUtil { val classid = "MAG" val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => { - val s1 = createSP(s.DisplayName, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES) + val s1 = createSP( + s.DisplayName, + classid, + className, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + ModelConstants.DNET_SUBJECT_TYPOLOGIES + ) val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString) var resList: List[StructuredProperty] = List(s1) if (s.MainType.isDefined) { val maintp = s.MainType.get - val s2 = createSP(s.MainType.get, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES) + val s2 = createSP( + s.MainType.get, + classid, + className, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + ModelConstants.DNET_SUBJECT_TYPOLOGIES + ) s2.setDataInfo(di) resList = resList ::: List(s2) if (maintp.contains(".")) { - val s3 = createSP(maintp.split("\\.").head, classid,className, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES) + val s3 = createSP( + maintp.split("\\.").head, + classid, + className, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + ModelConstants.DNET_SUBJECT_TYPOLOGIES + ) s3.setDataInfo(di) resList = resList ::: List(s3) } @@ -156,25 +250,27 @@ case object ConversionUtil { publication } - - def addInstances(a: (Publication, MagUrl)): Publication = { val pub = a._1 val urls = a._2 - - val i = new Instance + if (urls != null) { - if (urls!= null) { - - val l:List[String] = urls.instances.filter(k=>k.SourceUrl.nonEmpty).map(k=>k.SourceUrl):::List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}") + val l: List[String] = urls.instances + .filter(k => k.SourceUrl.nonEmpty) + .map(k => k.SourceUrl) ::: List( + s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}" + ) i.setUrl(l.asJava) - } - else - i.setUrl(List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}").asJava) + } else + i.setUrl( + List( + s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}" + ).asJava + ) // Ticket #6281 added pid to Instance i.setPid(pub.getPid) @@ -184,13 +280,13 @@ case object ConversionUtil { pub } - def transformPaperAbstract(input: MagPaperAbstract): MagPaperAbstract = { MagPaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract)) } - - def createOAFFromJournalAuthorPaper(inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)): Publication = { + def createOAFFromJournalAuthorPaper( + inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList) + ): Publication = { val paper = inputParams._1._1 val journal = inputParams._1._2 val authors = inputParams._2 @@ -206,31 +302,37 @@ case object ConversionUtil { pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub)) val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE) - val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE) + val originalTitles = + createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE) pub.setTitle(List(mainTitles, originalTitles).asJava) pub.setSource(List(asField(paper.BookTitle)).asJava) val authorsOAF = authors.authors.map { f: MagAuthorAffiliation => - val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author a.setRank(f.sequenceNumber) if (f.author.DisplayName.isDefined) a.setFullname(f.author.DisplayName.get) - if(f.affiliation!= null) + if (f.affiliation != null) a.setAffiliation(List(asField(f.affiliation)).asJava) - a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava) + a.setPid( + List( + createSP( + s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", + "URL", + ModelConstants.DNET_PID_TYPES + ) + ).asJava + ) a } pub.setAuthor(authorsOAF.asJava) - if (paper.Date != null && paper.Date.isDefined) { - pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0,10))) + pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0, 10))) } pub.setPublisher(asField(paper.Publisher)) - if (journal != null && journal.DisplayName.isDefined) { val j = new Journal @@ -250,8 +352,9 @@ case object ConversionUtil { pub } - - def createOAF(inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)): Publication = { + def createOAF( + inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract) + ): Publication = { val paper = inputParams._1._1 val authors = inputParams._1._2 @@ -268,46 +371,48 @@ case object ConversionUtil { pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub)) val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE) - val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE) + val originalTitles = + createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE) pub.setTitle(List(mainTitles, originalTitles).asJava) pub.setSource(List(asField(paper.BookTitle)).asJava) - if (description != null) { pub.setDescription(List(asField(description.IndexedAbstract)).asJava) } - val authorsOAF = authors.authors.map { f: MagAuthorAffiliation => - val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author a.setFullname(f.author.DisplayName.get) - if(f.affiliation!= null) + if (f.affiliation != null) a.setAffiliation(List(asField(f.affiliation)).asJava) - - a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", ModelConstants.DNET_PID_TYPES)).asJava) + a.setPid( + List( + createSP( + s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", + "URL", + ModelConstants.DNET_PID_TYPES + ) + ).asJava + ) a } - if (paper.Date != null) { - pub.setDateofacceptance(asField(paper.Date.toString.substring(0,10))) + pub.setDateofacceptance(asField(paper.Date.toString.substring(0, 10))) } pub.setAuthor(authorsOAF.asJava) - pub } - def convertInvertedIndexString(json_input: String): String = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(json_input) @@ -317,13 +422,13 @@ case object ConversionUtil { val iid = (json \ "InvertedIndex").extract[Map[String, List[Int]]] - for {(k: String, v: List[Int]) <- iid} { + for { (k: String, v: List[Int]) <- iid } { v.foreach(item => res(item) = k) } - (0 until idl).foreach(i => { - if (res(i) == null) - res(i) = "" - }) + (0 until idl).foreach(i => { + if (res(i) == null) + res(i) = "" + }) return res.mkString(" ") } "" diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala index 039c935f3..316bd91ac 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala @@ -8,44 +8,245 @@ import org.apache.spark.sql.{SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} object SparkImportMagIntoDataset { + val datatypedict = Map( - "bool" -> BooleanType, - "int" -> IntegerType, - "uint" -> IntegerType, - "long" -> LongType, - "ulong" -> LongType, - "float" -> FloatType, - "string" -> StringType, + "bool" -> BooleanType, + "int" -> IntegerType, + "uint" -> IntegerType, + "long" -> LongType, + "ulong" -> LongType, + "float" -> FloatType, + "string" -> StringType, "DateTime" -> DateType ) - val stream = Map( - "Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), - "AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")), - "Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), - "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), - "ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), - "EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")), - "FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")), - "FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")), - "FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), - "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), - "PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")), - "PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")), - "PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")), - "PaperExtendedAttributes" -> Tuple2("mag/PaperExtendedAttributes.txt", Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")), - "PaperFieldsOfStudy" -> Tuple2("advanced/PaperFieldsOfStudy.txt", Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")), - "PaperMeSH" -> Tuple2("advanced/PaperMeSH.txt", Seq("PaperId:long", "DescriptorUI:string", "DescriptorName:string", "QualifierUI:string", "QualifierName:string", "IsMajorTopic:bool")), - "PaperRecommendations" -> Tuple2("advanced/PaperRecommendations.txt", Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")), - "PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")), - "PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")), - "PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")), - "Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "FamilyRank:uint?", "DocSubTypes:string", "CreatedDate:DateTime")), - "RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float")) + "Affiliations" -> Tuple2( + "mag/Affiliations.txt", + Seq( + "AffiliationId:long", + "Rank:uint", + "NormalizedName:string", + "DisplayName:string", + "GridId:string", + "OfficialPage:string", + "WikiPage:string", + "PaperCount:long", + "PaperFamilyCount:long", + "CitationCount:long", + "Iso3166Code:string", + "Latitude:float?", + "Longitude:float?", + "CreatedDate:DateTime" + ) + ), + "AuthorExtendedAttributes" -> Tuple2( + "mag/AuthorExtendedAttributes.txt", + Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string") + ), + "Authors" -> Tuple2( + "mag/Authors.txt", + Seq( + "AuthorId:long", + "Rank:uint", + "NormalizedName:string", + "DisplayName:string", + "LastKnownAffiliationId:long?", + "PaperCount:long", + "PaperFamilyCount:long", + "CitationCount:long", + "CreatedDate:DateTime" + ) + ), + "ConferenceInstances" -> Tuple2( + "mag/ConferenceInstances.txt", + Seq( + "ConferenceInstanceId:long", + "NormalizedName:string", + "DisplayName:string", + "ConferenceSeriesId:long", + "Location:string", + "OfficialUrl:string", + "StartDate:DateTime?", + "EndDate:DateTime?", + "AbstractRegistrationDate:DateTime?", + "SubmissionDeadlineDate:DateTime?", + "NotificationDueDate:DateTime?", + "FinalVersionDueDate:DateTime?", + "PaperCount:long", + "PaperFamilyCount:long", + "CitationCount:long", + "Latitude:float?", + "Longitude:float?", + "CreatedDate:DateTime" + ) + ), + "ConferenceSeries" -> Tuple2( + "mag/ConferenceSeries.txt", + Seq( + "ConferenceSeriesId:long", + "Rank:uint", + "NormalizedName:string", + "DisplayName:string", + "PaperCount:long", + "PaperFamilyCount:long", + "CitationCount:long", + "CreatedDate:DateTime" + ) + ), + "EntityRelatedEntities" -> Tuple2( + "advanced/EntityRelatedEntities.txt", + Seq( + "EntityId:long", + "EntityType:string", + "RelatedEntityId:long", + "RelatedEntityType:string", + "RelatedType:int", + "Score:float" + ) + ), + "FieldOfStudyChildren" -> Tuple2( + "advanced/FieldOfStudyChildren.txt", + Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long") + ), + "FieldOfStudyExtendedAttributes" -> Tuple2( + "advanced/FieldOfStudyExtendedAttributes.txt", + Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string") + ), + "FieldsOfStudy" -> Tuple2( + "advanced/FieldsOfStudy.txt", + Seq( + "FieldOfStudyId:long", + "Rank:uint", + "NormalizedName:string", + "DisplayName:string", + "MainType:string", + "Level:int", + "PaperCount:long", + "PaperFamilyCount:long", + "CitationCount:long", + "CreatedDate:DateTime" + ) + ), + "Journals" -> Tuple2( + "mag/Journals.txt", + Seq( + "JournalId:long", + "Rank:uint", + "NormalizedName:string", + "DisplayName:string", + "Issn:string", + "Publisher:string", + "Webpage:string", + "PaperCount:long", + "PaperFamilyCount:long", + "CitationCount:long", + "CreatedDate:DateTime" + ) + ), + "PaperAbstractsInvertedIndex" -> Tuple2( + "nlp/PaperAbstractsInvertedIndex.txt.*", + Seq("PaperId:long", "IndexedAbstract:string") + ), + "PaperAuthorAffiliations" -> Tuple2( + "mag/PaperAuthorAffiliations.txt", + Seq( + "PaperId:long", + "AuthorId:long", + "AffiliationId:long?", + "AuthorSequenceNumber:uint", + "OriginalAuthor:string", + "OriginalAffiliation:string" + ) + ), + "PaperCitationContexts" -> Tuple2( + "nlp/PaperCitationContexts.txt", + Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string") + ), + "PaperExtendedAttributes" -> Tuple2( + "mag/PaperExtendedAttributes.txt", + Seq("PaperId:long", "AttributeType:int", "AttributeValue:string") + ), + "PaperFieldsOfStudy" -> Tuple2( + "advanced/PaperFieldsOfStudy.txt", + Seq("PaperId:long", "FieldOfStudyId:long", "Score:float") + ), + "PaperMeSH" -> Tuple2( + "advanced/PaperMeSH.txt", + Seq( + "PaperId:long", + "DescriptorUI:string", + "DescriptorName:string", + "QualifierUI:string", + "QualifierName:string", + "IsMajorTopic:bool" + ) + ), + "PaperRecommendations" -> Tuple2( + "advanced/PaperRecommendations.txt", + Seq("PaperId:long", "RecommendedPaperId:long", "Score:float") + ), + "PaperReferences" -> Tuple2( + "mag/PaperReferences.txt", + Seq("PaperId:long", "PaperReferenceId:long") + ), + "PaperResources" -> Tuple2( + "mag/PaperResources.txt", + Seq( + "PaperId:long", + "ResourceType:int", + "ResourceUrl:string", + "SourceUrl:string", + "RelationshipType:int" + ) + ), + "PaperUrls" -> Tuple2( + "mag/PaperUrls.txt", + Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string") + ), + "Papers" -> Tuple2( + "mag/Papers.txt", + Seq( + "PaperId:long", + "Rank:uint", + "Doi:string", + "DocType:string", + "PaperTitle:string", + "OriginalTitle:string", + "BookTitle:string", + "Year:int?", + "Date:DateTime?", + "OnlineDate:DateTime?", + "Publisher:string", + "JournalId:long?", + "ConferenceSeriesId:long?", + "ConferenceInstanceId:long?", + "Volume:string", + "Issue:string", + "FirstPage:string", + "LastPage:string", + "ReferenceCount:long", + "CitationCount:long", + "EstimatedCitation:long", + "OriginalVenue:string", + "FamilyId:long?", + "FamilyRank:uint?", + "DocSubTypes:string", + "CreatedDate:DateTime" + ) + ), + "RelatedFieldOfStudy" -> Tuple2( + "advanced/RelatedFieldOfStudy.txt", + Seq( + "FieldOfStudyId1:long", + "Type1:string", + "FieldOfStudyId2:long", + "Type2:string", + "Rank:float" + ) + ) ) - def getSchema(streamName: String): StructType = { var schema = new StructType() val d: Seq[String] = stream(streamName)._2 @@ -61,19 +262,22 @@ object SparkImportMagIntoDataset { schema } - def main(args: Array[String]): Unit = { val logger: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - + .master(parser.get("master")) + .getOrCreate() stream.foreach { case (k, v) => val s: StructType = getSchema(k) diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala index 41e95baa1..eae669853 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala @@ -9,6 +9,7 @@ import org.apache.spark.sql.functions.{col, collect_list, struct} import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ + object SparkProcessMAG { def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = { @@ -17,13 +18,31 @@ object SparkProcessMAG { .reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2)) .map(_._2)(Encoders.product[MagPapers]) .map(mp => { - MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi), - mp.DocType, mp.PaperTitle, mp.OriginalTitle, - mp.BookTitle, mp.Year, mp.Date, mp.Publisher: String, - mp.JournalId, mp.ConferenceSeriesId, mp.ConferenceInstanceId, - mp.Volume, mp.Issue, mp.FirstPage, mp.LastPage, - mp.ReferenceCount, mp.CitationCount, mp.EstimatedCitation, - mp.OriginalVenue, mp.FamilyId, mp.CreatedDate) + MagPapers( + mp.PaperId, + mp.Rank, + DoiBoostMappingUtil.normalizeDoi(mp.Doi), + mp.DocType, + mp.PaperTitle, + mp.OriginalTitle, + mp.BookTitle, + mp.Year, + mp.Date, + mp.Publisher: String, + mp.JournalId, + mp.ConferenceSeriesId, + mp.ConferenceInstanceId, + mp.Volume, + mp.Issue, + mp.FirstPage, + mp.LastPage, + mp.ReferenceCount, + mp.CitationCount, + mp.EstimatedCitation, + mp.OriginalVenue, + mp.FamilyId, + mp.CreatedDate + ) })(Encoders.product[MagPapers]) } @@ -31,22 +50,29 @@ object SparkProcessMAG { val logger: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() val sourcePath = parser.get("sourcePath") val workingPath = parser.get("workingPath") val targetPath = parser.get("targetPath") import spark.implicits._ - implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication] - implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs) + implicit val mapEncoderPubs: Encoder[Publication] = + org.apache.spark.sql.Encoders.kryo[Publication] + implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = + Encoders.tuple(Encoders.STRING, mapEncoderPubs) logger.info("Phase 1) make uninue DOI in Papers:") val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers] @@ -58,16 +84,23 @@ object SparkProcessMAG { logger.info("Phase 0) Enrich Publication with description") val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract] - pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"$workingPath/PaperAbstract") + pa.map(ConversionUtil.transformPaperAbstract) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/PaperAbstract") logger.info("Phase 3) Group Author by PaperId") val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor] val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation] - val paperAuthorAffiliation = spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation] + val paperAuthorAffiliation = + spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation] - paperAuthorAffiliation.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId"))) - .map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) => (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber)) } + paperAuthorAffiliation + .joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId"))) + .map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) => + (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber)) + } .joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left") .map(s => { val mpa = s._1._2 @@ -76,79 +109,133 @@ object SparkProcessMAG { MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName, mpa.sequenceNumber) } else mpa - }).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors")) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_1_paper_authors") + }) + .groupBy("PaperId") + .agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors")) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/merge_step_1_paper_authors") - logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors") + logger.info( + "Phase 4) create First Version of publication Entity with Paper Journal and Authors" + ) val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal] - val papers = spark.read.load((s"$workingPath/Papers_distinct")).as[MagPapers] + val papers = spark.read.load(s"$workingPath/Papers_distinct").as[MagPapers] - val paperWithAuthors = spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList] + val paperWithAuthors = + spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList] - val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left") - firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left") + val firstJoin = + papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left") + firstJoin + .joinWith( + paperWithAuthors, + firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), + "left" + ) .map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) } - .write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_2") - + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/merge_step_2") var magPubs: Dataset[(String, Publication)] = - spark.read.load(s"$workingPath/merge_step_2").as[Publication] - .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] + spark.read + .load(s"$workingPath/merge_step_2") + .as[Publication] + .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)) + .as[(String, Publication)] + val conference = spark.read + .load(s"$sourcePath/ConferenceInstances") + .select( + $"ConferenceInstanceId".as("ci"), + $"DisplayName", + $"Location", + $"StartDate", + $"EndDate" + ) + val conferenceInstance = conference + .joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci"))) + .select( + $"_1.ci", + $"_1.DisplayName", + $"_1.Location", + $"_1.StartDate", + $"_1.EndDate", + $"_2.PaperId" + ) + .as[MagConferenceInstance] - val conference = spark.read.load(s"$sourcePath/ConferenceInstances") - .select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate", $"EndDate") - val conferenceInstance = conference.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci"))) - .select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate", $"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance] - - - magPubs.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left") + magPubs + .joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left") .map(item => ConversionUtil.updatePubsWithConferenceInfo(item)) .write .mode(SaveMode.Overwrite) .save(s"$workingPath/merge_step_3") + val paperAbstract = spark.read.load(s"$workingPath/PaperAbstract").as[MagPaperAbstract] - val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract] - - - magPubs = spark.read.load(s"$workingPath/merge_step_3").as[Publication] - .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] - - magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left") - .map(item => ConversionUtil.updatePubsWithDescription(item) - ).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4") + magPubs = spark.read + .load(s"$workingPath/merge_step_3") + .as[Publication] + .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)) + .as[(String, Publication)] + magPubs + .joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left") + .map(item => ConversionUtil.updatePubsWithDescription(item)) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/merge_step_4") logger.info("Phase 7) Enrich Publication with FieldOfStudy") - magPubs = spark.read.load(s"$workingPath/merge_step_4").as[Publication] - .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] + magPubs = spark.read + .load(s"$workingPath/merge_step_4") + .as[Publication] + .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)) + .as[(String, Publication)] - val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType") + val fos = spark.read + .load(s"$sourcePath/FieldsOfStudy") + .select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType") val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy") - val paperField = pfos.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId"))) + val paperField = pfos + .joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId"))) .select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score") - .groupBy($"PaperId").agg(collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score")).as("subjects")) + .groupBy($"PaperId") + .agg( + collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score")) + .as("subjects") + ) .as[MagFieldOfStudy] - magPubs.joinWith(paperField, col("_1") - .equalTo(paperField("PaperId")), "left") + magPubs + .joinWith( + paperField, + col("_1") + .equalTo(paperField("PaperId")), + "left" + ) .map(item => ConversionUtil.updatePubsWithSubject(item)) - .write.mode(SaveMode.Overwrite) + .write + .mode(SaveMode.Overwrite) .save(s"$workingPath/mag_publication") - spark.read.load(s"$workingPath/mag_publication").as[Publication] + spark.read + .load(s"$workingPath/mag_publication") + .as[Publication] .filter(p => p.getId != null) .groupByKey(p => p.getId) .reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b)) .map(_._2) - .write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") - + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/magPublication") } } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala index 11031f9ca..7c58afc09 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala @@ -15,15 +15,20 @@ import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ +case class ORCIDItem(doi: String, authors: List[OrcidAuthor]) {} -case class ORCIDItem(doi:String, authors:List[OrcidAuthor]){} -case class OrcidAuthor(oid:String, name:Option[String], surname:Option[String], creditName:Option[String], otherNames:Option[List[String]], errorCode:Option[String]){} -case class OrcidWork(oid:String, doi:String) +case class OrcidAuthor( + oid: String, + name: Option[String], + surname: Option[String], + creditName: Option[String], + otherNames: Option[List[String]], + errorCode: Option[String] +) {} +case class OrcidWork(oid: String, doi: String) +case class ORCIDElement(doi: String, authors: List[ORCIDItem]) {} - - -case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {} object ORCIDToOAF { val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass) val mapper = new ObjectMapper() @@ -41,7 +46,7 @@ object ORCIDToOAF { def extractValueFromInputString(input: String): (String, String) = { val i = input.indexOf('[') - if (i <5) { + if (i < 5) { return null } val orcidList = input.substring(i, input.length - 1) @@ -51,17 +56,16 @@ object ORCIDToOAF { } else null } - - def strValid(s:Option[String]) : Boolean = { + def strValid(s: Option[String]): Boolean = { s.isDefined && s.get.nonEmpty } - def authorValid(author:OrcidAuthor): Boolean ={ + def authorValid(author: OrcidAuthor): Boolean = { if (strValid(author.name) && strValid(author.surname)) { return true } if (strValid(author.surname)) { - return true + return true } if (strValid(author.creditName)) { return true @@ -70,37 +74,35 @@ object ORCIDToOAF { false } - - def extractDOIWorks(input:String): List[OrcidWork] = { + def extractDOIWorks(input: String): List[OrcidWork] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - val oid = (json \ "workDetail" \"oid").extractOrElse[String](null) + val oid = (json \ "workDetail" \ "oid").extractOrElse[String](null) if (oid == null) return List() - val doi:List[(String, String)] = for { - JObject(extIds) <- json \ "workDetail" \"extIds" + val doi: List[(String, String)] = for { + JObject(extIds) <- json \ "workDetail" \ "extIds" JField("type", JString(typeValue)) <- extIds - JField("value", JString(value)) <- extIds + JField("value", JString(value)) <- extIds if "doi".equalsIgnoreCase(typeValue) } yield (typeValue, DoiBoostMappingUtil.normalizeDoi(value)) if (doi.nonEmpty) { - return doi.map(l =>OrcidWork(oid, l._2)) + return doi.map(l => OrcidWork(oid, l._2)) } List() } - def convertORCIDAuthor(input:String): OrcidAuthor = { + def convertORCIDAuthor(input: String): OrcidAuthor = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - (json \"authorData" ).extractOrElse[OrcidAuthor](null) - } + (json \ "authorData").extractOrElse[OrcidAuthor](null) + } - - def convertTOOAF(input:ORCIDItem) :Publication = { + def convertTOOAF(input: ORCIDItem): Publication = { val doi = input.doi - val pub:Publication = new Publication + val pub: Publication = new Publication pub.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava) pub.setDataInfo(generateDataInfo()) @@ -108,9 +110,9 @@ object ORCIDToOAF { if (pub.getId == null) return null - try{ + try { - val l:List[Author]= input.authors.map(a=> { + val l: List[Author] = input.authors.map(a => { generateAuthor(a) })(collection.breakOut) @@ -125,30 +127,38 @@ object ORCIDToOAF { } } - def generateOricPIDDatainfo():DataInfo = { - val di =DoiBoostMappingUtil.generateDataInfo("0.91") + def generateOricPIDDatainfo(): DataInfo = { + val di = DoiBoostMappingUtil.generateDataInfo("0.91") di.getProvenanceaction.setClassid(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY) di.getProvenanceaction.setClassname(ModelConstants.HARVESTED) di } - def generateAuthor(o : OrcidAuthor): Author = { + def generateAuthor(o: OrcidAuthor): Author = { val a = new Author if (strValid(o.name)) { - a.setName(o.name.get.capitalize) + a.setName(o.name.get.capitalize) } if (strValid(o.surname)) { a.setSurname(o.surname.get.capitalize) } - if(strValid(o.name) && strValid(o.surname)) + if (strValid(o.name) && strValid(o.surname)) a.setFullname(s"${o.name.get.capitalize} ${o.surname.get.capitalize}") else if (strValid(o.creditName)) a.setFullname(o.creditName.get) if (StringUtils.isNotBlank(o.oid)) - a.setPid(List(createSP(o.oid, ModelConstants.ORCID, ModelConstants.DNET_PID_TYPES, generateOricPIDDatainfo())).asJava) + a.setPid( + List( + createSP( + o.oid, + ModelConstants.ORCID, + ModelConstants.DNET_PID_TYPES, + generateOricPIDDatainfo() + ) + ).asJava + ) a } - -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala index 1b189e296..95a1f5a19 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala @@ -10,11 +10,11 @@ import org.slf4j.{Logger, LoggerFactory} object SparkConvertORCIDToOAF { val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass) - def run(spark: SparkSession, workingPath: String, targetPath: String): Unit = { implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] import spark.implicits._ - val dataset: Dataset[ORCIDItem] = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem] + val dataset: Dataset[ORCIDItem] = + spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem] logger.info("Converting ORCID to OAF") dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath) @@ -22,15 +22,21 @@ object SparkConvertORCIDToOAF { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + SparkConvertORCIDToOAF.getClass.getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/convert_orcid_to_oaf_params.json" + ) + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - + .master(parser.get("master")) + .getOrCreate() val workingPath = parser.get("workingPath") val targetPath = parser.get("targetPath") diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala index 153be5dd1..7b6408417 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala @@ -17,45 +17,72 @@ object SparkPreprocessORCID { } - def run(spark: SparkSession, sourcePath: String, workingPath: String): Unit = { import spark.implicits._ implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] - val inputRDD: RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s != null).filter(s => ORCIDToOAF.authorValid(s)) + val inputRDD: RDD[OrcidAuthor] = spark.sparkContext + .textFile(s"$sourcePath/authors") + .map(s => ORCIDToOAF.convertORCIDAuthor(s)) + .filter(s => s != null) + .filter(s => ORCIDToOAF.authorValid(s)) - spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author") + spark + .createDataset(inputRDD) + .as[OrcidAuthor] + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/author") - val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s != null) + val res = spark.sparkContext + .textFile(s"$sourcePath/works") + .flatMap(s => ORCIDToOAF.extractDOIWorks(s)) + .filter(s => s != null) - spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works") + spark + .createDataset(res) + .as[OrcidWork] + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/works") val authors: Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor] val works: Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork] - works.joinWith(authors, authors("oid").equalTo(works("oid"))) + works + .joinWith(authors, authors("oid").equalTo(works("oid"))) .map(i => { val doi = i._1.doi val author = i._2 (doi, author) - }).groupBy(col("_1").alias("doi")) - .agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem] + }) + .groupBy(col("_1").alias("doi")) + .agg(collect_list(col("_2")).alias("authors")) + .as[ORCIDItem] .map(s => fixORCIDItem(s)) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor") + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/orcidworksWithAuthor") } def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/preprocess_orcid_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + SparkConvertORCIDToOAF.getClass.getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/preprocess_orcid_params.json" + ) + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - + .master(parser.get("master")) + .getOrCreate() val sourcePath = parser.get("sourcePath") val workingPath = parser.get("workingPath") diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala index 70290018d..9f7f9d18f 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala @@ -13,28 +13,35 @@ object SparkMapUnpayWallToOAF { def main(args: Array[String]): Unit = { - val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + SparkMapDumpIntoOAF.getClass.getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/convert_uw_to_oaf_params.json" + ) + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] - val sourcePath = parser.get("sourcePath") val targetPath = parser.get("targetPath") val inputRDD: RDD[String] = spark.sparkContext.textFile(s"$sourcePath") logger.info("Converting UnpayWall to OAF") - val d: Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null)).as[Publication] + val d: Dataset[Publication] = spark + .createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null)) + .as[Publication] d.write.mode(SaveMode.Overwrite).save(targetPath) } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala index bf5694965..bbdc80b1d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala @@ -12,33 +12,41 @@ import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ - - -case class OALocation(evidence:Option[String], host_type:Option[String], is_best:Option[Boolean], license: Option[String], pmh_id:Option[String], updated:Option[String], - url:Option[String], url_for_landing_page:Option[String], url_for_pdf:Option[String], version:Option[String]) {} - - - +case class OALocation( + evidence: Option[String], + host_type: Option[String], + is_best: Option[Boolean], + license: Option[String], + pmh_id: Option[String], + updated: Option[String], + url: Option[String], + url_for_landing_page: Option[String], + url_for_pdf: Option[String], + version: Option[String] +) {} object UnpayWallToOAF { val logger: Logger = LoggerFactory.getLogger(getClass) - - def get_unpaywall_color(input:String):Option[OpenAccessRoute] = { - if(input == null || input.equalsIgnoreCase("close")) + def get_unpaywall_color(input: String): Option[OpenAccessRoute] = { + if (input == null || input.equalsIgnoreCase("close")) return None - if(input.equalsIgnoreCase("green")) + if (input.equalsIgnoreCase("green")) return Some(OpenAccessRoute.green) - if(input.equalsIgnoreCase("bronze")) + if (input.equalsIgnoreCase("bronze")) return Some(OpenAccessRoute.bronze) - if(input.equalsIgnoreCase("hybrid")) + if (input.equalsIgnoreCase("hybrid")) return Some(OpenAccessRoute.hybrid) else return Some(OpenAccessRoute.gold) } - def get_color(is_oa:Boolean, location: OALocation, journal_is_oa:Boolean):Option[OpenAccessRoute] = { + def get_color( + is_oa: Boolean, + location: OALocation, + journal_is_oa: Boolean + ): Option[OpenAccessRoute] = { if (is_oa) { if (location.host_type.isDefined) { { @@ -62,23 +70,22 @@ object UnpayWallToOAF { None } - - def convertToOAF(input:String):Publication = { + def convertToOAF(input: String): Publication = { val pub = new Publication implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - val doi = DoiBoostMappingUtil.normalizeDoi((json \"doi").extract[String]) + val doi = DoiBoostMappingUtil.normalizeDoi((json \ "doi").extract[String]) - if(doi == null) + if (doi == null) return null - val is_oa = (json\ "is_oa").extract[Boolean] + val is_oa = (json \ "is_oa").extract[Boolean] - val journal_is_oa= (json\ "journal_is_oa").extract[Boolean] + val journal_is_oa = (json \ "journal_is_oa").extract[Boolean] - val oaLocation:OALocation = (json \ "best_oa_location").extractOrElse[OALocation](null) + val oaLocation: OALocation = (json \ "best_oa_location").extractOrElse[OALocation](null) val colour = get_unpaywall_color((json \ "oa_status").extractOrElse[String](null)) @@ -88,9 +95,9 @@ object UnpayWallToOAF { if (!is_oa) return null - if(oaLocation== null || oaLocation.url.isEmpty) - return null - val i :Instance= new Instance() + if (oaLocation == null || oaLocation.url.isEmpty) + return null + val i: Instance = new Instance() i.setCollectedfrom(createUnpayWallCollectedFrom()) // i.setAccessright(getOpenAccessQualifier()) @@ -122,7 +129,4 @@ object UnpayWallToOAF { } - - - } diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala index 41730ade0..61d2eef29 100644 --- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala @@ -6,15 +6,11 @@ import org.junit.jupiter.api.Test class DoiBoostHostedByMapTest { @Test - def idDSGeneration():Unit = { - val s ="doajarticles::0066-782X" - - + def idDSGeneration(): Unit = { + val s = "doajarticles::0066-782X" println(DoiBoostMappingUtil.generateDSId(s)) - } - } diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala index a9a841ee9..391d45b10 100644 --- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala @@ -6,41 +6,39 @@ import org.junit.jupiter.api.Test class NormalizeDOITest { @Test - def doiDSLowerCase():Unit = { - val doi ="10.1042/BCJ20160876" + def doiDSLowerCase(): Unit = { + val doi = "10.1042/BCJ20160876" assert(DoiBoostMappingUtil.normalizeDoi(doi).equals(doi.toLowerCase())) } - @Test - def doiFiltered():Unit = { + def doiFiltered(): Unit = { val doi = "0.1042/BCJ20160876" assert(DoiBoostMappingUtil.normalizeDoi(doi) == null) } @Test - def doiFiltered2():Unit = { + def doiFiltered2(): Unit = { val doi = "https://doi.org/0.1042/BCJ20160876" assert(DoiBoostMappingUtil.normalizeDoi(doi) == null) } - @Test - def doiCleaned():Unit = { + def doiCleaned(): Unit = { val doi = "https://doi.org/10.1042/BCJ20160876" assert(DoiBoostMappingUtil.normalizeDoi(doi).equals("10.1042/BCJ20160876".toLowerCase())) } @Test - def doiCleaned1():Unit = { + def doiCleaned1(): Unit = { val doi = "https://doi.org/10.1042/ BCJ20160876" assert(DoiBoostMappingUtil.normalizeDoi(doi).equals("10.1042/BCJ20160876".toLowerCase())) } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala index 71dbf27be..8124a5aae 100644 --- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala @@ -12,20 +12,24 @@ import scala.collection.JavaConverters._ import scala.io.Source import scala.util.matching.Regex - class CrossrefMappingTest { val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass) val mapper = new ObjectMapper() - - @Test def testFunderRelationshipsMapping(): Unit = { - val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString - val funder_doi = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString - val funder_name = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString - + val template = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json") + ) + .mkString + val funder_doi = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")) + .mkString + val funder_name = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")) + .mkString for (line <- funder_doi.lines) { val json = template.replace("%s", line) @@ -43,7 +47,8 @@ class CrossrefMappingTest { def checkRelation(generatedOAF: List[Oaf]): Unit = { - val rels: List[Relation] = generatedOAF.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]] + val rels: List[Relation] = + generatedOAF.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]] assertFalse(rels.isEmpty) rels.foreach(relation => { val relJson = mapper.writeValueAsString(relation) @@ -59,22 +64,22 @@ class CrossrefMappingTest { } - @Test - def testSum() :Unit = { - val from:Long = 1613135645000L - val delta:Long = 1000000L - - - println(s"updating from value: $from -> ${from+delta}") + def testSum(): Unit = { + val from: Long = 1613135645000L + val delta: Long = 1000000L + println(s"updating from value: $from -> ${from + delta}") } @Test - def testOrcidID() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")).mkString - + def testOrcidID(): Unit = { + val json = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json") + ) + .mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -85,17 +90,18 @@ class CrossrefMappingTest { val items = resultList.filter(p => p.isInstanceOf[Result]) - mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) items.foreach(p => println(mapper.writeValueAsString(p))) - } @Test - def testEmptyTitle() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")).mkString - + def testEmptyTitle(): Unit = { + val json = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json") + ) + .mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -106,17 +112,16 @@ class CrossrefMappingTest { val items = resultList.filter(p => p.isInstanceOf[Result]) - mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) items.foreach(p => println(mapper.writeValueAsString(p))) - } - @Test def testPeerReviewed(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json")).mkString + val json = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json")) + .mkString mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) assertNotNull(json) @@ -128,12 +133,8 @@ class CrossrefMappingTest { val items = resultList.filter(p => p.isInstanceOf[Result]) - items.foreach(p => logger.info(mapper.writeValueAsString(p))) - - - } def extractECAward(award: String): String = { @@ -143,21 +144,21 @@ class CrossrefMappingTest { null } - @Test def extractECTest(): Unit = { - val s = "FP7/2007-2013" + val s = "FP7/2007-2013" val awardExtracted = extractECAward(s) println(awardExtracted) println(DHPUtils.md5(awardExtracted)) - } @Test def testJournalRelation(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json")).mkString + val json = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json")) + .mkString assertNotNull(json) assertFalse(json.isEmpty) @@ -165,20 +166,19 @@ class CrossrefMappingTest { val resultList: List[Oaf] = Crossref2Oaf.convert(json) assertTrue(resultList.nonEmpty) - val rels:List[Relation] = resultList.filter(p => p.isInstanceOf[Relation]).map(r=> r.asInstanceOf[Relation]) - - + val rels: List[Relation] = + resultList.filter(p => p.isInstanceOf[Relation]).map(r => r.asInstanceOf[Relation]) rels.foreach(s => logger.info(s.getTarget)) - assertEquals(rels.size, 6 ) - + assertEquals(rels.size, 6) } - @Test def testConvertBookFromCrossRef2Oaf(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json")).mkString + val json = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json")) + .mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -199,42 +199,62 @@ class CrossrefMappingTest { assertNotNull(result.getDataInfo, "Datainfo test not null Failed"); assertNotNull( result.getDataInfo.getProvenanceaction, - "DataInfo/Provenance test not null Failed"); + "DataInfo/Provenance test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getClassid.isEmpty, - "DataInfo/Provenance/classId test not null Failed"); + "DataInfo/Provenance/classId test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getClassname.isEmpty, - "DataInfo/Provenance/className test not null Failed"); + "DataInfo/Provenance/className test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty, - "DataInfo/Provenance/SchemeId test not null Failed"); + "DataInfo/Provenance/SchemeId test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getSchemename.isEmpty, - "DataInfo/Provenance/SchemeName test not null Failed"); + "DataInfo/Provenance/SchemeName test not null Failed" + ); assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed"); assertFalse(result.getCollectedfrom.isEmpty); val collectedFromList = result.getCollectedfrom.asScala - assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion") - - assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion") + assert( + collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), + "Wrong collected from assertion" + ) + assert( + collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), + "Wrong collected from assertion" + ) val relevantDates = result.getRelevantdate.asScala - assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created") - assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), "Missing relevant date of type published-online") - assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), "Missing relevant date of type published-print") + assert( + relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), + "Missing relevant date of type created" + ) + assert( + relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), + "Missing relevant date of type published-online" + ) + assert( + relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), + "Missing relevant date of type published-print" + ) val rels = resultList.filter(p => p.isInstanceOf[Relation]) assert(rels.isEmpty) } - @Test def testConvertPreprintFromCrossRef2Oaf(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json")).mkString + val json = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json")) + .mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -255,44 +275,70 @@ class CrossrefMappingTest { assertNotNull(result.getDataInfo, "Datainfo test not null Failed"); assertNotNull( result.getDataInfo.getProvenanceaction, - "DataInfo/Provenance test not null Failed"); + "DataInfo/Provenance test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getClassid.isEmpty, - "DataInfo/Provenance/classId test not null Failed"); + "DataInfo/Provenance/classId test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getClassname.isEmpty, - "DataInfo/Provenance/className test not null Failed"); + "DataInfo/Provenance/className test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty, - "DataInfo/Provenance/SchemeId test not null Failed"); + "DataInfo/Provenance/SchemeId test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getSchemename.isEmpty, - "DataInfo/Provenance/SchemeName test not null Failed"); + "DataInfo/Provenance/SchemeName test not null Failed" + ); assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed"); assertFalse(result.getCollectedfrom.isEmpty); val collectedFromList = result.getCollectedfrom.asScala - assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion") - - assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion") + assert( + collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), + "Wrong collected from assertion" + ) + assert( + collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), + "Wrong collected from assertion" + ) val relevantDates = result.getRelevantdate.asScala - assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created") - assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("available")), "Missing relevant date of type available") - assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("accepted")), "Missing relevant date of type accepted") - assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), "Missing relevant date of type published-online") - assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), "Missing relevant date of type published-print") + assert( + relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), + "Missing relevant date of type created" + ) + assert( + relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("available")), + "Missing relevant date of type available" + ) + assert( + relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("accepted")), + "Missing relevant date of type accepted" + ) + assert( + relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-online")), + "Missing relevant date of type published-online" + ) + assert( + relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("published-print")), + "Missing relevant date of type published-print" + ) val rels = resultList.filter(p => p.isInstanceOf[Relation]) assert(rels.isEmpty) } - @Test def testConvertDatasetFromCrossRef2Oaf(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json")).mkString + val json = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json")) + .mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -313,19 +359,24 @@ class CrossrefMappingTest { assertNotNull(result.getDataInfo, "Datainfo test not null Failed"); assertNotNull( result.getDataInfo.getProvenanceaction, - "DataInfo/Provenance test not null Failed"); + "DataInfo/Provenance test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getClassid.isEmpty, - "DataInfo/Provenance/classId test not null Failed"); + "DataInfo/Provenance/classId test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getClassname.isEmpty, - "DataInfo/Provenance/className test not null Failed"); + "DataInfo/Provenance/className test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty, - "DataInfo/Provenance/SchemeId test not null Failed"); + "DataInfo/Provenance/SchemeId test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getSchemename.isEmpty, - "DataInfo/Provenance/SchemeName test not null Failed"); + "DataInfo/Provenance/SchemeName test not null Failed" + ); assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed"); assertFalse(result.getCollectedfrom.isEmpty); @@ -333,7 +384,9 @@ class CrossrefMappingTest { @Test def testConvertArticleFromCrossRef2Oaf(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString + val json = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")) + .mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -354,32 +407,45 @@ class CrossrefMappingTest { assertNotNull(result.getDataInfo, "Datainfo test not null Failed"); assertNotNull( result.getDataInfo.getProvenanceaction, - "DataInfo/Provenance test not null Failed"); + "DataInfo/Provenance test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getClassid.isEmpty, - "DataInfo/Provenance/classId test not null Failed"); + "DataInfo/Provenance/classId test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getClassname.isEmpty, - "DataInfo/Provenance/className test not null Failed"); + "DataInfo/Provenance/className test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty, - "DataInfo/Provenance/SchemeId test not null Failed"); + "DataInfo/Provenance/SchemeId test not null Failed" + ); assertFalse( result.getDataInfo.getProvenanceaction.getSchemename.isEmpty, - "DataInfo/Provenance/SchemeName test not null Failed"); + "DataInfo/Provenance/SchemeName test not null Failed" + ); assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed"); assertFalse(result.getCollectedfrom.isEmpty); val collectedFromList = result.getCollectedfrom.asScala - assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion") - - assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion") + assert( + collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), + "Wrong collected from assertion" + ) + assert( + collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), + "Wrong collected from assertion" + ) val relevantDates = result.getRelevantdate.asScala - assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created") + assert( + relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), + "Missing relevant date of type created" + ) val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]] assertFalse(rels.isEmpty) @@ -393,15 +459,14 @@ class CrossrefMappingTest { }) - } - - @Test def testSetDateOfAcceptanceCrossRef2Oaf(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json")).mkString + val json = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json")) + .mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -421,8 +486,13 @@ class CrossrefMappingTest { @Test def testNormalizeDOI(): Unit = { - val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString - val line :String = "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}]," + val template = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json") + ) + .mkString + val line: String = + "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}]," val json = template.replace("%s", line) val resultList: List[Oaf] = Crossref2Oaf.convert(json) assertTrue(resultList.nonEmpty) @@ -431,13 +501,17 @@ class CrossrefMappingTest { result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi"))) assertTrue(result.getPid.size() == 1) - result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))) + result.getPid.asScala.foreach(pid => + assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())) + ) } @Test def testNormalizeDOI2(): Unit = { - val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString + val template = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")) + .mkString val resultList: List[Oaf] = Crossref2Oaf.convert(template) assertTrue(resultList.nonEmpty) @@ -446,14 +520,19 @@ class CrossrefMappingTest { result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi"))) assertTrue(result.getPid.size() == 1) - result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))) + result.getPid.asScala.foreach(pid => + assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())) + ) } @Test - def testLicenseVorClosed() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")).mkString - + def testLicenseVorClosed(): Unit = { + val json = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json") + ) + .mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -462,25 +541,28 @@ class CrossrefMappingTest { assertTrue(resultList.nonEmpty) - - val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] + val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) println(mapper.writeValueAsString(item)) - assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor"))) - assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED"))) + assertTrue( + item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://www.springer.com/vor")) + ) + assertTrue( + item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("CLOSED")) + ) assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null)) - - - } @Test - def testLicenseOpen() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")).mkString - + def testLicenseOpen(): Unit = { + val json = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json") + ) + .mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -489,21 +571,33 @@ class CrossrefMappingTest { assertTrue(resultList.nonEmpty) + val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] - val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] - - assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html"))) + assertTrue( + item.getInstance().asScala exists (i => + i.getLicense.getValue.equals( + "http://pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html" + ) + ) + ) assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN"))) - assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)) + assertTrue( + item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid) + ) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) println(mapper.writeValueAsString(item)) } @Test - def testLicenseEmbargoOpen() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json")).mkString - + def testLicenseEmbargoOpen(): Unit = { + val json = Source + .fromInputStream( + getClass.getResourceAsStream( + "/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json" + ) + ) + .mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -512,21 +606,33 @@ class CrossrefMappingTest { assertTrue(resultList.nonEmpty) + val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] - val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] - - assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"))) + assertTrue( + item.getInstance().asScala exists (i => + i.getLicense.getValue.equals( + "https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model" + ) + ) + ) assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("OPEN"))) - assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid)) + assertTrue( + item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == OpenAccessRoute.hybrid) + ) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) println(mapper.writeValueAsString(item)) } @Test - def testLicenseEmbargo() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo.json")).mkString - + def testLicenseEmbargo(): Unit = { + val json = Source + .fromInputStream( + getClass.getResourceAsStream( + "/eu/dnetlib/doiboost/crossref/publication_license_embargo.json" + ) + ) + .mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -535,35 +641,18 @@ class CrossrefMappingTest { assertTrue(resultList.nonEmpty) + val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] - val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] - - assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"))) - assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))) - assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null)) - mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) - println(mapper.writeValueAsString(item)) - - } - - - @Test - def testLicenseEmbargoDateTime() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json")).mkString - - - assertNotNull(json) - assertFalse(json.isEmpty); - - val resultList: List[Oaf] = Crossref2Oaf.convert(json) - - assertTrue(resultList.nonEmpty) - - - val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] - - assertTrue(item.getInstance().asScala exists (i => i.getLicense.getValue.equals("https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model"))) - assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO"))) + assertTrue( + item.getInstance().asScala exists (i => + i.getLicense.getValue.equals( + "https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model" + ) + ) + ) + assertTrue( + item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO")) + ) assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null)) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) println(mapper.writeValueAsString(item)) @@ -571,9 +660,14 @@ class CrossrefMappingTest { } @Test - def testMultipleURLs() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")).mkString - + def testLicenseEmbargoDateTime(): Unit = { + val json = Source + .fromInputStream( + getClass.getResourceAsStream( + "/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json" + ) + ) + .mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -582,12 +676,47 @@ class CrossrefMappingTest { assertTrue(resultList.nonEmpty) + val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] - val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] + assertTrue( + item.getInstance().asScala exists (i => + i.getLicense.getValue.equals( + "https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model" + ) + ) + ) + assertTrue( + item.getInstance().asScala exists (i => i.getAccessright.getClassid.equals("EMBARGO")) + ) + assertTrue(item.getInstance().asScala exists (i => i.getAccessright.getOpenAccessRoute == null)) + mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) + println(mapper.writeValueAsString(item)) + + } + + @Test + def testMultipleURLs(): Unit = { + val json = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json") + ) + .mkString + + assertNotNull(json) + assertFalse(json.isEmpty); + + val resultList: List[Oaf] = Crossref2Oaf.convert(json) + + assertTrue(resultList.nonEmpty) + + val item: Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] assertEquals(1, item.getInstance().size()) assertEquals(1, item.getInstance().get(0).getUrl().size()) - assertEquals("https://doi.org/10.1016/j.jas.2019.105013", item.getInstance().get(0).getUrl().get(0)) + assertEquals( + "https://doi.org/10.1016/j.jas.2019.105013", + item.getInstance().get(0).getUrl().get(0) + ) //println(mapper.writeValueAsString(item)) } diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/mag/MAGMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/mag/MAGMappingTest.scala index 611f3b323..882c0d8a0 100644 --- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/mag/MAGMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/mag/MAGMappingTest.scala @@ -12,43 +12,35 @@ import org.slf4j.{Logger, LoggerFactory} import java.sql.Timestamp import scala.io.Source - - class MAGMappingTest { val logger: Logger = LoggerFactory.getLogger(getClass) val mapper = new ObjectMapper() - - - @Test - def testSplitter():Unit = { + def testSplitter(): Unit = { val s = "sports.team" - if (s.contains(".")) { - println(s.split("\\.")head) + println(s.split("\\.") head) } } - - @Test - def testDate() :Unit = { + def testDate(): Unit = { - val p:Timestamp = Timestamp.valueOf("2011-10-02 00:00:00") + val p: Timestamp = Timestamp.valueOf("2011-10-02 00:00:00") - println(p.toString.substring(0,10)) + println(p.toString.substring(0, 10)) } - - @Test def buildInvertedIndexTest(): Unit = { - val json_input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json")).mkString + val json_input = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json")) + .mkString val description = ConversionUtil.convertInvertedIndexString(json_input) assertNotNull(description) assertTrue(description.nonEmpty) @@ -56,10 +48,9 @@ class MAGMappingTest { logger.debug(description) } + @Test - def normalizeDoiTest():Unit = { - - + def normalizeDoiTest(): Unit = { implicit val formats = DefaultFormats @@ -78,8 +69,9 @@ class MAGMappingTest { val schema = Encoders.product[MagPapers].schema import spark.implicits._ - val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers] - val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers) + val magPapers: Dataset[MagPapers] = + spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers] + val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers) assertTrue(ret.count == 10) ret.take(10).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase()))) @@ -87,7 +79,7 @@ class MAGMappingTest { } @Test - def normalizeDoiTest2():Unit = { + def normalizeDoiTest2(): Unit = { import org.json4s.DefaultFormats @@ -108,15 +100,13 @@ class MAGMappingTest { val schema = Encoders.product[MagPapers].schema import spark.implicits._ - val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers] - val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers) + val magPapers: Dataset[MagPapers] = + spark.read.option("multiline", true).schema(schema).json(path).as[MagPapers] + val ret: Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers) assertTrue(ret.count == 8) ret.take(8).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase()))) spark.close() //ret.take(8).foreach(mp => println(write(mp))) } - } - - diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala index 7c8f01f81..e5bf1bd5f 100644 --- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala @@ -19,8 +19,10 @@ class MappingORCIDToOAFTest { val mapper = new ObjectMapper() @Test - def testExtractData():Unit ={ - val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput")).mkString + def testExtractData(): Unit = { + val json = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput")) + .mkString assertNotNull(json) assertFalse(json.isEmpty) json.lines.foreach(s => { @@ -29,10 +31,10 @@ class MappingORCIDToOAFTest { } @Test - def testOAFConvert(@TempDir testDir: Path):Unit ={ - val sourcePath:String = getClass.getResource("/eu/dnetlib/doiboost/orcid/datasets").getPath - val targetPath: String =s"${testDir.toString}/output/orcidPublication" - val workingPath =s"${testDir.toString}/wp/" + def testOAFConvert(@TempDir testDir: Path): Unit = { + val sourcePath: String = getClass.getResource("/eu/dnetlib/doiboost/orcid/datasets").getPath + val targetPath: String = s"${testDir.toString}/output/orcidPublication" + val workingPath = s"${testDir.toString}/wp/" val conf = new SparkConf() conf.setMaster("local[*]") @@ -46,18 +48,14 @@ class MappingORCIDToOAFTest { implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] import spark.implicits._ - SparkPreprocessORCID.run( spark,sourcePath, workingPath) + SparkPreprocessORCID.run(spark, sourcePath, workingPath) - SparkConvertORCIDToOAF.run(spark, workingPath,targetPath) + SparkConvertORCIDToOAF.run(spark, workingPath, targetPath) val mapper = new ObjectMapper() - - val oA = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem].count() - - val p: Dataset[Publication] = spark.read.load(targetPath).as[Publication] assertTrue(oA == p.count()) @@ -65,19 +63,18 @@ class MappingORCIDToOAFTest { spark.close() - } - @Test - def testExtractDat1():Unit ={ + def testExtractDat1(): Unit = { + val aList: List[OrcidAuthor] = List( + OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null), + OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null), + OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null) + ) - - val aList: List[OrcidAuthor] = List(OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null ), - OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null ), OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null )) - - val orcid:ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList) + val orcid: ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList) val oaf = ORCIDToOAF.convertTOOAF(orcid) assert(oaf.getPid.size() == 1) @@ -85,10 +82,6 @@ class MappingORCIDToOAFTest { oaf.getPid.toList.foreach(pid => assert(pid.getValue.equals("10.1042/BCJ20160876"))) //println(mapper.writeValueAsString(ORCIDToOAF.convertTOOAF(orcid))) - } - - - } diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/uw/UnpayWallMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/uw/UnpayWallMappingTest.scala index 6671758b2..542faa8ad 100644 --- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/uw/UnpayWallMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/uw/UnpayWallMappingTest.scala @@ -14,41 +14,43 @@ class UnpayWallMappingTest { val logger: Logger = LoggerFactory.getLogger(getClass) val mapper = new ObjectMapper() - @Test - def testMappingToOAF():Unit ={ + def testMappingToOAF(): Unit = { - val Ilist = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json")).mkString + val Ilist = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json")) + .mkString - var i:Int = 0 - for (line <-Ilist.lines) { + var i: Int = 0 + for (line <- Ilist.lines) { val p = UnpayWallToOAF.convertToOAF(line) - if(p!= null) { - assertTrue(p.getInstance().size()==1) - if (i== 0){ + if (p != null) { + assertTrue(p.getInstance().size() == 1) + if (i == 0) { assertTrue(p.getPid.get(0).getValue.equals("10.1038/2211089b0")) } - if (i== 1){ + if (i == 1) { assertTrue(p.getPid.get(0).getValue.equals("10.1021/acs.bioconjchem.8b00058.s001")) } - if (i== 2){ + if (i == 2) { assertTrue(p.getPid.get(0).getValue.equals("10.1021/acs.bioconjchem.8b00086.s001")) } logger.info(s"ID : ${p.getId}") } assertNotNull(line) assertTrue(line.nonEmpty) - i = i+1 + i = i + 1 } - - - val l = Ilist.lines.next() + val l = Ilist.lines.next() val item = UnpayWallToOAF.convertToOAF(l) - assertEquals(item.getInstance().get(0).getAccessright.getOpenAccessRoute, OpenAccessRoute.bronze) + assertEquals( + item.getInstance().get(0).getAccessright.getOpenAccessRoute, + OpenAccessRoute.bronze + ) logger.info(mapper.writeValueAsString(item)) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala index ad4e1c96e..c5a2b4024 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala @@ -4,137 +4,190 @@ import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn} +case class HostedByItemType( + id: String, + officialname: String, + issn: String, + eissn: String, + lissn: String, + openAccess: Boolean +) {} -case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {} -case class HostedByInfo(id: String, officialname: String, journal_id: String, provenance : String, id_type: String) {} +case class HostedByInfo( + id: String, + officialname: String, + journal_id: String, + provenance: String, + id_type: String +) {} object Aggregators { - - - def getId(s1:String, s2:String) : String = { - if (s1.startsWith("10|")){ - return s1} - s2 - } - - def getValue(s1:String, s2:String) : String = { - if(!s1.equals("")){ + def getId(s1: String, s2: String): String = { + if (s1.startsWith("10|")) { return s1 } s2 } + def getValue(s1: String, s2: String): String = { + if (!s1.equals("")) { + return s1 + } + s2 + } - def explodeHostedByItemType(df: Dataset[(String, HostedByItemType)]): Dataset[(String, HostedByItemType)] = { - val transformedData : Dataset[(String, HostedByItemType)] = df + def explodeHostedByItemType( + df: Dataset[(String, HostedByItemType)] + ): Dataset[(String, HostedByItemType)] = { + val transformedData: Dataset[(String, HostedByItemType)] = df .groupByKey(_._1)(Encoders.STRING) .agg(Aggregators.hostedByAggregator) - .map{ - case (id:String , res:(String, HostedByItemType)) => res + .map { case (id: String, res: (String, HostedByItemType)) => + res }(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])) transformedData } - val hostedByAggregator: TypedColumn[(String, HostedByItemType), (String, HostedByItemType)] = new Aggregator[(String, HostedByItemType), (String, HostedByItemType), (String, HostedByItemType)] { - override def zero: (String, HostedByItemType) = ("", HostedByItemType("","","","","",false)) - override def reduce(b: (String, HostedByItemType), a:(String,HostedByItemType)): (String, HostedByItemType) = { - return merge(b, a) - } - override def merge(b1: (String, HostedByItemType), b2: (String, HostedByItemType)): (String, HostedByItemType) = { - if (b1 == null){ - return b2 + val hostedByAggregator: TypedColumn[(String, HostedByItemType), (String, HostedByItemType)] = + new Aggregator[ + (String, HostedByItemType), + (String, HostedByItemType), + (String, HostedByItemType) + ] { + + override def zero: (String, HostedByItemType) = + ("", HostedByItemType("", "", "", "", "", false)) + + override def reduce( + b: (String, HostedByItemType), + a: (String, HostedByItemType) + ): (String, HostedByItemType) = { + return merge(b, a) } - if(b2 == null){ - return b1 - } - if(b1._2.id.startsWith("10|")){ - return (b1._1, HostedByItemType(b1._2.id, b1._2.officialname, b1._2.issn, b1._2.eissn, b1._2.lissn, b1._2.openAccess || b2._2.openAccess)) + + override def merge( + b1: (String, HostedByItemType), + b2: (String, HostedByItemType) + ): (String, HostedByItemType) = { + if (b1 == null) { + return b2 + } + if (b2 == null) { + return b1 + } + if (b1._2.id.startsWith("10|")) { + return ( + b1._1, + HostedByItemType( + b1._2.id, + b1._2.officialname, + b1._2.issn, + b1._2.eissn, + b1._2.lissn, + b1._2.openAccess || b2._2.openAccess + ) + ) + + } + return ( + b2._1, + HostedByItemType( + b2._2.id, + b2._2.officialname, + b2._2.issn, + b2._2.eissn, + b2._2.lissn, + b1._2.openAccess || b2._2.openAccess + ) + ) } - return (b2._1, HostedByItemType(b2._2.id, b2._2.officialname, b2._2.issn, b2._2.eissn, b2._2.lissn, b1._2.openAccess || b2._2.openAccess)) - } - override def finish(reduction: (String,HostedByItemType)): (String, HostedByItemType) = reduction - override def bufferEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType]) + override def finish(reduction: (String, HostedByItemType)): (String, HostedByItemType) = + reduction - override def outputEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType]) - }.toColumn + override def bufferEncoder: Encoder[(String, HostedByItemType)] = + Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]) + override def outputEncoder: Encoder[(String, HostedByItemType)] = + Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]) + }.toColumn + def resultToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = + new Aggregator[EntityInfo, EntityInfo, EntityInfo] { + override def zero: EntityInfo = EntityInfo.newInstance("", "", "") - - def resultToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = new Aggregator[EntityInfo, EntityInfo, EntityInfo]{ - override def zero: EntityInfo = EntityInfo.newInstance("","","") - - override def reduce(b: EntityInfo, a:EntityInfo): EntityInfo = { - return merge(b, a) - } - override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = { - if (b1 == null){ - return b2 + override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = { + return merge(b, a) } - if(b2 == null){ - return b1 + + override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = { + if (b1 == null) { + return b2 + } + if (b2 == null) { + return b1 + } + if (!b1.getHostedById.equals("")) { + b1.setOpenAccess(b1.getOpenAccess || b2.getOpenAccess) + return b1 + } + b2.setOpenAccess(b1.getOpenAccess || b2.getOpenAccess) + b2 + } - if(!b1.getHostedById.equals("")){ - b1.setOpenAccess(b1.getOpenAccess || b2.getOpenAccess) - return b1 - } - b2.setOpenAccess(b1.getOpenAccess || b2.getOpenAccess) - b2 + override def finish(reduction: EntityInfo): EntityInfo = reduction + override def bufferEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) - } - override def finish(reduction: EntityInfo): EntityInfo = reduction - override def bufferEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) + override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) + }.toColumn - override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) - }.toColumn - - def resultToSingleId(df:Dataset[EntityInfo]): Dataset[EntityInfo] = { - val transformedData : Dataset[EntityInfo] = df + def resultToSingleId(df: Dataset[EntityInfo]): Dataset[EntityInfo] = { + val transformedData: Dataset[EntityInfo] = df .groupByKey(_.getId)(Encoders.STRING) .agg(Aggregators.resultToSingleIdAggregator) - .map{ - case (id:String , res: EntityInfo) => res + .map { case (id: String, res: EntityInfo) => + res }(Encoders.bean(classOf[EntityInfo])) transformedData } - def datasourceToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = new Aggregator[EntityInfo, EntityInfo, EntityInfo]{ - override def zero: EntityInfo = EntityInfo.newInstance("","","") + def datasourceToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = + new Aggregator[EntityInfo, EntityInfo, EntityInfo] { + override def zero: EntityInfo = EntityInfo.newInstance("", "", "") - override def reduce(b: EntityInfo, a:EntityInfo): EntityInfo = { - return merge(b, a) - } - override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = { - if (b1 == null){ - return b2 + override def reduce(b: EntityInfo, a: EntityInfo): EntityInfo = { + return merge(b, a) } - if(b2 == null){ - return b1 + + override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = { + if (b1 == null) { + return b2 + } + if (b2 == null) { + return b1 + } + if (!b1.getHostedById.equals("")) { + return b1 + } + b2 + } - if(!b1.getHostedById.equals("")){ - return b1 - } - b2 + override def finish(reduction: EntityInfo): EntityInfo = reduction + override def bufferEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) - } - override def finish(reduction: EntityInfo): EntityInfo = reduction - override def bufferEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) + override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) + }.toColumn - override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) - }.toColumn - - - def datasourceToSingleId(df:Dataset[EntityInfo]): Dataset[EntityInfo] = { - val transformedData : Dataset[EntityInfo] = df + def datasourceToSingleId(df: Dataset[EntityInfo]): Dataset[EntityInfo] = { + val transformedData: Dataset[EntityInfo] = df .groupByKey(_.getHostedById)(Encoders.STRING) .agg(Aggregators.datasourceToSingleIdAggregator) - .map{ - case (id:String , res: EntityInfo) => res + .map { case (id: String, res: EntityInfo) => + res }(Encoders.bean(classOf[EntityInfo])) transformedData diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala index 38af3eee4..80c672929 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala @@ -14,7 +14,8 @@ import org.slf4j.{Logger, LoggerFactory} object SparkApplyHostedByMapToDatasource { def applyHBtoDats(join: Dataset[EntityInfo], dats: Dataset[Datasource]): Dataset[Datasource] = { - dats.joinWith(join, dats.col("id").equalTo(join.col("hostedById")), "left") + dats + .joinWith(join, dats.col("id").equalTo(join.col("hostedById")), "left") .map(t2 => { val d: Datasource = t2._1 if (t2._2 != null) { @@ -31,14 +32,21 @@ object SparkApplyHostedByMapToDatasource { val logger: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json" + ) + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() val graphPath = parser.get("graphPath") val outputPath = parser.get("outputPath") @@ -51,20 +59,27 @@ object SparkApplyHostedByMapToDatasource { val mapper = new ObjectMapper() - val dats: Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource") + val dats: Dataset[Datasource] = spark.read + .textFile(graphPath + "/datasource") .map(r => mapper.readValue(r, classOf[Datasource])) - val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(preparedInfoPath) - .map(ei => mapper.readValue(ei, classOf[EntityInfo]))) + val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId( + spark.read + .textFile(preparedInfoPath) + .map(ei => mapper.readValue(ei, classOf[EntityInfo])) + ) - applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath) + applyHBtoDats(pinfo, dats).write + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath) - spark.read.textFile(outputPath) + spark.read + .textFile(outputPath) .write .mode(SaveMode.Overwrite) .option("compression", "gzip") .text(graphPath + "/datasource") } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala index 204325982..a900fc241 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala @@ -16,7 +16,8 @@ import scala.collection.JavaConverters._ object SparkApplyHostedByMapToResult { def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = { - pubs.joinWith(join, pubs.col("id").equalTo(join.col("id")), "left") + pubs + .joinWith(join, pubs.col("id").equalTo(join.col("id")), "left") .map(t2 => { val p: Publication = t2._1 if (t2._2 != null) { @@ -27,7 +28,14 @@ object SparkApplyHostedByMapToResult { inst.getHostedby.setKey(ei.getHostedById) inst.getHostedby.setValue(ei.getName) if (ei.getOpenAccess) { - inst.setAccessright(OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN, "Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)) + inst.setAccessright( + OafMapperUtils.accessRight( + ModelConstants.ACCESS_RIGHT_OPEN, + "Open Access", + ModelConstants.DNET_ACCESS_MODES, + ModelConstants.DNET_ACCESS_MODES + ) + ) inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold) p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance())); } @@ -40,46 +48,54 @@ object SparkApplyHostedByMapToResult { def main(args: Array[String]): Unit = { - val logger: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_apply_params.json" + ) + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - + .master(parser.get("master")) + .getOrCreate() val graphPath = parser.get("graphPath") val outputPath = parser.get("outputPath") val preparedInfoPath = parser.get("preparedInfoPath") - implicit val formats = DefaultFormats - implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication]) implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) val mapper = new ObjectMapper() - val pubs: Dataset[Publication] = spark.read.textFile(graphPath + "/publication") + val pubs: Dataset[Publication] = spark.read + .textFile(graphPath + "/publication") .map(r => mapper.readValue(r, classOf[Publication])) - val pinfo: Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath) + val pinfo: Dataset[EntityInfo] = spark.read + .textFile(preparedInfoPath) .map(ei => mapper.readValue(ei, classOf[EntityInfo])) - applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath) + applyHBtoPubs(pinfo, pubs).write + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath) - spark.read.textFile(outputPath) + spark.read + .textFile(outputPath) .write .mode(SaveMode.Overwrite) .option("compression", "gzip") .text(graphPath + "/publication") } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala index 87e203e4b..34798b147 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala @@ -19,7 +19,6 @@ object SparkPrepareHostedByInfoToApply { def getList(id: String, j: Journal, name: String): List[EntityInfo] = { var lst: List[EntityInfo] = List() - if (j.getIssnLinking != null && !j.getIssnLinking.equals("")) { lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst } @@ -37,14 +36,14 @@ object SparkPrepareHostedByInfoToApply { val mapper = new ObjectMapper() - val dd: Dataset[Publication] = spark.read.textFile(publicationPath) + val dd: Dataset[Publication] = spark.read + .textFile(publicationPath) .map(r => mapper.readValue(r, classOf[Publication])) dd.filter(p => p.getJournal != null).flatMap(p => getList(p.getId, p.getJournal, "")) } - def toEntityInfo(input: String): EntityInfo = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats @@ -53,7 +52,6 @@ object SparkPrepareHostedByInfoToApply { toEntityItem(c.keys.head, c.values.head) } - def toEntityItem(journal_id: String, hbi: HostedByItemType): EntityInfo = { EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess) @@ -61,62 +59,69 @@ object SparkPrepareHostedByInfoToApply { } def joinResHBM(res: Dataset[EntityInfo], hbm: Dataset[EntityInfo]): Dataset[EntityInfo] = { - Aggregators.resultToSingleId(res.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left") - .map(t2 => { - val res: EntityInfo = t2._1 - if (t2._2 != null) { - val ds = t2._2 - res.setHostedById(ds.getId) - res.setOpenAccess(ds.getOpenAccess) - res.setName(ds.getName) - } - res - })) + Aggregators.resultToSingleId( + res + .joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left") + .map(t2 => { + val res: EntityInfo = t2._1 + if (t2._2 != null) { + val ds = t2._2 + res.setHostedById(ds.getId) + res.setOpenAccess(ds.getOpenAccess) + res.setName(ds.getName) + } + res + }) + ) } def main(args: Array[String]): Unit = { - val logger: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_prepare_params.json" + ) + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - + .master(parser.get("master")) + .getOrCreate() val graphPath = parser.get("graphPath") val outputPath = parser.get("preparedInfoPath") val hostedByMapPath = parser.get("hostedByMapPath") - implicit val formats = DefaultFormats - logger.info("Getting the Datasources") import spark.implicits._ - //STEP1: read the hostedbymap and transform it in EntityInfo - val hostedByInfo: Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo) + val hostedByInfo: Dataset[EntityInfo] = + spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo) //STEP2: create association (publication, issn), (publication, eissn), (publication, lissn) - val resultInfoDataset: Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication") + val resultInfoDataset: Dataset[EntityInfo] = + prepareResultInfo(spark, graphPath + "/publication") //STEP3: left join resultInfo with hostedByInfo on journal_id. Reduction of all the results with the same id in just //one entry (one result could be associated to issn and eissn and so possivly matching more than once against the map) //to this entry we add the id of the datasource for the next step - joinResHBM(resultInfoDataset, hostedByInfo) - .write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath) - + joinResHBM(resultInfoDataset, hostedByInfo).write + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath) } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala index 6dfe35623..8d8965866 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala @@ -17,9 +17,8 @@ import java.io.PrintWriter object SparkProduceHostedByMap { - - implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] = Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]) - + implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] = + Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]) def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)): HostedByItemType = { val openaire: HostedByInfo = input._1._1 @@ -28,9 +27,33 @@ object SparkProduceHostedByMap { val isOpenAccess: Boolean = doaj == null && gold == null openaire.journal_id match { - case Constants.ISSN => HostedByItemType(openaire.id, openaire.officialname, openaire.journal_id, "", "", isOpenAccess) - case Constants.EISSN => HostedByItemType(openaire.id, openaire.officialname, "", openaire.journal_id, "", isOpenAccess) - case Constants.ISSNL => HostedByItemType(openaire.id, openaire.officialname, "", "", openaire.journal_id, isOpenAccess) + case Constants.ISSN => + HostedByItemType( + openaire.id, + openaire.officialname, + openaire.journal_id, + "", + "", + isOpenAccess + ) + case Constants.EISSN => + HostedByItemType( + openaire.id, + openaire.officialname, + "", + openaire.journal_id, + "", + isOpenAccess + ) + case Constants.ISSNL => + HostedByItemType( + openaire.id, + openaire.officialname, + "", + "", + openaire.journal_id, + isOpenAccess + ) // catch the default with a variable so you can print it case whoa => null @@ -46,11 +69,16 @@ object SparkProduceHostedByMap { Serialization.write(map) - } - - def getHostedByItemType(id: String, officialname: String, issn: String, eissn: String, issnl: String, oa: Boolean): HostedByItemType = { + def getHostedByItemType( + id: String, + officialname: String, + issn: String, + eissn: String, + issnl: String, + oa: Boolean + ): HostedByItemType = { if (issn != null) { if (eissn != null) { if (issnl != null) { @@ -85,7 +113,14 @@ object SparkProduceHostedByMap { def oaToHostedbyItemType(dats: Datasource): HostedByItemType = { if (dats.getJournal != null) { - return getHostedByItemType(dats.getId, dats.getOfficialname.getValue, dats.getJournal.getIssnPrinted, dats.getJournal.getIssnOnline, dats.getJournal.getIssnLinking, false) + return getHostedByItemType( + dats.getId, + dats.getOfficialname.getValue, + dats.getJournal.getIssnPrinted, + dats.getJournal.getIssnOnline, + dats.getJournal.getIssnLinking, + false + ) } HostedByItemType("", "", "", "", "", false) } @@ -94,32 +129,41 @@ object SparkProduceHostedByMap { import spark.implicits._ - val mapper = new ObjectMapper() implicit var encoderD = Encoders.kryo[Datasource] - val dd: Dataset[Datasource] = spark.read.textFile(datasourcePath) + val dd: Dataset[Datasource] = spark.read + .textFile(datasourcePath) .map(r => mapper.readValue(r, classOf[Datasource])) dd.map { ddt => oaToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) } - def goldToHostedbyItemType(gold: UnibiGoldModel): HostedByItemType = { - return getHostedByItemType(Constants.UNIBI, gold.getTitle, gold.getIssn, "", gold.getIssnL, true) + return getHostedByItemType( + Constants.UNIBI, + gold.getTitle, + gold.getIssn, + "", + gold.getIssnL, + true + ) } - - def goldHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { + def goldHostedByDataset( + spark: SparkSession, + datasourcePath: String + ): Dataset[HostedByItemType] = { import spark.implicits._ implicit val mapEncoderUnibi: Encoder[UnibiGoldModel] = Encoders.kryo[UnibiGoldModel] val mapper = new ObjectMapper() - val dd: Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath) + val dd: Dataset[UnibiGoldModel] = spark.read + .textFile(datasourcePath) .map(r => mapper.readValue(r, classOf[UnibiGoldModel])) dd.map { ddt => goldToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) @@ -128,17 +172,28 @@ object SparkProduceHostedByMap { def doajToHostedbyItemType(doaj: DOAJModel): HostedByItemType = { - return getHostedByItemType(Constants.DOAJ, doaj.getJournalTitle, doaj.getIssn, doaj.getEissn, "", true) + return getHostedByItemType( + Constants.DOAJ, + doaj.getJournalTitle, + doaj.getIssn, + doaj.getEissn, + "", + true + ) } - def doajHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { + def doajHostedByDataset( + spark: SparkSession, + datasourcePath: String + ): Dataset[HostedByItemType] = { import spark.implicits._ implicit val mapEncoderDOAJ: Encoder[DOAJModel] = Encoders.kryo[DOAJModel] val mapper = new ObjectMapper() - val dd: Dataset[DOAJModel] = spark.read.textFile(datasourcePath) + val dd: Dataset[DOAJModel] = spark.read + .textFile(datasourcePath) .map(r => mapper.readValue(r, classOf[DOAJModel])) dd.map { ddt => doajToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) @@ -159,7 +214,6 @@ object SparkProduceHostedByMap { lst } - def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = { val conf = new Configuration() @@ -169,49 +223,51 @@ object SparkProduceHostedByMap { val writer = new PrintWriter(output) try { input.foreach(hbi => writer.println(hbi)) - } - finally { + } finally { writer.close() } } - def main(args: Array[String]): Unit = { val logger: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/hostedbymap/hostedby_params.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - + .master(parser.get("master")) + .getOrCreate() val datasourcePath = parser.get("datasourcePath") val workingDirPath = parser.get("workingPath") val outputPath = parser.get("outputPath") - implicit val formats = DefaultFormats - logger.info("Getting the Datasources") - - Aggregators.explodeHostedByItemType(oaHostedByDataset(spark, datasourcePath) - .union(goldHostedByDataset(spark, workingDirPath + "/unibi_gold.json")) - .union(doajHostedByDataset(spark, workingDirPath + "/doaj.json")) - .flatMap(hbi => toList(hbi))).filter(hbi => hbi._2.id.startsWith("10|")) + Aggregators + .explodeHostedByItemType( + oaHostedByDataset(spark, datasourcePath) + .union(goldHostedByDataset(spark, workingDirPath + "/unibi_gold.json")) + .union(doajHostedByDataset(spark, workingDirPath + "/doaj.json")) + .flatMap(hbi => toList(hbi)) + ) + .filter(hbi => hbi._2.id.startsWith("10|")) .map(hbi => toHostedByMap(hbi))(Encoders.STRING) - .rdd.saveAsTextFile(outputPath, classOf[GzipCodec]) - + .rdd + .saveAsTextFile(outputPath, classOf[GzipCodec]) } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala index fa13f477c..533948289 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala @@ -20,7 +20,13 @@ object CopyHdfsOafSparkApplication { def main(args: Array[String]): Unit = { val log = LoggerFactory.getLogger(getClass) val conf = new SparkConf() - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json")).mkString) + val parser = new ArgumentApplicationParser( + Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json") + ) + .mkString + ) parser.parseArgument(args) val spark = @@ -28,7 +34,8 @@ object CopyHdfsOafSparkApplication { .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() val sc: SparkContext = spark.sparkContext @@ -49,19 +56,22 @@ object CopyHdfsOafSparkApplication { implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] - val paths = DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala + val paths = + DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala - val validPaths: List[String] = paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList + val validPaths: List[String] = + paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList - val types = ModelSupport.oafTypes.entrySet - .asScala + val types = ModelSupport.oafTypes.entrySet.asScala .map(e => Tuple2(e.getKey, e.getValue)) if (validPaths.nonEmpty) { val oaf = spark.read.textFile(validPaths: _*) - val mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + val mapper = + new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) - types.foreach(t => oaf + types.foreach(t => + oaf .filter(o => isOafType(o, t._1)) .map(j => mapper.readValue(j, t._2).asInstanceOf[Oaf]) .map(s => mapper.writeValueAsString(s))(Encoders.STRING) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala index 8e15063c2..f5a13e72b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.oa.graph.resolution import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.common.EntityType -import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset,_} +import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _} import org.apache.commons.io.IOUtils import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkConf @@ -13,20 +13,32 @@ import org.slf4j.{Logger, LoggerFactory} object SparkResolveEntities { val mapper = new ObjectMapper() - val entities = List(EntityType.dataset, EntityType.publication, EntityType.software, EntityType.otherresearchproduct) + + val entities = List( + EntityType.dataset, + EntityType.publication, + EntityType.software, + EntityType.otherresearchproduct + ) def main(args: Array[String]): Unit = { val log: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json" + ) + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - + .master(parser.get("master")) + .getOrCreate() val graphBasePath = parser.get("graphBasePath") log.info(s"graphBasePath -> $graphBasePath") @@ -38,7 +50,6 @@ object SparkResolveEntities { val targetPath = parser.get("targetPath") log.info(s"targetPath -> $targetPath") - val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) fs.mkdirs(new Path(workingPath)) @@ -46,60 +57,84 @@ object SparkResolveEntities { generateResolvedEntities(spark, workingPath, graphBasePath, targetPath) } - def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = { implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) import spark.implicits._ - val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)] - val up: Dataset[(String, Result)] = spark.read.text(unresolvedPath).as[String].map(s => mapper.readValue(s, classOf[Result])).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder)) + val rPid: Dataset[(String, String)] = + spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)] + val up: Dataset[(String, Result)] = spark.read + .text(unresolvedPath) + .as[String] + .map(s => mapper.readValue(s, classOf[Result])) + .map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder)) - rPid.joinWith(up, rPid("_2").equalTo(up("_1")), "inner").map { - r => + rPid + .joinWith(up, rPid("_2").equalTo(up("_1")), "inner") + .map { r => val result = r._2._2 val dnetId = r._1._1 result.setId(dnetId) result - }.write.mode(SaveMode.Overwrite).save(s"$workingPath/resolvedEntities") + } + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/resolvedEntities") } - def deserializeObject(input: String, entity: EntityType): Result = { entity match { - case EntityType.publication => mapper.readValue(input, classOf[Publication]) - case EntityType.dataset => mapper.readValue(input, classOf[OafDataset]) - case EntityType.software => mapper.readValue(input, classOf[Software]) + case EntityType.publication => mapper.readValue(input, classOf[Publication]) + case EntityType.dataset => mapper.readValue(input, classOf[OafDataset]) + case EntityType.software => mapper.readValue(input, classOf[Software]) case EntityType.otherresearchproduct => mapper.readValue(input, classOf[OtherResearchProduct]) } } - def generateResolvedEntities(spark: SparkSession, workingPath: String, graphBasePath: String, targetPath: String) = { + def generateResolvedEntities( + spark: SparkSession, + workingPath: String, + graphBasePath: String, + targetPath: String + ) = { implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) import spark.implicits._ - val re: Dataset[(String, Result)] = spark.read.load(s"$workingPath/resolvedEntities").as[Result].map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder)) - entities.foreach { - e => { + val re: Dataset[(String, Result)] = spark.read + .load(s"$workingPath/resolvedEntities") + .as[Result] + .map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder)) + entities.foreach { e => + { - val currentEntityDataset: Dataset[(String, Result)] = spark.read.text(s"$graphBasePath/$e").as[String].map(s => deserializeObject(s, e)).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder)) + val currentEntityDataset: Dataset[(String, Result)] = spark.read + .text(s"$graphBasePath/$e") + .as[String] + .map(s => deserializeObject(s, e)) + .map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder)) - currentEntityDataset.joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left").map(k => { + currentEntityDataset + .joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left") + .map(k => { - val a = k._1 - val b = k._2 - if (b == null) - a._2 - else { - a._2.mergeFrom(b._2) - a._2 - } - }).map(r => mapper.writeValueAsString(r))(Encoders.STRING) - .write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$targetPath/$e") + val a = k._1 + val b = k._2 + if (b == null) + a._2 + else { + a._2.mergeFrom(b._2) + a._2 + } + }) + .map(r => mapper.writeValueAsString(r))(Encoders.STRING) + .write + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(s"$targetPath/$e") } - } } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala index 80c09940f..2567a30a6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala @@ -17,18 +17,25 @@ import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} object SparkResolveRelation { + def main(args: Array[String]): Unit = { val log: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json" + ) + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - + .master(parser.get("master")) + .getOrCreate() val graphBasePath = parser.get("graphBasePath") log.info(s"graphBasePath -> $graphBasePath") @@ -41,7 +48,6 @@ object SparkResolveRelation { implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) import spark.implicits._ - //CLEANING TEMPORARY FOLDER HdfsSupport.remove(workingPath, spark.sparkContext.hadoopConfiguration) val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) @@ -51,39 +57,49 @@ object SparkResolveRelation { val mapper: ObjectMapper = new ObjectMapper() - val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)] + val rPid: Dataset[(String, String)] = + spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)] - val relationDs: Dataset[(String, Relation)] = spark.read.text(s"$graphBasePath/relation").as[String] - .map(s => mapper.readValue(s, classOf[Relation])).as[Relation] + val relationDs: Dataset[(String, Relation)] = spark.read + .text(s"$graphBasePath/relation") + .as[String] + .map(s => mapper.readValue(s, classOf[Relation])) + .as[Relation] .map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder)) - relationDs.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left").map { - m => + relationDs + .joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left") + .map { m => val sourceResolved = m._2 val currentRelation = m._1._2 if (sourceResolved != null && sourceResolved._1 != null && sourceResolved._1.nonEmpty) currentRelation.setSource(sourceResolved._1) currentRelation - }.write + } + .write .mode(SaveMode.Overwrite) .save(s"$workingPath/relationResolvedSource") - - val relationSourceResolved: Dataset[(String, Relation)] = spark.read.load(s"$workingPath/relationResolvedSource").as[Relation] + val relationSourceResolved: Dataset[(String, Relation)] = spark.read + .load(s"$workingPath/relationResolvedSource") + .as[Relation] .map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder)) - relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left").map { - m => + relationSourceResolved + .joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left") + .map { m => val targetResolved = m._2 val currentRelation = m._1._2 if (targetResolved != null && targetResolved._1.nonEmpty) currentRelation.setTarget(targetResolved._1) currentRelation - } + } .write .mode(SaveMode.Overwrite) .save(s"$workingPath/relation_resolved") - spark.read.load(s"$workingPath/relation_resolved").as[Relation] + spark.read + .load(s"$workingPath/relation_resolved") + .as[Relation] .filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")) .map(r => mapper.writeValueAsString(r)) .write @@ -96,33 +112,31 @@ object SparkResolveRelation { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) val result: List[(String, String)] = for { - JObject(iObj) <- json \ "instance" - JField("collectedfrom", JObject(cf)) <- iObj + JObject(iObj) <- json \ "instance" + JField("collectedfrom", JObject(cf)) <- iObj JField("instancetype", JObject(instancetype)) <- iObj - JField("value", JString(collectedFrom)) <- cf - JField("classname", JString(classname)) <- instancetype + JField("value", JString(collectedFrom)) <- cf + JField("classname", JString(classname)) <- instancetype } yield (classname, collectedFrom) result } - def extractPidsFromRecord(input: String): (String, List[(String, String)]) = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) val id: String = (json \ "id").extract[String] val result: List[(String, String)] = for { - JObject(pids) <- json \\ "instance" \ "pid" - JField("value", JString(pidValue)) <- pids + JObject(pids) <- json \\ "instance" \ "pid" + JField("value", JString(pidValue)) <- pids JField("qualifier", JObject(qualifier)) <- pids - JField("classid", JString(pidType)) <- qualifier + JField("classid", JString(pidType)) <- qualifier } yield (pidValue, pidType) (id, result) } - private def isRelation(input: String): Boolean = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats @@ -132,20 +146,25 @@ object SparkResolveRelation { source != null } - def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = { + def extractPidResolvedTableFromJsonRDD( + spark: SparkSession, + graphPath: String, + workingPath: String + ) = { import spark.implicits._ - val d: RDD[(String, String)] = spark.sparkContext.textFile(s"$graphPath/*") + val d: RDD[(String, String)] = spark.sparkContext + .textFile(s"$graphPath/*") .filter(i => !isRelation(i)) .map(i => extractPidsFromRecord(i)) .filter(s => s != null && s._1 != null && s._2 != null && s._2.nonEmpty) .flatMap { p => - p._2.map(pid => - (p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2)) - ) - }.filter(r => r._1 != null || r._2 != null) + p._2.map(pid => (p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2))) + } + .filter(r => r._1 != null || r._2 != null) - spark.createDataset(d) + spark + .createDataset(d) .groupByKey(_._2) .reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y) .map(s => s._2) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala index 9df3b41bd..79b1c22cd 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala @@ -7,24 +7,26 @@ import org.apache.spark.sql.SparkSession object SparkDataciteToOAF { - def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - + .master(parser.get("master")) + .getOrCreate() val sc = spark.sparkContext val inputPath = parser.get("inputPath") - } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala index 9d16cf907..fb90531c5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala @@ -11,18 +11,22 @@ import org.slf4j.{Logger, LoggerFactory} object SparkConvertDatasetToJsonRDD { - def main(args: Array[String]): Unit = { val log: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() val sourcePath = parser.get("sourcePath") log.info(s"sourcePath -> $sourcePath") @@ -33,9 +37,13 @@ object SparkConvertDatasetToJsonRDD { val mapper = new ObjectMapper() implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) - resultObject.foreach { item => - spark.read.load(s"$sourcePath/$item").as[Result].map(r => mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec]) + spark.read + .load(s"$sourcePath/$item") + .as[Result] + .map(r => mapper.writeValueAsString(r))(Encoders.STRING) + .rdd + .saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec]) } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala index 0c54de7c8..bfa07eb69 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala @@ -15,14 +15,19 @@ object SparkConvertObjectToJson { def main(args: Array[String]): Unit = { val log: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() val sourcePath = parser.get("sourcePath") log.info(s"sourcePath -> $sourcePath") @@ -33,24 +38,28 @@ object SparkConvertObjectToJson { val scholixUpdatePath = parser.get("scholixUpdatePath") log.info(s"scholixUpdatePath -> $scholixUpdatePath") - - implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] - val mapper = new ObjectMapper objectType.toLowerCase match { case "scholix" => log.info("Serialize Scholix") val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix] - val u :Dataset[Scholix]= spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix] - d.union(u).repartition(8000).map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.saveAsTextFile(targetPath, classOf[GzipCodec]) + val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix] + d.union(u) + .repartition(8000) + .map(s => mapper.writeValueAsString(s))(Encoders.STRING) + .rdd + .saveAsTextFile(targetPath, classOf[GzipCodec]) case "summary" => log.info("Serialize Summary") val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary] - d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(1000).saveAsTextFile(targetPath, classOf[GzipCodec]) + d.map(s => mapper.writeValueAsString(s))(Encoders.STRING) + .rdd + .repartition(1000) + .saveAsTextFile(targetPath, classOf[GzipCodec]) } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala index 2115df1fd..f13c14da5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala @@ -7,21 +7,26 @@ import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} + object SparkConvertRDDtoDataset { def main(args: Array[String]): Unit = { - val log: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() val sourcePath = parser.get("sourcePath") log.info(s"sourcePath -> $sourcePath") @@ -31,43 +36,79 @@ object SparkConvertRDDtoDataset { val entityPath = s"$t/entities" val relPath = s"$t/relation" val mapper = new ObjectMapper() - implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) - implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) - implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) - implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct]) - implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) - + implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) + implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) + implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) + implicit val orpEncoder: Encoder[OtherResearchProduct] = + Encoders.kryo(classOf[OtherResearchProduct]) + implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) log.info("Converting dataset") - val rddDataset =spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false) - spark.createDataset(rddDataset).as[OafDataset].write.mode(SaveMode.Overwrite).save(s"$entityPath/dataset") - + val rddDataset = spark.sparkContext + .textFile(s"$sourcePath/dataset") + .map(s => mapper.readValue(s, classOf[OafDataset])) + .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) + spark + .createDataset(rddDataset) + .as[OafDataset] + .write + .mode(SaveMode.Overwrite) + .save(s"$entityPath/dataset") log.info("Converting publication") - val rddPublication =spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false) - spark.createDataset(rddPublication).as[Publication].write.mode(SaveMode.Overwrite).save(s"$entityPath/publication") + val rddPublication = spark.sparkContext + .textFile(s"$sourcePath/publication") + .map(s => mapper.readValue(s, classOf[Publication])) + .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) + spark + .createDataset(rddPublication) + .as[Publication] + .write + .mode(SaveMode.Overwrite) + .save(s"$entityPath/publication") log.info("Converting software") - val rddSoftware =spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false) - spark.createDataset(rddSoftware).as[Software].write.mode(SaveMode.Overwrite).save(s"$entityPath/software") + val rddSoftware = spark.sparkContext + .textFile(s"$sourcePath/software") + .map(s => mapper.readValue(s, classOf[Software])) + .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) + spark + .createDataset(rddSoftware) + .as[Software] + .write + .mode(SaveMode.Overwrite) + .save(s"$entityPath/software") log.info("Converting otherresearchproduct") - val rddOtherResearchProduct =spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct])).filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false) - spark.createDataset(rddOtherResearchProduct).as[OtherResearchProduct].write.mode(SaveMode.Overwrite).save(s"$entityPath/otherresearchproduct") - + val rddOtherResearchProduct = spark.sparkContext + .textFile(s"$sourcePath/otherresearchproduct") + .map(s => mapper.readValue(s, classOf[OtherResearchProduct])) + .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) + spark + .createDataset(rddOtherResearchProduct) + .as[OtherResearchProduct] + .write + .mode(SaveMode.Overwrite) + .save(s"$entityPath/otherresearchproduct") log.info("Converting Relation") + val relationSemanticFilter = List( + "cites", + "iscitedby", + "merges", + "ismergedin", + "HasAmongTopNSimilarDocuments", + "IsAmongTopNSimilarDocuments" + ) - val relationSemanticFilter = List("cites", "iscitedby","merges", "ismergedin", "HasAmongTopNSimilarDocuments","IsAmongTopNSimilarDocuments" ) - - val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation") + val rddRelation = spark.sparkContext + .textFile(s"$sourcePath/relation") .map(s => mapper.readValue(s, classOf[Relation])) - .filter(r=> r.getDataInfo!= null && r.getDataInfo.getDeletedbyinference == false) - .filter(r=> r.getSource.startsWith("50") && r.getTarget.startsWith("50")) + .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) + .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") - } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala index ed88cfaa6..9d57e5869 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala @@ -1,7 +1,7 @@ package eu.dnetlib.dhp.sx.graph import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset,_} +import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql._ @@ -13,82 +13,131 @@ object SparkCreateInputGraph { val log: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/extract_entities_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/extract_entities_params.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - + .master(parser.get("master")) + .getOrCreate() val resultObject = List( ("publication", classOf[Publication]), ("dataset", classOf[OafDataset]), ("software", classOf[Software]), ("otherResearchProduct", classOf[OtherResearchProduct]) - ) implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) - implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct]) + implicit val orpEncoder: Encoder[OtherResearchProduct] = + Encoders.kryo(classOf[OtherResearchProduct]) implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) - val sourcePath = parser.get("sourcePath") log.info(s"sourcePath -> $sourcePath") val targetPath = parser.get("targetPath") log.info(s"targetPath -> $targetPath") - val oafDs: Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf] - log.info("Extract Publication") - oafDs.filter(o => o.isInstanceOf[Publication]).map(p => p.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/publication") + oafDs + .filter(o => o.isInstanceOf[Publication]) + .map(p => p.asInstanceOf[Publication]) + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/extracted/publication") log.info("Extract dataset") - oafDs.filter(o => o.isInstanceOf[OafDataset]).map(p => p.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/dataset") + oafDs + .filter(o => o.isInstanceOf[OafDataset]) + .map(p => p.asInstanceOf[OafDataset]) + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/extracted/dataset") log.info("Extract software") - oafDs.filter(o => o.isInstanceOf[Software]).map(p => p.asInstanceOf[Software]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/software") + oafDs + .filter(o => o.isInstanceOf[Software]) + .map(p => p.asInstanceOf[Software]) + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/extracted/software") log.info("Extract otherResearchProduct") - oafDs.filter(o => o.isInstanceOf[OtherResearchProduct]).map(p => p.asInstanceOf[OtherResearchProduct]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/otherResearchProduct") + oafDs + .filter(o => o.isInstanceOf[OtherResearchProduct]) + .map(p => p.asInstanceOf[OtherResearchProduct]) + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/extracted/otherResearchProduct") log.info("Extract Relation") - oafDs.filter(o => o.isInstanceOf[Relation]).map(p => p.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/extracted/relation") + oafDs + .filter(o => o.isInstanceOf[Relation]) + .map(p => p.asInstanceOf[Relation]) + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/extracted/relation") resultObject.foreach { r => log.info(s"Make ${r._1} unique") - makeDatasetUnique(s"$targetPath/extracted/${r._1}", s"$targetPath/preprocess/${r._1}", spark, r._2) + makeDatasetUnique( + s"$targetPath/extracted/${r._1}", + s"$targetPath/preprocess/${r._1}", + spark, + r._2 + ) } } - - def extractEntities[T <: Oaf](oafDs: Dataset[Oaf], targetPath: String, clazz: Class[T], log: Logger): Unit = { + def extractEntities[T <: Oaf]( + oafDs: Dataset[Oaf], + targetPath: String, + clazz: Class[T], + log: Logger + ): Unit = { implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) log.info(s"Extract ${clazz.getSimpleName}") - oafDs.filter(o => o.isInstanceOf[T]).map(p => p.asInstanceOf[T]).write.mode(SaveMode.Overwrite).save(targetPath) + oafDs + .filter(o => o.isInstanceOf[T]) + .map(p => p.asInstanceOf[T]) + .write + .mode(SaveMode.Overwrite) + .save(targetPath) } - - def makeDatasetUnique[T <: Result](sourcePath: String, targetPath: String, spark: SparkSession, clazz: Class[T]): Unit = { + def makeDatasetUnique[T <: Result]( + sourcePath: String, + targetPath: String, + spark: SparkSession, + clazz: Class[T] + ): Unit = { import spark.implicits._ implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) val ds: Dataset[T] = spark.read.load(sourcePath).as[T] - ds.groupByKey(_.getId).reduceGroups { (x, y) => - x.mergeFrom(y) - x - }.map(_._2).write.mode(SaveMode.Overwrite).save(targetPath) + ds.groupByKey(_.getId) + .reduceGroups { (x, y) => + x.mergeFrom(y) + x + } + .map(_._2) + .write + .mode(SaveMode.Overwrite) + .save(targetPath) } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala index 9930c57af..af19b9698 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala @@ -17,14 +17,19 @@ object SparkCreateScholix { def main(args: Array[String]): Unit = { val log: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() val relationPath = parser.get("relationPath") log.info(s"relationPath -> $relationPath") @@ -33,37 +38,46 @@ object SparkCreateScholix { val targetPath = parser.get("targetPath") log.info(s"targetPath -> $targetPath") - implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation] implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] import spark.implicits._ - - val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation] - .filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) + val relationDS: Dataset[(String, Relation)] = spark.read + .load(relationPath) + .as[Relation] + .filter(r => + (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase + .contains("merge") + ) .map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder)) - val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary] + val summaryDS: Dataset[(String, ScholixSummary)] = spark.read + .load(summaryPath) + .as[ScholixSummary] .map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, summaryEncoder)) - - relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left") + relationDS + .joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left") .map { input: ((String, Relation), (String, ScholixSummary)) => if (input._1 != null && input._2 != null) { val rel: Relation = input._1._2 val source: ScholixSummary = input._2._2 (rel.getTarget, ScholixUtils.scholixFromSource(rel, source)) - } - else null + } else null }(Encoders.tuple(Encoders.STRING, scholixEncoder)) .filter(r => r != null) - .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source") + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/scholix_from_source") - val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder)) + val scholixSource: Dataset[(String, Scholix)] = spark.read + .load(s"$targetPath/scholix_from_source") + .as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder)) - scholixSource.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left") + scholixSource + .joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left") .map { input: ((String, Scholix), (String, ScholixSummary)) => if (input._2 == null) { null @@ -72,40 +86,73 @@ object SparkCreateScholix { val target: ScholixSummary = input._2._2 ScholixUtils.generateCompleteScholix(s, target) } - }.filter(s => s != null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse") + } + .filter(s => s != null) + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/scholix_one_verse") + val scholix_o_v: Dataset[Scholix] = + spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix] - val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix] - - scholix_o_v.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))).as[Scholix] + scholix_o_v + .flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))) + .as[Scholix] .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder)) .groupByKey(_._1) .agg(ScholixUtils.scholixAggregator.toColumn) .map(s => s._2) - .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix") + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/scholix") val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix] - val stats: Dataset[(String, String, Long)] = scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String, String, Long)] - + val stats: Dataset[(String, String, Long)] = scholix_final + .map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)) + .groupBy("_1", "_2") + .agg(count("_1")) + .as[(String, String, Long)] stats - .map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0)) + .map(s => + RelatedEntities( + s._1, + if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, + if ("publication".equalsIgnoreCase(s._2)) s._3 else 0 + ) + ) .groupByKey(_.id) - .reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset + b.relatedDataset, a.relatedPublication + b.relatedPublication)) + .reduceGroups((a, b) => + RelatedEntities( + a.id, + a.relatedDataset + b.relatedDataset, + a.relatedPublication + b.relatedPublication + ) + ) .map(_._2) - .write.mode(SaveMode.Overwrite).save(s"$targetPath/related_entities") + .write + .mode(SaveMode.Overwrite) + .save(s"$targetPath/related_entities") - val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication > 0 || r.relatedDataset > 0) + val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read + .load(s"$targetPath/related_entities") + .as[RelatedEntities] + .filter(r => r.relatedPublication > 0 || r.relatedDataset > 0) - relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map { i => - val re = i._1 - val sum = i._2._2 + relatedEntitiesDS + .joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner") + .map { i => + val re = i._1 + val sum = i._2._2 - sum.setRelatedDatasets(re.relatedDataset) - sum.setRelatedPublications(re.relatedPublication) - sum - }.write.mode(SaveMode.Overwrite).save(s"${summaryPath}_filtered") + sum.setRelatedDatasets(re.relatedDataset) + sum.setRelatedPublications(re.relatedPublication) + sum + } + .write + .mode(SaveMode.Overwrite) + .save(s"${summaryPath}_filtered") } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala index 4274cae5a..6d489e8cb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala @@ -14,14 +14,19 @@ object SparkCreateSummaryObject { def main(args: Array[String]): Unit = { val log: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json"))) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json") + ) + ) parser.parseArgument(args) val spark: SparkSession = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() val sourcePath = parser.get("sourcePath") log.info(s"sourcePath -> $sourcePath") @@ -33,10 +38,17 @@ object SparkCreateSummaryObject { implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] + val ds: Dataset[Result] = spark.read + .load(s"$sourcePath/*") + .as[Result] + .filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) - val ds: Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) - - ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s != null).write.mode(SaveMode.Overwrite).save(targetPath) + ds.repartition(6000) + .map(r => ScholixUtils.resultToSummary(r)) + .filter(s => s != null) + .write + .mode(SaveMode.Overwrite) + .save(targetPath) } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala index c70397d04..23f4da6c7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala @@ -10,61 +10,88 @@ import java.util.regex.Pattern import scala.language.postfixOps import scala.xml.{Elem, Node, XML} -case class PangaeaDataModel(identifier:String, title:List[String], objectType:List[String], creator:List[String], - publisher:List[String], dataCenter :List[String],subject :List[String], language:String, - rights:String, parent:String,relation :List[String],linkage:List[(String,String)] ) {} +case class PangaeaDataModel( + identifier: String, + title: List[String], + objectType: List[String], + creator: List[String], + publisher: List[String], + dataCenter: List[String], + subject: List[String], + language: String, + rights: String, + parent: String, + relation: List[String], + linkage: List[(String, String)] +) {} object PangaeaUtils { - - def toDataset(input:String):PangaeaDataModel = { + def toDataset(input: String): PangaeaDataModel = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - val xml= (json \ "xml").extract[String] + val xml = (json \ "xml").extract[String] parseXml(xml) } - def findDOIInRelation( input:List[String]):List[String] = { + def findDOIInRelation(input: List[String]): List[String] = { val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b") - input.map(i => { - val matcher = pattern.matcher(i) - if (matcher.find()) - matcher.group(0) - else - null - }).filter(i => i!= null) + input + .map(i => { + val matcher = pattern.matcher(i) + if (matcher.find()) + matcher.group(0) + else + null + }) + .filter(i => i != null) } - def attributeOpt(attribute: String, node:Node): Option[String] = + def attributeOpt(attribute: String, node: Node): Option[String] = node.attribute(attribute) flatMap (_.headOption) map (_.text) - def extractLinkage(node:Elem):List[(String, String)] = { - (node \ "linkage").map(n =>(attributeOpt("type",n), n.text)).filter(t => t._1.isDefined).map(t=> (t._1.get, t._2))(collection.breakOut) + def extractLinkage(node: Elem): List[(String, String)] = { + (node \ "linkage") + .map(n => (attributeOpt("type", n), n.text)) + .filter(t => t._1.isDefined) + .map(t => (t._1.get, t._2))(collection.breakOut) } - def parseXml(input:String):PangaeaDataModel = { + def parseXml(input: String): PangaeaDataModel = { val xml = XML.loadString(input) val identifier = (xml \ "identifier").text - val title :List[String] = (xml \ "title").map(n => n.text)(collection.breakOut) - val pType :List[String] = (xml \ "type").map(n => n.text)(collection.breakOut) - val creators:List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut) - val publisher :List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut) - val dataCenter :List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut) - val subject :List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut) - val language= (xml \ "language").text - val rights= (xml \ "rights").text - val parentIdentifier= (xml \ "parentIdentifier").text - val relation :List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut) + val title: List[String] = (xml \ "title").map(n => n.text)(collection.breakOut) + val pType: List[String] = (xml \ "type").map(n => n.text)(collection.breakOut) + val creators: List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut) + val publisher: List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut) + val dataCenter: List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut) + val subject: List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut) + val language = (xml \ "language").text + val rights = (xml \ "rights").text + val parentIdentifier = (xml \ "parentIdentifier").text + val relation: List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut) val relationFiltered = findDOIInRelation(relation) - val linkage:List[(String,String)] = extractLinkage(xml) + val linkage: List[(String, String)] = extractLinkage(xml) - PangaeaDataModel(identifier,title, pType, creators,publisher, dataCenter, subject, language, rights, parentIdentifier, relationFiltered, linkage) + PangaeaDataModel( + identifier, + title, + pType, + creators, + publisher, + dataCenter, + subject, + language, + rights, + parentIdentifier, + relationFiltered, + linkage + ) } - - def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] = new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel]{ - + def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] = + new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] { override def zero: PangaeaDataModel = null @@ -77,7 +104,7 @@ object PangaeaUtils { else { if (b.title != null && b.title.nonEmpty) b - else + else a._2 } @@ -106,7 +133,4 @@ object PangaeaUtils { override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel] } - - - -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala index 2717b7b80..8ff8a8b1a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala @@ -11,20 +11,25 @@ import scala.io.Source object SparkGeneratePanagaeaDataset { - def main(args: Array[String]): Unit = { val logger: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")).mkString) + val parser = new ArgumentApplicationParser( + Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json") + ) + .mkString + ) parser.parseArgument(args) - val spark: SparkSession = SparkSession .builder() .config(conf) .appName(SparkGeneratePanagaeaDataset.getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() + .master(parser.get("master")) + .getOrCreate() parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}")) logger.info("Converting sequential file into Dataset") @@ -34,16 +39,20 @@ object SparkGeneratePanagaeaDataset { implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel] - val inputRDD: RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s)) + val inputRDD: RDD[PangaeaDataModel] = + sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s)) - spark.createDataset(inputRDD).as[PangaeaDataModel] + spark + .createDataset(inputRDD) + .as[PangaeaDataModel] .map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders)) .groupByKey(_._1)(Encoders.STRING) .agg(PangaeaUtils.getDatasetAggregator().toColumn) .map(s => s._2) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset") + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/dataset") } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala index 4613d5636..7e41e993f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala @@ -9,10 +9,10 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} import org.junit.jupiter.api.Test -class TestApply extends java.io.Serializable{ +class TestApply extends java.io.Serializable { @Test - def testApplyOnResult (): Unit = { + def testApplyOnResult(): Unit = { val conf = new SparkConf() conf.setMaster("local[*]") conf.set("spark.driver.host", "localhost") @@ -25,54 +25,104 @@ class TestApply extends java.io.Serializable{ val pub = getClass.getResource("publication.json").getPath val hbm = getClass.getResource("preparedInfo.json").getPath - val mapper:ObjectMapper = new ObjectMapper() + val mapper: ObjectMapper = new ObjectMapper() implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) implicit val mapEncoderPubInfo: Encoder[Publication] = Encoders.bean(classOf[Publication]) - - val pub_ds :Dataset[Publication] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[Publication])) - val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])) - + val pub_ds: Dataset[Publication] = + spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[Publication])) + val hbm_ds: Dataset[EntityInfo] = + spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])) assertEquals(13, pub_ds.count()) - val ds:Dataset[Publication] = SparkApplyHostedByMapToResult.applyHBtoPubs(hbm_ds, pub_ds) + val ds: Dataset[Publication] = SparkApplyHostedByMapToResult.applyHBtoPubs(hbm_ds, pub_ds) - assertEquals(13, ds.count) + assertEquals(13, ds.count) - val temp: Dataset[(Publication, Publication)] = pub_ds.joinWith(ds, pub_ds.col("id").equalTo(ds.col("id")), "left") + val temp: Dataset[(Publication, Publication)] = + pub_ds.joinWith(ds, pub_ds.col("id").equalTo(ds.col("id")), "left") assertEquals(13, temp.count()) temp.foreach(t2 => { - val pb : Publication = t2._1 - val pa : Publication = t2._2 + val pb: Publication = t2._1 + val pa: Publication = t2._2 assertEquals(1, pa.getInstance().size()) assertEquals(1, pb.getInstance().size()) assertTrue(t2._1.getId.equals(t2._2.getId)) - if(pb.getId.equals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9")){ - assertTrue(pa.getInstance().get(0).getHostedby.getKey.equals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735")) + if (pb.getId.equals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9")) { + assertTrue( + pa.getInstance() + .get(0) + .getHostedby + .getKey + .equals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735") + ) assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals("Academic Therapy")) assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals("OPEN")) assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals("Open Access")) - assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.gold)) + assertTrue( + pa.getInstance().get(0).getAccessright.getOpenAccessRoute.equals(OpenAccessRoute.gold) + ) assertTrue(pa.getBestaccessright.getClassid.equals("OPEN")) assertTrue(pa.getBestaccessright.getClassname.equals("Open Access")) - - assertTrue(pb.getInstance().get(0).getHostedby.getKey.equals("10|openaire____::0b74b6a356bbf23c245f9ae9a748745c")) - assertTrue(pb.getInstance().get(0).getHostedby.getValue.equals("Revistas de investigación Universidad Nacional Mayor de San Marcos")) + assertTrue( + pb.getInstance() + .get(0) + .getHostedby + .getKey + .equals("10|openaire____::0b74b6a356bbf23c245f9ae9a748745c") + ) + assertTrue( + pb.getInstance() + .get(0) + .getHostedby + .getValue + .equals("Revistas de investigación Universidad Nacional Mayor de San Marcos") + ) assertTrue(pb.getInstance().get(0).getAccessright.getClassname.equals("not available")) assertTrue(pb.getInstance().get(0).getAccessright.getClassid.equals("UNKNOWN")) assertTrue(pb.getInstance().get(0).getAccessright.getOpenAccessRoute == null) assertTrue(pb.getBestaccessright.getClassid.equals("UNKNOWN")) assertTrue(pb.getBestaccessright.getClassname.equals("not available")) - }else{ - assertTrue(pa.getInstance().get(0).getHostedby.getKey.equals(pb.getInstance().get(0).getHostedby.getKey)) - assertTrue(pa.getInstance().get(0).getHostedby.getValue.equals(pb.getInstance().get(0).getHostedby.getValue)) - assertTrue(pa.getInstance().get(0).getAccessright.getClassid.equals(pb.getInstance().get(0).getAccessright.getClassid)) - assertTrue(pa.getInstance().get(0).getAccessright.getClassname.equals(pb.getInstance().get(0).getAccessright.getClassname)) - assertTrue(pa.getInstance().get(0).getAccessright.getOpenAccessRoute == pb.getInstance().get(0).getAccessright.getOpenAccessRoute) + } else { + assertTrue( + pa.getInstance() + .get(0) + .getHostedby + .getKey + .equals(pb.getInstance().get(0).getHostedby.getKey) + ) + assertTrue( + pa.getInstance() + .get(0) + .getHostedby + .getValue + .equals(pb.getInstance().get(0).getHostedby.getValue) + ) + assertTrue( + pa.getInstance() + .get(0) + .getAccessright + .getClassid + .equals(pb.getInstance().get(0).getAccessright.getClassid) + ) + assertTrue( + pa.getInstance() + .get(0) + .getAccessright + .getClassname + .equals(pb.getInstance().get(0).getAccessright.getClassname) + ) + assertTrue( + pa.getInstance().get(0).getAccessright.getOpenAccessRoute == pb + .getInstance() + .get(0) + .getAccessright + .getOpenAccessRoute + ) } }) @@ -80,9 +130,8 @@ class TestApply extends java.io.Serializable{ spark.close() } - @Test - def testApplyOnDatasource():Unit = { + def testApplyOnDatasource(): Unit = { val conf = new SparkConf() conf.setMaster("local[*]") conf.set("spark.driver.host", "localhost") @@ -95,38 +144,49 @@ class TestApply extends java.io.Serializable{ val dats = getClass.getResource("datasource.json").getPath val hbm = getClass.getResource("preparedInfo2.json").getPath - val mapper:ObjectMapper = new ObjectMapper() + val mapper: ObjectMapper = new ObjectMapper() implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) implicit val mapEncoderPubInfo: Encoder[Datasource] = Encoders.bean(classOf[Datasource]) - - val dats_ds :Dataset[Datasource] = spark.read.textFile(dats).map(p => mapper.readValue(p, classOf[Datasource])) - val hbm_ds :Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo]))) - + val dats_ds: Dataset[Datasource] = + spark.read.textFile(dats).map(p => mapper.readValue(p, classOf[Datasource])) + val hbm_ds: Dataset[EntityInfo] = Aggregators.datasourceToSingleId( + spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])) + ) assertEquals(10, dats_ds.count()) - val ds:Dataset[Datasource] = SparkApplyHostedByMapToDatasource.applyHBtoDats(hbm_ds, dats_ds) + val ds: Dataset[Datasource] = SparkApplyHostedByMapToDatasource.applyHBtoDats(hbm_ds, dats_ds) - assertEquals(10, ds.count) + assertEquals(10, ds.count) - val temp: Dataset[(Datasource, Datasource)] = dats_ds.joinWith(ds, dats_ds.col("id").equalTo(ds.col("id")), "left") + val temp: Dataset[(Datasource, Datasource)] = + dats_ds.joinWith(ds, dats_ds.col("id").equalTo(ds.col("id")), "left") assertEquals(10, temp.count()) temp.foreach(t2 => { - val pb : Datasource = t2._1 - val pa : Datasource = t2._2 + val pb: Datasource = t2._1 + val pa: Datasource = t2._2 assertTrue(t2._1.getId.equals(t2._2.getId)) - if(pb.getId.equals("10|doajarticles::0ab37b7620eb9a73ac95d3ca4320c97d")) { + if (pb.getId.equals("10|doajarticles::0ab37b7620eb9a73ac95d3ca4320c97d")) { assertTrue(pa.getOpenairecompatibility().getClassid.equals("hostedBy")) - assertTrue(pa.getOpenairecompatibility().getClassname.equals("collected from a compatible aggregator")) + assertTrue( + pa.getOpenairecompatibility() + .getClassname + .equals("collected from a compatible aggregator") + ) assertTrue(pb.getOpenairecompatibility().getClassid.equals(ModelConstants.UNKNOWN)) - } else { - assertTrue(pa.getOpenairecompatibility().getClassid.equals(pb.getOpenairecompatibility.getClassid)) - assertTrue(pa.getOpenairecompatibility().getClassname.equals(pb.getOpenairecompatibility.getClassname)) + assertTrue( + pa.getOpenairecompatibility().getClassid.equals(pb.getOpenairecompatibility.getClassid) + ) + assertTrue( + pa.getOpenairecompatibility() + .getClassname + .equals(pb.getOpenairecompatibility.getClassname) + ) } }) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala index 7abce547f..5fc29e3b0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala @@ -9,9 +9,9 @@ import org.json4s.DefaultFormats import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} import org.junit.jupiter.api.Test -class TestPrepare extends java.io.Serializable{ +class TestPrepare extends java.io.Serializable { - def getString(input:HostedByItemType):String = { + def getString(input: HostedByItemType): String = { import org.json4s.jackson.Serialization.write implicit val formats = DefaultFormats @@ -19,9 +19,8 @@ class TestPrepare extends java.io.Serializable{ write(input) } - @Test - def testHostedByMaptoEntityInfo() : Unit = { + def testHostedByMaptoEntityInfo(): Unit = { val conf = new SparkConf() conf.setMaster("local[*]") conf.set("spark.driver.host", "localhost") @@ -33,23 +32,23 @@ class TestPrepare extends java.io.Serializable{ .getOrCreate() val hbm = getClass.getResource("hostedbymap.json").getPath - import spark.implicits._ - val mapper:ObjectMapper = new ObjectMapper() + val mapper: ObjectMapper = new ObjectMapper() implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) - val ds :Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hbm)).map(toEntityInfo) + val ds: Dataset[EntityInfo] = + spark.createDataset(spark.sparkContext.textFile(hbm)).map(toEntityInfo) ds.foreach(e => println(mapper.writeValueAsString(e))) - assertEquals(20, ds.count) + assertEquals(20, ds.count) spark.close() } @Test - def testPublicationtoEntityInfo() : Unit = { + def testPublicationtoEntityInfo(): Unit = { val conf = new SparkConf() conf.setMaster("local[*]") conf.set("spark.driver.host", "localhost") @@ -61,24 +60,30 @@ class TestPrepare extends java.io.Serializable{ .getOrCreate() val path = getClass.getResource("publication.json").getPath - val mapper:ObjectMapper = new ObjectMapper() + val mapper: ObjectMapper = new ObjectMapper() implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) - val ds :Dataset[EntityInfo] = prepareResultInfo(spark, path) + val ds: Dataset[EntityInfo] = prepareResultInfo(spark, path) ds.foreach(e => println(mapper.writeValueAsString(e))) - assertEquals(2, ds.count) + assertEquals(2, ds.count) - assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ds.filter(ei => ei.getJournalId.equals("1728-5852")).first().getId) - assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ds.filter(ei => ei.getJournalId.equals("0001-396X")).first().getId) + assertEquals( + "50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", + ds.filter(ei => ei.getJournalId.equals("1728-5852")).first().getId + ) + assertEquals( + "50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", + ds.filter(ei => ei.getJournalId.equals("0001-396X")).first().getId + ) spark.close() } @Test - def testJoinResHBM (): Unit = { + def testJoinResHBM(): Unit = { val conf = new SparkConf() conf.setMaster("local[*]") conf.set("spark.driver.host", "localhost") @@ -91,18 +96,20 @@ class TestPrepare extends java.io.Serializable{ val pub = getClass.getResource("iteminfofrompublication").getPath val hbm = getClass.getResource("iteminfofromhostedbymap.json").getPath - val mapper:ObjectMapper = new ObjectMapper() + val mapper: ObjectMapper = new ObjectMapper() implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) - val pub_ds :Dataset[EntityInfo] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo])) - val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])) + val pub_ds: Dataset[EntityInfo] = + spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo])) + val hbm_ds: Dataset[EntityInfo] = + spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])) val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds) - assertEquals(1, ds.count) + assertEquals(1, ds.count) - val ei:EntityInfo = ds.first() + val ei: EntityInfo = ds.first() assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ei.getId) assertEquals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735", ei.getHostedById) @@ -114,7 +121,7 @@ class TestPrepare extends java.io.Serializable{ } @Test - def testJoinResHBM2 (): Unit = { + def testJoinResHBM2(): Unit = { val conf = new SparkConf() conf.setMaster("local[*]") conf.set("spark.driver.host", "localhost") @@ -127,18 +134,20 @@ class TestPrepare extends java.io.Serializable{ val pub = getClass.getResource("iteminfofrompublication2").getPath val hbm = getClass.getResource("iteminfofromhostedbymap2.json").getPath - val mapper:ObjectMapper = new ObjectMapper() + val mapper: ObjectMapper = new ObjectMapper() implicit val mapEncoderDSInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) - val pub_ds :Dataset[EntityInfo] = spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo])) - val hbm_ds :Dataset[EntityInfo] = spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])) + val pub_ds: Dataset[EntityInfo] = + spark.read.textFile(pub).map(p => mapper.readValue(p, classOf[EntityInfo])) + val hbm_ds: Dataset[EntityInfo] = + spark.read.textFile(hbm).map(p => mapper.readValue(p, classOf[EntityInfo])) val ds: Dataset[EntityInfo] = joinResHBM(pub_ds, hbm_ds) - assertEquals(1, ds.count) + assertEquals(1, ds.count) - val ei:EntityInfo = ds.first() + val ei: EntityInfo = ds.first() assertEquals("50|4dc99724cf04::ed1ba83e1add6ce292433729acd8b0d9", ei.getId) assertEquals("10|issn___print::e4b6d6d978f67520f6f37679a98c5735", ei.getHostedById) @@ -150,6 +159,4 @@ class TestPrepare extends java.io.Serializable{ spark.close() } - - } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala index 0922f2e19..12879c466 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala @@ -8,20 +8,19 @@ import org.json4s.jackson.Serialization.write import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Test -class TestPreprocess extends java.io.Serializable{ +class TestPreprocess extends java.io.Serializable { implicit val mapEncoderDats: Encoder[Datasource] = Encoders.kryo[Datasource] implicit val schema = Encoders.product[HostedByInfo] - - def toHBIString (hbi:HostedByItemType): String = { + def toHBIString(hbi: HostedByItemType): String = { implicit val formats = DefaultFormats write(hbi) } @Test - def readDatasource():Unit = { + def readDatasource(): Unit = { val conf = new SparkConf() conf.setMaster("local[*]") conf.set("spark.driver.host", "localhost") @@ -33,29 +32,40 @@ class TestPreprocess extends java.io.Serializable{ .getOrCreate() val path = getClass.getResource("datasource.json").getPath - val ds :Dataset[HostedByItemType]= SparkProduceHostedByMap.oaHostedByDataset(spark, path) + val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.oaHostedByDataset(spark, path) - assertEquals(9, ds.count) + assertEquals(9, ds.count) assertEquals(8, ds.filter(hbi => !hbi.issn.equals("")).count) assertEquals(5, ds.filter(hbi => !hbi.eissn.equals("")).count) assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count) - assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count) + assertEquals( + 0, + ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count + ) assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365")).count == 1) assertTrue(ds.filter(hbi => hbi.eissn.equals("2253-900X")).count == 1) - assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1) - assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata")).count == 1) - assertTrue(ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.id.equals("10|doajarticles::abbc9265bea9ff62776a1c39785af00c")).count == 1) + assertTrue( + ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.eissn.equals("2253-900X")).count == 1 + ) + assertTrue( + ds.filter(hbi => hbi.issn.equals("0212-8365") && hbi.officialname.equals("Thémata")).count == 1 + ) + assertTrue( + ds.filter(hbi => + hbi.issn.equals("0212-8365") && hbi.id + .equals("10|doajarticles::abbc9265bea9ff62776a1c39785af00c") + ).count == 1 + ) ds.foreach(hbi => assertTrue(hbi.id.startsWith("10|"))) ds.foreach(hbi => println(toHBIString(hbi))) spark.close() } - @Test - def readGold():Unit = { + def readGold(): Unit = { val conf = new SparkConf() conf.setMaster("local[*]") conf.set("spark.driver.host", "localhost") @@ -67,8 +77,7 @@ class TestPreprocess extends java.io.Serializable{ .getOrCreate() val path = getClass.getResource("unibi_transformed.json").getPath - - val ds :Dataset[HostedByItemType]= SparkProduceHostedByMap.goldHostedByDataset(spark, path) + val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.goldHostedByDataset(spark, path) assertEquals(29, ds.count) @@ -76,9 +85,17 @@ class TestPreprocess extends java.io.Serializable{ assertEquals(0, ds.filter(hbi => !hbi.eissn.equals("")).count) assertEquals(29, ds.filter(hbi => !hbi.lissn.equals("")).count) - assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count) + assertEquals( + 0, + ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count + ) - assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().officialname.equals("European journal of sustainable development.")) + assertTrue( + ds.filter(hbi => hbi.issn.equals("2239-6101")) + .first() + .officialname + .equals("European journal of sustainable development.") + ) assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).first().lissn.equals("2239-5938")) assertTrue(ds.filter(hbi => hbi.issn.equals("2239-6101")).count == 1) ds.foreach(hbi => assertTrue(hbi.id.equals(Constants.UNIBI))) @@ -88,7 +105,7 @@ class TestPreprocess extends java.io.Serializable{ } @Test - def readDoaj():Unit = { + def readDoaj(): Unit = { val conf = new SparkConf() conf.setMaster("local[*]") conf.set("spark.driver.host", "localhost") @@ -100,7 +117,7 @@ class TestPreprocess extends java.io.Serializable{ .getOrCreate() val path = getClass.getResource("doaj_transformed.json").getPath - val ds :Dataset[HostedByItemType]= SparkProduceHostedByMap.doajHostedByDataset(spark, path) + val ds: Dataset[HostedByItemType] = SparkProduceHostedByMap.doajHostedByDataset(spark, path) assertEquals(25, ds.count) @@ -108,9 +125,17 @@ class TestPreprocess extends java.io.Serializable{ assertEquals(21, ds.filter(hbi => !hbi.eissn.equals("")).count) assertEquals(0, ds.filter(hbi => !hbi.lissn.equals("")).count) - assertEquals(0, ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count) + assertEquals( + 0, + ds.filter(hbi => hbi.issn.equals("") && hbi.eissn.equals("") && hbi.lissn.equals("")).count + ) - assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().officialname.equals("Journal of Space Technology")) + assertTrue( + ds.filter(hbi => hbi.issn.equals("2077-3099")) + .first() + .officialname + .equals("Journal of Space Technology") + ) assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).first().eissn.equals("2411-5029")) assertTrue(ds.filter(hbi => hbi.issn.equals("2077-3099")).count == 1) assertTrue(ds.filter(hbi => hbi.eissn.equals("2077-2955")).first().issn.equals("")) @@ -121,7 +146,7 @@ class TestPreprocess extends java.io.Serializable{ } @Test - def testAggregator() : Unit = { + def testAggregator(): Unit = { val conf = new SparkConf() conf.setMaster("local[*]") @@ -133,22 +158,40 @@ class TestPreprocess extends java.io.Serializable{ .config(conf) .getOrCreate() - - val tmp = SparkProduceHostedByMap.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath) - .union(SparkProduceHostedByMap.goldHostedByDataset(spark,getClass.getResource("unibi_transformed.json").getPath)) - .union(SparkProduceHostedByMap.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)) - .flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])) + val tmp = SparkProduceHostedByMap + .oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath) + .union( + SparkProduceHostedByMap + .goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath) + ) + .union( + SparkProduceHostedByMap + .doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath) + ) + .flatMap(hbi => SparkProduceHostedByMap.toList(hbi))( + Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]) + ) assertEquals(106, tmp.count) assertEquals(82, tmp.map(i => i._1)(Encoders.STRING).distinct().count) + val ds: Dataset[(String, HostedByItemType)] = Aggregators.explodeHostedByItemType( + SparkProduceHostedByMap + .oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath) + .union( + SparkProduceHostedByMap + .goldHostedByDataset(spark, getClass.getResource("unibi_transformed.json").getPath) + ) + .union( + SparkProduceHostedByMap + .doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath) + ) + .flatMap(hbi => SparkProduceHostedByMap.toList(hbi))( + Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]) + ) + ) - val ds :Dataset[(String, HostedByItemType)] = Aggregators.explodeHostedByItemType(SparkProduceHostedByMap.oaHostedByDataset(spark, getClass.getResource("datasource.json").getPath) - .union(SparkProduceHostedByMap.goldHostedByDataset(spark,getClass.getResource("unibi_transformed.json").getPath)) - .union(SparkProduceHostedByMap.doajHostedByDataset(spark, getClass.getResource("doaj_transformed.json").getPath)) - .flatMap(hbi => SparkProduceHostedByMap.toList(hbi))(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]))) - - assertEquals(82, ds.count) + assertEquals(82, ds.count) assertEquals(13, ds.filter(i => i._2.id.startsWith("10|")).count) @@ -156,14 +199,13 @@ class TestPreprocess extends java.io.Serializable{ assertTrue(ds.filter(i => i._1.equals("2077-3757")).first()._2.openAccess) assertEquals(1, ds.filter(i => i._1.equals("2077-3757")).count) - val hbmap : Dataset[String] = ds.filter(hbi => hbi._2.id.startsWith("10|")).map(SparkProduceHostedByMap.toHostedByMap)(Encoders.STRING) + val hbmap: Dataset[String] = ds + .filter(hbi => hbi._2.id.startsWith("10|")) + .map(SparkProduceHostedByMap.toHostedByMap)(Encoders.STRING) hbmap.foreach(entry => println(entry)) spark.close() } - - - } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala index c22243f94..c8e41743f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala @@ -1,6 +1,5 @@ package eu.dnetlib.dhp.oa.graph.resolution - import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.common.EntityType import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils @@ -19,174 +18,225 @@ import scala.io.Source @TestInstance(Lifecycle.PER_CLASS) class ResolveEntitiesTest extends Serializable { - var workingDir:Path = null + var workingDir: Path = null val FAKE_TITLE = "FAKETITLE" val FAKE_SUBJECT = "FAKESUBJECT" - var sparkSession:Option[SparkSession] = None - + var sparkSession: Option[SparkSession] = None @BeforeAll - def setUp() :Unit = { + def setUp(): Unit = { workingDir = Files.createTempDirectory(getClass.getSimpleName) val conf = new SparkConf() - sparkSession = Some(SparkSession - .builder() - .config(conf) - .appName(getClass.getSimpleName) - .master("local[*]").getOrCreate()) + sparkSession = Some( + SparkSession + .builder() + .config(conf) + .appName(getClass.getSimpleName) + .master("local[*]") + .getOrCreate() + ) populateDatasets(sparkSession.get) generateUpdates(sparkSession.get) } - @AfterAll - def tearDown():Unit = { + def tearDown(): Unit = { FileUtils.deleteDirectory(workingDir.toFile) sparkSession.get.stop() - } - - def generateUpdates(spark:SparkSession):Unit = { + def generateUpdates(spark: SparkSession): Unit = { val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString + val pids: List[String] = template.lines + .map { id => + val r = new Result + r.setId(id.toLowerCase.trim) + r.setSubject( + List( + OafMapperUtils.structuredProperty( + FAKE_SUBJECT, + OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"), + null + ) + ).asJava + ) + r.setTitle( + List( + OafMapperUtils.structuredProperty( + FAKE_TITLE, + OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"), + null + ) + ).asJava + ) + r + } + .map { r => + val mapper = new ObjectMapper() - val pids:List[String] = template.lines.map{id => - val r = new Result - r.setId(id.toLowerCase.trim) - r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava) - r.setTitle(List(OafMapperUtils.structuredProperty(FAKE_TITLE, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava) - r - }.map{r => - val mapper = new ObjectMapper() + mapper.writeValueAsString(r) + } + .toList - mapper.writeValueAsString(r)}.toList - - - val sc =spark.sparkContext + val sc = spark.sparkContext println(sc.parallelize(pids).count()) - spark.createDataset(sc.parallelize(pids))(Encoders.STRING).write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingDir/updates") - - - - + spark + .createDataset(sc.parallelize(pids))(Encoders.STRING) + .write + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(s"$workingDir/updates") import spark.implicits._ implicit val resEncoder: Encoder[Result] = Encoders.bean(classOf[Result]) - val ds = spark.read.text(s"$workingDir/updates").as[String].map{s => val mapper = new ObjectMapper() - mapper.readValue(s, classOf[Result])}.collect() - - - + val ds = spark.read + .text(s"$workingDir/updates") + .as[String] + .map { s => + val mapper = new ObjectMapper() + mapper.readValue(s, classOf[Result]) + } + .collect() assertEquals(4, ds.length) - ds.foreach{r => assertNotNull(r.getSubject)} - ds.foreach{r => assertEquals(1,r.getSubject.size())} - ds.foreach{r => assertNotNull(r.getTitle)} - ds.foreach{r => assertEquals(1,r.getTitle.size())} + ds.foreach { r => assertNotNull(r.getSubject) } + ds.foreach { r => assertEquals(1, r.getSubject.size()) } + ds.foreach { r => assertNotNull(r.getTitle) } + ds.foreach { r => assertEquals(1, r.getTitle.size()) } - - - ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_TITLE,t)) - ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_SUBJECT,t)) + ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue)) + .foreach(t => assertEquals(FAKE_TITLE, t)) + ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue)) + .foreach(t => assertEquals(FAKE_SUBJECT, t)) println("generated Updates") } - - def populateDatasets(spark:SparkSession):Unit = { + def populateDatasets(spark: SparkSession): Unit = { import spark.implicits._ - val entities =SparkResolveEntities.entities + val entities = SparkResolveEntities.entities - entities.foreach{ - e => - val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString - spark.createDataset(spark.sparkContext.parallelize(template.lines.toList)).as[String].write.option("compression", "gzip").text(s"$workingDir/graph/$e") - println(s"Created Dataset $e") + entities.foreach { e => + val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString + spark + .createDataset(spark.sparkContext.parallelize(template.lines.toList)) + .as[String] + .write + .option("compression", "gzip") + .text(s"$workingDir/graph/$e") + println(s"Created Dataset $e") } - SparkResolveRelation.extractPidResolvedTableFromJsonRDD(spark, s"$workingDir/graph", s"$workingDir/work") + SparkResolveRelation.extractPidResolvedTableFromJsonRDD( + spark, + s"$workingDir/graph", + s"$workingDir/work" + ) } - @Test - def testResolution():Unit = { - val spark:SparkSession = sparkSession.get + def testResolution(): Unit = { + val spark: SparkSession = sparkSession.get implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) - SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" ) + SparkResolveEntities.resolveEntities(spark, s"$workingDir/work", s"$workingDir/updates") val ds = spark.read.load(s"$workingDir/work/resolvedEntities").as[Result] assertEquals(3, ds.count()) - ds.collect().foreach{ - r => + ds.collect().foreach { r => assertTrue(r.getId.startsWith("50")) } } - - - - private def structuredPContainsValue(l:java.util.List[StructuredProperty], exptectedValue:String):Boolean = { - l.asScala.exists(p =>p.getValue!= null && p.getValue.equalsIgnoreCase(exptectedValue)) + private def structuredPContainsValue( + l: java.util.List[StructuredProperty], + exptectedValue: String + ): Boolean = { + l.asScala.exists(p => p.getValue != null && p.getValue.equalsIgnoreCase(exptectedValue)) } @Test - def testUpdate():Unit = { - val spark:SparkSession = sparkSession.get + def testUpdate(): Unit = { + val spark: SparkSession = sparkSession.get import spark.implicits._ implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) val m = new ObjectMapper() - SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" ) - SparkResolveEntities.generateResolvedEntities(spark,s"$workingDir/work",s"$workingDir/graph", s"$workingDir/target" ) - - - - val pubDS:Dataset[Result] = spark.read.text(s"$workingDir/target/publication").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication)) - val t = pubDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() + SparkResolveEntities.resolveEntities(spark, s"$workingDir/work", s"$workingDir/updates") + SparkResolveEntities.generateResolvedEntities( + spark, + s"$workingDir/work", + s"$workingDir/graph", + s"$workingDir/target" + ) + val pubDS: Dataset[Result] = spark.read + .text(s"$workingDir/target/publication") + .as[String] + .map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication)) + val t = pubDS + .filter(p => p.getTitle != null && p.getSubject != null) + .filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))) + .count() var ct = pubDS.count() - var et = pubDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count() + var et = pubDS + .filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)) + .count() assertEquals(ct, et) - - - val datDS:Dataset[Result] = spark.read.text(s"$workingDir/target/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset)) - val td = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() + val datDS: Dataset[Result] = spark.read + .text(s"$workingDir/target/dataset") + .as[String] + .map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset)) + val td = datDS + .filter(p => p.getTitle != null && p.getSubject != null) + .filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))) + .count() ct = datDS.count() - et = datDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count() + et = datDS + .filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)) + .count() assertEquals(ct, et) - - val softDS:Dataset[Result] = spark.read.text(s"$workingDir/target/software").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.software)) - val ts = softDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() + val softDS: Dataset[Result] = spark.read + .text(s"$workingDir/target/software") + .as[String] + .map(s => SparkResolveEntities.deserializeObject(s, EntityType.software)) + val ts = softDS + .filter(p => p.getTitle != null && p.getSubject != null) + .filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))) + .count() ct = softDS.count() - et = softDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count() + et = softDS + .filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)) + .count() assertEquals(ct, et) - - val orpDS:Dataset[Result] = spark.read.text(s"$workingDir/target/otherresearchproduct").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct)) - val to = orpDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() - + val orpDS: Dataset[Result] = spark.read + .text(s"$workingDir/target/otherresearchproduct") + .as[String] + .map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct)) + val to = orpDS + .filter(p => p.getTitle != null && p.getSubject != null) + .filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))) + .count() ct = orpDS.count() - et = orpDS.filter(p => p.getTitle!= null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)).count() + et = orpDS + .filter(p => p.getTitle != null && p.getTitle.asScala.forall(t => t.getValue != null && t.getValue.nonEmpty)) + .count() assertEquals(ct, et) - - - - assertEquals(0, t) assertEquals(2, td) assertEquals(1, ts) @@ -194,40 +244,35 @@ class ResolveEntitiesTest extends Serializable { } - - - - @Test - def testMerge():Unit = { + def testMerge(): Unit = { val r = new Result - r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava) + r.setSubject( + List( + OafMapperUtils.structuredProperty( + FAKE_SUBJECT, + OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"), + null + ) + ).asJava + ) val mapper = new ObjectMapper() - val p = mapper.readValue(Source.fromInputStream(this.getClass.getResourceAsStream(s"publication")).mkString.lines.next(), classOf[Publication]) - + val p = mapper.readValue( + Source + .fromInputStream(this.getClass.getResourceAsStream(s"publication")) + .mkString + .lines + .next(), + classOf[Publication] + ) r.mergeFrom(p) - println(mapper.writeValueAsString(r)) - - - - - - - } - - - - - - - } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/RetrieveDataciteDeltaTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/RetrieveDataciteDeltaTest.scala index c277b0aa1..80ea9d59c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/RetrieveDataciteDeltaTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/RetrieveDataciteDeltaTest.scala @@ -1,26 +1,20 @@ package eu.dnetlib.dhp.sx.graph + import org.junit.jupiter.api.Test import java.text.SimpleDateFormat - - class RetrieveDataciteDeltaTest { @Test def testParsingDate(): Unit = { - val inputDate = "2021-12-02T11:17:36+0000" val t = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ").parse(inputDate).getTime - println(t) - - } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala index 04b1f9ecd..e92f36896 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala @@ -18,37 +18,40 @@ import scala.collection.JavaConverters._ import scala.io.Source @ExtendWith(Array(classOf[MockitoExtension])) -class ScholixGraphTest extends AbstractVocabularyTest{ - +class ScholixGraphTest extends AbstractVocabularyTest { val mapper: ObjectMapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT) - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,false) + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) @BeforeEach - def setUp() :Unit = { + def setUp(): Unit = { super.setUpVocabulary() } - @Test - def testExtractPids():Unit = { + def testExtractPids(): Unit = { - val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json")).mkString - val res =SparkResolveRelation.extractPidsFromRecord(input) + val input = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json")) + .mkString + val res = SparkResolveRelation.extractPidsFromRecord(input) assertNotNull(res) - assertEquals(1,res._2.size) + assertEquals(1, res._2.size) } @Test - def testOAFToSummary():Unit= { - val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary")).mkString + def testOAFToSummary(): Unit = { + val inputRelations = Source + .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary")) + .mkString val items = inputRelations.lines.toList assertNotNull(items) - items.foreach(i =>assertTrue(i.nonEmpty)) - val result = items.map(r => mapper.readValue(r, classOf[Result])).map(i => ScholixUtils.resultToSummary(i)) + items.foreach(i => assertTrue(i.nonEmpty)) + val result = + items.map(r => mapper.readValue(r, classOf[Result])).map(i => ScholixUtils.resultToSummary(i)) assertNotNull(result) @@ -59,37 +62,41 @@ class ScholixGraphTest extends AbstractVocabularyTest{ } - - @Test - def testScholixMergeOnSource():Unit = { - val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix")).mkString - val result:List[(Relation,ScholixSummary)] =inputRelations.lines.sliding(2).map(s => (s.head, s(1))).map(p => (mapper.readValue(p._1, classOf[Relation]),mapper.readValue(p._2, classOf[ScholixSummary]) )).toList + def testScholixMergeOnSource(): Unit = { + val inputRelations = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix") + ) + .mkString + val result: List[(Relation, ScholixSummary)] = inputRelations.lines + .sliding(2) + .map(s => (s.head, s(1))) + .map(p => (mapper.readValue(p._1, classOf[Relation]), mapper.readValue(p._2, classOf[ScholixSummary]))) + .toList assertNotNull(result) assertTrue(result.nonEmpty) result.foreach(r => assertEquals(r._1.getSource, r._2.getId)) - val scholix:List[Scholix] = result.map(r => ScholixUtils.scholixFromSource(r._1, r._2)) + val scholix: List[Scholix] = result.map(r => ScholixUtils.scholixFromSource(r._1, r._2)) println(mapper.writeValueAsString(scholix.head)) } - - - @Test def testScholixRelationshipsClean(): Unit = { - val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relation_transform.json")).mkString + val inputRelations = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relation_transform.json") + ) + .mkString implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(inputRelations) - val l:List[String] =json.extract[List[String]] + val l: List[String] = json.extract[List[String]] assertNotNull(l) assertTrue(l.nonEmpty) - val relVocbaulary =ScholixUtils.relations - l.foreach(r => assertTrue(relVocbaulary.contains(r.toLowerCase))) + val relVocbaulary = ScholixUtils.relations + l.foreach(r => assertTrue(relVocbaulary.contains(r.toLowerCase))) } - - - } diff --git a/pom.xml b/pom.xml index ed7b8a2ca..b68671aec 100644 --- a/pom.xml +++ b/pom.xml @@ -620,6 +620,18 @@ + + org.antipathy + mvn-scalafmt_2.11 + 1.0.1640073709.733712b + + + eu.dnetlib.dhp + dhp-code-style + ${project.version} + + + @@ -665,6 +677,33 @@ + + org.antipathy + mvn-scalafmt_2.11 + + dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf + false + false + + ${project.basedir}/src/main/scala + + + ${project.basedir}/src/test/scala + + false + false + : git rev-parse --abbrev-ref HEAD + false + + + + validate + + format + + + + org.apache.maven.plugins maven-release-plugin