diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/SparkScalaApplication.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala similarity index 100% rename from dhp-common/src/main/java/eu/dnetlib/dhp/application/SparkScalaApplication.scala rename to dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala deleted file mode 100644 index 7a87861db7..0000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala +++ /dev/null @@ -1,69 +0,0 @@ -package eu.dnetlib.dhp.actionmanager.scholix - -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result} -import org.apache.spark.SparkConf -import org.apache.spark.sql._ -import org.slf4j.{Logger, LoggerFactory} - -import scala.io.Source - -object SparkCreateActionset { - - def main(args: Array[String]): Unit = { - val log: Logger = LoggerFactory.getLogger(getClass) - val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/generate_actionset.json")).mkString) - parser.parseArgument(args) - - - val spark: SparkSession = - SparkSession - .builder() - .config(conf) - .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - - - val sourcePath = parser.get("sourcePath") - log.info(s"sourcePath -> $sourcePath") - - val targetPath = parser.get("targetPath") - log.info(s"targetPath -> $targetPath") - - val workingDirFolder = parser.get("workingDirFolder") - log.info(s"workingDirFolder -> $workingDirFolder") - - implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val resultEncoders: Encoder[Result] = Encoders.kryo[Result] - implicit val relationEncoders: Encoder[Relation] = Encoders.kryo[Relation] - - import spark.implicits._ - - val relation = spark.read.load(s"$sourcePath/relation").as[Relation] - - relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) - .flatMap(r => List(r.getSource, r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation") - - - val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String] - - log.info("extract source and target Identifier involved in relations") - - - log.info("save relation filtered") - - relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) - .write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf") - - log.info("saving entities") - - val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders)) - - entities - .joinWith(idRelation, entities("_1").equalTo(idRelation("value"))) - .map(p => p._1._2) - .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf") - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala deleted file mode 100644 index 1df7ea3fb1..0000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala +++ /dev/null @@ -1,86 +0,0 @@ -package eu.dnetlib.dhp.actionmanager.scholix - -import com.fasterxml.jackson.databind.ObjectMapper -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.action.AtomicAction -import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation} -import org.apache.hadoop.io.Text -import org.apache.hadoop.io.compress.GzipCodec -import org.apache.hadoop.mapred.SequenceFileOutputFormat -import org.apache.spark.SparkConf -import org.apache.spark.sql.{Encoder, Encoders, SparkSession} -import org.slf4j.{Logger, LoggerFactory} - -import scala.io.Source - -object SparkSaveActionSet { - - - def toActionSet(item: Oaf): (String, String) = { - val mapper = new ObjectMapper() - - item match { - case dataset: OafDataset => - val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset] - a.setClazz(classOf[OafDataset]) - a.setPayload(dataset) - (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a)) - case publication: Publication => - val a: AtomicAction[Publication] = new AtomicAction[Publication] - a.setClazz(classOf[Publication]) - a.setPayload(publication) - (publication.getClass.getCanonicalName, mapper.writeValueAsString(a)) - case software: Software => - val a: AtomicAction[Software] = new AtomicAction[Software] - a.setClazz(classOf[Software]) - a.setPayload(software) - (software.getClass.getCanonicalName, mapper.writeValueAsString(a)) - case orp: OtherResearchProduct => - val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct] - a.setClazz(classOf[OtherResearchProduct]) - a.setPayload(orp) - (orp.getClass.getCanonicalName, mapper.writeValueAsString(a)) - - case relation: Relation => - val a: AtomicAction[Relation] = new AtomicAction[Relation] - a.setClazz(classOf[Relation]) - a.setPayload(relation) - (relation.getClass.getCanonicalName, mapper.writeValueAsString(a)) - case _ => - null - } - - } - - def main(args: Array[String]): Unit = { - val log: Logger = LoggerFactory.getLogger(getClass) - val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/save_actionset.json")).mkString) - parser.parseArgument(args) - - - val spark: SparkSession = - SparkSession - .builder() - .config(conf) - .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - - - val sourcePath = parser.get("sourcePath") - log.info(s"sourcePath -> $sourcePath") - - val targetPath = parser.get("targetPath") - log.info(s"targetPath -> $targetPath") - - implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val tEncoder: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING) - - spark.read.load(sourcePath).as[Oaf] - .map(o => toActionSet(o)) - .filter(o => o != null) - .rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec]) - - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/AbstractRestClient.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala index 0685a04a3d..6c5dc8cce1 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala @@ -1,8 +1,8 @@ package eu.dnetlib.dhp.datacite import eu.dnetlib.dhp.schema.common.ModelConstants -import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue} import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils +import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue} import java.io.InputStream import java.time.format.DateTimeFormatter diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala similarity index 93% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index 6b4deef0ad..a662cf99d3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -6,7 +6,7 @@ import eu.dnetlib.dhp.datacite.DataciteModelConstants._ import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} -import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _} import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils import org.json4s.DefaultFormats @@ -29,6 +29,7 @@ object DataciteToOAFTransformation { /** * This method should skip record if json contains invalid text * defined in gile datacite_filter + * * @param json * @return True if the record should be skipped */ @@ -107,9 +108,9 @@ object DataciteToOAFTransformation { d } - def fix_thai_date(input:String, format:String) :String = { + def fix_thai_date(input: String, format: String): String = { try { - val a_date = LocalDate.parse(input,DateTimeFormatter.ofPattern(format)) + val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format)) val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth) LocalDate.from(d).toString } catch { @@ -236,7 +237,7 @@ object DataciteToOAFTransformation { val p = match_pattern.get._2 val grantId = m.matcher(awardUri).replaceAll("$2") val targetId = s"$p${DHPUtils.md5(grantId)}" - List( generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo) ) + List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo)) } else List() @@ -335,15 +336,15 @@ object DataciteToOAFTransformation { .map(d => d.get) if (a_date.isDefined) { - if(doi.startsWith("10.14457")) - result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get,"[yyyy-MM-dd]"), null)) + if (doi.startsWith("10.14457")) + result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null)) else result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null)) } if (i_date.isDefined && i_date.get.isDefined) { - if(doi.startsWith("10.14457")) { - result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null)) - result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null)) + if (doi.startsWith("10.14457")) { + result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)) } else { result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) @@ -351,9 +352,9 @@ object DataciteToOAFTransformation { } } else if (publication_year != null) { - if(doi.startsWith("10.14457")) { - result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null)) - result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null)) + if (doi.startsWith("10.14457")) { + result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)) } else { result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) @@ -457,7 +458,7 @@ object DataciteToOAFTransformation { JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier } yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType) - relations = relations ::: generateRelations(rels,result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null) + relations = relations ::: generateRelations(rels, result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null) } if (relations != null && relations.nonEmpty) { List(result) ::: relations @@ -466,7 +467,7 @@ object DataciteToOAFTransformation { List(result) } - private def generateRelations(rels: List[RelatedIdentifierType], id:String, date:String):List[Relation] = { + private def generateRelations(rels: List[RelatedIdentifierType], id: String, date: String): List[Relation] = { rels .filter(r => subRelTypeMapping.contains(r.relationType) && ( @@ -484,12 +485,12 @@ object DataciteToOAFTransformation { rel.setSubRelType(subRelType) rel.setRelClass(r.relationType) - val dateProps:KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date) + val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date) rel.setProperties(List(dateProps).asJava) rel.setSource(id) - rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier,r.relatedIdentifierType)) + rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)) rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) rel.getCollectedfrom.asScala.map(c => c.getValue).toList rel @@ -504,4 +505,4 @@ object DataciteToOAFTransformation { } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/ImportDatacite.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala similarity index 99% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala index 70dcc0184c..853b248628 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala @@ -7,6 +7,7 @@ import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.{compact, parse, render} import collection.JavaConverters._ + object BioDBToOAF { case class EBILinkItem(id: Long, links: String) {} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala similarity index 73% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala index 8ae8285e3f..fcceacd445 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala @@ -1,9 +1,9 @@ package eu.dnetlib.dhp.sx.bio import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.Oaf -import BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.collection.CollectionUtils +import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} @@ -36,13 +36,13 @@ object SparkTransformBioDatabaseToOAF { import spark.implicits._ database.toUpperCase() match { case "UNIPROT" => - spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) case "PDB" => - spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) case "SCHOLIX" => - spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) case "CROSSREF_LINKS" => - spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala similarity index 98% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala index 17d21f19c6..660a26a6c8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.sx.bio.ebi import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.oaf.Result -import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf} +import eu.dnetlib.dhp.sx.bio.pubmed._ import eu.dnetlib.dhp.utils.ISLookupClientFactory import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala similarity index 98% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala index eab6b1dc6a..18e39387fe 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala @@ -1,9 +1,8 @@ package eu.dnetlib.dhp.sx.bio.ebi import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal} import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem -import eu.dnetlib.dhp.sx.bio.pubmed.PMJournal +import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal} import org.apache.commons.io.IOUtils import org.apache.http.client.config.RequestConfig import org.apache.http.client.methods.HttpGet diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala similarity index 93% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala index 8da617ca07..12af4824b4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala @@ -1,11 +1,10 @@ package eu.dnetlib.dhp.sx.bio.ebi import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.sx.bio.BioDBToOAF import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem -import BioDBToOAF.EBILinkItem -import eu.dnetlib.dhp.collection.CollectionUtils import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql._ @@ -38,7 +37,7 @@ object SparkEBILinksToOaf { ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links)) .filter(p => BioDBToOAF.EBITargetLinksFilter(p)) .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)) - .flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null) + .flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null) .write.mode(SaveMode.Overwrite).save(targetPath) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala similarity index 96% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index ecef322020..4a93a7cb4f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -4,7 +4,7 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} import eu.dnetlib.dhp.schema.oaf._ -import scala.collection.JavaConverters._ +import collection.JavaConverters._ import java.util.regex.Pattern @@ -22,10 +22,10 @@ object PubMedToOaf { val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") - /** * Cleaning the DOI Applying regex in order to * remove doi starting with URL + * * @param doi input DOI * @return cleaned DOI */ @@ -49,7 +49,7 @@ object PubMedToOaf { * starting from OAF instanceType value * * @param cobjQualifier OAF instance type - * @param vocabularies All dnet vocabularies + * @param vocabularies All dnet vocabularies * @return the correct instance */ def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = { @@ -65,7 +65,7 @@ object PubMedToOaf { } /** - * Mapping the Pubmedjournal info into the OAF Journale + * Mapping the Pubmedjournal info into the OAF Journale * * @param j the pubmedJournal * @return the OAF Journal @@ -91,9 +91,8 @@ object PubMedToOaf { * Find vocabulary term into synonyms and term in the vocabulary * * @param vocabularyName the input vocabulary name - * @param vocabularies all the vocabularies - * @param term the term to find - * + * @param vocabularies all the vocabularies + * @param term the term to find * @return the cleaned term value */ def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = { @@ -104,10 +103,9 @@ object PubMedToOaf { /** - * Map the Pubmed Article into the OAF instance + * Map the Pubmed Article into the OAF instance * - * - * @param article the pubmed articles + * @param article the pubmed articles * @param vocabularies the vocabularies * @return The OAF instance if the mapping did not fail */ @@ -185,7 +183,6 @@ object PubMedToOaf { //-------------------------------------------------------------------------------------- - // RESULT MAPPING //-------------------------------------------------------------------------------------- result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java index 5a04dcefe8..5153c412fd 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java @@ -89,13 +89,13 @@ public class CreateOpenCitationsASTest { "-inputPath", inputPath, "-outputPath", - workingDir.toString() + "/actionSet" + workingDir.toString() + "/actionSet1" }); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); JavaRDD tmp = sc - .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(aa -> ((Relation) aa.getPayload())); @@ -121,13 +121,13 @@ public class CreateOpenCitationsASTest { "-inputPath", inputPath, "-outputPath", - workingDir.toString() + "/actionSet" + workingDir.toString() + "/actionSet2" }); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); JavaRDD tmp = sc - .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .sequenceFile(workingDir.toString() + "/actionSet2", Text.class, Text.class) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(aa -> ((Relation) aa.getPayload())); @@ -153,13 +153,13 @@ public class CreateOpenCitationsASTest { "-inputPath", inputPath, "-outputPath", - workingDir.toString() + "/actionSet" + workingDir.toString() + "/actionSet3" }); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); JavaRDD tmp = sc - .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .sequenceFile(workingDir.toString() + "/actionSet3", Text.class, Text.class) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(aa -> ((Relation) aa.getPayload())); @@ -186,13 +186,13 @@ public class CreateOpenCitationsASTest { "-inputPath", inputPath, "-outputPath", - workingDir.toString() + "/actionSet" + workingDir.toString() + "/actionSet4" }); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); JavaRDD tmp = sc - .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .sequenceFile(workingDir.toString() + "/actionSet4", Text.class, Text.class) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(aa -> ((Relation) aa.getPayload())); @@ -226,13 +226,13 @@ public class CreateOpenCitationsASTest { "-inputPath", inputPath, "-outputPath", - workingDir.toString() + "/actionSet" + workingDir.toString() + "/actionSet5" }); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); JavaRDD tmp = sc - .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .sequenceFile(workingDir.toString() + "/actionSet5", Text.class, Text.class) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(aa -> ((Relation) aa.getPayload())); @@ -261,13 +261,13 @@ public class CreateOpenCitationsASTest { "-inputPath", inputPath, "-outputPath", - workingDir.toString() + "/actionSet" + workingDir.toString() + "/actionSet6" }); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); JavaRDD tmp = sc - .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .sequenceFile(workingDir.toString() + "/actionSet6", Text.class, Text.class) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(aa -> ((Relation) aa.getPayload())); @@ -306,13 +306,13 @@ public class CreateOpenCitationsASTest { "-inputPath", inputPath, "-outputPath", - workingDir.toString() + "/actionSet" + workingDir.toString() + "/actionSet7" }); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); JavaRDD tmp = sc - .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .sequenceFile(workingDir.toString() + "/actionSet7", Text.class, Text.class) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(aa -> ((Relation) aa.getPayload())); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala similarity index 99% rename from dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala rename to dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala index 8c8b4b5bfe..5bb6ba67d1 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala @@ -8,6 +8,7 @@ import org.apache.commons.io.FileUtils import org.apache.spark.SparkConf import org.apache.spark.sql.functions.{col, count} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} +import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.extension.ExtendWith import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} import org.mockito.junit.jupiter.MockitoExtension @@ -17,7 +18,6 @@ import java.nio.file.{Files, Path} import java.text.SimpleDateFormat import java.util.Locale import scala.io.Source -import org.junit.jupiter.api.Assertions._ @ExtendWith(Array(classOf[MockitoExtension])) class DataciteToOAFTest extends AbstractVocabularyTest{ diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala rename to dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala similarity index 97% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index efd5d24976..3822f40b5e 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -1,21 +1,19 @@ package eu.dnetlib.doiboost -import java.time.LocalDate -import java.time.format.DateTimeFormatter - +import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.action.AtomicAction -import eu.dnetlib.dhp.schema.oaf.{AccessRight, DataInfo, Dataset, Field, Instance, KeyValue, Oaf, OpenAccessRoute, Organization, Publication, Qualifier, Relation, Result, StructuredProperty} +import eu.dnetlib.dhp.schema.common.ModelConstants +import eu.dnetlib.dhp.schema.oaf._ +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils -import com.fasterxml.jackson.databind.ObjectMapper -import eu.dnetlib.dhp.schema.common.ModelConstants -import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils -import eu.dnetlib.doiboost.DoiBoostMappingUtil.{getClosedAccessQualifier, getEmbargoedAccessQualifier, getUnknownQualifier} import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} +import java.time.LocalDate +import java.time.format.DateTimeFormatter import scala.collection.JavaConverters._ diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala similarity index 63% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala index 3bfca0859b..f13900abe4 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala @@ -8,11 +8,12 @@ import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.slf4j.{Logger, LoggerFactory} object SparkGenerateDOIBoostActionSet { val logger: Logger = LoggerFactory.getLogger(getClass) + def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() @@ -33,53 +34,41 @@ object SparkGenerateDOIBoostActionSet { implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]] - val dbPublicationPath = parser.get("dbPublicationPath") - val dbDatasetPath = parser.get("dbDatasetPath") - val crossRefRelation = parser.get("crossRefRelation") - val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath") - val dbOrganizationPath = parser.get("dbOrganizationPath") - val sequenceFilePath = parser.get("sFilePath") + val dbPublicationPath = parser.get("dbPublicationPath") + val dbDatasetPath = parser.get("dbDatasetPath") + val crossRefRelation = parser.get("crossRefRelation") + val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath") + val dbOrganizationPath = parser.get("dbOrganizationPath") + val sequenceFilePath = parser.get("sFilePath") val asDataset = spark.read.load(dbDatasetPath).as[OafDataset] .filter(p => p != null || p.getId != null) - .map(d =>DoiBoostMappingUtil.fixResult(d)) - .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) + .map(d => DoiBoostMappingUtil.fixResult(d)) + .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) - val asPublication =spark.read.load(dbPublicationPath).as[Publication] + val asPublication = spark.read.load(dbPublicationPath).as[Publication] .filter(p => p != null || p.getId != null) - .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) + .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) val asOrganization = spark.read.load(dbOrganizationPath).as[Organization] - .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) - + .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) val asCRelation = spark.read.load(crossRefRelation).as[Relation] - .filter(r => r!= null && r.getSource != null && r.getTarget != null) - .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) + .filter(r => r != null && r.getSource != null && r.getTarget != null) + .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation] - .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) - - - + .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation) - - d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) - - - - - - - + d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec]) } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala similarity index 76% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala index 501073e74e..91fe56cbae 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala @@ -9,28 +9,26 @@ import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.functions.col -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} -import org.slf4j.{Logger, LoggerFactory} - -import scala.collection.JavaConverters._ +import org.apache.spark.sql._ import org.json4s.DefaultFormats -import org.json4s.JsonAST.{JField, JObject, JString,JArray} +import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.parse - +import org.slf4j.{Logger, LoggerFactory} +import scala.collection.JavaConverters._ object SparkGenerateDoiBoost { - def extractIdGRID(input:String):List[(String,String)] = { + def extractIdGRID(input: String): List[(String, String)] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: org.json4s.JValue = parse(input) - val id:String = (json \ "id").extract[String] + val id: String = (json \ "id").extract[String] - val grids:List[String] = for { + val grids: List[String] = for { JObject(pid) <- json \ "pid" JField("qualifier", JObject(qualifier)) <- pid - JField("classid", JString(classid)) <-qualifier + JField("classid", JString(classid)) <- qualifier JField("value", JString(vl)) <- pid if classid == "GRID" } yield vl @@ -38,7 +36,6 @@ object SparkGenerateDoiBoost { } - def main(args: Array[String]): Unit = { val logger: Logger = LoggerFactory.getLogger(getClass) @@ -73,7 +70,7 @@ object SparkGenerateDoiBoost { if (a != null && a._2 != null) { b.mergeFrom(a._2) b.setId(a._1) - val authors =AuthorMerger.mergeAuthor(b.getAuthor, a._2.getAuthor) + val authors = AuthorMerger.mergeAuthor(b.getAuthor, a._2.getAuthor) b.setAuthor(authors) return b } @@ -87,11 +84,11 @@ object SparkGenerateDoiBoost { return b2 } else { - if (b2 != null ) { + if (b2 != null) { b1.mergeFrom(b2) - val authors =AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor) + val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor) b1.setAuthor(authors) - if (b2.getId!= null && b2.getId.nonEmpty) + if (b2.getId != null && b2.getId.nonEmpty) b1.setId(b2.getId) return b1 } @@ -118,10 +115,9 @@ object SparkGenerateDoiBoost { val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p)) val uwPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p)) - def applyMerge(item:((String, Publication), (String, Publication))) : Publication = - { + def applyMerge(item: ((String, Publication), (String, Publication))): Publication = { val crossrefPub = item._1._2 - if (item._2!= null) { + if (item._2 != null) { val otherPub = item._2._2 if (otherPub != null) { crossrefPub.mergeFrom(otherPub) @@ -130,6 +126,7 @@ object SparkGenerateDoiBoost { } crossrefPub } + crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/firstJoin") logger.info("Phase 3) Join Result with ORCID") val fj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p)) @@ -143,9 +140,9 @@ object SparkGenerateDoiBoost { sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublication") - val doiBoostPublication: Dataset[(String,Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication].filter(p=>DoiBoostMappingUtil.filterPublication(p)).map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder) + val doiBoostPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication].filter(p => DoiBoostMappingUtil.filterPublication(p)).map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder) - val hostedByDataset : Dataset[(String, HostedByItemType)] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem)) + val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem)) doiBoostPublication.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left") @@ -164,21 +161,20 @@ object SparkGenerateDoiBoost { val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId")) - val a:Dataset[DoiBoostAffiliation] = paperAffiliation + val a: Dataset[DoiBoostAffiliation] = paperAffiliation .joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId"))) .select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId"), col("_2.OfficialPage"), col("_2.DisplayName")).as[DoiBoostAffiliation] - - val magPubs:Dataset[(String,Publication)]= spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication] - .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s =>s._1!= null ) + val magPubs: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication] + .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s => s._1 != null) - magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).flatMap(item => { - val pub:Publication = item._1._2 + magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).flatMap(item => { + val pub: Publication = item._1._2 val affiliation = item._2 - val affId:String = if (affiliation.GridId.isDefined) s"unresolved::grid::${affiliation.GridId.get.toLowerCase}" else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString) - val r:Relation = new Relation + val affId: String = if (affiliation.GridId.isDefined) s"unresolved::grid::${affiliation.GridId.get.toLowerCase}" else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString) + val r: Relation = new Relation r.setSource(pub.getId) r.setTarget(affId) r.setRelType(ModelConstants.RESULT_ORGANIZATION) @@ -186,7 +182,7 @@ object SparkGenerateDoiBoost { r.setSubRelType(ModelConstants.AFFILIATION) r.setDataInfo(pub.getDataInfo) r.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava) - val r1:Relation = new Relation + val r1: Relation = new Relation r1.setTarget(pub.getId) r1.setSource(affId) r1.setRelType(ModelConstants.RESULT_ORGANIZATION) @@ -198,33 +194,31 @@ object SparkGenerateDoiBoost { })(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved") - - - val unresolvedRels:Dataset[(String, Relation)] = spark.read.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved").as[Relation].map(r => { + val unresolvedRels: Dataset[(String, Relation)] = spark.read.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved").as[Relation].map(r => { if (r.getSource.startsWith("unresolved")) (r.getSource, r) else if (r.getTarget.startsWith("unresolved")) - (r.getTarget,r) - else + (r.getTarget, r) + else ("resolved", r) })(Encoders.tuple(Encoders.STRING, mapEncoderRel)) - val openaireOrganization:Dataset[(String,String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x,y) => if (x != null) x else y ).map(_._2) + val openaireOrganization: Dataset[(String, String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x, y) => if (x != null) x else y).map(_._2) - unresolvedRels.joinWith(openaireOrganization,unresolvedRels("_1").equalTo(openaireOrganization("_2"))) + unresolvedRels.joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2"))) .map { x => val currentRels = x._1._2 val currentOrgs = x._2 - if (currentOrgs!= null) - if(currentRels.getSource.startsWith("unresolved")) + if (currentOrgs != null) + if (currentRels.getSource.startsWith("unresolved")) currentRels.setSource(currentOrgs._1) else currentRels.setTarget(currentOrgs._1) - currentRels - }.filter(r=> !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation") + currentRels + }.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation") - magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).map( item => { + magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).map(item => { val affiliation = item._2 if (affiliation.GridId.isEmpty) { val o = new Organization @@ -241,7 +235,7 @@ object SparkGenerateDoiBoost { } else null - }).filter(o=> o!=null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization") - } + }).filter(o => o != null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization") + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala similarity index 99% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 1b1c850ba3..edca4a1802 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -4,20 +4,19 @@ import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf._ import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} import eu.dnetlib.dhp.utils.DHPUtils -import eu.dnetlib.doiboost.DoiBoostMappingUtil.{decideAccessRight, _} +import eu.dnetlib.doiboost.DoiBoostMappingUtil +import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import org.apache.commons.lang.StringUtils import org.json4s import org.json4s.DefaultFormats -import org.json4s.JsonAST.{JValue, _} +import org.json4s.JsonAST._ import org.json4s.jackson.JsonMethods._ import org.slf4j.{Logger, LoggerFactory} +import java.util import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex -import java.util - -import eu.dnetlib.doiboost.DoiBoostMappingUtil case class CrossrefDT(doi: String, json:String, timestamp: Long) {} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala similarity index 77% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala index 159b817c7f..6a1c701aff 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala @@ -6,7 +6,7 @@ import org.apache.commons.io.IOUtils import org.apache.hadoop.io.{IntWritable, Text} import org.apache.spark.SparkConf import org.apache.spark.sql.expressions.Aggregator -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession} import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse @@ -17,12 +17,12 @@ object CrossrefDataset { val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) - def to_item(input:String):CrossrefDT = { + def to_item(input: String): CrossrefDT = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - val ts:Long = (json \ "indexed" \ "timestamp").extract[Long] - val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) + val ts: Long = (json \ "indexed" \ "timestamp").extract[Long] + val doi: String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) CrossrefDT(doi, input, ts) } @@ -30,7 +30,6 @@ object CrossrefDataset { def main(args: Array[String]): Unit = { - val conf: SparkConf = new SparkConf() val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json"))) parser.parseArgument(args) @@ -54,7 +53,7 @@ object CrossrefDataset { return b - if(a.timestamp >b.timestamp) { + if (a.timestamp > b.timestamp) { return a } b @@ -66,7 +65,7 @@ object CrossrefDataset { if (a == null) return b - if(a.timestamp >b.timestamp) { + if (a.timestamp > b.timestamp) { return a } b @@ -79,20 +78,20 @@ object CrossrefDataset { override def finish(reduction: CrossrefDT): CrossrefDT = reduction } - val workingPath:String = parser.get("workingPath") + val workingPath: String = parser.get("workingPath") - val main_ds:Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT] + val main_ds: Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT] val update = - spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text]) - .map(i =>CrossrefImporter.decompressBlob(i._2.toString)) - .map(i =>to_item(i))) + spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text]) + .map(i => CrossrefImporter.decompressBlob(i._2.toString)) + .map(i => to_item(i))) main_ds.union(update).groupByKey(_.doi) .agg(crossrefAggregator.toColumn) - .map(s=>s._2) + .map(s => s._2) .write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated") } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala similarity index 73% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala index 526ff7b3ad..6d03abc259 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala @@ -2,17 +2,12 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.doiboost.DoiBoostMappingUtil -import eu.dnetlib.doiboost.crossref.CrossrefDataset.to_item -import eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries.getClass -import org.apache.hadoop.io.{IntWritable, Text} -import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.rdd.RDD -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.{SparkConf, SparkContext} import org.json4s import org.json4s.DefaultFormats -import org.json4s.JsonAST.JArray -import org.json4s.jackson.JsonMethods.{compact, parse, render} +import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} import scala.io.Source @@ -24,11 +19,10 @@ object GenerateCrossrefDataset { implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT] - def crossrefElement(meta: String): CrossrefDT = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(meta) - val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) + val doi: String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) val timestamp: Long = (json \ "indexed" \ "timestamp").extract[Long] CrossrefDT(doi, meta, timestamp) @@ -51,14 +45,14 @@ object GenerateCrossrefDataset { import spark.implicits._ - val tmp : RDD[String] = sc.textFile(sourcePath,6000) + val tmp: RDD[String] = sc.textFile(sourcePath, 6000) spark.createDataset(tmp) .map(entry => crossrefElement(entry)) .write.mode(SaveMode.Overwrite).save(targetPath) -// .map(meta => crossrefElement(meta)) -// .toDS.as[CrossrefDT] -// .write.mode(SaveMode.Overwrite).save(targetPath) + // .map(meta => crossrefElement(meta)) + // .toDS.as[CrossrefDT] + // .write.mode(SaveMode.Overwrite).save(targetPath) } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala similarity index 96% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala index c65916610c..fa55b9fb9a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala @@ -4,10 +4,8 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.oaf import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset} import org.apache.commons.io.IOUtils - import org.apache.spark.SparkConf - -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala similarity index 88% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala index 95ecb568bd..191c4587ec 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala @@ -2,8 +2,8 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.application.ArgumentApplicationParser import org.apache.hadoop.io.compress.GzipCodec +import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.json4s import org.json4s.DefaultFormats import org.json4s.JsonAST.JArray @@ -17,9 +17,7 @@ object UnpackCrtossrefEntries { val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass) - - - def extractDump(input:String):List[String] = { + def extractDump(input: String): List[String] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) @@ -30,7 +28,6 @@ object UnpackCrtossrefEntries { } - def main(args: Array[String]): Unit = { val conf = new SparkConf val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString) @@ -45,7 +42,7 @@ object UnpackCrtossrefEntries { .getOrCreate() val sc: SparkContext = spark.sparkContext - sc.wholeTextFiles(sourcePath,6000).flatMap(d =>extractDump(d._2)) + sc.wholeTextFiles(sourcePath, 6000).flatMap(d => extractDump(d._2)) .saveAsTextFile(targetPath, classOf[GzipCodec]) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala index fd96290242..0a6fa00f04 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala @@ -5,10 +5,10 @@ import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty} import eu.dnetlib.doiboost.DoiBoostMappingUtil +import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse -import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import scala.collection.JavaConverters._ import scala.collection.mutable diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala similarity index 98% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala index a68d0bb2d4..d25a4893f5 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala @@ -3,8 +3,8 @@ package eu.dnetlib.doiboost.mag import eu.dnetlib.dhp.application.ArgumentApplicationParser import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.types._ +import org.apache.spark.sql.{SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} object SparkImportMagIntoDataset { @@ -24,13 +24,13 @@ object SparkImportMagIntoDataset { "Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), "AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")), "Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), - "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), + "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), "ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")), "FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")), "FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")), "FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), - "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "CreatedDate:DateTime")), + "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")), "PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")), "PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")), @@ -75,7 +75,6 @@ object SparkImportMagIntoDataset { .master(parser.get("master")).getOrCreate() - stream.foreach { case (k, v) => val s: StructType = getSchema(k) val df = spark.read diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala similarity index 90% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala index 0ea4b66a4b..41e95baa1c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala @@ -5,22 +5,19 @@ import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.doiboost.DoiBoostMappingUtil import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.functions._ +import org.apache.spark.sql.functions.{col, collect_list, struct} import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} - import scala.collection.JavaConverters._ - object SparkProcessMAG { - def getDistinctResults (d:Dataset[MagPapers]):Dataset[MagPapers]={ + def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = { d.where(col("Doi").isNotNull) .groupByKey(mp => DoiBoostMappingUtil.normalizeDoi(mp.Doi))(Encoders.STRING) - .reduceGroups((p1:MagPapers,p2:MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1,p2)) + .reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2)) .map(_._2)(Encoders.product[MagPapers]) .map(mp => { - new MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi), + MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi), mp.DocType, mp.PaperTitle, mp.OriginalTitle, mp.BookTitle, mp.Year, mp.Date, mp.Publisher: String, mp.JournalId, mp.ConferenceSeriesId, mp.ConferenceInstanceId, @@ -98,13 +95,13 @@ object SparkProcessMAG { var magPubs: Dataset[(String, Publication)] = spark.read.load(s"$workingPath/merge_step_2").as[Publication] - .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] + .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] val conference = spark.read.load(s"$sourcePath/ConferenceInstances") - .select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate",$"EndDate" ) + .select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate", $"EndDate") val conferenceInstance = conference.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci"))) - .select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate",$"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance] + .select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate", $"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance] magPubs.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left") @@ -122,7 +119,7 @@ object SparkProcessMAG { magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left") .map(item => ConversionUtil.updatePubsWithDescription(item) - ).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4") + ).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4") logger.info("Phase 7) Enrich Publication with FieldOfStudy") @@ -148,11 +145,10 @@ object SparkProcessMAG { spark.read.load(s"$workingPath/mag_publication").as[Publication] .filter(p => p.getId != null) .groupByKey(p => p.getId) - .reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) + .reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b)) .map(_._2) .write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") - } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala similarity index 98% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala index 1cd3f70283..11031f9ca4 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala @@ -4,17 +4,16 @@ import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication} -import eu.dnetlib.dhp.schema.orcid.{AuthorData, OrcidDOI} import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo} import org.apache.commons.lang.StringUtils -import org.slf4j.{Logger, LoggerFactory} - -import scala.collection.JavaConverters._ import org.json4s import org.json4s.DefaultFormats import org.json4s.JsonAST._ import org.json4s.jackson.JsonMethods._ +import org.slf4j.{Logger, LoggerFactory} + +import scala.collection.JavaConverters._ case class ORCIDItem(doi:String, authors:List[OrcidAuthor]){} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala similarity index 84% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala index fa4a93e008..1b189e2969 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala @@ -11,10 +11,10 @@ object SparkConvertORCIDToOAF { val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass) - def run(spark:SparkSession, workingPath:String, targetPath:String) :Unit = { + def run(spark: SparkSession, workingPath: String, targetPath: String): Unit = { implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] import spark.implicits._ - val dataset: Dataset[ORCIDItem] =spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem] + val dataset: Dataset[ORCIDItem] = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem] logger.info("Converting ORCID to OAF") dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath) @@ -35,8 +35,8 @@ object SparkConvertORCIDToOAF { val workingPath = parser.get("workingPath") val targetPath = parser.get("targetPath") - run(spark,workingPath, targetPath) + run(spark, workingPath, targetPath) } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala similarity index 67% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala index 31f3319122..153be5dd13 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala @@ -1,48 +1,45 @@ package eu.dnetlib.doiboost.orcid -import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.oa.merge.AuthorMerger import eu.dnetlib.dhp.schema.oaf.Publication -import eu.dnetlib.dhp.schema.orcid.OrcidDOI import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql.functions.{col, collect_list} +import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} object SparkPreprocessORCID { val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass) - def fixORCIDItem(item :ORCIDItem):ORCIDItem = { - ORCIDItem(item.doi, item.authors.groupBy(_.oid).map(_._2.head).toList) + def fixORCIDItem(item: ORCIDItem): ORCIDItem = { + ORCIDItem(item.doi, item.authors.groupBy(_.oid).map(_._2.head).toList) } - def run(spark:SparkSession,sourcePath:String,workingPath:String):Unit = { + def run(spark: SparkSession, sourcePath: String, workingPath: String): Unit = { import spark.implicits._ implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] - val inputRDD:RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s!= null).filter(s => ORCIDToOAF.authorValid(s)) + val inputRDD: RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s != null).filter(s => ORCIDToOAF.authorValid(s)) spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author") - val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s!= null) + val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s != null) spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works") - val authors :Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor] + val authors: Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor] - val works :Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork] + val works: Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork] works.joinWith(authors, authors("oid").equalTo(works("oid"))) - .map(i =>{ + .map(i => { val doi = i._1.doi val author = i._2 - (doi, author) - }).groupBy(col("_1").alias("doi")) + (doi, author) + }).groupBy(col("_1").alias("doi")) .agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem] .map(s => fixORCIDItem(s)) .write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor") @@ -67,4 +64,4 @@ object SparkPreprocessORCID { } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala similarity index 80% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala index 4530926f10..70290018d9 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala @@ -1,16 +1,14 @@ package eu.dnetlib.doiboost.uw import eu.dnetlib.dhp.application.ArgumentApplicationParser - import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} - object SparkMapUnpayWallToOAF { def main(args: Array[String]): Unit = { @@ -32,11 +30,11 @@ object SparkMapUnpayWallToOAF { val sourcePath = parser.get("sourcePath") val targetPath = parser.get("targetPath") - val inputRDD:RDD[String] = spark.sparkContext.textFile(s"$sourcePath") + val inputRDD: RDD[String] = spark.sparkContext.textFile(s"$sourcePath") logger.info("Converting UnpayWall to OAF") - val d:Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p=>p!=null)).as[Publication] + val d: Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null)).as[Publication] d.write.mode(SaveMode.Overwrite).save(targetPath) } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala similarity index 98% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala index c8324cde16..bf56949658 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala @@ -4,14 +4,13 @@ import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{AccessRight, Instance, OpenAccessRoute, Publication} import eu.dnetlib.doiboost.DoiBoostMappingUtil +import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ -import eu.dnetlib.doiboost.DoiBoostMappingUtil._ -import eu.dnetlib.doiboost.uw.UnpayWallToOAF.get_unpaywall_color diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala deleted file mode 100644 index 4912648bea..0000000000 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala +++ /dev/null @@ -1,70 +0,0 @@ -package eu.dnetlib.dhp.doiboost - -import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset} -import eu.dnetlib.doiboost.{DoiBoostMappingUtil, HostedByItemType} -import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass -import eu.dnetlib.doiboost.mag.ConversionUtil -import eu.dnetlib.doiboost.orcid.ORCIDElement -import org.apache.spark.SparkConf -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} -import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} -import org.junit.jupiter.api.Test - -import scala.io.Source - -class DoiBoostHostedByMapTest { - - - -// @Test -// def testMerge():Unit = { -// val conf: SparkConf = new SparkConf() -// val spark: SparkSession = -// SparkSession -// .builder() -// .config(conf) -// .appName(getClass.getSimpleName) -// .master("local[*]").getOrCreate() -// -// -// -// implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] -// implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset] -// implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub) -// -// -// import spark.implicits._ -// val dataset:RDD[String]= spark.sparkContext.textFile("/home/sandro/Downloads/hbMap.gz") -// -// -// val hbMap:Dataset[(String, HostedByItemType)] =spark.createDataset(dataset.map(DoiBoostMappingUtil.toHostedByItem)) -// -// -// hbMap.show() -// -// -// -// -// -// -// -// -// -// -// } - - - @Test - def idDSGeneration():Unit = { - val s ="doajarticles::0066-782X" - - - - println(DoiBoostMappingUtil.generateDSId(s)) - - - } - - -} diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala new file mode 100644 index 0000000000..41730ade03 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala @@ -0,0 +1,20 @@ +package eu.dnetlib.dhp.doiboost + +import eu.dnetlib.doiboost.DoiBoostMappingUtil +import org.junit.jupiter.api.Test + +class DoiBoostHostedByMapTest { + + @Test + def idDSGeneration():Unit = { + val s ="doajarticles::0066-782X" + + + + println(DoiBoostMappingUtil.generateDSId(s)) + + + } + + +} diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala rename to dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala similarity index 88% rename from dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala rename to dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala index 5ef92cfa47..71dbf27bea 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala @@ -1,7 +1,8 @@ -package eu.dnetlib.doiboost.crossref +package eu.dnetlib.dhp.doiboost.crossref import eu.dnetlib.dhp.schema.oaf._ import eu.dnetlib.dhp.utils.DHPUtils +import eu.dnetlib.doiboost.crossref.Crossref2Oaf import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Test @@ -21,9 +22,9 @@ class CrossrefMappingTest { @Test def testFunderRelationshipsMapping(): Unit = { - val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString - val funder_doi = Source.fromInputStream(getClass.getResourceAsStream("funder_doi")).mkString - val funder_name = Source.fromInputStream(getClass.getResourceAsStream("funder_doi")).mkString + val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString + val funder_doi = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString + val funder_name = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString for (line <- funder_doi.lines) { @@ -72,7 +73,7 @@ class CrossrefMappingTest { @Test def testOrcidID() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("orcid_data.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")).mkString assertNotNull(json) @@ -93,7 +94,7 @@ class CrossrefMappingTest { @Test def testEmptyTitle() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("empty_title.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")).mkString assertNotNull(json) @@ -115,7 +116,7 @@ class CrossrefMappingTest { @Test def testPeerReviewed(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("prwTest.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json")).mkString mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) assertNotNull(json) @@ -156,7 +157,7 @@ class CrossrefMappingTest { @Test def testJournalRelation(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("awardTest.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json")).mkString assertNotNull(json) assertFalse(json.isEmpty) @@ -177,7 +178,7 @@ class CrossrefMappingTest { @Test def testConvertBookFromCrossRef2Oaf(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("book.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json")).mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -233,7 +234,7 @@ class CrossrefMappingTest { @Test def testConvertPreprintFromCrossRef2Oaf(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("preprint.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json")).mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -291,7 +292,7 @@ class CrossrefMappingTest { @Test def testConvertDatasetFromCrossRef2Oaf(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("dataset.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json")).mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -332,7 +333,7 @@ class CrossrefMappingTest { @Test def testConvertArticleFromCrossRef2Oaf(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("article.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -400,7 +401,7 @@ class CrossrefMappingTest { @Test def testSetDateOfAcceptanceCrossRef2Oaf(): Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("dump_file.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json")).mkString assertNotNull(json) assertFalse(json.isEmpty); @@ -415,55 +416,12 @@ class CrossrefMappingTest { assert(items.size == 1) val result: Result = items.head.asInstanceOf[Publication] assertNotNull(result) - logger.info(mapper.writeValueAsString(result)); - -// assertNotNull(result.getDataInfo, "Datainfo test not null Failed"); -// assertNotNull( -// result.getDataInfo.getProvenanceaction, -// "DataInfo/Provenance test not null Failed"); -// assertFalse( -// result.getDataInfo.getProvenanceaction.getClassid.isEmpty, -// "DataInfo/Provenance/classId test not null Failed"); -// assertFalse( -// result.getDataInfo.getProvenanceaction.getClassname.isEmpty, -// "DataInfo/Provenance/className test not null Failed"); -// assertFalse( -// result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty, -// "DataInfo/Provenance/SchemeId test not null Failed"); -// assertFalse( -// result.getDataInfo.getProvenanceaction.getSchemename.isEmpty, -// "DataInfo/Provenance/SchemeName test not null Failed"); -// -// assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed"); -// assertFalse(result.getCollectedfrom.isEmpty); -// -// val collectedFromList = result.getCollectedfrom.asScala -// assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion") -// -// assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion") -// -// -// val relevantDates = result.getRelevantdate.asScala -// -// assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created") -// -// val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]] -// assertFalse(rels.isEmpty) -// rels.foreach(relation => { -// assertNotNull(relation) -// assertFalse(relation.getSource.isEmpty) -// assertFalse(relation.getTarget.isEmpty) -// assertFalse(relation.getRelClass.isEmpty) -// assertFalse(relation.getRelType.isEmpty) -// assertFalse(relation.getSubRelType.isEmpty) -// -// }) } @Test def testNormalizeDOI(): Unit = { - val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString + val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString val line :String = "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}]," val json = template.replace("%s", line) val resultList: List[Oaf] = Crossref2Oaf.convert(json) @@ -479,7 +437,7 @@ class CrossrefMappingTest { @Test def testNormalizeDOI2(): Unit = { - val template = Source.fromInputStream(getClass.getResourceAsStream("article.json")).mkString + val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString val resultList: List[Oaf] = Crossref2Oaf.convert(template) assertTrue(resultList.nonEmpty) @@ -494,7 +452,7 @@ class CrossrefMappingTest { @Test def testLicenseVorClosed() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_vor.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")).mkString assertNotNull(json) @@ -521,7 +479,7 @@ class CrossrefMappingTest { @Test def testLicenseOpen() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_open.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")).mkString assertNotNull(json) @@ -544,7 +502,7 @@ class CrossrefMappingTest { @Test def testLicenseEmbargoOpen() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_embargo_open.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json")).mkString assertNotNull(json) @@ -567,7 +525,7 @@ class CrossrefMappingTest { @Test def testLicenseEmbargo() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_embargo.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo.json")).mkString assertNotNull(json) @@ -591,7 +549,7 @@ class CrossrefMappingTest { @Test def testLicenseEmbargoDateTime() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_embargo_datetime.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json")).mkString assertNotNull(json) @@ -614,7 +572,7 @@ class CrossrefMappingTest { @Test def testMultipleURLs() :Unit = { - val json = Source.fromInputStream(getClass.getResourceAsStream("multiple_urls.json")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")).mkString assertNotNull(json) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/mag/MAGMappingTest.scala similarity index 88% rename from dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala rename to dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/mag/MAGMappingTest.scala index 46d4ec08d6..611f3b3238 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/mag/MAGMappingTest.scala @@ -1,11 +1,12 @@ -package eu.dnetlib.doiboost.mag +package eu.dnetlib.dhp.doiboost.mag +import eu.dnetlib.doiboost.mag.{ConversionUtil, MagPapers, SparkProcessMAG} import org.apache.spark.SparkConf import org.apache.spark.sql.{Dataset, SparkSession} import org.codehaus.jackson.map.ObjectMapper +import org.json4s.DefaultFormats import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Test -import org.json4s.DefaultFormats import org.slf4j.{Logger, LoggerFactory} import java.sql.Timestamp @@ -47,7 +48,7 @@ class MAGMappingTest { @Test def buildInvertedIndexTest(): Unit = { - val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString + val json_input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json")).mkString val description = ConversionUtil.convertInvertedIndexString(json_input) assertNotNull(description) assertTrue(description.nonEmpty) @@ -71,7 +72,7 @@ class MAGMappingTest { .appName(getClass.getSimpleName) .config(conf) .getOrCreate() - val path = getClass.getResource("magPapers.json").getPath + val path = getClass.getResource("/eu/dnetlib/doiboost/mag/magPapers.json").getPath import org.apache.spark.sql.Encoders val schema = Encoders.product[MagPapers].schema @@ -101,7 +102,7 @@ class MAGMappingTest { .appName(getClass.getSimpleName) .config(conf) .getOrCreate() - val path = getClass.getResource("duplicatedMagPapers.json").getPath + val path = getClass.getResource("/eu/dnetlib/doiboost/mag/duplicatedMagPapers.json").getPath import org.apache.spark.sql.Encoders val schema = Encoders.product[MagPapers].schema diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala similarity index 95% rename from dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala rename to dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala index b484dc0878..7c8f01f81b 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala @@ -1,7 +1,8 @@ -package eu.dnetlib.doiboost.orcid +package eu.dnetlib.dhp.doiboost.orcid import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.oaf.Publication +import eu.dnetlib.doiboost.orcid._ import org.apache.spark.SparkConf import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.junit.jupiter.api.Assertions._ @@ -10,9 +11,8 @@ import org.junit.jupiter.api.io.TempDir import org.slf4j.{Logger, LoggerFactory} import java.nio.file.Path -import scala.io.Source - import scala.collection.JavaConversions._ +import scala.io.Source class MappingORCIDToOAFTest { val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass) @@ -20,7 +20,7 @@ class MappingORCIDToOAFTest { @Test def testExtractData():Unit ={ - val json = Source.fromInputStream(getClass.getResourceAsStream("dataOutput")).mkString + val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput")).mkString assertNotNull(json) assertFalse(json.isEmpty) json.lines.foreach(s => { diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/uw/UnpayWallMappingTest.scala similarity index 91% rename from dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala rename to dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/uw/UnpayWallMappingTest.scala index fa696fffc7..6671758b2a 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/uw/UnpayWallMappingTest.scala @@ -1,13 +1,13 @@ -package eu.dnetlib.doiboost.uw - +package eu.dnetlib.dhp.doiboost.uw import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.oaf.OpenAccessRoute +import eu.dnetlib.doiboost.uw.UnpayWallToOAF +import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Test +import org.slf4j.{Logger, LoggerFactory} import scala.io.Source -import org.junit.jupiter.api.Assertions._ -import org.slf4j.{Logger, LoggerFactory} class UnpayWallMappingTest { @@ -18,7 +18,7 @@ class UnpayWallMappingTest { @Test def testMappingToOAF():Unit ={ - val Ilist = Source.fromInputStream(getClass.getResourceAsStream("input.json")).mkString + val Ilist = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json")).mkString var i:Int = 0 for (line <-Ilist.lines) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala index ce383292c1..ad4e1c96ea 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala @@ -1,8 +1,8 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo -import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn} import org.apache.spark.sql.expressions.Aggregator +import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn} case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala similarity index 81% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala index 1b18ba3ae4..38af3eee4e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala @@ -2,13 +2,12 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkApplyHostedByMapToResult.{applyHBtoPubs, getClass} import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo import eu.dnetlib.dhp.schema.common.ModelConstants -import eu.dnetlib.dhp.schema.oaf.{Datasource, Publication} +import eu.dnetlib.dhp.schema.oaf.Datasource import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.json4s.DefaultFormats import org.slf4j.{Logger, LoggerFactory} @@ -52,18 +51,18 @@ object SparkApplyHostedByMapToDatasource { val mapper = new ObjectMapper() - val dats : Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource") + val dats: Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource") .map(r => mapper.readValue(r, classOf[Datasource])) - val pinfo : Dataset[EntityInfo] = Aggregators.datasourceToSingleId( spark.read.textFile(preparedInfoPath) + val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(preparedInfoPath) .map(ei => mapper.readValue(ei, classOf[EntityInfo]))) - applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath) + applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath) spark.read.textFile(outputPath) .write .mode(SaveMode.Overwrite) - .option("compression","gzip") + .option("compression", "gzip") .text(graphPath + "/datasource") } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala similarity index 83% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala index 0e047d0169..2043259822 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala @@ -5,16 +5,14 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils -import eu.dnetlib.dhp.schema.oaf.{Datasource, Instance, OpenAccessRoute, Publication} +import eu.dnetlib.dhp.schema.oaf.{Instance, OpenAccessRoute, Publication} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.json4s.DefaultFormats import org.slf4j.{Logger, LoggerFactory} - import scala.collection.JavaConverters._ - object SparkApplyHostedByMapToResult { def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = { @@ -25,7 +23,7 @@ object SparkApplyHostedByMapToResult { val ei: EntityInfo = t2._2 val i = p.getInstance().asScala if (i.size == 1) { - val inst: Instance = i(0) + val inst: Instance = i.head inst.getHostedby.setKey(ei.getHostedById) inst.getHostedby.setValue(ei.getName) if (ei.getOpenAccess) { @@ -39,6 +37,7 @@ object SparkApplyHostedByMapToResult { p })(Encoders.bean(classOf[Publication])) } + def main(args: Array[String]): Unit = { @@ -67,18 +66,18 @@ object SparkApplyHostedByMapToResult { implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) val mapper = new ObjectMapper() - val pubs : Dataset[Publication] = spark.read.textFile(graphPath + "/publication") + val pubs: Dataset[Publication] = spark.read.textFile(graphPath + "/publication") .map(r => mapper.readValue(r, classOf[Publication])) - val pinfo : Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath) - .map(ei => mapper.readValue(ei, classOf[EntityInfo])) + val pinfo: Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath) + .map(ei => mapper.readValue(ei, classOf[EntityInfo])) - applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath) + applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath) spark.read.textFile(outputPath) .write .mode(SaveMode.Overwrite) - .option("compression","gzip") + .option("compression", "gzip") .text(graphPath + "/publication") } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala similarity index 74% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala index b7a7d352f2..87e203e4b3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala @@ -3,61 +3,58 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo - import eu.dnetlib.dhp.schema.oaf.{Journal, Publication} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} - - object SparkPrepareHostedByInfoToApply { implicit val mapEncoderPInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) - def getList(id: String, j: Journal, name: String ) : List[EntityInfo] = { - var lst:List[EntityInfo] = List() + def getList(id: String, j: Journal, name: String): List[EntityInfo] = { + var lst: List[EntityInfo] = List() - if (j.getIssnLinking != null && !j.getIssnLinking.equals("")){ + if (j.getIssnLinking != null && !j.getIssnLinking.equals("")) { lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst } - if (j.getIssnOnline != null && !j.getIssnOnline.equals("")){ + if (j.getIssnOnline != null && !j.getIssnOnline.equals("")) { lst = EntityInfo.newInstance(id, j.getIssnOnline, name) :: lst } - if (j.getIssnPrinted != null && !j.getIssnPrinted.equals("")){ + if (j.getIssnPrinted != null && !j.getIssnPrinted.equals("")) { lst = EntityInfo.newInstance(id, j.getIssnPrinted, name) :: lst } lst } - def prepareResultInfo(spark:SparkSession, publicationPath:String) : Dataset[EntityInfo] = { + def prepareResultInfo(spark: SparkSession, publicationPath: String): Dataset[EntityInfo] = { implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication]) val mapper = new ObjectMapper() - val dd : Dataset[Publication] = spark.read.textFile(publicationPath) + val dd: Dataset[Publication] = spark.read.textFile(publicationPath) .map(r => mapper.readValue(r, classOf[Publication])) - dd.filter(p => p.getJournal != null ).flatMap(p => getList(p.getId, p.getJournal, "")) + dd.filter(p => p.getJournal != null).flatMap(p => getList(p.getId, p.getJournal, "")) } - def toEntityInfo(input:String): EntityInfo = { + def toEntityInfo(input: String): EntityInfo = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]] + val c: Map[String, HostedByItemType] = json.extract[Map[String, HostedByItemType]] toEntityItem(c.keys.head, c.values.head) } - def toEntityItem(journal_id: String , hbi: HostedByItemType): EntityInfo = { + def toEntityItem(journal_id: String, hbi: HostedByItemType): EntityInfo = { EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess) @@ -67,7 +64,7 @@ object SparkPrepareHostedByInfoToApply { Aggregators.resultToSingleId(res.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left") .map(t2 => { val res: EntityInfo = t2._1 - if(t2._2 != null ){ + if (t2._2 != null) { val ds = t2._2 res.setHostedById(ds.getId) res.setOpenAccess(ds.getOpenAccess) @@ -107,10 +104,10 @@ object SparkPrepareHostedByInfoToApply { //STEP1: read the hostedbymap and transform it in EntityInfo - val hostedByInfo:Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo) + val hostedByInfo: Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo) - //STEP2: create association (publication, issn), (publication, eissn), (publication, lissn) - val resultInfoDataset:Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication") + //STEP2: create association (publication, issn), (publication, eissn), (publication, lissn) + val resultInfoDataset: Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication") //STEP3: left join resultInfo with hostedByInfo on journal_id. Reduction of all the results with the same id in just //one entry (one result could be associated to issn and eissn and so possivly matching more than once against the map) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala similarity index 61% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala index 1ee1d5d1ab..6dfe356239 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala @@ -1,41 +1,39 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap +import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DOAJModel, UnibiGoldModel} import eu.dnetlib.dhp.schema.oaf.Datasource import org.apache.commons.io.IOUtils +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.json4s.DefaultFormats import org.slf4j.{Logger, LoggerFactory} -import com.fasterxml.jackson.databind.ObjectMapper -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.FileSystem -import org.apache.hadoop.fs.Path + import java.io.PrintWriter -import org.apache.hadoop.io.compress.GzipCodec - - object SparkProduceHostedByMap { implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] = Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]) - def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)) : HostedByItemType = { + def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)): HostedByItemType = { val openaire: HostedByInfo = input._1._1 val doaj: HostedByInfo = input._1._2 val gold: HostedByInfo = input._2 val isOpenAccess: Boolean = doaj == null && gold == null openaire.journal_id match { - case Constants.ISSN => HostedByItemType(openaire.id, openaire.officialname, openaire.journal_id, "", "", isOpenAccess) - case Constants.EISSN => HostedByItemType(openaire.id, openaire.officialname, "", openaire.journal_id, "", isOpenAccess) - case Constants.ISSNL => HostedByItemType(openaire.id, openaire.officialname, "", "", openaire.journal_id, isOpenAccess) + case Constants.ISSN => HostedByItemType(openaire.id, openaire.officialname, openaire.journal_id, "", "", isOpenAccess) + case Constants.EISSN => HostedByItemType(openaire.id, openaire.officialname, "", openaire.journal_id, "", isOpenAccess) + case Constants.ISSNL => HostedByItemType(openaire.id, openaire.officialname, "", "", openaire.journal_id, isOpenAccess) // catch the default with a variable so you can print it - case whoa => null + case whoa => null } } @@ -44,7 +42,7 @@ object SparkProduceHostedByMap { implicit val formats = org.json4s.DefaultFormats - val map: Map [String, HostedByItemType] = Map (input._1 -> input._2 ) + val map: Map[String, HostedByItemType] = Map(input._1 -> input._2) Serialization.write(map) @@ -52,34 +50,33 @@ object SparkProduceHostedByMap { } - - def getHostedByItemType(id:String, officialname: String, issn:String, eissn:String, issnl:String, oa:Boolean): HostedByItemType = { - if(issn != null){ - if(eissn != null){ - if(issnl != null){ - HostedByItemType(id, officialname, issn, eissn, issnl , oa) - }else{ - HostedByItemType(id, officialname, issn, eissn, "" , oa) + def getHostedByItemType(id: String, officialname: String, issn: String, eissn: String, issnl: String, oa: Boolean): HostedByItemType = { + if (issn != null) { + if (eissn != null) { + if (issnl != null) { + HostedByItemType(id, officialname, issn, eissn, issnl, oa) + } else { + HostedByItemType(id, officialname, issn, eissn, "", oa) } - }else{ - if(issnl != null){ - HostedByItemType(id, officialname, issn, "", issnl , oa) - }else{ - HostedByItemType(id, officialname, issn, "", "" , oa) + } else { + if (issnl != null) { + HostedByItemType(id, officialname, issn, "", issnl, oa) + } else { + HostedByItemType(id, officialname, issn, "", "", oa) } } - }else{ - if(eissn != null){ - if(issnl != null){ - HostedByItemType(id, officialname, "", eissn, issnl , oa) - }else{ - HostedByItemType(id, officialname, "", eissn, "" , oa) + } else { + if (eissn != null) { + if (issnl != null) { + HostedByItemType(id, officialname, "", eissn, issnl, oa) + } else { + HostedByItemType(id, officialname, "", eissn, "", oa) } - }else{ - if(issnl != null){ - HostedByItemType(id, officialname, "", "", issnl , oa) - }else{ - HostedByItemType("", "", "", "", "" , oa) + } else { + if (issnl != null) { + HostedByItemType(id, officialname, "", "", issnl, oa) + } else { + HostedByItemType("", "", "", "", "", oa) } } } @@ -90,10 +87,10 @@ object SparkProduceHostedByMap { return getHostedByItemType(dats.getId, dats.getOfficialname.getValue, dats.getJournal.getIssnPrinted, dats.getJournal.getIssnOnline, dats.getJournal.getIssnLinking, false) } - HostedByItemType("","","","","",false) + HostedByItemType("", "", "", "", "", false) } - def oaHostedByDataset(spark:SparkSession, datasourcePath : String) : Dataset[HostedByItemType] = { + def oaHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { import spark.implicits._ @@ -102,10 +99,10 @@ object SparkProduceHostedByMap { implicit var encoderD = Encoders.kryo[Datasource] - val dd : Dataset[Datasource] = spark.read.textFile(datasourcePath) + val dd: Dataset[Datasource] = spark.read.textFile(datasourcePath) .map(r => mapper.readValue(r, classOf[Datasource])) - dd.map{ddt => oaToHostedbyItemType(ddt)}.filter(hb => !(hb.id.equals(""))) + dd.map { ddt => oaToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) } @@ -115,17 +112,17 @@ object SparkProduceHostedByMap { } - def goldHostedByDataset(spark:SparkSession, datasourcePath:String) : Dataset[HostedByItemType] = { + def goldHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { import spark.implicits._ implicit val mapEncoderUnibi: Encoder[UnibiGoldModel] = Encoders.kryo[UnibiGoldModel] val mapper = new ObjectMapper() - val dd : Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath) + val dd: Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath) .map(r => mapper.readValue(r, classOf[UnibiGoldModel])) - dd.map{ddt => goldToHostedbyItemType(ddt)}.filter(hb => !(hb.id.equals(""))) + dd.map { ddt => goldToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) } @@ -134,41 +131,40 @@ object SparkProduceHostedByMap { return getHostedByItemType(Constants.DOAJ, doaj.getJournalTitle, doaj.getIssn, doaj.getEissn, "", true) } - def doajHostedByDataset(spark:SparkSession, datasourcePath:String) : Dataset[HostedByItemType] = { + def doajHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { import spark.implicits._ implicit val mapEncoderDOAJ: Encoder[DOAJModel] = Encoders.kryo[DOAJModel] val mapper = new ObjectMapper() - val dd : Dataset[DOAJModel] = spark.read.textFile(datasourcePath) + val dd: Dataset[DOAJModel] = spark.read.textFile(datasourcePath) .map(r => mapper.readValue(r, classOf[DOAJModel])) - dd.map{ddt => doajToHostedbyItemType(ddt)}.filter(hb => !(hb.id.equals(""))) + dd.map { ddt => doajToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) } def toList(input: HostedByItemType): List[(String, HostedByItemType)] = { - var lst : List[(String, HostedByItemType)] = List() - if(!input.issn.equals("")){ + var lst: List[(String, HostedByItemType)] = List() + if (!input.issn.equals("")) { lst = (input.issn, input) :: lst } - if(!input.eissn.equals("")){ + if (!input.eissn.equals("")) { lst = (input.eissn, input) :: lst } - if(!input.lissn.equals("")){ + if (!input.lissn.equals("")) { lst = (input.lissn, input) :: lst } lst } - - def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode : String):Unit = { + def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = { val conf = new Configuration() conf.set("fs.defaultFS", hdfsNameNode) - val fs= FileSystem.get(conf) + val fs = FileSystem.get(conf) val output = fs.create(new Path(outputPath)) val writer = new PrintWriter(output) try { @@ -182,7 +178,6 @@ object SparkProduceHostedByMap { } - def main(args: Array[String]): Unit = { val logger: Logger = LoggerFactory.getLogger(getClass) @@ -213,7 +208,7 @@ object SparkProduceHostedByMap { .union(doajHostedByDataset(spark, workingDirPath + "/doaj.json")) .flatMap(hbi => toList(hbi))).filter(hbi => hbi._2.id.startsWith("10|")) .map(hbi => toHostedByMap(hbi))(Encoders.STRING) - .rdd.saveAsTextFile(outputPath , classOf[GzipCodec]) + .rdd.saveAsTextFile(outputPath, classOf[GzipCodec]) } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala similarity index 88% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala index c7ad1890de..91d2bafe3d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala @@ -4,17 +4,11 @@ import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.common.HdfsSupport import eu.dnetlib.dhp.schema.common.ModelSupport -import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.utils.DHPUtils -import org.apache.commons.io.IOUtils -import org.apache.commons.lang3.StringUtils -import org.apache.http.client.methods.HttpGet -import org.apache.http.impl.client.HttpClients import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.{SparkConf, SparkContext} import org.slf4j.LoggerFactory - import scala.collection.JavaConverters._ import scala.io.Source @@ -59,7 +53,7 @@ object CopyHdfsOafSparkApplication { if (validPaths.nonEmpty) { val oaf = spark.read.load(validPaths: _*).as[Oaf] val mapper = new ObjectMapper() - val l =ModelSupport.oafTypes.entrySet.asScala.map(e => e.getKey).toList + val l = ModelSupport.oafTypes.entrySet.asScala.map(e => e.getKey).toList l.foreach( e => oaf.filter(o => o.getClass.getSimpleName.equalsIgnoreCase(e)) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala similarity index 95% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala index be217c5c3f..8e15063c28 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala @@ -2,9 +2,8 @@ package eu.dnetlib.dhp.oa.graph.resolution import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.common.HdfsSupport import eu.dnetlib.dhp.schema.common.EntityType -import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset,_} import org.apache.commons.io.IOUtils import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkConf @@ -75,7 +74,7 @@ object SparkResolveEntities { } } - def generateResolvedEntities(spark: SparkSession, workingPath: String, graphBasePath: String, targetPath:String) = { + def generateResolvedEntities(spark: SparkSession, workingPath: String, graphBasePath: String, targetPath: String) = { implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) import spark.implicits._ diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala similarity index 99% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala index a194f26946..80c09940f0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.oa.graph.resolution import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.common.HdfsSupport -import eu.dnetlib.dhp.schema.oaf.{Relation, Result} +import eu.dnetlib.dhp.schema.oaf.Relation import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.io.IOUtils import org.apache.hadoop.fs.{FileSystem, Path} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala similarity index 96% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala index 9e905d806b..9df3b41bdc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala @@ -18,7 +18,6 @@ object SparkDataciteToOAF { .config(conf) .appName(getClass.getSimpleName) .master(parser.get("master")).getOrCreate() - import spark.implicits._ val sc = spark.sparkContext diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala similarity index 69% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala index 3ee0c7dd6a..9d16cf907e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala @@ -2,7 +2,7 @@ package eu.dnetlib.dhp.sx.graph import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.oaf.Result import org.apache.commons.io.IOUtils import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.SparkConf @@ -29,13 +29,13 @@ object SparkConvertDatasetToJsonRDD { val targetPath = parser.get("targetPath") log.info(s"targetPath -> $targetPath") - val resultObject = List("publication","dataset","software", "otherResearchProduct") + val resultObject = List("publication", "dataset", "software", "otherResearchProduct") val mapper = new ObjectMapper() - implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) + implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) - resultObject.foreach{item => - spark.read.load(s"$sourcePath/$item").as[Result].map(r=> mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec]) + resultObject.foreach { item => + spark.read.load(s"$sourcePath/$item").as[Result].map(r => mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec]) } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala similarity index 83% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala index 846ac37af2..cc1b97fd6e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala @@ -5,10 +5,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.sx.scholix.Scholix import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary import org.apache.commons.io.IOUtils +import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.SparkConf import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.slf4j.{Logger, LoggerFactory} -import org.apache.hadoop.io.compress._ object SparkConvertObjectToJson { @@ -32,8 +32,8 @@ object SparkConvertObjectToJson { log.info(s"objectType -> $objectType") - implicit val scholixEncoder :Encoder[Scholix]= Encoders.kryo[Scholix] - implicit val summaryEncoder :Encoder[ScholixSummary]= Encoders.kryo[ScholixSummary] + implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] + implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] val mapper = new ObjectMapper @@ -42,11 +42,11 @@ object SparkConvertObjectToJson { case "scholix" => log.info("Serialize Scholix") val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix] - d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(6000).saveAsTextFile(targetPath, classOf[GzipCodec]) + d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(6000).saveAsTextFile(targetPath, classOf[GzipCodec]) case "summary" => log.info("Serialize Summary") val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary] - d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(1000).saveAsTextFile(targetPath, classOf[GzipCodec]) + d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(1000).saveAsTextFile(targetPath, classOf[GzipCodec]) } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala similarity index 62% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala index 4b82fe6455..23f039c706 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala @@ -2,11 +2,12 @@ package eu.dnetlib.dhp.sx.graph import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Software,Dataset => OafDataset} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} + object SparkConvertRDDtoDataset { def main(args: Array[String]): Unit = { @@ -31,39 +32,39 @@ object SparkConvertRDDtoDataset { val entityPath = s"$t/entities" val relPath = s"$t/relation" val mapper = new ObjectMapper() - implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) - implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) - implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) - implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct]) - implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) + implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) + implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) + implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) + implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct]) + implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) log.info("Converting dataset") - val rddDataset =spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset])) + val rddDataset = spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset])) spark.createDataset(rddDataset).as[OafDataset].write.mode(SaveMode.Overwrite).save(s"$entityPath/dataset") log.info("Converting publication") - val rddPublication =spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication])) + val rddPublication = spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication])) spark.createDataset(rddPublication).as[Publication].write.mode(SaveMode.Overwrite).save(s"$entityPath/publication") log.info("Converting software") - val rddSoftware =spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software])) + val rddSoftware = spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software])) spark.createDataset(rddSoftware).as[Software].write.mode(SaveMode.Overwrite).save(s"$entityPath/software") log.info("Converting otherresearchproduct") - val rddOtherResearchProduct =spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct])) + val rddOtherResearchProduct = spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct])) spark.createDataset(rddOtherResearchProduct).as[OtherResearchProduct].write.mode(SaveMode.Overwrite).save(s"$entityPath/otherresearchproduct") log.info("Converting Relation") - val relationSemanticFilter = List("cites", "iscitedby","merges", "ismergedin") + val relationSemanticFilter = List("cites", "iscitedby", "merges", "ismergedin") - val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation") + val rddRelation = spark.sparkContext.textFile(s"$sourcePath/relation") .map(s => mapper.readValue(s, classOf[Relation])) - .filter(r=> r.getSource.startsWith("50") && r.getTarget.startsWith("50")) + .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala similarity index 76% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala index 350b00c5ea..ed88cfaa64 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala @@ -1,14 +1,12 @@ package eu.dnetlib.dhp.sx.graph import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset,_} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} - - object SparkCreateInputGraph { def main(args: Array[String]): Unit = { @@ -33,7 +31,7 @@ object SparkCreateInputGraph { ) - implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) @@ -41,16 +39,13 @@ object SparkCreateInputGraph { implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) - - - val sourcePath = parser.get("sourcePath") log.info(s"sourcePath -> $sourcePath") val targetPath = parser.get("targetPath") log.info(s"targetPath -> $targetPath") - val oafDs:Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf] + val oafDs: Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf] log.info("Extract Publication") @@ -70,27 +65,27 @@ object SparkCreateInputGraph { resultObject.foreach { r => log.info(s"Make ${r._1} unique") - makeDatasetUnique(s"$targetPath/extracted/${r._1}",s"$targetPath/preprocess/${r._1}",spark, r._2) + makeDatasetUnique(s"$targetPath/extracted/${r._1}", s"$targetPath/preprocess/${r._1}", spark, r._2) } } - def extractEntities[T <: Oaf ](oafDs:Dataset[Oaf], targetPath:String, clazz:Class[T], log:Logger) :Unit = { + def extractEntities[T <: Oaf](oafDs: Dataset[Oaf], targetPath: String, clazz: Class[T], log: Logger): Unit = { - implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) + implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) log.info(s"Extract ${clazz.getSimpleName}") oafDs.filter(o => o.isInstanceOf[T]).map(p => p.asInstanceOf[T]).write.mode(SaveMode.Overwrite).save(targetPath) } - def makeDatasetUnique[T <: Result ](sourcePath:String, targetPath:String, spark:SparkSession, clazz:Class[T]) :Unit = { + def makeDatasetUnique[T <: Result](sourcePath: String, targetPath: String, spark: SparkSession, clazz: Class[T]): Unit = { import spark.implicits._ - implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) + implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) - val ds:Dataset[T] = spark.read.load(sourcePath).as[T] + val ds: Dataset[T] = spark.read.load(sourcePath).as[T] - ds.groupByKey(_.getId).reduceGroups{(x,y) => + ds.groupByKey(_.getId).reduceGroups { (x, y) => x.mergeFrom(y) x }.map(_._2).write.mode(SaveMode.Overwrite).save(targetPath) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala similarity index 76% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala index e4fcd27824..9930c57af9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala @@ -9,7 +9,7 @@ import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils.RelatedEntities import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql.functions.count -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} object SparkCreateScholix { @@ -42,7 +42,7 @@ object SparkCreateScholix { val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation] - .filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) + .filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) .map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder)) val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary] @@ -51,54 +51,54 @@ object SparkCreateScholix { relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left") .map { input: ((String, Relation), (String, ScholixSummary)) => - if (input._1!= null && input._2!= null) { + if (input._1 != null && input._2 != null) { val rel: Relation = input._1._2 val source: ScholixSummary = input._2._2 (rel.getTarget, ScholixUtils.scholixFromSource(rel, source)) } - else null + else null }(Encoders.tuple(Encoders.STRING, scholixEncoder)) - .filter(r => r!= null) + .filter(r => r != null) .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source") val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder)) scholixSource.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left") .map { input: ((String, Scholix), (String, ScholixSummary)) => - if (input._2== null) { + if (input._2 == null) { null } else { val s: Scholix = input._1._2 val target: ScholixSummary = input._2._2 ScholixUtils.generateCompleteScholix(s, target) } - }.filter(s => s!= null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse") + }.filter(s => s != null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse") val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix] scholix_o_v.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))).as[Scholix] - .map(s=> (s.getIdentifier,s))(Encoders.tuple(Encoders.STRING, scholixEncoder)) + .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder)) .groupByKey(_._1) .agg(ScholixUtils.scholixAggregator.toColumn) .map(s => s._2) .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix") - val scholix_final:Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix] + val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix] - val stats:Dataset[(String,String,Long)]= scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String,String,Long)] + val stats: Dataset[(String, String, Long)] = scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String, String, Long)] stats - .map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0 )) + .map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0)) .groupByKey(_.id) - .reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset+b.relatedDataset, a.relatedPublication+b.relatedPublication)) + .reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset + b.relatedDataset, a.relatedPublication + b.relatedPublication)) .map(_._2) .write.mode(SaveMode.Overwrite).save(s"$targetPath/related_entities") - val relatedEntitiesDS:Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication>0 || r.relatedDataset > 0) + val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication > 0 || r.relatedDataset > 0) - relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map{i => + relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map { i => val re = i._1 val sum = i._2._2 diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala similarity index 68% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala index 0970375f5c..4274cae5a6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala @@ -6,7 +6,7 @@ import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} object SparkCreateSummaryObject { @@ -28,15 +28,15 @@ object SparkCreateSummaryObject { val targetPath = parser.get("targetPath") log.info(s"targetPath -> $targetPath") - implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result] - implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result] + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] + implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] - val ds:Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r=>r.getDataInfo== null || r.getDataInfo.getDeletedbyinference== false) + val ds: Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) - ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s!= null).write.mode(SaveMode.Overwrite).save(targetPath) + ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s != null).write.mode(SaveMode.Overwrite).save(targetPath) } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala similarity index 99% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala index 193512474e..c70397d042 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala @@ -5,6 +5,7 @@ import org.apache.spark.sql.{Encoder, Encoders} import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse + import java.util.regex.Pattern import scala.language.postfixOps import scala.xml.{Elem, Node, XML} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala similarity index 83% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala index 79c75d6df7..2717b7b804 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala @@ -2,11 +2,11 @@ package eu.dnetlib.dhp.sx.graph.pangaea import eu.dnetlib.dhp.application.ArgumentApplicationParser import org.apache.spark.rdd.RDD -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.{SparkConf, SparkContext} import org.slf4j.{Logger, LoggerFactory} - import scala.collection.JavaConverters._ + import scala.io.Source object SparkGeneratePanagaeaDataset { @@ -28,17 +28,17 @@ object SparkGeneratePanagaeaDataset { parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}")) logger.info("Converting sequential file into Dataset") - val sc:SparkContext = spark.sparkContext + val sc: SparkContext = spark.sparkContext - val workingPath:String = parser.get("workingPath") + val workingPath: String = parser.get("workingPath") implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel] - val inputRDD:RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s)) + val inputRDD: RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s)) spark.createDataset(inputRDD).as[PangaeaDataModel] - .map(s => (s.identifier,s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders)) - .groupByKey(_._1)(Encoders.STRING) + .map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders)) + .groupByKey(_._1)(Encoders.STRING) .agg(PangaeaUtils.getDatasetAggregator().toColumn) .map(s => s._2) .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset") @@ -46,7 +46,4 @@ object SparkGeneratePanagaeaDataset { } - - - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala similarity index 61% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala index 93c554e048..bf81a26d46 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala @@ -1,6 +1,5 @@ package eu.dnetlib.dhp.sx.graph.scholix - import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.schema.sx.scholix._ import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology} @@ -10,23 +9,22 @@ import org.apache.spark.sql.{Encoder, Encoders} import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse - import scala.collection.JavaConverters._ import scala.io.Source -import scala.language.postfixOps object ScholixUtils { val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier" - val DATE_RELATION_KEY:String = "RelationDate" - case class RelationVocabulary(original:String, inverse:String){} + val DATE_RELATION_KEY: String = "RelationDate" - case class RelatedEntities(id:String, relatedDataset:Long, relatedPublication:Long){} + case class RelationVocabulary(original: String, inverse: String) {} - val relations:Map[String, RelationVocabulary] = { - val input =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relations.json")).mkString + case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {} + + val relations: Map[String, RelationVocabulary] = { + val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relations.json")).mkString implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) @@ -35,12 +33,12 @@ object ScholixUtils { } - def extractRelationDate(relation: Relation):String = { + def extractRelationDate(relation: Relation): String = { - if (relation.getProperties== null || !relation.getProperties.isEmpty) + if (relation.getProperties == null || !relation.getProperties.isEmpty) null else { - val date =relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue) + val date = relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue) if (date.isDefined) date.get else @@ -48,9 +46,9 @@ object ScholixUtils { } } - def extractRelationDate(summary: ScholixSummary):String = { + def extractRelationDate(summary: ScholixSummary): String = { - if(summary.getDate== null || summary.getDate.isEmpty) + if (summary.getDate == null || summary.getDate.isEmpty) null else { summary.getDate.get(0) @@ -59,15 +57,14 @@ object ScholixUtils { } - def inverseRelationShip(rel:ScholixRelationship):ScholixRelationship = { + def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = { new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName) } - - val statsAggregator:Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] with Serializable { + val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable { override def zero: RelatedEntities = null override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = { @@ -78,17 +75,16 @@ object ScholixUtils { if (b == null) RelatedEntities(a._1, relatedDataset, relatedPublication) else - RelatedEntities(a._1,b.relatedDataset+ relatedDataset, b.relatedPublication+ relatedPublication ) + RelatedEntities(a._1, b.relatedDataset + relatedDataset, b.relatedPublication + relatedPublication) } override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = { - if (b1!= null && b2!= null) - RelatedEntities(b1.id, b1.relatedDataset+ b2.relatedDataset, b1.relatedPublication+ b2.relatedPublication) + if (b1 != null && b2 != null) + RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication) + else if (b1 != null) + b1 else - if (b1!= null) - b1 - else b2 } @@ -104,12 +100,12 @@ object ScholixUtils { override def zero: Scholix = null - def scholix_complete(s:Scholix):Boolean ={ - if (s== null || s.getIdentifier==null) { + def scholix_complete(s: Scholix): Boolean = { + if (s == null || s.getIdentifier == null) { false } else if (s.getSource == null || s.getTarget == null) { - false - } + false + } else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty) false else @@ -121,7 +117,7 @@ object ScholixUtils { } override def merge(b1: Scholix, b2: Scholix): Scholix = { - if (scholix_complete(b1)) b1 else b2 + if (scholix_complete(b1)) b1 else b2 } override def finish(reduction: Scholix): Scholix = reduction @@ -132,7 +128,7 @@ object ScholixUtils { } - def createInverseScholixRelation(scholix: Scholix):Scholix = { + def createInverseScholixRelation(scholix: Scholix): Scholix = { val s = new Scholix s.setPublicationDate(scholix.getPublicationDate) s.setPublisher(scholix.getPublisher) @@ -144,34 +140,33 @@ object ScholixUtils { s - } - def extractCollectedFrom(summary:ScholixSummary): List[ScholixEntityId] = { - if (summary.getDatasources!= null && !summary.getDatasources.isEmpty) { - val l: List[ScholixEntityId] = summary.getDatasources.asScala.map{ + def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = { + if (summary.getDatasources != null && !summary.getDatasources.isEmpty) { + val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d => new ScholixEntityId(d.getDatasourceName, List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava) }(collection.breakOut) - l + l } else List() } - def extractCollectedFrom(relation: Relation) : List[ScholixEntityId] = { + def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = { if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) { val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c => - new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA,null)).asJava) + new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava) }(collection breakOut) l } else List() } - def generateCompleteScholix(scholix: Scholix, target:ScholixSummary): Scholix = { + def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = { val s = new Scholix s.setPublicationDate(scholix.getPublicationDate) s.setPublisher(scholix.getPublisher) @@ -192,29 +187,28 @@ object ScholixUtils { r.setObjectType(summaryObject.getTypology.toString) r.setObjectSubType(summaryObject.getSubType) - if (summaryObject.getTitle!= null && !summaryObject.getTitle.isEmpty) - r.setTitle(summaryObject.getTitle.get(0)) + if (summaryObject.getTitle != null && !summaryObject.getTitle.isEmpty) + r.setTitle(summaryObject.getTitle.get(0)) - if (summaryObject.getAuthor!= null && !summaryObject.getAuthor.isEmpty){ - val l:List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a,null)).toList + if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) { + val l: List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList if (l.nonEmpty) r.setCreator(l.asJava) } - if (summaryObject.getDate!= null && !summaryObject.getDate.isEmpty) + if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty) r.setPublicationDate(summaryObject.getDate.get(0)) - if (summaryObject.getPublisher!= null && !summaryObject.getPublisher.isEmpty) - { - val plist:List[ScholixEntityId] =summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList + if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) { + val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList if (plist.nonEmpty) r.setPublisher(plist.asJava) } - if (summaryObject.getDatasources!= null && !summaryObject.getDatasources.isEmpty) { + if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) { - val l:List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom( + val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom( new ScholixEntityId(c.getDatasourceName, List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava) , "collected", "complete" @@ -228,12 +222,9 @@ object ScholixUtils { } + def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = { - - - def scholixFromSource(relation:Relation, source:ScholixSummary):Scholix = { - - if (relation== null || source== null) + if (relation == null || source == null) return null val s = new Scholix @@ -253,9 +244,9 @@ object ScholixUtils { s.setPublicationDate(d) - if (source.getPublisher!= null && !source.getPublisher.isEmpty) { + if (source.getPublisher != null && !source.getPublisher.isEmpty) { val l: List[ScholixEntityId] = source.getPublisher.asScala - .map{ + .map { p => new ScholixEntityId(p, null) }(collection.breakOut) @@ -265,7 +256,7 @@ object ScholixUtils { } val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null) - if (semanticRelation== null) + if (semanticRelation == null) return null s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)) s.setSource(generateScholixResourceFromSummary(source)) @@ -274,8 +265,8 @@ object ScholixUtils { } - def findURLForPID(pidValue:List[StructuredProperty], urls:List[String]):List[(StructuredProperty, String)] = { - pidValue.map{ + def findURLForPID(pidValue: List[StructuredProperty], urls: List[String]): List[(StructuredProperty, String)] = { + pidValue.map { p => val pv = p.getValue @@ -285,67 +276,67 @@ object ScholixUtils { } - def extractTypedIdentifierFromInstance(r:Result):List[ScholixIdentifier] = { + def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = { if (r.getInstance() == null || r.getInstance().isEmpty) return List() - r.getInstance().asScala.filter(i => i.getUrl!= null && !i.getUrl.isEmpty) - .filter(i => i.getPid!= null && i.getUrl != null) + r.getInstance().asScala.filter(i => i.getUrl != null && !i.getUrl.isEmpty) + .filter(i => i.getPid != null && i.getUrl != null) .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList)) .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList } - def resultToSummary(r:Result):ScholixSummary = { + def resultToSummary(r: Result): ScholixSummary = { val s = new ScholixSummary s.setId(r.getId) if (r.getPid == null || r.getPid.isEmpty) return null - val persistentIdentifiers:List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r) + val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r) if (persistentIdentifiers.isEmpty) return null s.setLocalIdentifier(persistentIdentifiers.asJava) - if (r.isInstanceOf[Publication] ) + if (r.isInstanceOf[Publication]) s.setTypology(Typology.publication) else s.setTypology(Typology.dataset) s.setSubType(r.getInstance().get(0).getInstancetype.getClassname) - if (r.getTitle!= null && r.getTitle.asScala.nonEmpty) { - val titles:List[String] =r.getTitle.asScala.map(t => t.getValue)(collection breakOut) + if (r.getTitle != null && r.getTitle.asScala.nonEmpty) { + val titles: List[String] = r.getTitle.asScala.map(t => t.getValue)(collection breakOut) if (titles.nonEmpty) s.setTitle(titles.asJava) else - return null + return null } - if(r.getAuthor!= null && !r.getAuthor.isEmpty) { - val authors:List[String] = r.getAuthor.asScala.map(a=> a.getFullname)(collection breakOut) + if (r.getAuthor != null && !r.getAuthor.isEmpty) { + val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname)(collection breakOut) if (authors nonEmpty) s.setAuthor(authors.asJava) } if (r.getInstance() != null) { - val dt:List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue)(collection.breakOut) + val dt: List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue)(collection.breakOut) if (dt.nonEmpty) s.setDate(dt.distinct.asJava) } - if (r.getDescription!= null && !r.getDescription.isEmpty) { - val d = r.getDescription.asScala.find(f => f!= null && f.getValue!=null) + if (r.getDescription != null && !r.getDescription.isEmpty) { + val d = r.getDescription.asScala.find(f => f != null && f.getValue != null) if (d.isDefined) s.setDescription(d.get.getValue) } - if (r.getSubject!= null && !r.getSubject.isEmpty) { - val subjects:List[SchemeValue] =r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))(collection breakOut) + if (r.getSubject != null && !r.getSubject.isEmpty) { + val subjects: List[SchemeValue] = r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))(collection breakOut) if (subjects.nonEmpty) s.setSubject(subjects.asJava) } - if (r.getPublisher!= null) + if (r.getPublisher != null) s.setPublisher(List(r.getPublisher.getValue).asJava) - if (r.getCollectedfrom!= null && !r.getCollectedfrom.isEmpty) { - val cf:List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))(collection breakOut) + if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) { + val cf: List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))(collection breakOut) if (cf.nonEmpty) s.setDatasources(cf.distinct.asJava) } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java index 602aaf6e66..07f6638965 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java @@ -171,24 +171,6 @@ public class DumpJobTest { GraphResult gr = verificationDataset.first(); - Assertions.assertEquals(2, gr.getMeasures().size()); - Assertions - .assertTrue( - gr - .getMeasures() - .stream() - .anyMatch( - m -> m.getKey().equals("influence") - && m.getValue().equals("1.62759106106e-08"))); - Assertions - .assertTrue( - gr - .getMeasures() - .stream() - .anyMatch( - m -> m.getKey().equals("popularity") - && m.getValue().equals("0.22519296"))); - Assertions.assertEquals(6, gr.getAuthor().size()); Assertions .assertTrue( diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/addProjectInfo/publication_extendedmodel b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/addProjectInfo/publication_extendedmodel index 6b146405a9..b56b30fa52 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/addProjectInfo/publication_extendedmodel +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/addProjectInfo/publication_extendedmodel @@ -1,2 +1,2 @@ -{"measures":[{"key":"influence","value":"1.62759106106e-08"},{"key":"popularity","value":"0.22519296"}],"author":[{"fullname":"Nikolaidou,Charitini","name":"Charitini","surname":"Nikolaidou","rank":1,"pid":null},{"fullname":"Votsi,Nefta","name":"Nefta","surname":"Votsi","rank":2,"pid":{"id":{"scheme":"orcid","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}},{"fullname":"Sgardelis,Steanos","name":"Steanos","surname":"Sgardelis","rank":3,"pid":{"id":{"scheme":"orcid_pending","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}},{"fullname":"Halley,John","name":"John","surname":"Halley","rank":4,"pid":null},{"fullname":"Pantis,John","name":"John","surname":"Pantis","rank":5,"pid":{"id":{"scheme":"orcid","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}},{"fullname":"Tsiafouli,Maria","name":"Maria","surname":"Tsiafouli","rank":6,"pid":{"id":{"scheme":"orcid_pending","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}}],"type":"publication","language":{"code":"eng","label":"English"},"country":[{"code":"IT","label":"Italy","provenance":null}],"subjects":[{"subject":{"scheme":"ACM","value":"Ecosystem Services hotspots"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Natura 2000"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Quiet Protected Areas"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Biodiversity"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Agriculture"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Elevation"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Slope"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Ecosystem Service trade-offs and synergies"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":" cultural services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"provisioning services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"regulating services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"supporting services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}],"maintitle":"Ecosystem Service capacity is higher in areas of multiple designation types","subtitle":null,"description":["The implementation of the Ecosystem Service (ES) concept into practice might be a challenging task as it has to take into account previous “traditional” policies and approaches that have evaluated nature and biodiversity differently. Among them the Habitat (92/43/EC) and Bird Directives (79/409/EC), the Water Framework Directive (2000/60/EC), and the Noise Directive (2002/49/EC) have led to the evaluation/designation of areas in Europe with different criteria. In this study our goal was to understand how the ES capacity of an area is related to its designation and if areas with multiple designations have higher capacity in providing ES. We selected four catchments in Greece with a great variety of characteristics covering over 25% of the national territory. Inside the catchments we assessed the ES capacity (following the methodology of Burkhard et al. 2009) of areas designated as Natura 2000 sites, Quiet areas and Wetlands or Water bodies and found those areas that have multiple designations. Data were analyzed by GLM to reveal differences regarding the ES capacity among the different types of areas. We also investigated by PCA synergies and trade-offs among different kinds of ES and tested for correlations among landscape properties, such as elevation, aspect and slope and the ES potential. Our results show that areas with different types or multiple designations have a different capacity in providing ES. Areas of one designation type (Protected or Quiet Areas) had in general intermediate scores in most ES but scores were higher compared to areas with no designation, which displayed stronger capacity in provisioning services. Among Protected Areas and Quiet Areas the latter scored better in general. Areas that combined both designation types (Protected and Quiet Areas) showed the highest capacity in 13 out of 29 ES, that were mostly linked with natural and forest ecosystems. We found significant synergies among most regulating, supporting and cultural ES which in turn display trade-offs with provisioning services. The different ES are spatially related and display strong correlation with landscape properties, such as elevation and slope. We suggest that the designation status of an area can be used as an alternative tool for environmental policy, indicating the capacity for ES provision. Multiple designations of areas can be used as proxies for locating ES “hotspots”. This integration of “traditional” evaluation and designation and the “newer” ES concept forms a time- and cost-effective way to be adopted by stakeholders and policy-makers in order to start complying with new standards and demands for nature conservation and environmental management."],"publicationdate":"2017-01-01","publisher":"Pensoft Publishers","embargoenddate":null,"source":["One Ecosystem 2: e13718"],"format":["text/html"],"contributor":[],"coverage":[],"bestaccessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/","openAccessRoute":null},"container":{"name":"One Ecosystem","issnPrinted":"","issnOnline":"2367-8194","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":null,"conferencedate":null},"documentationUrl":null,"codeRepositoryUrl":null,"programmingLanguage":null,"contactperson":null,"contactgroup":null,"tool":null,"size":null,"version":null,"geolocation":null,"id":"50|pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2","originalId":["50|pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2","10.3897/oneeco.2.e13718"],"pid":[{"scheme":"doi","value":"10.1016/j.triboint.2014.05.004"}],"dateofcollection":"2020-03-23T00:20:51.392Z","lastupdatetimestamp":1628257970612,"projects":null,"context":[{"code":"dh-ch","label":"Digital Humanities and Cultural Heritage","provenance":[{"provenance":"Inferred by OpenAIRE","trust":"0.9"}]}],"collectedfrom":[{"key":"10|openaire____::fdc7e0400d8c1634cdaf8051dbae23db","value":"Pensoft"}],"instance":[{"pid":[],"alternateIdentifier":[{"scheme":"doi","value":"10.3897/oneeco.2.e13718"}],"license":null,"accessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/","openAccessRoute":"green"},"type":"Article","url":["https://doi.org/10.3897/oneeco.2.e13718","https://oneecosystem.pensoft.net/article/13718/"],"articleprocessingcharge":null,"publicationdate":"2017-01-01","refereed":"peerReviewed","hostedby":{"key":"10|openaire____::e707e544b9a5bd23fc27fbfa65eb60dd","value":"One Ecosystem"},"collectedfrom":{"key":"10|openaire____::fdc7e0400d8c1634cdaf8051dbae23db","value":"Pensoft"}}]} -{"measures":[{"key":"influence","value":"1.62759106106e-08"},{"key":"popularity","value":"0.22519296"}],"author":[{"fullname":"Nikolaidou,Charitini","name":"Charitini","surname":"Nikolaidou","rank":1,"pid":null},{"fullname":"Votsi,Nefta","name":"Nefta","surname":"Votsi","rank":2,"pid":{"id":{"scheme":"orcid","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}},{"fullname":"Sgardelis,Steanos","name":"Steanos","surname":"Sgardelis","rank":3,"pid":{"id":{"scheme":"orcid_pending","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}},{"fullname":"Halley,John","name":"John","surname":"Halley","rank":4,"pid":null},{"fullname":"Pantis,John","name":"John","surname":"Pantis","rank":5,"pid":{"id":{"scheme":"orcid","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}},{"fullname":"Tsiafouli,Maria","name":"Maria","surname":"Tsiafouli","rank":6,"pid":{"id":{"scheme":"orcid_pending","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}}],"type":"publication","language":{"code":"eng","label":"English"},"country":[{"code":"IT","label":"Italy","provenance":null}],"subjects":[{"subject":{"scheme":"ACM","value":"Ecosystem Services hotspots"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Natura 2000"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Quiet Protected Areas"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Biodiversity"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Agriculture"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Elevation"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Slope"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Ecosystem Service trade-offs and synergies"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":" cultural services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"provisioning services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"regulating services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"supporting services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}],"maintitle":"Ecosystem Service capacity is higher in areas of multiple designation types","subtitle":null,"description":["The implementation of the Ecosystem Service (ES) concept into practice might be a challenging task as it has to take into account previous “traditional” policies and approaches that have evaluated nature and biodiversity differently. Among them the Habitat (92/43/EC) and Bird Directives (79/409/EC), the Water Framework Directive (2000/60/EC), and the Noise Directive (2002/49/EC) have led to the evaluation/designation of areas in Europe with different criteria. In this study our goal was to understand how the ES capacity of an area is related to its designation and if areas with multiple designations have higher capacity in providing ES. We selected four catchments in Greece with a great variety of characteristics covering over 25% of the national territory. Inside the catchments we assessed the ES capacity (following the methodology of Burkhard et al. 2009) of areas designated as Natura 2000 sites, Quiet areas and Wetlands or Water bodies and found those areas that have multiple designations. Data were analyzed by GLM to reveal differences regarding the ES capacity among the different types of areas. We also investigated by PCA synergies and trade-offs among different kinds of ES and tested for correlations among landscape properties, such as elevation, aspect and slope and the ES potential. Our results show that areas with different types or multiple designations have a different capacity in providing ES. Areas of one designation type (Protected or Quiet Areas) had in general intermediate scores in most ES but scores were higher compared to areas with no designation, which displayed stronger capacity in provisioning services. Among Protected Areas and Quiet Areas the latter scored better in general. Areas that combined both designation types (Protected and Quiet Areas) showed the highest capacity in 13 out of 29 ES, that were mostly linked with natural and forest ecosystems. We found significant synergies among most regulating, supporting and cultural ES which in turn display trade-offs with provisioning services. The different ES are spatially related and display strong correlation with landscape properties, such as elevation and slope. We suggest that the designation status of an area can be used as an alternative tool for environmental policy, indicating the capacity for ES provision. Multiple designations of areas can be used as proxies for locating ES “hotspots”. This integration of “traditional” evaluation and designation and the “newer” ES concept forms a time- and cost-effective way to be adopted by stakeholders and policy-makers in order to start complying with new standards and demands for nature conservation and environmental management."],"publicationdate":"2017-01-01","publisher":"Pensoft Publishers","embargoenddate":null,"source":["One Ecosystem 2: e13718"],"format":["text/html"],"contributor":[],"coverage":[],"bestaccessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/","openAccessRoute":null},"container":{"name":"One Ecosystem","issnPrinted":"","issnOnline":"2367-8194","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":null,"conferencedate":null},"documentationUrl":null,"codeRepositoryUrl":null,"programmingLanguage":null,"contactperson":null,"contactgroup":null,"tool":null,"size":null,"version":null,"geolocation":null,"id":"50|fakeoft_____::00ea4a1cd53806a97d62ea6bf268f2a2","originalId":["50|pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2","10.3897/oneeco.2.e13718"],"pid":[{"scheme":"doi","value":"10.1016/j.triboint.2014.05.004"}],"dateofcollection":"2020-03-23T00:20:51.392Z","lastupdatetimestamp":1628257970612,"projects":null,"context":[{"code":"dh-ch","label":"Digital Humanities and Cultural Heritage","provenance":[{"provenance":"Inferred by OpenAIRE","trust":"0.9"}]}],"collectedfrom":[{"key":"10|openaire____::fdc7e0400d8c1634cdaf8051dbae23db","value":"Pensoft"}],"instance":[{"pid":[],"alternateIdentifier":[{"scheme":"doi","value":"10.3897/oneeco.2.e13718"}],"license":null,"accessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/","openAccessRoute":"green"},"type":"Article","url":["https://doi.org/10.3897/oneeco.2.e13718","https://oneecosystem.pensoft.net/article/13718/"],"articleprocessingcharge":null,"publicationdate":"2017-01-01","refereed":"peerReviewed","hostedby":{"key":"10|openaire____::e707e544b9a5bd23fc27fbfa65eb60dd","value":"One Ecosystem"},"collectedfrom":{"key":"10|openaire____::fdc7e0400d8c1634cdaf8051dbae23db","value":"Pensoft"}}]} \ No newline at end of file +{"author":[{"fullname":"Nikolaidou,Charitini","name":"Charitini","surname":"Nikolaidou","rank":1,"pid":null},{"fullname":"Votsi,Nefta","name":"Nefta","surname":"Votsi","rank":2,"pid":{"id":{"scheme":"orcid","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}},{"fullname":"Sgardelis,Steanos","name":"Steanos","surname":"Sgardelis","rank":3,"pid":{"id":{"scheme":"orcid_pending","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}},{"fullname":"Halley,John","name":"John","surname":"Halley","rank":4,"pid":null},{"fullname":"Pantis,John","name":"John","surname":"Pantis","rank":5,"pid":{"id":{"scheme":"orcid","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}},{"fullname":"Tsiafouli,Maria","name":"Maria","surname":"Tsiafouli","rank":6,"pid":{"id":{"scheme":"orcid_pending","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}}],"type":"publication","language":{"code":"eng","label":"English"},"country":[{"code":"IT","label":"Italy","provenance":null}],"subjects":[{"subject":{"scheme":"ACM","value":"Ecosystem Services hotspots"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Natura 2000"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Quiet Protected Areas"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Biodiversity"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Agriculture"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Elevation"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Slope"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Ecosystem Service trade-offs and synergies"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":" cultural services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"provisioning services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"regulating services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"supporting services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}],"maintitle":"Ecosystem Service capacity is higher in areas of multiple designation types","subtitle":null,"description":["The implementation of the Ecosystem Service (ES) concept into practice might be a challenging task as it has to take into account previous “traditional” policies and approaches that have evaluated nature and biodiversity differently. Among them the Habitat (92/43/EC) and Bird Directives (79/409/EC), the Water Framework Directive (2000/60/EC), and the Noise Directive (2002/49/EC) have led to the evaluation/designation of areas in Europe with different criteria. In this study our goal was to understand how the ES capacity of an area is related to its designation and if areas with multiple designations have higher capacity in providing ES. We selected four catchments in Greece with a great variety of characteristics covering over 25% of the national territory. Inside the catchments we assessed the ES capacity (following the methodology of Burkhard et al. 2009) of areas designated as Natura 2000 sites, Quiet areas and Wetlands or Water bodies and found those areas that have multiple designations. Data were analyzed by GLM to reveal differences regarding the ES capacity among the different types of areas. We also investigated by PCA synergies and trade-offs among different kinds of ES and tested for correlations among landscape properties, such as elevation, aspect and slope and the ES potential. Our results show that areas with different types or multiple designations have a different capacity in providing ES. Areas of one designation type (Protected or Quiet Areas) had in general intermediate scores in most ES but scores were higher compared to areas with no designation, which displayed stronger capacity in provisioning services. Among Protected Areas and Quiet Areas the latter scored better in general. Areas that combined both designation types (Protected and Quiet Areas) showed the highest capacity in 13 out of 29 ES, that were mostly linked with natural and forest ecosystems. We found significant synergies among most regulating, supporting and cultural ES which in turn display trade-offs with provisioning services. The different ES are spatially related and display strong correlation with landscape properties, such as elevation and slope. We suggest that the designation status of an area can be used as an alternative tool for environmental policy, indicating the capacity for ES provision. Multiple designations of areas can be used as proxies for locating ES “hotspots”. This integration of “traditional” evaluation and designation and the “newer” ES concept forms a time- and cost-effective way to be adopted by stakeholders and policy-makers in order to start complying with new standards and demands for nature conservation and environmental management."],"publicationdate":"2017-01-01","publisher":"Pensoft Publishers","embargoenddate":null,"source":["One Ecosystem 2: e13718"],"format":["text/html"],"contributor":[],"coverage":[],"bestaccessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/"},"container":{"name":"One Ecosystem","issnPrinted":"","issnOnline":"2367-8194","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":null,"conferencedate":null},"documentationUrl":null,"codeRepositoryUrl":null,"programmingLanguage":null,"contactperson":null,"contactgroup":null,"tool":null,"size":null,"version":null,"geolocation":null,"id":"50|pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2","originalId":["50|pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2","10.3897/oneeco.2.e13718"],"pid":[{"scheme":"doi","value":"10.1016/j.triboint.2014.05.004"}],"dateofcollection":"2020-03-23T00:20:51.392Z","lastupdatetimestamp":1628257970612,"projects":null,"context":[{"code":"dh-ch","label":"Digital Humanities and Cultural Heritage","provenance":[{"provenance":"Inferred by OpenAIRE","trust":"0.9"}]}],"collectedfrom":[{"key":"10|openaire____::fdc7e0400d8c1634cdaf8051dbae23db","value":"Pensoft"}],"instance":[{"pid":[],"alternateIdentifier":[{"scheme":"doi","value":"10.3897/oneeco.2.e13718"}],"license":null,"accessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/","openAccessRoute":"green"},"type":"Article","url":["https://doi.org/10.3897/oneeco.2.e13718","https://oneecosystem.pensoft.net/article/13718/"],"articleprocessingcharge":null,"publicationdate":"2017-01-01","refereed":"peerReviewed","hostedby":{"key":"10|openaire____::e707e544b9a5bd23fc27fbfa65eb60dd","value":"One Ecosystem"},"collectedfrom":{"key":"10|openaire____::fdc7e0400d8c1634cdaf8051dbae23db","value":"Pensoft"}}]} +{"author":[{"fullname":"Nikolaidou,Charitini","name":"Charitini","surname":"Nikolaidou","rank":1,"pid":null},{"fullname":"Votsi,Nefta","name":"Nefta","surname":"Votsi","rank":2,"pid":{"id":{"scheme":"orcid","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}},{"fullname":"Sgardelis,Steanos","name":"Steanos","surname":"Sgardelis","rank":3,"pid":{"id":{"scheme":"orcid_pending","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}},{"fullname":"Halley,John","name":"John","surname":"Halley","rank":4,"pid":null},{"fullname":"Pantis,John","name":"John","surname":"Pantis","rank":5,"pid":{"id":{"scheme":"orcid","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}},{"fullname":"Tsiafouli,Maria","name":"Maria","surname":"Tsiafouli","rank":6,"pid":{"id":{"scheme":"orcid_pending","value":"0000-0001-6651-1178"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}}],"type":"publication","language":{"code":"eng","label":"English"},"country":[{"code":"IT","label":"Italy","provenance":null}],"subjects":[{"subject":{"scheme":"ACM","value":"Ecosystem Services hotspots"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Natura 2000"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Quiet Protected Areas"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Biodiversity"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Agriculture"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Elevation"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Slope"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"Ecosystem Service trade-offs and synergies"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":" cultural services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"provisioning services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"regulating services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}},{"subject":{"scheme":"","value":"supporting services"},"provenance":{"provenance":"sysimport:crosswalk:repository","trust":"0.9"}}],"maintitle":"Ecosystem Service capacity is higher in areas of multiple designation types","subtitle":null,"description":["The implementation of the Ecosystem Service (ES) concept into practice might be a challenging task as it has to take into account previous “traditional” policies and approaches that have evaluated nature and biodiversity differently. Among them the Habitat (92/43/EC) and Bird Directives (79/409/EC), the Water Framework Directive (2000/60/EC), and the Noise Directive (2002/49/EC) have led to the evaluation/designation of areas in Europe with different criteria. In this study our goal was to understand how the ES capacity of an area is related to its designation and if areas with multiple designations have higher capacity in providing ES. We selected four catchments in Greece with a great variety of characteristics covering over 25% of the national territory. Inside the catchments we assessed the ES capacity (following the methodology of Burkhard et al. 2009) of areas designated as Natura 2000 sites, Quiet areas and Wetlands or Water bodies and found those areas that have multiple designations. Data were analyzed by GLM to reveal differences regarding the ES capacity among the different types of areas. We also investigated by PCA synergies and trade-offs among different kinds of ES and tested for correlations among landscape properties, such as elevation, aspect and slope and the ES potential. Our results show that areas with different types or multiple designations have a different capacity in providing ES. Areas of one designation type (Protected or Quiet Areas) had in general intermediate scores in most ES but scores were higher compared to areas with no designation, which displayed stronger capacity in provisioning services. Among Protected Areas and Quiet Areas the latter scored better in general. Areas that combined both designation types (Protected and Quiet Areas) showed the highest capacity in 13 out of 29 ES, that were mostly linked with natural and forest ecosystems. We found significant synergies among most regulating, supporting and cultural ES which in turn display trade-offs with provisioning services. The different ES are spatially related and display strong correlation with landscape properties, such as elevation and slope. We suggest that the designation status of an area can be used as an alternative tool for environmental policy, indicating the capacity for ES provision. Multiple designations of areas can be used as proxies for locating ES “hotspots”. This integration of “traditional” evaluation and designation and the “newer” ES concept forms a time- and cost-effective way to be adopted by stakeholders and policy-makers in order to start complying with new standards and demands for nature conservation and environmental management."],"publicationdate":"2017-01-01","publisher":"Pensoft Publishers","embargoenddate":null,"source":["One Ecosystem 2: e13718"],"format":["text/html"],"contributor":[],"coverage":[],"bestaccessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/"},"container":{"name":"One Ecosystem","issnPrinted":"","issnOnline":"2367-8194","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":null,"conferencedate":null},"documentationUrl":null,"codeRepositoryUrl":null,"programmingLanguage":null,"contactperson":null,"contactgroup":null,"tool":null,"size":null,"version":null,"geolocation":null,"id":"50|fakeoft_____::00ea4a1cd53806a97d62ea6bf268f2a2","originalId":["50|pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2","10.3897/oneeco.2.e13718"],"pid":[{"scheme":"doi","value":"10.1016/j.triboint.2014.05.004"}],"dateofcollection":"2020-03-23T00:20:51.392Z","lastupdatetimestamp":1628257970612,"projects":null,"context":[{"code":"dh-ch","label":"Digital Humanities and Cultural Heritage","provenance":[{"provenance":"Inferred by OpenAIRE","trust":"0.9"}]}],"collectedfrom":[{"key":"10|openaire____::fdc7e0400d8c1634cdaf8051dbae23db","value":"Pensoft"}],"instance":[{"pid":[],"alternateIdentifier":[{"scheme":"doi","value":"10.3897/oneeco.2.e13718"}],"license":null,"accessright":{"code":"c_abf2","label":"OPEN","scheme":"http://vocabularies.coar-repositories.org/documentation/access_rights/","openAccessRoute":"green"},"type":"Article","url":["https://doi.org/10.3897/oneeco.2.e13718","https://oneecosystem.pensoft.net/article/13718/"],"articleprocessingcharge":null,"publicationdate":"2017-01-01","refereed":"peerReviewed","hostedby":{"key":"10|openaire____::e707e544b9a5bd23fc27fbfa65eb60dd","value":"One Ecosystem"},"collectedfrom":{"key":"10|openaire____::fdc7e0400d8c1634cdaf8051dbae23db","value":"Pensoft"}}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala rename to dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala similarity index 96% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala rename to dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala index a3a753a8ab..7abce547f8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala @@ -3,13 +3,9 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkPrepareHostedByInfoToApply.{joinResHBM, prepareResultInfo, toEntityInfo} import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo -import eu.dnetlib.dhp.schema.oaf.{Datasource, OpenAccessRoute, Publication} -import javax.management.openmbean.OpenMBeanAttributeInfo import org.apache.spark.SparkConf import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} -import org.json4s import org.json4s.DefaultFormats -import eu.dnetlib.dhp.schema.common.ModelConstants import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} import org.junit.jupiter.api.Test diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala similarity index 98% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala rename to dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala index 5b00e9b6f1..0922f2e196 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala @@ -4,10 +4,9 @@ import eu.dnetlib.dhp.schema.oaf.Datasource import org.apache.spark.SparkConf import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.json4s.DefaultFormats -import org.junit.jupiter.api.Assertions.{assertNotNull, assertTrue} -import org.junit.jupiter.api.Test -import org.junit.jupiter.api.Assertions._ import org.json4s.jackson.Serialization.write +import org.junit.jupiter.api.Assertions._ +import org.junit.jupiter.api.Test class TestPreprocess extends java.io.Serializable{ diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala rename to dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala similarity index 99% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala rename to dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala index bd7e4fd094..04b1f9ecde 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala @@ -37,7 +37,8 @@ class ScholixGraphTest extends AbstractVocabularyTest{ val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json")).mkString val res =SparkResolveRelation.extractPidsFromRecord(input) assertNotNull(res) - assertTrue(res._2.size == 2) + + assertEquals(1,res._2.size) }