From 50cc21d92e09805934c5b73060bd10803b35a77e Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 29 Jun 2021 18:35:28 +0200 Subject: [PATCH 01/26] Added method to normalize doi values (lower case, remove all preceeding 10., filtering out doi not starting with 10.) --- .../doiboost/DoiBoostMappingUtil.scala | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index a6101c07e..1baf55b89 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -38,6 +38,9 @@ object DoiBoostMappingUtil { val OPENAIRE_PREFIX = "openaire____" val SEPARATOR = "::" + val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)" + val DOI_PREFIX = "10." + val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;") def toActionSet(item:Oaf) :(String, String) = { @@ -352,5 +355,26 @@ object DoiBoostMappingUtil { } + def isEmpty(x: String) = x == null || x.trim.isEmpty + + def normalizeDoi(input : String) :String ={ + val replaced = input.replaceAll("(?:\\n|\\r|\\t|\\s)", "").toLowerCase.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX) + if (isEmpty(replaced)) + return null + + if(replaced.indexOf("10.") < 0) + return null + + val ret = replaced.substring(replaced.indexOf("10.")) + + if (!ret.startsWith(DOI_PREFIX)) + return null + + return ret + + + } + + } From 8b8ffe82dcf81fa5858b049773064023360d1fd5 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 29 Jun 2021 18:41:39 +0200 Subject: [PATCH 02/26] added step of normalization for the doi --- .../main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala index a7f97aaf8..cc758bcae 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala @@ -3,6 +3,7 @@ package eu.dnetlib.doiboost.uw import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{AccessRight, Instance, OpenAccessRoute, Publication} +import eu.dnetlib.doiboost.DoiBoostMappingUtil import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse @@ -53,7 +54,10 @@ object UnpayWallToOAF { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - val doi = (json \"doi").extract[String] + val doi = DoiBoostMappingUtil.normalizeDoi((json \"doi").extract[String]) + + if(doi == null) + return null val is_oa = (json\ "is_oa").extract[Boolean] From 06074ea7d3d1d62a7a7e0b751cb3794fcbd4937e Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 29 Jun 2021 18:46:08 +0200 Subject: [PATCH 03/26] added normalization step to the doi --- .../src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala index 0b5edfb19..0f6dc4885 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala @@ -83,8 +83,9 @@ object ORCIDToOAF { JObject(extIds) <- json \ "workDetail" \"extIds" JField("type", JString(typeValue)) <- extIds JField("value", JString(value)) <- extIds - if "doi".equalsIgnoreCase(typeValue) - } yield (typeValue, value) + normalized_value: String = DoiBoostMappingUtil.normalizeDoi(value) + if "doi".equalsIgnoreCase(typeValue) && normalized_value != null + } yield (typeValue, normalized_value) if (doi.nonEmpty) { return doi.map(l =>OrcidWork(oid, l._2)) } From a74de1cda251c83519166a824815f11cd3b4b197 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 29 Jun 2021 18:51:11 +0200 Subject: [PATCH 04/26] added normalization step to the doi --- .../doiboost/mag/SparkProcessMAG.scala | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala index 173e33360..ecb389af8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala @@ -2,6 +2,7 @@ package eu.dnetlib.doiboost.mag import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.oaf.Publication +import eu.dnetlib.doiboost.DoiBoostMappingUtil import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD @@ -12,6 +13,23 @@ import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ object SparkProcessMAG { + + def getDistinctResults (d:Dataset[MagPapers]):Dataset[MagPapers]={ + d.where(col("Doi").isNotNull) + .groupByKey(mp => DoiBoostMappingUtil.normalizeDoi(mp.Doi))(Encoders.STRING) + .reduceGroups((p1:MagPapers,p2:MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1,p2)) + .map(_._2)(Encoders.product[MagPapers]) + .map(mp => { + new MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi), + mp.DocType, mp.PaperTitle, mp.OriginalTitle, + mp.BookTitle, mp.Year, mp.Date, mp.Publisher: String, + mp.JournalId, mp.ConferenceSeriesId, mp.ConferenceInstanceId, + mp.Volume, mp.Issue, mp.FirstPage, mp.LastPage, + mp.ReferenceCount, mp.CitationCount, mp.EstimatedCitation, + mp.OriginalVenue, mp.FamilyId, mp.CreatedDate) + })(Encoders.product[MagPapers]) + } + def main(args: Array[String]): Unit = { val logger: Logger = LoggerFactory.getLogger(getClass) @@ -33,17 +51,11 @@ object SparkProcessMAG { implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication] implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs) - logger.info("Phase 1) make uninque DOI in Papers:") + logger.info("Phase 1) make uninue DOI in Papers:") val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers] // Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one - val result: RDD[MagPapers] = d.where(col("Doi").isNotNull) - .rdd - .map{ p: MagPapers => Tuple2(p.Doi, p) } - .reduceByKey((p1:MagPapers,p2:MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1,p2)) - .map(_._2) - - val distinctPaper: Dataset[MagPapers] = spark.createDataset(result) + val distinctPaper: Dataset[MagPapers] = getDistinctResults(d) distinctPaper.write.mode(SaveMode.Overwrite).save(s"$workingPath/Papers_distinct") From 801763a0fa854ba5bb74d6b4c2c958089f2d8f70 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 29 Jun 2021 19:07:23 +0200 Subject: [PATCH 05/26] there is no more the need to lower case the doi since it is done in the first step. Also changed the creation of the id by using the factory --- .../eu/dnetlib/doiboost/mag/MagDataModel.scala | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala index e8c283a7a..fd9629024 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala @@ -196,8 +196,8 @@ case object ConversionUtil { val authors = inputParams._2 val pub = new Publication - pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", ModelConstants.DNET_PID_TYPES)).asJava) - pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava) + pub.setPid(List(createSP(paper.Doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava) + pub.setOriginalId(List(paper.PaperId.toString, paper.Doi).asJava) //IMPORTANT //The old method result.setId(generateIdentifier(result, doi)) @@ -258,11 +258,14 @@ case object ConversionUtil { val description = inputParams._2 val pub = new Publication - pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", ModelConstants.DNET_PID_TYPES)).asJava) - pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava) + pub.setPid(List(createSP(paper.Doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava) + pub.setOriginalId(List(paper.PaperId.toString, paper.Doi).asJava) - //Set identifier as 50 | doiboost____::md5(DOI) - pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase)) + //IMPORTANT + //The old method result.setId(generateIdentifier(result, doi)) + //will be replaced using IdentifierFactory + + pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub)) val mainTitles = createSP(paper.PaperTitle, "main title", ModelConstants.DNET_DATACITE_TITLE) val originalTitles = createSP(paper.OriginalTitle, "alternative title", ModelConstants.DNET_DATACITE_TITLE) From cf758f4f91654745d08bc5dce7cd47ecd1334260 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 30 Jun 2021 10:03:15 +0200 Subject: [PATCH 06/26] added normalization step for the doi --- .../java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 6 ++++-- .../java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala | 3 ++- .../dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 3f6a26c46..15a321431 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -16,9 +16,10 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex import eu.dnetlib.dhp.schema.scholexplorer.OafUtils - import java.util +import eu.dnetlib.doiboost.DoiBoostMappingUtil + case class CrossrefDT(doi: String, json:String, timestamp: Long) {} case class mappingAffiliation(name: String) {} @@ -89,7 +90,7 @@ case object Crossref2Oaf { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats //MAPPING Crossref DOI into PID - val doi: String = (json \ "DOI").extract[String] + val doi: String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) result.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava) //MAPPING Crossref DOI into OriginalId @@ -101,6 +102,7 @@ case object Crossref2Oaf { val originalIds = new util.ArrayList(tmp.filter(id => id != null).asJava) result.setOriginalId(originalIds) + // Add DataInfo result.setDataInfo(generateDataInfo()) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala index 4a39a2987..159b817c7 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala @@ -1,6 +1,7 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.doiboost.DoiBoostMappingUtil import org.apache.commons.io.IOUtils import org.apache.hadoop.io.{IntWritable, Text} import org.apache.spark.SparkConf @@ -21,7 +22,7 @@ object CrossrefDataset { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) val ts:Long = (json \ "indexed" \ "timestamp").extract[Long] - val doi:String = (json \ "DOI").extract[String] + val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) CrossrefDT(doi, input, ts) } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala index b11e2d8de..526ff7b3a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala @@ -1,6 +1,7 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.crossref.CrossrefDataset.to_item import eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries.getClass import org.apache.hadoop.io.{IntWritable, Text} @@ -27,7 +28,7 @@ object GenerateCrossrefDataset { def crossrefElement(meta: String): CrossrefDT = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(meta) - val doi:String = (json \ "DOI").extract[String] + val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) val timestamp: Long = (json \ "indexed" \ "timestamp").extract[Long] CrossrefDT(doi, meta, timestamp) From 1299bfb35708bd00088255bc9984b6dc66764bec Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 30 Jun 2021 12:53:27 +0200 Subject: [PATCH 07/26] Added class to test the normalization of doi --- .../dhp/doiboost/NormalizeDoiTest.scala | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala new file mode 100644 index 000000000..a9a841ee9 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala @@ -0,0 +1,46 @@ +package eu.dnetlib.dhp.doiboost + +import eu.dnetlib.doiboost.DoiBoostMappingUtil +import org.junit.jupiter.api.Test + +class NormalizeDOITest { + + @Test + def doiDSLowerCase():Unit = { + val doi ="10.1042/BCJ20160876" + + assert(DoiBoostMappingUtil.normalizeDoi(doi).equals(doi.toLowerCase())) + + } + + + @Test + def doiFiltered():Unit = { + val doi = "0.1042/BCJ20160876" + + assert(DoiBoostMappingUtil.normalizeDoi(doi) == null) + } + + @Test + def doiFiltered2():Unit = { + val doi = "https://doi.org/0.1042/BCJ20160876" + + assert(DoiBoostMappingUtil.normalizeDoi(doi) == null) + } + + + @Test + def doiCleaned():Unit = { + val doi = "https://doi.org/10.1042/BCJ20160876" + + assert(DoiBoostMappingUtil.normalizeDoi(doi).equals("10.1042/BCJ20160876".toLowerCase())) + } + + @Test + def doiCleaned1():Unit = { + val doi = "https://doi.org/10.1042/ BCJ20160876" + + assert(DoiBoostMappingUtil.normalizeDoi(doi).equals("10.1042/BCJ20160876".toLowerCase())) + } + +} \ No newline at end of file From 1503ccbbb5f2da6e79b00ed5bc974b8020bc1f87 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 30 Jun 2021 12:55:37 +0200 Subject: [PATCH 08/26] added tests for the normalization of the dois --- .../crossref/CrossrefMappingTest.scala | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala index cb543b4d7..0fa34d88e 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala @@ -461,5 +461,37 @@ class CrossrefMappingTest { // }) } + @Test + def testNormalizeDOI(): Unit = { + val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString + val line :String = "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}]," + val json = template.replace("%s", line) + val resultList: List[Oaf] = Crossref2Oaf.convert(json) + assertTrue(resultList.nonEmpty) + val items = resultList.filter(p => p.isInstanceOf[Publication]) + val result: Result = items.head.asInstanceOf[Publication] + + result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi"))) + assertTrue(result.getPid.size() == 1) + result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))) + + } + + @Test + def testNormalizeDOI2(): Unit = { + val template = Source.fromInputStream(getClass.getResourceAsStream("article.json")).mkString + + val resultList: List[Oaf] = Crossref2Oaf.convert(template) + assertTrue(resultList.nonEmpty) + val items = resultList.filter(p => p.isInstanceOf[Publication]) + val result: Result = items.head.asInstanceOf[Publication] + + result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi"))) + assertTrue(result.getPid.size() == 1) + result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase()))) + + } + + } From e487b5544c9c81ea348441a8e4f4456bb1c5a800 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 30 Jun 2021 12:57:11 +0200 Subject: [PATCH 09/26] added tests for the normalization of the dois --- .../dnetlib/doiboost/mag/MAGMappingTest.scala | 51 ++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala index 88b1669f4..7eb50665e 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala @@ -4,7 +4,7 @@ import java.sql.Timestamp import eu.dnetlib.dhp.schema.oaf.Publication import org.apache.htrace.fasterxml.jackson.databind.SerializationFeature -import org.apache.spark.SparkConf +import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.api.java.function.MapFunction import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} @@ -62,6 +62,55 @@ class MAGMappingTest { logger.debug(description) } + @Test + def normalizeDoiTest():Unit = { + + import org.json4s.jackson.Serialization.write + import org.json4s.DefaultFormats + + implicit val formats = DefaultFormats + + val conf = new SparkConf().setAppName("test").setMaster("local[2]") + val sc = new SparkContext(conf) + val spark = SparkSession.builder.config(sc.getConf).getOrCreate() + val path = getClass.getResource("magPapers.json").getPath + + import org.apache.spark.sql.Encoders + val schema = Encoders.product[MagPapers].schema + + import spark.implicits._ + val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers] + val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers) + assertTrue(ret.count == 10) + ret.take(10).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase()))) + + spark.close() + } + + @Test + def normalizeDoiTest2():Unit = { + + import org.json4s.jackson.Serialization.write + import org.json4s.DefaultFormats + + implicit val formats = DefaultFormats + + val conf = new SparkConf().setAppName("test").setMaster("local[2]") + val sc = new SparkContext(conf) + val spark = SparkSession.builder.config(sc.getConf).getOrCreate() + val path = getClass.getResource("duplicatedMagPapers.json").getPath + + import org.apache.spark.sql.Encoders + val schema = Encoders.product[MagPapers].schema + + import spark.implicits._ + val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers] + val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers) + assertTrue(ret.count == 8) + ret.take(8).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase()))) + spark.close() + //ret.take(8).foreach(mp => println(write(mp))) + } } From 149f85ddf59f1e13faf2ca63b1bdfd3609b40459 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 30 Jun 2021 13:00:52 +0200 Subject: [PATCH 10/26] added tests for the normalization of the dois --- .../orcid/MappingORCIDToOAFTest.scala | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala index cdda3b2af..7628fb853 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala @@ -12,6 +12,8 @@ import org.slf4j.{Logger, LoggerFactory} import java.nio.file.Path import scala.io.Source +import scala.collection.JavaConversions._ + class MappingORCIDToOAFTest { val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass) val mapper = new ObjectMapper() @@ -63,9 +65,26 @@ class MappingORCIDToOAFTest { } + @Test + def testExtractDat1():Unit ={ + val aList: List[OrcidAuthor] = List(OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null ), + OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null ), OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null )) + + val orcid:ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList) + + val oaf = ORCIDToOAF.convertTOOAF(orcid) + assert(oaf.getPid.size() == 1) + oaf.getPid.toList.foreach(pid => assert(pid.getQualifier.getClassid.equals("doi"))) + oaf.getPid.toList.foreach(pid => assert(pid.getValue.equals("10.1042/BCJ20160876".toLowerCase()))) + //println(mapper.writeValueAsString(ORCIDToOAF.convertTOOAF(orcid))) + + + } + + } From f8eec0ca9a8a9bf761bfe73c63889d42c52d47ff Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 30 Jun 2021 13:19:54 +0200 Subject: [PATCH 11/26] added resource to test the normalization of doi during the import of MAG --- .../eu/dnetlib/doiboost/mag/duplicatedMagPapers.json | 10 ++++++++++ .../resources/eu/dnetlib/doiboost/mag/magPapers.json | 10 ++++++++++ 2 files changed, 20 insertions(+) create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/mag/duplicatedMagPapers.json create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/mag/magPapers.json diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/mag/duplicatedMagPapers.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/mag/duplicatedMagPapers.json new file mode 100644 index 000000000..6b591a592 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/mag/duplicatedMagPapers.json @@ -0,0 +1,10 @@ +[{"PaperId":2866429360,"Rank":1,"Doi":"10.5465/AMBPP.2018.12619SYMPOSIUM","DocType":null,"PaperTitle":"new directions in research on conflict dynamics","OriginalTitle":"New Directions in Research on Conflict Dynamics","BookTitle":null,"Year":2018,"Date":"2018-07-09T00:00:00Z","Publisher":"Academy of Management Briarcliff Manor, NY 10510","JournalId":null,"Volume":"2018","Issue":"1","FirstPage":"12619","LastPage":null,"ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Academy of Management Proceedings","CreatedDate":"2018-07-19T00:00:00Z"}, + {"PaperId":2871494677,"Rank":2,"Doi":"10.1007/978-981-10-8971-8_33","DocType":null,"PaperTitle":"wild flame detection using weight adaptive particle filter from monocular video","OriginalTitle":"Wild Flame Detection Using Weight Adaptive Particle Filter from Monocular Video","BookTitle":null,"Year":2019,"Date":"2019-01-01T00:00:00Z","Publisher":"Springer, Singapore","JournalId":null,"Volume":null,"Issue":null,"FirstPage":"357","LastPage":"365","ReferenceCount":14,"CitationCount":1,"EstimatedCitation":1,"OriginalVenue":null,"CreatedDate":"2018-07-19T00:00:00Z"}, + {"PaperId":2883520096,"Rank":3,"Doi":"10.5465/AMBPP .2018.12619SYMPOSIUM","DocType":"Journal","PaperTitle":"elaboracion de un corpus cacografico desde la disponibilidad lexica en estudiantes sevillanos un analisis para la ensenanza de la lengua","OriginalTitle":"Elaboración de un corpus cacográfico desde la disponibilidad léxica en estudiantes sevillanos. Un análisis para la enseñanza de la lengua","BookTitle":null,"Year":2018,"Date":"2018-07-13T00:00:00Z","Publisher":"Poli papers","JournalId":2738339871,"Volume":"13","Issue":"1","FirstPage":"119","LastPage":"131","ReferenceCount":28,"CitationCount":2,"EstimatedCitation":2,"OriginalVenue":"Revista de Lingüística y Lenguas Aplicadas","CreatedDate":"2018-08-03T00:00:00Z"}, + {"PaperId":2883800636,"Rank":4,"Doi":"10.1007/978-3-319-92513-4_4","DocType":null,"PaperTitle":"cognitive advantage of bilingualism and its criticisms","OriginalTitle":"Cognitive Advantage of Bilingualism and Its Criticisms","BookTitle":null,"Year":2018,"Date":"2018-01-01T00:00:00Z","Publisher":"Springer, Cham","JournalId":null,"Volume":null,"Issue":null,"FirstPage":"67","LastPage":"89","ReferenceCount":74,"CitationCount":1,"EstimatedCitation":1,"OriginalVenue":null,"CreatedDate":"2018-08-03T00:00:00Z"}, + {"PaperId":2885023064,"Rank":5,"Doi":"10.1097/NNA.0000000000000647","DocType":"Journal","PaperTitle":"enhancing and advancing shared governance through a targeted decision making redesign","OriginalTitle":"Enhancing and Advancing Shared Governance Through a Targeted Decision-Making Redesign.","BookTitle":null,"Year":2018,"Date":"2018-09-01T00:00:00Z","Publisher":"J Nurs Adm","JournalId":194945867,"Volume":"48","Issue":"9","FirstPage":"445","LastPage":"451","ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Journal of Nursing Administration","CreatedDate":"2018-08-22T00:00:00Z"}, + {"PaperId":2885607541,"Rank":1,"Doi":"10.1007/S10465-018-9283-7","DocType":"Journal","PaperTitle":"dance movement therapists attitudes and actions regarding lgbtqi and gender nonconforming communities","OriginalTitle":"Dance/Movement Therapists’ Attitudes and Actions Regarding LGBTQI and Gender Nonconforming Communities","BookTitle":null,"Year":2018,"Date":"2018-08-07T00:00:00Z","Publisher":"Springer US","JournalId":104993962,"Volume":"40","Issue":"2","FirstPage":"202","LastPage":"223","ReferenceCount":40,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"American Journal of Dance Therapy","CreatedDate":"2018-08-22T00:00:00Z"}, + {"PaperId":2886182429,"Rank":2,"Doi":"10.13039/501100003329","DocType":null,"PaperTitle":"caracteres de adaptacion en judia comun phaseolus vulgaris l aproximacion genetica e identificacion de qtls","OriginalTitle":"Caracteres de adaptación en judía común (Phaseolus vulgaris L.): aproximación genética e identificación de QTLs","BookTitle":null,"Year":2017,"Date":"2017-06-15T00:00:00Z","Publisher":"CSIC - Misión Biológica de Galicia (MBG)","JournalId":null,"Volume":null,"Issue":null,"FirstPage":null,"LastPage":null,"ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":null,"CreatedDate":"2018-08-22T00:00:00Z"}, + {"PaperId":2887149460,"Rank":3,"Doi":"10.1093/FEMSLE/FNY192","DocType":"Journal","PaperTitle":"small extracellular particles with big potential for horizontal gene transfer membrane vesicles and gene transfer agents","OriginalTitle":"Small extracellular particles with big potential for horizontal gene transfer: membrane vesicles and gene transfer agents.","BookTitle":null,"Year":2018,"Date":"2018-10-01T00:00:00Z","Publisher":"Narnia","JournalId":34954451,"Volume":"365","Issue":"19","FirstPage":null,"LastPage":null,"ReferenceCount":124,"CitationCount":13,"EstimatedCitation":13,"OriginalVenue":"Fems Microbiology Letters","CreatedDate":"2018-08-22T00:00:00Z"}, + {"PaperId":2887446149,"Rank":4,"Doi":"10.5465/ambpp.2018.12619symposium","DocType":"Journal","PaperTitle":"notes from the field toxigenic vibrio cholerae o141 in a traveler to florida nebraska 2017","OriginalTitle":"Notes from the Field: Toxigenic Vibrio cholerae O141 in a Traveler to Florida — Nebraska, 2017","BookTitle":null,"Year":2018,"Date":"2018-08-03T00:00:00Z","Publisher":"Centers for Disease Control MMWR Office","JournalId":183158886,"Volume":"67","Issue":"30","FirstPage":"838","LastPage":"839","ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Morbidity and Mortality Weekly Report","CreatedDate":"2018-08-22T00:00:00Z"}, + {"PaperId":2889180499,"Rank":5,"Doi":"10.1007/S10924-018-1299-Z","DocType":"Journal","PaperTitle":"hybrid adsorbent materials obtained by the combination of poly ethylene alt maleic anhydride with lignin and lignosulfonate","OriginalTitle":"Hybrid Adsorbent Materials Obtained by the Combination of Poly(ethylene-alt-maleic anhydride) with Lignin and Lignosulfonate","BookTitle":null,"Year":2018,"Date":"2018-08-30T00:00:00Z","Publisher":"Springer US","JournalId":193665811,"Volume":"26","Issue":"11","FirstPage":"4293","LastPage":"4302","ReferenceCount":29,"CitationCount":5,"EstimatedCitation":5,"OriginalVenue":"Journal of Polymers and The Environment","CreatedDate":"2018-09-07T00:00:00Z"}] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/mag/magPapers.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/mag/magPapers.json new file mode 100644 index 000000000..738f5a96c --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/mag/magPapers.json @@ -0,0 +1,10 @@ +[{"PaperId":2866429360,"Rank":1,"Doi":"10.5465/AMBPP.2018.12619SYMPOSIUM","DocType":null,"PaperTitle":"new directions in research on conflict dynamics","OriginalTitle":"New Directions in Research on Conflict Dynamics","BookTitle":null,"Year":2018,"Date":"2018-07-09T00:00:00Z","Publisher":"Academy of Management Briarcliff Manor, NY 10510","JournalId":null,"Volume":"2018","Issue":"1","FirstPage":"12619","LastPage":null,"ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Academy of Management Proceedings","CreatedDate":"2018-07-19T00:00:00Z"}, +{"PaperId":2871494677,"Rank":2,"Doi":"10.1007/978-981-10-8971-8_33","DocType":null,"PaperTitle":"wild flame detection using weight adaptive particle filter from monocular video","OriginalTitle":"Wild Flame Detection Using Weight Adaptive Particle Filter from Monocular Video","BookTitle":null,"Year":2019,"Date":"2019-01-01T00:00:00Z","Publisher":"Springer, Singapore","JournalId":null,"Volume":null,"Issue":null,"FirstPage":"357","LastPage":"365","ReferenceCount":14,"CitationCount":1,"EstimatedCitation":1,"OriginalVenue":null,"CreatedDate":"2018-07-19T00:00:00Z"}, +{"PaperId":2883520096,"Rank":3,"Doi":"10.4995/RLYLA.2018.9176","DocType":"Journal","PaperTitle":"elaboracion de un corpus cacografico desde la disponibilidad lexica en estudiantes sevillanos un analisis para la ensenanza de la lengua","OriginalTitle":"Elaboración de un corpus cacográfico desde la disponibilidad léxica en estudiantes sevillanos. Un análisis para la enseñanza de la lengua","BookTitle":null,"Year":2018,"Date":"2018-07-13T00:00:00Z","Publisher":"Poli papers","JournalId":2738339871,"Volume":"13","Issue":"1","FirstPage":"119","LastPage":"131","ReferenceCount":28,"CitationCount":2,"EstimatedCitation":2,"OriginalVenue":"Revista de Lingüística y Lenguas Aplicadas","CreatedDate":"2018-08-03T00:00:00Z"}, +{"PaperId":2883800636,"Rank":4,"Doi":"10.1007/978-3-319-92513-4_4","DocType":null,"PaperTitle":"cognitive advantage of bilingualism and its criticisms","OriginalTitle":"Cognitive Advantage of Bilingualism and Its Criticisms","BookTitle":null,"Year":2018,"Date":"2018-01-01T00:00:00Z","Publisher":"Springer, Cham","JournalId":null,"Volume":null,"Issue":null,"FirstPage":"67","LastPage":"89","ReferenceCount":74,"CitationCount":1,"EstimatedCitation":1,"OriginalVenue":null,"CreatedDate":"2018-08-03T00:00:00Z"}, +{"PaperId":2885023064,"Rank":5,"Doi":"10.1097/NNA.0000000000000647","DocType":"Journal","PaperTitle":"enhancing and advancing shared governance through a targeted decision making redesign","OriginalTitle":"Enhancing and Advancing Shared Governance Through a Targeted Decision-Making Redesign.","BookTitle":null,"Year":2018,"Date":"2018-09-01T00:00:00Z","Publisher":"J Nurs Adm","JournalId":194945867,"Volume":"48","Issue":"9","FirstPage":"445","LastPage":"451","ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Journal of Nursing Administration","CreatedDate":"2018-08-22T00:00:00Z"}, +{"PaperId":2885607541,"Rank":1,"Doi":"10.1007/S10465-018-9283-7","DocType":"Journal","PaperTitle":"dance movement therapists attitudes and actions regarding lgbtqi and gender nonconforming communities","OriginalTitle":"Dance/Movement Therapists’ Attitudes and Actions Regarding LGBTQI and Gender Nonconforming Communities","BookTitle":null,"Year":2018,"Date":"2018-08-07T00:00:00Z","Publisher":"Springer US","JournalId":104993962,"Volume":"40","Issue":"2","FirstPage":"202","LastPage":"223","ReferenceCount":40,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"American Journal of Dance Therapy","CreatedDate":"2018-08-22T00:00:00Z"}, +{"PaperId":2886182429,"Rank":2,"Doi":"10.13039/501100003329","DocType":null,"PaperTitle":"caracteres de adaptacion en judia comun phaseolus vulgaris l aproximacion genetica e identificacion de qtls","OriginalTitle":"Caracteres de adaptación en judía común (Phaseolus vulgaris L.): aproximación genética e identificación de QTLs","BookTitle":null,"Year":2017,"Date":"2017-06-15T00:00:00Z","Publisher":"CSIC - Misión Biológica de Galicia (MBG)","JournalId":null,"Volume":null,"Issue":null,"FirstPage":null,"LastPage":null,"ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":null,"CreatedDate":"2018-08-22T00:00:00Z"}, +{"PaperId":2887149460,"Rank":3,"Doi":"10.1093/FEMSLE/FNY192","DocType":"Journal","PaperTitle":"small extracellular particles with big potential for horizontal gene transfer membrane vesicles and gene transfer agents","OriginalTitle":"Small extracellular particles with big potential for horizontal gene transfer: membrane vesicles and gene transfer agents.","BookTitle":null,"Year":2018,"Date":"2018-10-01T00:00:00Z","Publisher":"Narnia","JournalId":34954451,"Volume":"365","Issue":"19","FirstPage":null,"LastPage":null,"ReferenceCount":124,"CitationCount":13,"EstimatedCitation":13,"OriginalVenue":"Fems Microbiology Letters","CreatedDate":"2018-08-22T00:00:00Z"}, +{"PaperId":2887446149,"Rank":4,"Doi":"10.15585/MMWR.MM6730A7","DocType":"Journal","PaperTitle":"notes from the field toxigenic vibrio cholerae o141 in a traveler to florida nebraska 2017","OriginalTitle":"Notes from the Field: Toxigenic Vibrio cholerae O141 in a Traveler to Florida — Nebraska, 2017","BookTitle":null,"Year":2018,"Date":"2018-08-03T00:00:00Z","Publisher":"Centers for Disease Control MMWR Office","JournalId":183158886,"Volume":"67","Issue":"30","FirstPage":"838","LastPage":"839","ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Morbidity and Mortality Weekly Report","CreatedDate":"2018-08-22T00:00:00Z"}, +{"PaperId":2889180499,"Rank":5,"Doi":"10.1007/S10924-018-1299-Z","DocType":"Journal","PaperTitle":"hybrid adsorbent materials obtained by the combination of poly ethylene alt maleic anhydride with lignin and lignosulfonate","OriginalTitle":"Hybrid Adsorbent Materials Obtained by the Combination of Poly(ethylene-alt-maleic anhydride) with Lignin and Lignosulfonate","BookTitle":null,"Year":2018,"Date":"2018-08-30T00:00:00Z","Publisher":"Springer US","JournalId":193665811,"Volume":"26","Issue":"11","FirstPage":"4293","LastPage":"4302","ReferenceCount":29,"CitationCount":5,"EstimatedCitation":5,"OriginalVenue":"Journal of Polymers and The Environment","CreatedDate":"2018-09-07T00:00:00Z"}] \ No newline at end of file From 03767ea8e6cbf5f987ef46f678f8dc0d1b857eec Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 30 Jun 2021 13:21:24 +0200 Subject: [PATCH 12/26] slight modification of the resource to accomodate also doi normalization tests --- .../test/resources/eu/dnetlib/doiboost/crossref/article.json | 2 +- .../eu/dnetlib/doiboost/crossref/article_funder_template.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/article.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/article.json index 69424d0ad..5bdf9b3f3 100644 --- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/article.json +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/article.json @@ -1,5 +1,5 @@ { - "DOI": "10.26850/1678-4618eqj.v35.1.2010.p41-46", + "DOI": " 10.26850/1678-4618eqj.v35.1.2010.p41-46", "issued": { "date-parts": [ [ diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/article_funder_template.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/article_funder_template.json index 1a49109ec..5ab0544d1 100644 --- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/article_funder_template.json +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/article_funder_template.json @@ -1,5 +1,5 @@ { - "DOI": "10.26850/1678-4618eqj.v35.1.2010.p41-46", + "DOI": "10.26850/1678-4618EQJ.v35.1.2010.p41-46", "issued": { "date-parts": [ [ From 86f47afcc7736ed9c7eaf63aa52df159fd39d9ea Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 30 Jun 2021 14:36:49 +0200 Subject: [PATCH 13/26] slight modification of the resource to accomodate also doi normalization tests --- .../src/test/resources/eu/dnetlib/doiboost/uw/input.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/uw/input.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/uw/input.json index 33d4dbc3c..68b22b1d6 100644 --- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/uw/input.json +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/uw/input.json @@ -1,6 +1,6 @@ -{"doi": "10.1038/2211089b0", "year": 1969, "genre": "journal-article", "is_oa": true, "title": "Planning: Trees in Danger", "doi_url": "https://doi.org/10.1038/2211089b0", "updated": "2020-02-06T13:51:15.164623", "oa_status": "bronze", "publisher": "Springer Nature", "z_authors": [{"name": "Our Planning Correspondent"}], "is_paratext": false, "journal_name": "Nature", "oa_locations": [{"url": "http://www.nature.com/articles/2211089b0.pdf", "pmh_id": null, "is_best": true, "license": null, "updated": "2018-07-11T09:19:40.598930", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "http://www.nature.com/articles/2211089b0.pdf", "url_for_landing_page": "https://doi.org/10.1038/2211089b0", "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0028-0836,1476-4687", "journal_issn_l": "0028-0836", "published_date": "1969-03-01", "best_oa_location": {"url": "http://www.nature.com/articles/2211089b0.pdf", "pmh_id": null, "is_best": true, "license": null, "updated": "2018-07-11T09:19:40.598930", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "http://www.nature.com/articles/2211089b0.pdf", "url_for_landing_page": "https://doi.org/10.1038/2211089b0", "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": false} -{"doi": "10.1021/acs.bioconjchem.8b00058.s001", "year": null, "genre": "component", "is_oa": true, "title": "Engineering Reversible CellCell Interactions with Lipid Anchored Prosthetic Receptors", "doi_url": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "updated": "2020-04-04T21:15:41.966773", "oa_status": "bronze", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [{"url": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:13:39.352965", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "url_for_landing_page": null, "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": {"url": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:13:39.352965", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "url_for_landing_page": null, "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": false} -{"doi": "10.1021/acs.bioconjchem.8b00086.s001", "year": null, "genre": "component", "is_oa": true, "title": "Rapid, Stoichiometric, Site-Specific Modification of Aldehyde-Containing Proteins Using a Tandem Knoevenagel-Intra Michael Addition Reaction", "doi_url": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "updated": "2020-04-04T21:24:50.688286", "oa_status": "bronze", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [{"url": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:22:19.694440", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "url_for_landing_page": null, "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": {"url": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:22:19.694440", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "url_for_landing_page": null, "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": false} +{"doi": "10.1038/221 1089b0", "year": 1969, "genre": "journal-article", "is_oa": true, "title": "Planning: Trees in Danger", "doi_url": "https://doi.org/10.1038/2211089b0", "updated": "2020-02-06T13:51:15.164623", "oa_status": "bronze", "publisher": "Springer Nature", "z_authors": [{"name": "Our Planning Correspondent"}], "is_paratext": false, "journal_name": "Nature", "oa_locations": [{"url": "http://www.nature.com/articles/2211089b0.pdf", "pmh_id": null, "is_best": true, "license": null, "updated": "2018-07-11T09:19:40.598930", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "http://www.nature.com/articles/2211089b0.pdf", "url_for_landing_page": "https://doi.org/10.1038/2211089b0", "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0028-0836,1476-4687", "journal_issn_l": "0028-0836", "published_date": "1969-03-01", "best_oa_location": {"url": "http://www.nature.com/articles/2211089b0.pdf", "pmh_id": null, "is_best": true, "license": null, "updated": "2018-07-11T09:19:40.598930", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "http://www.nature.com/articles/2211089b0.pdf", "url_for_landing_page": "https://doi.org/10.1038/2211089b0", "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": false} +{"doi": "10.1021/acs.bioconjchem.8b00058. s001", "year": null, "genre": "component", "is_oa": true, "title": "Engineering Reversible CellCell Interactions with Lipid Anchored Prosthetic Receptors", "doi_url": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "updated": "2020-04-04T21:15:41.966773", "oa_status": "bronze", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [{"url": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:13:39.352965", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "url_for_landing_page": null, "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": {"url": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:13:39.352965", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00058.s001", "url_for_landing_page": null, "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": false} +{"doi": "10.1021/acs.bioconjCHEM.8b00086.s001", "year": null, "genre": "component", "is_oa": true, "title": "Rapid, Stoichiometric, Site-Specific Modification of Aldehyde-Containing Proteins Using a Tandem Knoevenagel-Intra Michael Addition Reaction", "doi_url": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "updated": "2020-04-04T21:24:50.688286", "oa_status": "bronze", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [{"url": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:22:19.694440", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "url_for_landing_page": null, "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": {"url": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T21:22:19.694440", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acs.bioconjchem.8b00086.s001", "url_for_landing_page": null, "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": false} {"doi": "10.1192/bjp.89.375.270", "year": 1943, "genre": "journal-article", "is_oa": false, "title": "Unusual Pituitary Activity in a Case of Anorexia Nervosa", "doi_url": "https://doi.org/10.1192/bjp.89.375.270", "updated": "2020-03-09T08:54:12.827623", "oa_status": "closed", "publisher": "Royal College of Psychiatrists", "z_authors": [{"given": "M.", "family": "Reiss", "sequence": "first"}], "is_paratext": false, "journal_name": "Journal of Mental Science", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0368-315X,2514-9946", "journal_issn_l": "0368-315X", "published_date": "1943-04-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false} {"doi": "10.1016/s0167-7012(99)00056-1", "year": 1999, "genre": "journal-article", "is_oa": false, "title": "Development of radiographic and microscopic techniques for the characterization of bacterial transport in intact sediment cores from Oyster, Virginia", "doi_url": "https://doi.org/10.1016/s0167-7012(99)00056-1", "updated": "2020-04-05T11:15:40.634599", "oa_status": "closed", "publisher": "Elsevier BV", "z_authors": [{"given": "Hailiang", "family": "Dong", "sequence": "first"}, {"given": "Tullis C.", "family": "Onstott", "sequence": "additional"}, {"given": "Mary F.", "family": "DeFlaun", "sequence": "additional"}, {"given": "Mark E.", "family": "Fuller", "sequence": "additional"}, {"given": "Kathleen M.", "family": "Gillespie", "sequence": "additional"}, {"given": "James K.", "family": "Fredrickson", "sequence": "additional"}], "is_paratext": false, "journal_name": "Journal of Microbiological Methods", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0167-7012", "journal_issn_l": "0167-7012", "published_date": "1999-08-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false} {"doi": "10.1086/mp.1905.2.issue-3", "year": 1905, "genre": "journal-issue", "is_oa": false, "title": null, "doi_url": "https://doi.org/10.1086/mp.1905.2.issue-3", "updated": "2020-02-07T15:51:44.560109", "oa_status": "closed", "publisher": "University of Chicago Press", "z_authors": null, "is_paratext": false, "journal_name": "Modern Philology", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0026-8232,1545-6951", "journal_issn_l": "0026-8232", "published_date": "1905-01-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false} @@ -38,7 +38,7 @@ {"doi": "10.1016/s1067-991x(03)70006-6", "year": 2003, "genre": "journal-article", "is_oa": false, "title": "Use of the autolaunch method of dispatching a helicopter", "doi_url": "https://doi.org/10.1016/s1067-991x(03)70006-6", "updated": "2020-03-12T07:24:35.659404", "oa_status": "closed", "publisher": "Elsevier BV", "z_authors": [{"given": "Kathleen S.", "family": "Berns", "sequence": "first"}, {"given": "Jeffery J.", "family": "Caniglia", "sequence": "additional"}, {"given": "Daniel G.", "family": "Hankins", "sequence": "additional"}, {"given": "Scott P.", "family": "Zietlow", "sequence": "additional"}], "is_paratext": false, "journal_name": "Air Medical Journal", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "1067-991X", "journal_issn_l": "1067-991X", "published_date": "2003-05-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false} {"doi": "10.1016/j.clinimag.2015.12.002", "year": 2016, "genre": "journal-article", "is_oa": false, "title": "Imaging findings, diagnosis, and clinical outcomes in patients with mycotic aneurysms: single center experience", "doi_url": "https://doi.org/10.1016/j.clinimag.2015.12.002", "updated": "2020-03-12T17:56:16.049536", "oa_status": "closed", "publisher": "Elsevier BV", "z_authors": [{"given": "Amy R.", "family": "Deipolyi", "sequence": "first"}, {"given": "Alexander", "family": "Bailin", "sequence": "additional"}, {"given": "Ali", "family": "Khademhosseini", "sequence": "additional"}, {"ORCID": "http://orcid.org/0000-0003-4984-1778", "given": "Rahmi", "family": "Oklu", "sequence": "additional", "authenticated-orcid": false}], "is_paratext": false, "journal_name": "Clinical Imaging", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0899-7071", "journal_issn_l": "0899-7071", "published_date": "2016-05-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false} {"doi": "10.1016/j.biocel.2013.05.012", "year": 2013, "genre": "journal-article", "is_oa": false, "title": "MAVS-mediated host cell defense is inhibited by Borna disease virus", "doi_url": "https://doi.org/10.1016/j.biocel.2013.05.012", "updated": "2020-03-09T20:49:25.975316", "oa_status": "closed", "publisher": "Elsevier BV", "z_authors": [{"given": "Yujun", "family": "Li", "sequence": "first"}, {"given": "Wuqi", "family": "Song", "sequence": "additional"}, {"given": "Jing", "family": "Wu", "sequence": "additional"}, {"given": "Qingmeng", "family": "Zhang", "sequence": "additional"}, {"given": "Junming", "family": "He", "sequence": "additional"}, {"given": "Aimei", "family": "Li", "sequence": "additional"}, {"given": "Jun", "family": "Qian", "sequence": "additional"}, {"given": "Aixia", "family": "Zhai", "sequence": "additional"}, {"given": "Yunlong", "family": "Hu", "sequence": "additional"}, {"given": "Wenping", "family": "Kao", "sequence": "additional"}, {"given": "Lanlan", "family": "Wei", "sequence": "additional"}, {"given": "Fengmin", "family": "Zhang", "sequence": "additional"}, {"given": "Dakang", "family": "Xu", "sequence": "additional"}], "is_paratext": false, "journal_name": "The International Journal of Biochemistry & Cell Biology", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "1357-2725", "journal_issn_l": "1357-2725", "published_date": "2013-08-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false} -{"doi": "10.1021/acsami.8b01074.s004", "year": null, "genre": "component", "is_oa": false, "title": "Solution Coating of Pharmaceutical Nanothin Films and Multilayer Nanocomposites with Controlled Morphology and Polymorphism", "doi_url": "https://doi.org/10.1021/acsami.8b01074.s004", "updated": "2020-04-04T21:02:07.815195", "oa_status": "closed", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false} +{"doi": "10.1021/acsami.8b01074 .s004", "year": null, "genre": "component", "is_oa": false, "title": "Solution Coating of Pharmaceutical Nanothin Films and Multilayer Nanocomposites with Controlled Morphology and Polymorphism", "doi_url": "https://doi.org/10.1021/acsami.8b01074.s004", "updated": "2020-04-04T21:02:07.815195", "oa_status": "closed", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false} {"doi": "10.1093/nar/18.18.5552", "year": 1990, "genre": "journal-article", "is_oa": true, "title": "Nucleotide sequence of LTR-gag region of Rous sarcoma virus adapted to semi-permissive host", "doi_url": "https://doi.org/10.1093/nar/18.18.5552", "updated": "2020-02-07T07:59:06.754183", "oa_status": "green", "publisher": "Oxford University Press (OUP)", "z_authors": [{"given": "Vladimir I.", "family": "Kashuba", "sequence": "first"}, {"given": "Serge V.", "family": "Zubak", "sequence": "additional"}, {"given": "Vadim M.", "family": "Kavsan", "sequence": "additional"}, {"given": "Alla V.", "family": "Rynditch", "sequence": "additional"}, {"given": "Ivo", "family": "Hlozanek", "sequence": "additional"}], "is_paratext": false, "journal_name": "Nucleic Acids Research", "oa_locations": [{"url": "http://europepmc.org/articles/pmc332244?pdf=render", "pmh_id": "oai:pubmedcentral.nih.gov:332244", "is_best": true, "license": null, "updated": "2017-10-22T11:38:23.025497", "version": "publishedVersion", "evidence": "oa repository (via OAI-PMH doi match)", "host_type": "repository", "endpoint_id": "pubmedcentral.nih.gov", "url_for_pdf": "http://europepmc.org/articles/pmc332244?pdf=render", "url_for_landing_page": "http://europepmc.org/articles/pmc332244", "repository_institution": "pubmedcentral.nih.gov"}, {"url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC332244", "pmh_id": null, "is_best": false, "license": null, "updated": "2020-04-24T18:18:02.810779", "version": "publishedVersion", "evidence": "oa repository (via pmcid lookup)", "host_type": "repository", "endpoint_id": null, "url_for_pdf": null, "url_for_landing_page": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC332244", "repository_institution": null}], "data_standard": 2, "journal_is_oa": false, "journal_issns": "0305-1048,1362-4962", "journal_issn_l": "0305-1048", "published_date": "1990-01-01", "best_oa_location": {"url": "http://europepmc.org/articles/pmc332244?pdf=render", "pmh_id": "oai:pubmedcentral.nih.gov:332244", "is_best": true, "license": null, "updated": "2017-10-22T11:38:23.025497", "version": "publishedVersion", "evidence": "oa repository (via OAI-PMH doi match)", "host_type": "repository", "endpoint_id": "pubmedcentral.nih.gov", "url_for_pdf": "http://europepmc.org/articles/pmc332244?pdf=render", "url_for_landing_page": "http://europepmc.org/articles/pmc332244", "repository_institution": "pubmedcentral.nih.gov"}, "journal_is_in_doaj": false, "has_repository_copy": true} {"doi": "10.1021/acsami.8b01294.s001", "year": null, "genre": "component", "is_oa": true, "title": "Highly Elastic Biodegradable Single-Network Hydrogel for Cell Printing", "doi_url": "https://doi.org/10.1021/acsami.8b01294.s001", "updated": "2020-04-04T22:12:53.813586", "oa_status": "bronze", "publisher": "American Chemical Society (ACS)", "z_authors": null, "is_paratext": false, "journal_name": null, "oa_locations": [{"url": "https://doi.org/10.1021/acsami.8b01294.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T22:11:06.757648", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acsami.8b01294.s001", "url_for_landing_page": null, "repository_institution": null}, {"url": "http://europepmc.org/articles/pmc5876623?pdf=render", "pmh_id": "oai:pubmedcentral.nih.gov:5876623", "is_best": false, "license": "acs-specific: authorchoice/editors choice usage agreement", "updated": "2020-02-19T13:50:59.876849", "version": "publishedVersion", "evidence": "oa repository (via OAI-PMH title match)", "host_type": "repository", "endpoint_id": "ac9de7698155b820de7", "url_for_pdf": "http://europepmc.org/articles/pmc5876623?pdf=render", "url_for_landing_page": "http://europepmc.org/articles/pmc5876623", "repository_institution": "National Institutes of Health (USA) - US National Library of Medicine"}], "data_standard": 2, "journal_is_oa": false, "journal_issns": null, "journal_issn_l": null, "published_date": null, "best_oa_location": {"url": "https://doi.org/10.1021/acsami.8b01294.s001", "pmh_id": null, "is_best": true, "license": null, "updated": "2020-04-04T22:11:06.757648", "version": "publishedVersion", "evidence": "open (via free pdf)", "host_type": "publisher", "endpoint_id": null, "url_for_pdf": "https://doi.org/10.1021/acsami.8b01294.s001", "url_for_landing_page": null, "repository_institution": null}, "journal_is_in_doaj": false, "has_repository_copy": true} {"doi": "10.1097/scs.0b013e3181ef67ba", "year": 2010, "genre": "journal-article", "is_oa": false, "title": "Anomaly of the Internal Carotid Artery Detected During Tonsillectomy", "doi_url": "https://doi.org/10.1097/scs.0b013e3181ef67ba", "updated": "2020-02-10T19:05:26.462040", "oa_status": "closed", "publisher": "Ovid Technologies (Wolters Kluwer Health)", "z_authors": [{"given": "Serdar", "family": "Ceylan", "sequence": "first"}, {"given": "Serkan", "family": "Salman", "sequence": "additional"}, {"given": "Fatih", "family": "Bora", "sequence": "additional"}], "is_paratext": false, "journal_name": "Journal of Craniofacial Surgery", "oa_locations": [], "data_standard": 2, "journal_is_oa": false, "journal_issns": "1049-2275", "journal_issn_l": "1049-2275", "published_date": "2010-09-01", "best_oa_location": null, "journal_is_in_doaj": false, "has_repository_copy": false} From bc3434764361747df2fb35ed68ac4dc75d058ced Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 30 Jun 2021 14:37:08 +0200 Subject: [PATCH 14/26] added assertions to verify doi normalization --- .../dnetlib/doiboost/uw/UnpayWallMappingTest.scala | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala index 6688fc616..fa696fffc 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala @@ -20,16 +20,26 @@ class UnpayWallMappingTest { val Ilist = Source.fromInputStream(getClass.getResourceAsStream("input.json")).mkString - + var i:Int = 0 for (line <-Ilist.lines) { val p = UnpayWallToOAF.convertToOAF(line) if(p!= null) { assertTrue(p.getInstance().size()==1) + if (i== 0){ + assertTrue(p.getPid.get(0).getValue.equals("10.1038/2211089b0")) + } + if (i== 1){ + assertTrue(p.getPid.get(0).getValue.equals("10.1021/acs.bioconjchem.8b00058.s001")) + } + if (i== 2){ + assertTrue(p.getPid.get(0).getValue.equals("10.1021/acs.bioconjchem.8b00086.s001")) + } logger.info(s"ID : ${p.getId}") } assertNotNull(line) assertTrue(line.nonEmpty) + i = i+1 } @@ -39,7 +49,9 @@ class UnpayWallMappingTest { val item = UnpayWallToOAF.convertToOAF(l) assertEquals(item.getInstance().get(0).getAccessright.getOpenAccessRoute, OpenAccessRoute.bronze) + logger.info(mapper.writeValueAsString(item)) + } } From 89e6f466822b2897d1a48444accbc1afae5ce13d Mon Sep 17 00:00:00 2001 From: antleb Date: Mon, 5 Jul 2021 12:00:00 +0300 Subject: [PATCH 15/26] using organization ids instead of names in monitor db creation --- .../oozie_app/scripts/step20-createMonitorDB.sql | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index e01ee1e90..7442b7c10 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -17,11 +17,11 @@ create table TARGET.result as union all select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) union all - select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on p.id=rp.project join SOURCE.project_organizations po on po.id=p.id join SOURCE.organization o on o.id=po.organization where rp.id=r.id and o.name in ( - 'GEORG-AUGUST-UNIVERSITAT GOTTINGEN STIFTUNG OFFENTLICHEN RECHTS', - 'ATHINA-EREVNITIKO KENTRO KAINOTOMIAS STIS TECHNOLOGIES TIS PLIROFORIAS, TON EPIKOINONION KAI TIS GNOSIS', - 'Consiglio Nazionale delle Ricerche', - 'Universidade do Minho') )) foo; + select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on p.id=rp.project join SOURCE.project_organizations po on po.id=p.id where rp.id=r.id and po.organization in ( + 'openorgs____::759d59f05d77188faee99b7493b46805', + 'openorgs____::b84450f9864182c67b8611b5593f4250', + 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', + 'openorgs____::eadc8da90a546e98c03f896661a2e4d4') )) foo; compute stats TARGET.result; create table TARGET.result_citations as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); @@ -130,4 +130,4 @@ compute stats TARGET.result_denorm; alter table TARGET.result_denorm rename to TARGET.result; drop table TARGET.res_tmp; ---- done! \ No newline at end of file +--- done! From 0892cad4e86cab1fdbdcf9a48bd2b26669cce8a1 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 5 Jul 2021 16:21:42 +0200 Subject: [PATCH 16/26] the normalization of the content of value was not visible outside the block. Moved doi normalization operation while returning value --- .../main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala index 0f6dc4885..1cd3f7028 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala @@ -83,9 +83,8 @@ object ORCIDToOAF { JObject(extIds) <- json \ "workDetail" \"extIds" JField("type", JString(typeValue)) <- extIds JField("value", JString(value)) <- extIds - normalized_value: String = DoiBoostMappingUtil.normalizeDoi(value) - if "doi".equalsIgnoreCase(typeValue) && normalized_value != null - } yield (typeValue, normalized_value) + if "doi".equalsIgnoreCase(typeValue) + } yield (typeValue, DoiBoostMappingUtil.normalizeDoi(value)) if (doi.nonEmpty) { return doi.map(l =>OrcidWork(oid, l._2)) } @@ -103,7 +102,7 @@ object ORCIDToOAF { def convertTOOAF(input:ORCIDItem) :Publication = { val doi = input.doi val pub:Publication = new Publication - pub.setPid(List(createSP(doi.toLowerCase, "doi", ModelConstants.DNET_PID_TYPES)).asJava) + pub.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava) pub.setDataInfo(generateDataInfo()) pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub)) From 7177c252616dd1e48aafb85ebfb5dff229d72444 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 5 Jul 2021 16:22:38 +0200 Subject: [PATCH 17/26] added check for null value during doi normalization --- .../src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 1baf55b89..12e4ac379 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -358,6 +358,8 @@ object DoiBoostMappingUtil { def isEmpty(x: String) = x == null || x.trim.isEmpty def normalizeDoi(input : String) :String ={ + if(input == null) + return null val replaced = input.replaceAll("(?:\\n|\\r|\\t|\\s)", "").toLowerCase.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX) if (isEmpty(replaced)) return null From 70ded407bbefbc3c777fc82697c08a7254a64df8 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 5 Jul 2021 18:04:30 +0200 Subject: [PATCH 18/26] HttpClient used in metadata collection retries also on 404 --- .../eu/dnetlib/dhp/collection/HttpConnector2.java | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java index a84b26955..a61e2032c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java @@ -131,18 +131,9 @@ public class HttpConnector2 { } return attemptDownload(newUrl, retryNumber + 1, report); } - if (is4xx(urlConn.getResponseCode())) { - // CLIENT ERROR, DO NOT RETRY - report - .put( - REPORT_PREFIX + urlConn.getResponseCode(), - String - .format( - "%s error: %s", requestUrl, urlConn.getResponseMessage())); - throw new CollectorException("4xx error: request will not be repeated. " + report); - } - if (is5xx(urlConn.getResponseCode())) { + if (is4xx(urlConn.getResponseCode()) || is5xx(urlConn.getResponseCode())) { switch (urlConn.getResponseCode()) { + case HttpURLConnection.HTTP_NOT_FOUND: case HttpURLConnection.HTTP_BAD_GATEWAY: case HttpURLConnection.HTTP_UNAVAILABLE: case HttpURLConnection.HTTP_GATEWAY_TIMEOUT: From 35318027108717ede95b934cd68d691dbc4e20a9 Mon Sep 17 00:00:00 2001 From: Andreas Czerniak Date: Tue, 6 Jul 2021 11:30:56 +0200 Subject: [PATCH 19/26] to solve the scala SI-3623 --- dhp-workflows/dhp-graph-mapper/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index 135516e3e..19febb9ed 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -35,7 +35,7 @@ -Xmax-classfile-name - 140 + 200 ${scala.version} From f580cb77e101489ca3ba33ab40ea1fa6f3a84b0d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 6 Jul 2021 21:11:11 +0200 Subject: [PATCH 20/26] added mapping for claim relation 'resultResult_publicationDataset_isRelatedTo' (present on BETA) --- .../dhp/oa/graph/raw/MigrateDbEntitiesApplication.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index 1a55499c1..8f1a97984 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -529,6 +529,15 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i r2.setSubRelType(OUTCOME); r2.setRelClass(IS_PRODUCED_BY); break; + case "resultResult_publicationDataset_isRelatedTo": + r1.setRelClass(RESULT_RESULT); + r1.setSubRelType(PUBLICATION_DATASET); + r1.setRelClass(IS_RELATED_TO); + + r2.setRelClass(RESULT_RESULT); + r2.setSubRelType(PUBLICATION_DATASET); + r2.setRelClass(IS_RELATED_TO); + break; default: throw new IllegalArgumentException("claim semantics not managed: " + semantics); } From ebf3f47a02a0cfe8cb302a2816c054e01d7884c3 Mon Sep 17 00:00:00 2001 From: Andreas Czerniak Date: Wed, 7 Jul 2021 09:29:49 +0200 Subject: [PATCH 21/26] from&until more OAI2.0 compl., adding tfs --- .../plugin/oai/OaiCollectorPlugin.java | 4 +- .../collection/plugin/oai/OaiIterator.java | 4 +- ...c_cleaning_OPENAIREplus_compliant_hal_orig | 143 +++++ ..._cleaning_OpenAIREplus_compliant_doaj_orig | 140 +++++ .../xslt_cleaning_datarepo_datacite_orig.xsl | 0 ...e_datacite_ExchangeLandingpagePid_orig.xsl | 0 .../original/xslt_nlm2oaf_journal.fi_orig.xsl | 492 +++++++++++++++++ .../original/xslt_nlm2oaf_us-pmc_orig.xsl | 437 ++++++++++++++++ .../scripts/xslt_nlm2oaf_journal.fi.xsl | 493 ++++++++++++++++++ .../transform/scripts/xslt_nlm2oaf_us-pmc.xsl | 373 +++++++++++++ 10 files changed, 2082 insertions(+), 4 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/dc_cleaning_OPENAIREplus_compliant_hal_orig create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/dc_cleaning_OpenAIREplus_compliant_doaj_orig rename dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/{ => original}/xslt_cleaning_datarepo_datacite_orig.xsl (100%) rename dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/{ => original}/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl (100%) create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/xslt_nlm2oaf_journal.fi_orig.xsl create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/xslt_nlm2oaf_us-pmc_orig.xsl create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_nlm2oaf_journal.fi.xsl create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_nlm2oaf_us-pmc.xsl diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index 4600562ca..67fd352a3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -62,11 +62,11 @@ public class OaiCollectorPlugin implements CollectorPlugin { throw new CollectorException("Param 'mdFormat' is null or empty"); } - if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}") && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z")) { throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate); } - if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}") && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z")) { throw new CollectorException("Invalid date (YYYY-MM-DD): " + untilDate); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index 12d313108..c044e02db 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -107,10 +107,10 @@ public class OaiIterator implements Iterator { if (set != null && !set.isEmpty()) { url += "&set=" + URLEncoder.encode(set, "UTF-8"); } - if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + if (fromDate != null && (fromDate.matches("\\d{4}-\\d{2}-\\d{2}") || fromDate.matches("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z"))) { url += "&from=" + URLEncoder.encode(fromDate, "UTF-8"); } - if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + if (untilDate != null && (untilDate.matches("\\d{4}-\\d{2}-\\d{2}") || untilDate.matches("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z"))) { url += "&until=" + URLEncoder.encode(untilDate, "UTF-8"); } log.info("Start harvesting using url: " + url); diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/dc_cleaning_OPENAIREplus_compliant_hal_orig b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/dc_cleaning_OPENAIREplus_compliant_hal_orig new file mode 100644 index 000000000..d4eb71dc4 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/dc_cleaning_OPENAIREplus_compliant_hal_orig @@ -0,0 +1,143 @@ +// from PROD 2021-07-06 , tf script of HAL with around 3mill. records +declare_script "dc_cleaning_OpenAIREplus_compliant_hal"; +declare_ns oaf = "http://namespace.openaire.eu/oaf"; +declare_ns dri = "http://www.driver-repository.eu/namespace/dri"; +declare_ns dr = "http://www.driver-repository.eu/namespace/dr"; +declare_ns dc = "http://purl.org/dc/elements/1.1/"; +declare_ns prov = "http://www.openarchives.org/OAI/2.0/provenance"; +declare_ns oai = "http://www.openarchives.org/OAI/2.0/"; +declare_ns xs = "http://www.w3.org/2001/XMLSchema"; +$var0 = "''"; +$varFP7 = "'corda_______::'"; +$varH2020 = "'corda__h2020::'"; +$varDummy = "''"; +static $varDatasourceid = getValue(PROFILEFIELD, [xpath:"concat('collection(&apos;/db/DRIVER/RepositoryServiceResources&apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&quot;NamespacePrefix&quot;][value=&quot;', //oaf:datasourceprefix, '&quot;]]')", xpath:"//EXTRA_FIELDS/FIELD[key='OpenAireDataSourceId']/value"]); +static $varRepoid = xpath:"//dri:repositoryId"; +static $varOfficialname = getValue(PROFILEFIELD, [xpath:"concat('collection(&apos;/db/DRIVER/RepositoryServiceResources&apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&quot;NamespacePrefix&quot;][value=&quot;', //oaf:datasourceprefix, '&quot;]]')", xpath:"//CONFIGURATION/OFFICIAL_NAME"]); +dri:objIdentifier = xpath:"//dri:objIdentifier"; +dri:repositoryId = $varRepoid; +dri:recordIdentifier = xpath:"//dri:recordIdentifier"; +// +// communities - deactivated until received green light from DARIAH to mark community on prod also +// $varCommunity = xpath:"//*[local-name()='setSpec'][starts-with(., 'collection:DARIAH')]/'dariah'"; +// concept should not appear with empty attribute id, i.e when there is no community - ugly, but seems to work (oaf:datasourceprefix = just any field available in all records) +// oaf:concept = set(xpath:"//oaf:datasourceprefix[string-length($varCommunity) gt 0]/''", @id = $varCommunity;); +// +// apply xpath:"//dc:contributor[starts-with(., 'European Project')]" if xpath:"string-length(replace(., '.*(\d{6,6}).*', '$1')) = 6" oaf:projectid = xpath:"concat($var1, replace(., '.*(\d{6,6}).*', '$1'))"; else $varDummy = "''"; +apply xpath:"//dc:creator" if xpath:"string-length(.) > 0 and normalize-space(.) != ','" dc:creator = xpath:"normalize-space(.)"; else $varDummy = "''"; +if xpath:"//dc:title[string-length(.)> 0]" $varDummy = "''"; else dc:coverage = skipRecord(); +dc:title = xpath:"//dc:title[string-length(.) > 0]/normalize-space(.)"; +apply xpath:"//dc:subject" if xpath:"string-length(.) > 0" dc:subject = xpath:"normalize-space(.)"; else $varDummy = "''"; +apply xpath:"//dc:publisher" if xpath:"string-length(.) > 0" dc:publisher = xpath:"normalize-space(.)"; else $varDummy = "''"; +apply xpath:"//dc:source" if xpath:"string-length(.) > 0" dc:source = xpath:"normalize-space(.)"; else $varDummy = "''"; +dc:contributor = xpath:"//dc:contributor"; +// dc:description = xpath:"//dc:description/normalize-space(.)"; +//dc:description = xpath:"string-join(//dc:description/normalize-space(.), concat('; ',codepoints-to-string(10)))"; +dc:description = xpath:"string-join(//dc:description/normalize-space(.), '; ')"; +dc:format = xpath:"//dc:format"; +$varHttpTest = "''"; +oaf:fulltext = xpath:"//dc:identifier[starts-with(., 'http') and (ends-with(., 'document') or ends-with(., 'pdf'))]"; +//if xpath:"//dc:identifier[starts-with(., 'http') and (ends-with(., 'document') or ends-with(., 'pdf'))] or //dc:relation[starts-with(lower-case(normalize-space(.)), 'info:eu-repo/grantagreement')] or //dc:rights[starts-with(lower-case(normalize-space(.)), 'open') or contains(lower-case(normalize-space(.)), 'openaccess')] or //dc:accessRights[contains(lower-case(normalize-space(.)), 'openaccess')]" $var0 = "''"; else dc:coverage = skipRecord(); +if xpath:"//dc:identifier[starts-with(., 'http')]" $var0 = "''"; else dc:coverage = skipRecord(); +apply xpath:"//dc:identifier" if xpath:"starts-with(normalize-space(.), 'http')" dc:identifier = xpath:"normalize-space(.)"; else dr:CobjIdentifier = xpath:"normalize-space(.)"; +dr:dateOfCollection = xpath:"//dri:dateOfCollection"; +static dr:dateOfTransformation = xpath:"current-dateTime()"; +dc:type = xpath:"//dc:type"; +dc:format = xpath:"//dc:format"; +dc:date = xpath:"//dc:date"; +dc:language = Convert(xpath:"//dc:language", Languages); +$varDateAccepted = Convert(xpath:"descendant-or-self::dc:date", DateISO8601, "yyyy-MM-dd", "min()"); +if xpath:"starts-with($varDateAccepted, '0')" oaf:dateAccepted = $varDummy; else oaf:dateAccepted = $varDateAccepted; +$varEmbargoEnd = xpath:"//dc:date[matches(normalize-space(.), '(.*)(info:eu-repo/date/embargoEnd/)(\d\d\d\d-\d\d-\d\d)', 'i')][contains(lower-case(.), 'info:eu-repo')]/replace(normalize-space(.), '(.*)(info:eu-repo/date/embargoEnd/)(\d\d\d\d-\d\d-\d\d)', '$3', 'i')"; +oaf:embargoenddate = $varEmbargoEnd; +// FP7 +oaf:projectid = xpath:"distinct-values(//dc:relation[matches(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/fp7/)(\d\d\d\d\d\d)(.*)', 'i')][year-from-date(xs:date(max(($varDateAccepted, '0001-01-01')))) gt 2006][contains(lower-case(.), 'info:eu-repo')]/concat($varFP7, replace(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/fp7/)(\d\d\d\d\d\d)(.*)', '$3', 'i')))"; +// H2020 +oaf:projectid = xpath:"distinct-values(//dc:relation[matches(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/h2020/)(\d\d\d\d\d\d)(.*)', 'i')][year-from-date(xs:date(max(($varDateAccepted, '0001-01-01')))) gt 2012][contains(lower-case(.), 'info:eu-repo')]/concat($varH2020, replace(normalize-space(.), '(.*)(info:eu-repo/grantagreement[/]+ec/h2020/)(\d\d\d\d\d\d)(.*)', '$3', 'i')))"; +// H2020 workaround for HAL +oaf:projectid = xpath:"distinct-values(//dc:relation[matches(normalize-space(.), '(.*)(info:eu-repo/grantagreement//)(\d\d\d\d\d\d)(.*)', 'i')][//dc:contributor[contains(lower-case(.), 'h2020')]][year-from-date(xs:date(max(($varDateAccepted, '0001-01-01')))) gt 2012][contains(lower-case(.), 'info:eu-repo')]/concat($varH2020, replace(normalize-space(.), '(.*)(info:eu-repo/grantagreement//)(\d\d\d\d\d\d)(.*)', '$3', 'i')))"; +dc:relation = xpath:"//dc:relation"; +//comment-js-09-10-2012 apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics')" dc:rights = empty; else dc:rights = xpath:"normalize-space(.)"; +// +oaf:collectedDatasourceid = xpath:"$varDatasourceid"; +// +//if xpath:"//dc:type[1]/lower-case(.) = 'text'" dr:CobjCategory = Convert(xpath:"reverse(//dc:type) | //oai:setSpec", TextTypologies); else dr:CobjCategory = Convert(xpath:"//dc:type | //oai:setSpec", TextTypologies); +$varCobjCategoryReverse = Convert(xpath:"insert-before(reverse(//dc:type) , 0, reverse(//oai:setSpec))", TextTypologies); +$varSuperTypeReverse = Convert(xpath:"normalize-space($varCobjCategoryReverse)", SuperTypes); +dr:CobjCategory = set(xpath:"//oaf:datasourceprefix[//dc:type[1]/lower-case(.) = ('text', 'info:eu-repo/semantics/other', 'other') or //oaf:datasourceprefix/lower-case(.) = 'openedition_']/$varCobjCategoryReverse", @type = $varSuperTypeReverse;); +$varCobjCategoryStraight = Convert(xpath:"insert-before(//dc:type , 100, //oai:setSpec)", TextTypologies); +$varSuperTypeStraight = Convert(xpath:"normalize-space($varCobjCategoryStraight)", SuperTypes); +dr:CobjCategory = set(xpath:"//oaf:datasourceprefix[not(//dc:type[1]/lower-case(.) = ('text', 'info:eu-repo/semantics/other', 'other'))]/$varCobjCategoryStraight", @type = $varSuperTypeStraight;); +// +// review level +// oaf:refereed = Convert(xpath:"//dc:description", ReviewLevels); +$varRefereedConvt = Convert(xpath:"(//dc:type, //oai:setSpec, //dc:description)", ReviewLevels); +$varRefereedDesct = xpath:"(//dc:description[matches(lower-case(.), '.*(this\s*book|it)\s*constitutes\s*the\s*(thoroughly\s*)?refereed') or matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*')]/'0001', //dc:description[matches(., '^version\s*(préliminaire.*|0$)')]/'0002')"; +$varRefereedIdntf = xpath:"(//*[string(node-name(.)) = 'dc:identifier' and matches(lower-case(.), '(^|.*[\.\-_/\s\(\)])pre[\.\-_/\s\(\)]?prints?([\.\-_/\s\(\)].*)?$')][count(//dc:identifier) = 1]/'0002', //*[string(node-name(.)) = 'dc:identifier' and matches(lower-case(.), '(^|.*[\.\-_/\s\(\)])refereed([\.\-_/\s\(\)\d].*)?$')]/'0001', //*[string(node-name(.)) = 'dc:identifier' and contains(lower-case(.), '-peer-reviewed-article-')]/'0001')"; +$varRefereed = xpath:"($varRefereedConvt, $varRefereedIdntf, $varRefereedDesct)"; +if xpath:"count(index-of($varRefereed, '0001')) >0" oaf:refereed = xpath:"'0001'"; else $varDummy= "''"; +if xpath:"count(index-of($varRefereed, '0002')) >0 and count(index-of($varRefereed, '0001')) = 0" oaf:refereed = xpath:"'0002'"; else $varDummy= "''"; +// +apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics') and (xs:date( max( ($varEmbargoEnd, '0001-01-01') ) ) gt current-date())" oaf:accessrights = Convert(xpath:"normalize-space(.)", AccessRights); else dc:rights = xpath:"."; +// apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics') " oaf:accessrights = Convert(xpath:"normalize-space(.)", AccessRights); else dc:rights = xpath:"."; +//2021-06-01 ; acz ; next line to avoid to be OPEN as default, set to UNKNOWN , 2021-07-05 acz +//if xpath:"//dc:rights[starts-with(normalize-space(.), 'info:eu-repo/semantics') and not(xs:date( max( ($varEmbargoEnd, '0001-01-01') ) ) lt current-date())]" $var0 = "''"; else oaf:accessrights = "UNKNOWN"; +oaf:license = xpath:"//dc:rights[starts-with(., 'http') or matches(., '^CC[- ]BY([- ](NC([- ](ND|SA))?|ND|SA))([- ]\d(\.\d)?)?$', 'i')]"; +// +static oaf:collectedFrom = set("''", @name = $varOfficialname; , @id = $varDatasourceid;); +static oaf:hostedBy = set("''", @name = $varOfficialname; , @id = $varDatasourceid;); +// +//$varId = identifierExtract('["//dc:identifier", "//dc:relation"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/<>]*/[^\s"<>]+)'); +$varIdDoi = identifierExtract('["//dc:identifier[starts-with(., \"info:\") or starts-with(., \"urn:\") or starts-with(., \"doi:\") or starts-with(., \"DOI:\") or starts-with(., \"Doi:\") or starts-with(., \"doi \") or starts-with(., \"DOI \") or starts-with(., \"Doi \") or starts-with(., \"10.\") or ((starts-with(., \"http\")) and contains(., \"doi.org/10.\"))]", "//dc:relation[starts-with(., \"info:eu-repo/semantics/altIdentifier/doi/10.\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/DOI/10.\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/Doi/10.\") or ((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/doi/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/DOI/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/Doi/http\")) and contains(., \"doi.org/10.\"))]"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/<>]*/[^\s"<>]+)'); + +$varIdHdl = identifierExtract('["//dc:identifier[starts-with(., \"HDL:\") and not(starts-with(., \"HDL: http\"))][not(contains(., \"123456789\"))]", "//dc:relation[starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/hdl/\") or (starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/url/\") and contains(., \"://hdl.handle.net/\"))]"]' , xpath:"./*[local-name()='record']" , '(?!(info:hdl:|://hdl.handle.net/|info:eu-repo/semantics/altIdentifier/hdl/))(\d.*)'); + + +$varIdIsbn = xpath:"(//dc:identifier, //dc:source)[starts-with(lower-case(.), 'isbn') or starts-with(., '978') or starts-with(., '979')][(matches(., '(isbn[:\s]*)?97[89]-\d+-\d+-\d+-\d+$', 'i') and string-length(concat('97', substring-after(., '97'))) = 17) or matches(., '(isbn[:\s]*)?97[89]\d{10}$', 'i')]/replace(., 'isbn[:\s]*', '', 'i'), //dc:relation[starts-with(lower-case(.), 'info:eu-repo/semantics/altidentifier/isbn/')][(matches(., 'info:eu-repo/semantics/altIdentifier/isbn/97[89]-\d+-\d+-\d+-\d+$', 'i') and string-length(.) = 59) or matches(., 'info:eu-repo/semantics/altidentifier/isbn/97[89]\d{10}$', 'i')]/substring-after(lower-case(.), 'info:eu-repo/semantics/altidentifier/isbn/')"; + +$varIdBibc = identifierExtract('["//dc:identifier[starts-with(., \"BibCode:\") or starts-with(., \"BIBCODE:\") or (starts-with(., \"http:\") and contains(., \"bibcode=\"))]"]' , xpath:"./*[local-name()='record']" , '(^(BibCode:|BIBCODE:|http).*$)'); + +$varIdPtnt = identifierExtract('["//dc:identifier[starts-with(., \"Patent N°:\")]"]' , xpath:"./*[local-name()='record']" , '(^Patent N°:.*$)'); + +$varPmId = identifierExtract('["//dc:identifier[starts-with(normalize-space(.), \"PUBMED:\")]"]' , xpath:"./*[local-name()='record']" , '(?!PUBMED: )(\d+)'); + +$varIdPmc = identifierExtract('["//dc:identifier[starts-with(., \"PUBMEDCENTRAL:\") or (starts-with(., \"http\") and contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/PMC\"))]", "//dc:relation[starts-with(., \"info:eu-repo/semantics/altIdentifier/pmid/PMC\") or ((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/pmid/http\")) and contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/PMC\"))]"]' , xpath:"./*[local-name()='record']" , '(PMC\d+)'); + +//$varIdHal = identifierExtract('["//dc:identifier[starts-with(., \"ads-\") or starts-with(., \"anses-\") or starts-with(., \"artxibo-\") or starts-with(., \"bioemco-\") or starts-with(., \"cea-\") or starts-with(., \"cel-\") or starts-with(., \"cirad-\") or starts-with(., \"edutice-\") or starts-with(., \"emse-\") or starts-with(., \"EMSE-\") or starts-with(., \"ensl-\") or starts-with(., \"hal-\") or starts-with(., \"HAL-\") or starts-with(., \"halsde-\") or starts-with(., \"halshs-\") or starts-with(., \"hprints-\") or starts-with(., \"in2p3-\") or starts-with(., \"ineris-\") or starts-with(., \"inria-\") or starts-with(., \"Inria-\") or starts-with(., \"inserm-\") or starts-with(., \"insu-\") or starts-with(., \"INSU-\") or starts-with(., \"ird-\") or starts-with(., \"irsn-\") or starts-with(., \"jpa-\") or starts-with(., \"lirmm-\") or starts-with(., \"medihal-\") or starts-with(., \"meteo-\") or starts-with(., \"mnhn-\") or starts-with(., \"obspm-\") or starts-with(., \"pastel-\") or starts-with(., \"pasteur-\") or starts-with(., \"Pasteur-\") or starts-with(., \"peer-\") or starts-with(., \"ssa-\") or starts-with(., \"tel-\") or starts-with(., \"ujm-\") or starts-with(., \"ijn_\") or starts-with(., \"sic_\") or (starts-with(., \"http\") and (contains(., \"://hal.archives-ouvertes.fr/hal\") or contains(., \"://halshs.archives-ouvertes.fr/hal\") or contains(., \"://halsde.archives-ouvertes.fr/hal\") or contains(., \"://medihal.archives-ouvertes.fr/hal\")))]", "//dc:relation[((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\")) and (contains(., \"://hal.archives-ouvertes.fr/hal\") or contains(., \"://halshs.archives-ouvertes.fr/hal\") or contains(., \"://halsde.archives-ouvertes.fr/hal\")))]"]' , xpath:"./*[local-name()='record']" , '((ads|anses|artxibo|bioemco|cea|cel|cirad|edutice|emse|EMSE|ensl|hal|HAL|halsde|halshs|hprints|in2p3|ineris|inria|Inria|inserm|insu|INSU|ird|irsn|jpa|lirmm|medihal|meteo|mnhn|obspm|pastel|pasteur|Pasteur|peer|ssa|tel|ujm)-|(ijn|sic)_).*'); +$varIdHal = identifierExtract('["//*[local-name() = \"recordIdentifier\"]"]' , xpath:"./*[local-name()='record']" , '(oai:HAL:.*)'); + +$varIdArxv = identifierExtract('["//dc:identifier[((starts-with(., \"http\") or starts-with(., \"ArXiv: http\")) and (contains(., \"://arxiv.org/abs/\") or contains(., \"://arxiv.org/pdf/\"))) or starts-with(., \"arXiv:\") or starts-with(., \"ARXIV:\")]", "//dc:relation[(starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/arxiv/\") and not(contains(., \"/arxiv/http\"))) or ((starts-with(., \"info:eu-repo/semantics/altIdentifier/url/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/purl/http\") or starts-with(., \"info:eu-repo/semantics/altIdentifier/urn/http\") or starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/arxiv/http\")) and (contains(., \"://arxiv.org/abs/\") or contains(., \"://arxiv.org/pdf/\")))]"]' , xpath:"./*[local-name()='record']" , '(?!(://arxiv.org/abs/|:eu-repo/semantics/altIdentifier/arxiv/))([a-zA-Z].*)'); + +$varIdWos = identifierExtract('["//dc:identifier[starts-with(., \"WOS:\") or starts-with(., \"wos: WOS:\")]", "//dc:relation[starts-with(normalize-space(.), \"info:eu-repo/semantics/altIdentifier/wos/\")]"]' , xpath:"./*[local-name()='record']" , '(info.*|WOS:.+|wos: WOS:.+)'); + +//oaf:identifier = set(xpath:"$varId//value[not[. = '10.1145/nnnnnnn.nnnnnnn']]", @identifierType = "doi";); +oaf:identifier = set(xpath:"$varIdDoi//value[not(. = '10.1145/nnnnnnn.nnnnnnn')]", @identifierType = "doi";); +oaf:identifier = set(xpath:"$varIdHdl//value", @identifierType = "handle";); +oaf:identifier = set(xpath:"$varIdIsbn", @identifierType = "isbn";); + +oaf:identifier = set(xpath:"($varIdBibc//value[not(starts-with(., 'http'))]/replace(., 'BIBCODE:\s*', ''), $varIdBibc//value[starts-with(., 'http') and contains(substring-after(., 'bibcode='), codepoints-to-string(38))]/substring-before(substring-after(., 'bibcode='), codepoints-to-string(38)), $varIdBibc//value[starts-with(., 'http') and not(contains(substring-after(., 'bibcode='), codepoints-to-string(38)))]/substring-after(., 'bibcode='))", @identifierType = "bibcode";); + +oaf:identifier = set(xpath:"$varIdPtnt//value/normalize-space(substring-after(., 'Patent N°:'))", @identifierType = "patentNumber";); + +oaf:identifier = set(xpath:"$varPmId//value", @identifierType = "pmid";); +oaf:identifier = set(xpath:"$varIdPmc//value", @identifierType = "pmcid";); +//oaf:identifier = set(xpath:"distinct-values($varIdHal//value/replace(., '(/document|/image|/file/.*)$', ''))", @identifierType = "hal";); +oaf:identifier = set(xpath:"distinct-values($varIdHal//value/replace(substring-after(., 'oai:HAL:'), '(v\d*)$', ''))", @identifierType = "hal";); +oaf:identifier = set(xpath:"distinct-values(($varIdArxv//value/normalize-space(replace(., '(https?://arxiv.org/abs/|https?://arxiv.org/pdf/|info:eu-repo/semantics/altIdentifier/arxiv/|info:eu-repo/semantics/altIdentifier/url/|info:eu-repo/semantics/altIdentifier/urn/|arXiv:|\.pdf)', '', 'i'))))", @identifierType = "arxiv";); +oaf:identifier = set(xpath:"$varIdWos//value/normalize-space(replace(., '(info:eu-repo/semantics/altIdentifier/wos/|WOS:|wos:)', ''))", @identifierType = "wos";); + +oaf:identifier = set(xpath:"distinct-values(//dc:identifier[starts-with(., 'http') and contains(., $varIdHal//value/replace(substring-after(., 'oai:HAL:'), '(v\d*)$', ''))]/replace(., '(/document|/image|/file/.*)$', ''))", @identifierType = "landingPage";); +oaf:identifier = set(xpath:"distinct-values(//dc:identifier[starts-with(., 'http') and not(ends-with(., $varIdHal//value/replace(substring-after(., 'oai:HAL:'), '(v\d*)$', '')))])", @identifierType = "url";); + +oaf:identifier = set(xpath:"//dri:recordIdentifier", @identifierType = "oai-original";); + +oaf:datasourceprefix = xpath:"//oaf:datasourceprefix"; + +// journal data +// avoiding regular expressions, while a) correcting ISSNs with no - or other letters instead of - and b) ignoring any stuff after the ISSN (as e.g. print/online/...) +$varISSN = xpath:"//dc:source[starts-with(., 'ISSN:') and string-length(.) > 12]/concat(substring(normalize-space(substring-after(., 'ISSN:')), 1, 4), '-', normalize-space(substring-after(., substring(normalize-space(substring-after(., 'ISSN:')), 1, 4))))"; +//$varEISSN = xpath:"//dc:source[starts-with(., 'EISSN:') and string-length(.) > 13]/normalize-space(substring-after(., 'ISSN:'))"; +$varEISSN = xpath:"//dc:source[starts-with(., 'EISSN:') and string-length(.) > 13]/concat(substring(normalize-space(substring-after(., 'EISSN:')), 1, 4), '-', normalize-space(substring-after(., substring(normalize-space(substring-after(., 'EISSN:')), 1, 4))))"; +oaf:journal = set(xpath:"//oaf:datasourceprefix[$varISSN or $varEISSN]/''", @issn = xpath:"$varISSN";, @eissn = xpath:"$varEISSN";); + +end diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/dc_cleaning_OpenAIREplus_compliant_doaj_orig b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/dc_cleaning_OpenAIREplus_compliant_doaj_orig new file mode 100644 index 000000000..f8bc42a2c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/dc_cleaning_OpenAIREplus_compliant_doaj_orig @@ -0,0 +1,140 @@ +// from PROD 2021-07-06 , tf script of DOAJ with more than 6mill. records +declare_script "dc_cleaning_OpenAIREplus_compliant_doaj"; +declare_ns oaf = "http://namespace.openaire.eu/oaf"; +declare_ns dri = "http://www.driver-repository.eu/namespace/dri"; +declare_ns dr = "http://www.driver-repository.eu/namespace/dr"; +declare_ns dc = "http://purl.org/dc/elements/1.1/"; +declare_ns prov = "http://www.openarchives.org/OAI/2.0/provenance"; +$var0 = "''"; +$varFP7 = "'corda_______::'"; +$varH2020 = "'corda__h2020::'"; +$varDummy = "''"; +// $varUnknownRepoId = "'openaire____::55045bd2a65019fd8e6741a755395c8c'"; +// +$varUnknownRepoId = "'openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18'"; +$varUnknownRepoName = "'Unknown Repository'"; +static $varDatasourceid = getValue(PROFILEFIELD, [xpath:"concat('collection(&apos;/db/DRIVER/RepositoryServiceResources&apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&quot;NamespacePrefix&quot;][value=&quot;', //oaf:datasourceprefix, '&quot;]]')", xpath:"//EXTRA_FIELDS/FIELD[key='OpenAireDataSourceId']/value"]); +static $varRepoid = xpath:"//dri:repositoryId"; +static $varOfficialname = getValue(PROFILEFIELD, [xpath:"concat('collection(&apos;/db/DRIVER/RepositoryServiceResources&apos;)//RESOURCE_PROFILE[.//EXTRA_FIELDS/FIELD[key=&quot;NamespacePrefix&quot;][value=&quot;', //oaf:datasourceprefix, '&quot;]]')", xpath:"//CONFIGURATION/OFFICIAL_NAME"]); +dri:objIdentifier = xpath:"//dri:objIdentifier"; +dri:repositoryId = $varRepoid; +dri:recordIdentifier = xpath:"//dri:recordIdentifier"; + +if xpath:"//dc:creator[string-length(normalize-space(.)) &gt; 0][contains(., 'CDATA')][starts-with(normalize-space(.), '(')][starts-with(normalize-space(.), '.')]" dc:creator = skipRecord(); else $varDummy = "''"; +//apply xpath:"//dc:creator" if xpath:"string-length(normalize-space(.)) &amp;gt; 0 and not(contains(., 'CDATA')) and not(starts-with(normalize-space(.), '.')) and not(starts-with(normalize-space(.), '('))" dc:creator = Convert(xpath:".", Person); else $varDummy = "''"; +if xpath:"count(//dc:creator) = 0" dc:creator = skipRecord(); else $varDummy = "''"; +//apply xpath:"//dc:creator" if xpath:"string-length(.) &gt; 0 and normalize-space(.) != ','" dc:creator = xpath:"normalize-space(.)"; else $varDummy = "''"; +$varOrcidName = xpath:"//dc:creator[string-length(normalize-space(.)) > 0]"; +$varOrcidOrcid = xpath:"//dc:creator[string-length(normalize-space(.)) > 0]/@id/replace(., 'https?://orcid.org/', '')"; +dc:creator = set(xpath:"$varOrcidName", @nameIdentifier = xpath:"subsequence($varOrcidOrcid,position(),1)";, @nameIdentifierScheme=xpath:"replace(subsequence($varOrcidOrcid,position(),1),'^.+$','ORCID')";, @schemeUri=xpath:"replace(subsequence($varOrcidOrcid,position(),1),'^.+$','http://orcid.org/')";); + +if xpath:"count(//dc:title[string-length(.) &gt; 0]) = 0" dc:title = skipRecord(); else $varDummy = "''"; +dc:title = xpath:"//dc:title/normalize-space(replace(., '^(&lt;title language=)(.)*(&gt;)', ''))"; +// apply xpath:"//dc:title" if xpath:"string-length(.) &gt; 0" dc:title = xpath:"normalize-space(.)"; else $varDummy = "''"; + +apply xpath:"//dc:subject" if xpath:"string-length(.) &gt; 0 and not(@xsi:type = 'dcterms:LCSH')" dc:subject = xpath:"normalize-space(.)"; else $varDummy = "''"; +dc:subject = set(xpath:"//dc:subject[@xsi:type = 'dcterms:LCSH']/concat('lcsh:', .)", @classid=xpath:"'lcsh'";, @classname=xpath:"'lcsh'";, @schemeid=xpath:"'dnet:subject_classification_typologies'";, @schemename=xpath:"'dnet:subject_classification_typologies'";); + +apply xpath:"//dc:publisher" if xpath:"string-length(.) &gt; 0" dc:publisher = xpath:"normalize-space(replace(., '(&lt;br&gt;)', ''))"; else $varDummy = "''"; +apply xpath:"//dc:source" if xpath:"string-length(.) &gt; 0" dc:source = xpath:"normalize-space(.)"; else $varDummy = "''"; +dc:contributor = xpath:"//dc:contributor"; +dc:description = xpath:"//dc:description[not(starts-with(., 'URN: urn:nbn:') or starts-with(., 'URN: http'))]"; +dc:format = xpath:"//dc:format"; +$varHttpTest = "''"; +if xpath:"//dc:relation[starts-with(., 'http') or starts-with(., 'www.')]" $varHttpTest = "true"; else dc:identifier = skipRecord(); +//apply xpath:"//dc:relation" if xpath:"starts-with(normalize-space(.), 'http')" dc:identifier = xpath:"normalize-space(.)"; else dr:CobjIdentifier = xpath:"normalize-space(.)"; +//apply xpath:"//dc:relation" if xpath:"starts-with(normalize-space(.), 'www.')" dc:identifier = xpath:"concat('http://', normalize-space(.))"; else dr:CobjIdentifier = xpath:"normalize-space(.)"; +dr:CobjIdentifier = xpath:"distinct-values(//dc:identifier[not(starts-with(normalize-space(.), 'http'))][not(normalize-space(.) = ($varIdList))][not(starts-with(normalize-space(.), 'urn:nbn:') or starts-with(normalize-space(.), 'URN:NBN:'))][not(. = ($varISSN[1], $varISSN[2]))][normalize-space(.) != ''])"; +dc:identifier = xpath:"($varIdUrl//value[not(starts-with(., 'www'))], $varIdUrl//value[starts-with(., 'www')]/concat('http://', .), $varIdLdpg//value, $varIdDoi//value)[1]"; +dc:relation = xpath:"//dc:relation[starts-with(., 'https://doaj.org/toc/')]"; + +dr:dateOfCollection = xpath:"//dri:dateOfCollection"; +static dr:dateOfTransformation = xpath:"current-dateTime()"; +// dc:type = xpath:"//dc:type"; +dc:language = Convert(xpath:"//dc:language", Languages); +//if xpath:"//dc:rights[text()='info:eu-repo/semantics/openAccess']" dc:publisher = xpath:"//dc:publisher"; else dc:publisher = skipRecord(); +dc:date = xpath:"//dc:date"; +oaf:dateAccepted = Convert(xpath:"descendant-or-self::dc:date", DateISO8601, "yyyy-MM-dd", "min()"); +apply xpath:"//dc:date" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/date')" oaf:embargoenddate = RegExpr(xpath:"normalize-space(.)", $var0, "s/^(.*info:eu-repo\/date\/embargoEnd\/)//gmi"); else $var0 = "''"; +//apply xpath:"//dc:relation" if xpath:"string-length(substring-after(normalize-space(.), 'info:eu-repo/grantAgreement/EC/FP7/')) = 6" oaf:projectid = RegExpr(xpath:"normalize-space(.)", $var1, "s/^(.*info:eu-repo\/grantAgreement\/EC\/FP7\/)//gmi"); else dc:relation = xpath:"normalize-space(.)"; +//comment-js-09-10-2012 apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics')" dc:rights = empty; else dc:rights = xpath:"normalize-space(.)"; +// +oaf:collectedDatasourceid = $varDatasourceid; +// +// apply xpath:"//dc:type" if xpath:"." dr:CobjCategory = Convert(xpath:"normalize-space(.)", TextTypologies); else dc:type = xpath:"."; +//dr:CobjCategory = "0001"; +$varCobjCategory = Convert(xpath:"//dc:type", TextTypologies); +$varSuperType = Convert(xpath:"normalize-space($varCobjCategory)", SuperTypes); +dr:CobjCategory = set($varCobjCategory, @type = $varSuperType;); +dc:type = xpath:"//dc:type"; +// +// review status + +$varRefereedIdntf = xpath:"(//*[string(node-name(.)) = 'dc:identifier' and matches(., '^(https?://(dx\.)?doi.org/)?10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$')]/'0001', //*[string(node-name(.)) = 'dc:relation' and matches(., '^info:eu-repo/semantics/altIdentifier/doi/10\.12688/(f1000research|wellcomeopenres|aasopenres|gatesopenres|hrbopenres)\.\d*(\.\d*|-\d*\.v\d*)$', 'i')]/'0001')"; + +$varRefereedProse = xpath:"(//*[string(node-name(.)) = 'dc:description' and matches(lower-case(.), '.*this\s*preprint\s*has\s*been\s*reviewed\s*and\s*recommended\s*by\s*peer\s*community') and contains(., '10.24072/')]/'0001', //dc:title[matches(lower-case(.), '.*\[.*peer[\s\-\._]*review\s*:.*\]\s*$')]/'0001')"; +$varRefereedReltn = xpath:"(//dc:relation, //dc:identifier)[contains(., '://www.dovepress.com/') and matches(lower-case(.), '.*-peer-reviewed-(fulltext-)?article-.*')]/'0001'"; +$varRefereedTitle = xpath:"//dc:title[matches(lower-case(.), '.*\[.*peer[\s\-\._]*review\s*:.*\]\s*$')]/'0001'"; +$varRefereedDesct = xpath:"(//dc:description[matches(lower-case(.), '.*peer[\.\-_/\s\(\)]?review\s*under\s*responsibility\s*of.*') or matches(lower-case(.), '(this|a)\s*(article|preprint)\s*(has\s*been\s*)?(peer[\-\s]*)?reviewed\s*and\s*recommended\s*by\s*peer[\-\s]*community')]/'0001')"; +$varRefereed = xpath:"($varRefereedIdntf, $varRefereedProse, $varRefereedReltn, $varRefereedTitle, $varRefereedDesct)"; +//if xpath:"$varRefereed" oaf:refereed = xpath:"'0001'"; else $varDummy= "''"; +if xpath:"count(index-of($varRefereed, '0001')) >0" oaf:refereed = xpath:"'0001'"; else $varDummy= "''"; +if xpath:"count(index-of($varRefereed, '0002')) >0 and count(index-of($varRefereed, '0001')) = 0" oaf:refereed = xpath:"'0002'"; else $varDummy= "''"; +// +apply xpath:"//dc:rights" if xpath:"starts-with(normalize-space(.), 'info:eu-repo/semantics')" oaf:accessrights = Convert(xpath:"normalize-space(.)", AccessRights); else dc:rights = xpath:"."; +if xpath:"//dc:rights[starts-with(normalize-space(.), 'info:eu-repo/semantics')]" $var0 = "''"; else oaf:accessrights = "OPEN"; +//if xpath:"count(//dc:rights) = 0" oaf:accessrights = "OPEN"; else $var0 = "''"; +// oaf:accessrights = Convert(xpath:"normalize-space(//dc:rights)", AccessRights); +oaf:license = xpath:"(//dc:rights, //dc:relation)[starts-with(normalize-space(.), 'http') and (contains(., '/licenses/') or contains(., '/licence/') or contains(., '/licencias/') or contains(., '/licencia/') or contains(., '://creativecommons.org/') or contains(., '://rightsstatements.org/')) or matches(., '^CC[- ]BY([- ](NC([- ](ND|SA))?|ND|SA))([- ]\d(\.\d)?)?$', 'i')][not(contains(normalize-space(.), ' '))]/normalize-space(.)"; +// +static oaf:collectedFrom = set("''", @name = $varOfficialname; , @id = $varDatasourceid;); +static oaf:hostedBy = set("''", @name = $varOfficialname; , @id = $varDatasourceid;); +// +//$varId = identifierExtract('["//dc:identifier", "//dc:relation"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/&lt;&gt;]*/[^\s"&lt;&gt;]+)'); +$varIdDoi = identifierExtract('["//dc:identifier[starts-with(., \"10.\") or starts-with(., \"DOI:\") or starts-with(., \"doi:\") or (starts-with(., \"http\") and contains(., \"doi.org/\"))]", "//dc:relation[starts-with(., \"10.\") or starts-with(., \"DOI:\") or starts-with(., \"doi:\") or (starts-with(., \"http\") and contains(., \"doi.org/\"))]"]' , xpath:"./*[local-name()='record']" , '(10[.][0-9]{4,}[^\s"/<>]*/[^\s"<>]+)'); +$varIdHdl = identifierExtract('["//dc:relation[starts-with(., \"http\") and contains(., \"://hdl.handle.net/\")][not(contains(., \"123456789\"))]"]' , xpath:"./*[local-name()='record']" , '(?!(://hdl.handle.net/))(\d.*)'); +$varIdUrn = identifierExtract('["//dc:relation[starts-with(., \"urn:nbn:\") or starts-with(., \"URN:NBN:\") or (starts-with(., \"http\") and (contains(., \"://nbn-resolving.org/urn:nbn:\") or contains(., \"://nbn-resolving.de/urn/resolver.pl?urn:nbn:\") or contains(., \"://nbn-resolving.de/urn:nbn:\") or contains(., \"://resolver.obvsg.at/urn:nbn:\") or contains(., \"://urn.fi/URN:NBN:\") or contains(., \"://urn.kb.se/resolve?urn=urn:nbn:\")))]", "//dc:description[contains(., \"URN: urn:nbn:de:0114-\") or contains(., \"URN: http://nbn-resolving.de/urn:nbn:de:0114-\") or (contains(., \"URN:NBN:no-\") and //dc:identifier = \"1893-1774\")]"]' , xpath:"./*[local-name()='record']" , '((urn:nbn:|URN:NBN:).*)'); +$varIdArk = identifierExtract('["//dc:relation[starts-with(normalize-space(.), \"http\") and contains(., \"/ark:\")]"]' , xpath:"./*[local-name()='record']" , '(http.*)'); +$varIdPmid = identifierExtract('["//dc:relation[starts-with(., \"http\") and contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/pmid/\")]"]' , xpath:"./*[local-name()='record']" , '(http.*)'); +$varIdPmc = identifierExtract('["//dc:relation[starts-with(., \"http\") and (contains(., \"://www.ncbi.nlm.nih.gov/pmc/articles/PMC\") or contains(., \"//europepmc.org/articles/PMC\"))]"]' , xpath:"./*[local-name()='record']" , '(http.*)'); +$varIdHal = identifierExtract('["//dc:relation[starts-with(., \"hal-\") or starts-with(., \"halshs-\") or starts-with(., \"halsde-\") or (starts-with(., \"http\") and (contains(., \"://hal.archives-ouvertes.fr/hal\") or contains(., \"://halshs.archives-ouvertes.fr/hal\") or contains(., \"://halsde.archives-ouvertes.fr/hal\")))]"]' , xpath:"./*[local-name()='record']" , '(hal(shs|sde)?-.*)'); +$varIdArxv = identifierExtract('["//dc:relation[starts-with(., \"http\") and (contains(., \"://arxiv.org/pdf/\") or contains(., \"://arxiv.org/abs/\"))]"]' , xpath:"./*[local-name()='record']" , '(\d.*)'); +$varIdLdpg = identifierExtract('["//dc:identifier[starts-with(., \"https://doaj.org/article/\")]"]', xpath:"./*[local-name()='record']" , '(http.*)'); +$varIdUrl = identifierExtract('["//dc:relation[starts-with(., \"http\")][not(contains(., \"://doaj.org\"))][not(contains(., \"doi.org/\"))][not(contains(., \"hdl.handle.net/\"))][not(contains(., \"://nbn-resolving.de/\") or contains(., \"://nbn-resolving.org/\") or contains(., \"://resolver.obvsg.at/\") or contains(., \"://urn.fi/URN:NBN:\") or contains(., \"://urn.kb.se/resolve\"))][not(contains(., \"://arxiv.org/pdf/\") or contains(., \"://arxiv.org/abs/\"))][not(contains(., \"://localhost/\") or contains(., \"://localhost:\"))]", "//dc:relation[starts-with(., \"www\")]"]', xpath:"./*[local-name()='record']" , '((http|www).*)'); + +$varIdList = xpath:"(($varIdDoi//value, $varIdHdl//value, $varIdUrn//value, $varIdArk//value, $varIdPmid//value, $varIdPmc//value, $varIdLdpg//value, $varIdUrl//value))"; + +// dropping/cleaning wrong DOIs, as +// 2 DOIs just different in 1 ending with . (mostly, but not exclusively, prefixed with 10.5216) +// noise stemming from odd/wrong DOI statements' formats +// DOIs with 2 prefixes +// DOI statements containing first the DOI prefix and then the DOI incl. the resolver prefix +//oaf:identifier = set(xpath:"$varId//value", @identifierType = "doi";); +//oaf:identifier = set(xpath:"$varIdDoi//value", @identifierType = "doi";); + +oaf:identifier = set(xpath:"distinct-values(($varIdDoi//value[not(ends-with(., '.') and exists(index-of($varIdDoi//value, substring(., 1, string-length(.)-1))))][not(. = '10.4313/article-4')][not(lower-case(.) = ('10.30659/ijibe.2.1.171-181', '10.30659/ijibe.2.1.171', '10.26843/rencima.v8i4.149', '10.26843/rencima.v11i1.215', '10.18273/revfue.v14n2-2016002revista', '10.17061/phrp3112015', '10.21789/24222704', '10.22432/pjsr.2017.14.', '10.22432/pjsr.2017.18.02', '10.22432/pjsr.2017.18.'))][not(starts-with(., '10.1530/VAB-'))][not(starts-with(lower-case(.), '10.1155/s168761720'))][not(starts-with(., '10.15561/10.6084/') or starts-with(., '10.5935/10.19180/'))][not(starts-with(., '10.7454/jvi.v') and string-length(.) = 16)][not(starts-with(., '10.15094/0000') and string-length(.) = 16)][not(matches(., '^10\.\d*/DOI:$'))][not(starts-with(., concat(substring-before(., '/'), '/', substring-before(., '/'), '/')))][not(matches(substring-after(., '/'), '^https?://(dx.)?doi.org/.*') and starts-with(substring-after(., 'doi.org/'), substring-before(., '/')))][not(starts-with(., '10.1371/journal.') and matches(., '^10\.1371/journal\.[a-z]{4}\.\d{7}\.(eor|20050521)$'))][not(substring-before(., '/') = ('10.19183', '10.18066') and matches(., '^(10\.19183/how\.\d*\.\d*|10\.18066/revunivap\.v\d*i\d*)$'))]/lower-case(.), $varIdDoi//value[matches(substring-after(., '/'), '^https?://(dx.)?doi.org/.*') and starts-with(substring-after(., 'doi.org/'), substring-before(., '/'))]/substring-after(., 'doi.org/'), $varIdDoi//value[starts-with(., '10.1371/journal.') and matches(., '^10\.1371/journal\.[a-z]{4}\.\d{7}\.eor$')]/substring(., 1, 28), $varIdDoi//value[starts-with(., '10.15561/10.6084/') or starts-with(., '10.5935/10.19180/')]/substring-after(., '/')))", @identifierType = "doi";); + +oaf:identifier = set(xpath:"distinct-values($varIdHdl//value/normalize-space(replace(., '\?locatt=view:master', '')))", @identifierType = "handle";); +oaf:identifier = set(xpath:"$varIdUrn//value", @identifierType = "urn";); +oaf:identifier = set(xpath:"distinct-values($varIdArk//value/replace(substring-after(., '/ark:'), '^/', ''))", @identifierType = "ark";); +oaf:identifier = set(xpath:"distinct-values($varIdPmid//value/replace(., 'https?://www.ncbi.nlm.nih.gov/pmc/articles/pmid/(\d+)(/.*)?', '$1'))", @identifierType = "pmid";); +oaf:identifier = set(xpath:"distinct-values($varIdPmc//value/replace(., 'https?://(www.ncbi.nlm.nih.gov/pmc|europepmc.org)/articles/(PMC\d*)([/\?].*)?', '$2'))", @identifierType = "pmcid";); +oaf:identifier = set(xpath:"distinct-values($varIdHal//value/replace(., '/document', ''))", @identifierType = "hal";); +oaf:identifier = set(xpath:"$varIdArxv//value", @identifierType = "arxiv";); +oaf:identifier = set(xpath:"$varIdLdpg//value", @identifierType = "landingPage";); +oaf:identifier = set(xpath:"($varIdUrl//value[not(starts-with(., 'www'))], $varIdUrl//value[starts-with(., 'www')]/concat('http://', .))", @identifierType = "url";); + +oaf:datasourceprefix = xpath:"//oaf:datasourceprefix"; + +//$varJournalName = xpath:"substring-before(//dc:source, ',')"; +$varJournalTitle = xpath:"(//dc:source[contains(., ', Vol ')]/substring-before(., ', Vol '), //dc:source[contains(., ', Iss ')]/substring-before(., ', Iss '))[1]"; +$varVol = xpath:"//dc:source[contains(., ', Vol ')][matches(., ', Vol \d+')]/replace(substring-after(., ', Vol '), '^(\d+).*$', '$1')"; +$varIss = xpath:"//dc:source[contains(., ', Iss ')][matches(., ', Iss \d+')]/replace(substring-after(., ', Iss '), '^(\d+).*$', '$1')"; +$varSp = xpath:"//dc:source[contains(., ', Pp ')][matches(., ', Pp \d+-\d+')]/substring-before(substring-after(., ', Pp '), '-')"; +$varEp = xpath:"//dc:source[contains(., ', Pp ')][matches(., ', Pp \d+-\d+')]/replace(substring-after(substring-after(., ', Pp '), '-'), '^(\d+).*$', '$1')"; +$varISSN = xpath:"//dc:identifier[string-length() = 9 and matches(., '(\d{4})-(\d{4})')][1]"; +//oaf:journal = set($varJournalName, @issn = xpath:"//dc:identifier[string-length() = 9 and matches(., '^(\d{4})-(\d{4}|\d{3}X)')][1]"; , @eissn = xpath:"//dc:identifier[string-length() = 9 and matches(., '^(\d{4})-(\d{4}|\d{3}X)')][2]";); +//oaf:journal = set($varJournalName, @issn = xpath:"//dc:identifier[string-length() = 9]";); +oaf:journal = set($varJournalTitle, @issn = xpath:"//dc:identifier[string-length() = 9 and matches(., '(\d{4})-(\d{4})')][1]";, @eissn = xpath:"//dc:identifier[string-length() = 9 and matches(., '(\d{4})-(\d{4})')][2]";, @vol = xpath:"$varVol";, @iss = xpath:"$varIss";, @sp = xpath:"$varSp";, @ep = xpath:"$varEp";); + +end diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite_orig.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/xslt_cleaning_datarepo_datacite_orig.xsl similarity index 100% rename from dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite_orig.xsl rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/xslt_cleaning_datarepo_datacite_orig.xsl diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl similarity index 100% rename from dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/xslt_nlm2oaf_journal.fi_orig.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/xslt_nlm2oaf_journal.fi_orig.xsl new file mode 100644 index 000000000..846e57f88 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/xslt_nlm2oaf_journal.fi_orig.xsl @@ -0,0 +1,492 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + landingPage + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + doi + + + + + + + + + + + + + + ORCID + + + http://orcid.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/xslt_nlm2oaf_us-pmc_orig.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/xslt_nlm2oaf_us-pmc_orig.xsl new file mode 100644 index 000000000..b3ee0c985 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/original/xslt_nlm2oaf_us-pmc_orig.xsl @@ -0,0 +1,437 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + eng + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OPEN + + + + + 0001 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + doi + + + + + + pmc + + + + + + pmid + + + + + + + + + + + + + ORCID + + + http://orcid.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_nlm2oaf_journal.fi.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_nlm2oaf_journal.fi.xsl new file mode 100644 index 000000000..2049a3016 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_nlm2oaf_journal.fi.xsl @@ -0,0 +1,493 @@ + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + landingPage + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + doi + + + + + + + + + + + + + + ORCID + + + http://orcid.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_nlm2oaf_us-pmc.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_nlm2oaf_us-pmc.xsl new file mode 100644 index 000000000..5c07fa1a9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_nlm2oaf_us-pmc.xsl @@ -0,0 +1,373 @@ + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + eng + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OPEN + + + + 0001 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + doi + + + + + + pmc + + + + + + pmid + + + + + + + + + + + ORCID + + + http://orcid.org/ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file From 32bdfdccbc76fd1c166f85fa426042d191f624d7 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 7 Jul 2021 11:08:27 +0200 Subject: [PATCH 22/26] [raw_all] Aggregator graph creation merges claims (updates) with the corresponding entity --- .../raw/MigrateDbEntitiesApplication.java | 70 ++++++++----------- 1 file changed, 29 insertions(+), 41 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index 8f1a97984..c1c8e602c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -480,38 +480,15 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final String sourceId = createOpenaireId(sourceType, rs.getString("source_id"), false); final String targetId = createOpenaireId(targetType, rs.getString("target_id"), false); - final Relation r1 = new Relation(); - final Relation r2 = new Relation(); - - if (StringUtils.isNotBlank(validationDate)) { - r1.setValidated(true); - r1.setValidationDate(validationDate); - r2.setValidated(true); - r2.setValidationDate(validationDate); - } - r1.setCollectedfrom(COLLECTED_FROM_CLAIM); - r1.setSource(sourceId); - r1.setTarget(targetId); - r1.setDataInfo(DATA_INFO_CLAIM); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - - r2.setCollectedfrom(COLLECTED_FROM_CLAIM); - r2.setSource(targetId); - r2.setTarget(sourceId); - r2.setDataInfo(DATA_INFO_CLAIM); - r2.setLastupdatetimestamp(lastUpdateTimestamp); + Relation r1 = prepareRelation(sourceId, targetId, validationDate); + Relation r2 = prepareRelation(targetId, sourceId, validationDate); final String semantics = rs.getString("semantics"); switch (semantics) { case "resultResult_relationship_isRelatedTo": - r1.setRelType(RESULT_RESULT); - r1.setSubRelType(RELATIONSHIP); - r1.setRelClass(IS_RELATED_TO); - - r2.setRelType(RESULT_RESULT); - r2.setSubRelType(RELATIONSHIP); - r2.setRelClass(IS_RELATED_TO); + r1 = setRelationSemantic(r1, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO); + r2 = setRelationSemantic(r2, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO); break; case "resultProject_outcome_produces": if (!"project".equals(sourceType)) { @@ -521,22 +498,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i "invalid claim, sourceId: %s, targetId: %s, semantics: %s", sourceId, targetId, semantics)); } - r1.setRelType(RESULT_PROJECT); - r1.setSubRelType(OUTCOME); - r1.setRelClass(PRODUCES); - - r2.setRelType(RESULT_PROJECT); - r2.setSubRelType(OUTCOME); - r2.setRelClass(IS_PRODUCED_BY); + r1 = setRelationSemantic(r1, RESULT_PROJECT, OUTCOME, PRODUCES); + r2 = setRelationSemantic(r2, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY); break; case "resultResult_publicationDataset_isRelatedTo": - r1.setRelClass(RESULT_RESULT); - r1.setSubRelType(PUBLICATION_DATASET); - r1.setRelClass(IS_RELATED_TO); - - r2.setRelClass(RESULT_RESULT); - r2.setSubRelType(PUBLICATION_DATASET); - r2.setRelClass(IS_RELATED_TO); + r1 = setRelationSemantic(r1, RESULT_PROJECT, PUBLICATION_DATASET, IS_RELATED_TO); + r2 = setRelationSemantic(r2, RESULT_PROJECT, PUBLICATION_DATASET, IS_RELATED_TO); break; default: throw new IllegalArgumentException("claim semantics not managed: " + semantics); @@ -549,6 +516,27 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i } } + private Relation prepareRelation(String sourceId, String targetId, String validationDate) { + Relation r = new Relation(); + if (StringUtils.isNotBlank(validationDate)) { + r.setValidated(true); + r.setValidationDate(validationDate); + } + r.setCollectedfrom(COLLECTED_FROM_CLAIM); + r.setSource(sourceId); + r.setTarget(targetId); + r.setDataInfo(DATA_INFO_CLAIM); + r.setLastupdatetimestamp(lastUpdateTimestamp); + return r; + } + + private Relation setRelationSemantic(Relation r, String relType, String subRelType, String relClass) { + r.setRelType(relType); + r.setSubRelType(subRelType); + r.setRelClass(relClass); + return r; + } + private List prepareContext(final String id, final DataInfo dataInfo) { final Context context = new Context(); context.setId(id); From 777536ce9180dc7fe6e4c5351d312fac5bd45aef Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 7 Jul 2021 11:23:40 +0200 Subject: [PATCH 23/26] [aggregation] string values used as regular expressions in the OAI collection classes are defined in a single point as constants, to be reused across the code (PR#122) --- .../dhp/collection/plugin/oai/OaiCollectorPlugin.java | 7 +++++-- .../eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java | 6 ++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index 67fd352a3..9918e4abe 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -21,6 +21,9 @@ import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; public class OaiCollectorPlugin implements CollectorPlugin { + public static final String DATE_REGEX = "\\d{4}-\\d{2}-\\d{2}"; + public static final String UTC_DATETIME_REGEX = "\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z"; + private static final String FORMAT_PARAM = "format"; private static final String OAI_SET_PARAM = "set"; private static final Object OAI_FROM_DATE_PARAM = "fromDate"; @@ -62,11 +65,11 @@ public class OaiCollectorPlugin implements CollectorPlugin { throw new CollectorException("Param 'mdFormat' is null or empty"); } - if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}") && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z")) { + if (fromDate != null && !fromDate.matches(DATE_REGEX) && !fromDate.matches(UTC_DATETIME_REGEX)) { throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate); } - if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}") && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z")) { + if (untilDate != null && !untilDate.matches(DATE_REGEX) && !untilDate.matches(UTC_DATETIME_REGEX)) { throw new CollectorException("Invalid date (YYYY-MM-DD): " + untilDate); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index c044e02db..75dd746ea 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -107,10 +107,12 @@ public class OaiIterator implements Iterator { if (set != null && !set.isEmpty()) { url += "&set=" + URLEncoder.encode(set, "UTF-8"); } - if (fromDate != null && (fromDate.matches("\\d{4}-\\d{2}-\\d{2}") || fromDate.matches("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z"))) { + if (fromDate != null && (fromDate.matches(OaiCollectorPlugin.DATE_REGEX) + || fromDate.matches(OaiCollectorPlugin.UTC_DATETIME_REGEX))) { url += "&from=" + URLEncoder.encode(fromDate, "UTF-8"); } - if (untilDate != null && (untilDate.matches("\\d{4}-\\d{2}-\\d{2}") || untilDate.matches("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z"))) { + if (untilDate != null && (untilDate.matches(OaiCollectorPlugin.DATE_REGEX) + || untilDate.matches(OaiCollectorPlugin.UTC_DATETIME_REGEX))) { url += "&until=" + URLEncoder.encode(untilDate, "UTF-8"); } log.info("Start harvesting using url: " + url); From fdcff42e46028a5e7201183ec5dc2433bdf10052 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 7 Jul 2021 19:01:59 +0200 Subject: [PATCH 24/26] [raw_all] Aggregator graph creation merges claims (updates) with the corresponding entity --- .../dhp/oa/graph/raw/MigrateDbEntitiesApplication.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index c1c8e602c..a9d3e05fe 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -502,8 +502,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i r2 = setRelationSemantic(r2, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY); break; case "resultResult_publicationDataset_isRelatedTo": - r1 = setRelationSemantic(r1, RESULT_PROJECT, PUBLICATION_DATASET, IS_RELATED_TO); - r2 = setRelationSemantic(r2, RESULT_PROJECT, PUBLICATION_DATASET, IS_RELATED_TO); + r1 = setRelationSemantic(r1, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO); + r2 = setRelationSemantic(r2, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO); break; default: throw new IllegalArgumentException("claim semantics not managed: " + semantics); From b7b8e0986ec81574f23c7b8073b96aff719ed7d6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 8 Jul 2021 10:42:31 +0200 Subject: [PATCH 25/26] [raw_all] The claim merge procedure includes the claimed contexts in the merged result --- .../oa/graph/raw/MergeClaimsApplication.java | 87 +++++++++++++++++-- 1 file changed, 79 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java index 9b99097ce..d5c310c1b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java @@ -3,8 +3,12 @@ package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import java.util.ArrayList; +import java.util.List; import java.util.Objects; import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; @@ -98,14 +102,9 @@ public class MergeClaimsApplication { raw .joinWith(claim, raw.col("_1").equalTo(claim.col("_1")), "full_outer") .map( - (MapFunction, Tuple2>, T>) value -> { - Optional> opRaw = Optional.ofNullable(value._1()); - Optional> opClaim = Optional.ofNullable(value._2()); - - return opRaw.isPresent() - ? opRaw.get()._2() - : opClaim.isPresent() ? opClaim.get()._2() : null; - }, + (MapFunction, Tuple2>, T>) value -> processClaims( + Optional.ofNullable(value._1()), + Optional.ofNullable(value._2())), Encoders.bean(clazz)) .filter(Objects::nonNull) .map( @@ -117,6 +116,78 @@ public class MergeClaimsApplication { .text(outPath); } + private static T processClaims(Optional> opRaw, + Optional> opClaim) { + + // when both are present + if (opClaim.isPresent() && opRaw.isPresent()) { + T oafClaim = opClaim.get()._2(); + if (oafClaim instanceof Result) { + T oafRaw = opRaw.get()._2(); + + // merge the context lists from both oaf objects ... + final List context = mergeContexts((Result) oafClaim, (Result) oafRaw); + + // ... and set it on the result from the aggregator + ((Result) oafRaw).setContext(context); + return oafRaw; + } + } + + // otherwise prefer the result from the aggregator + return opRaw.isPresent() + ? opRaw.get()._2() + : opClaim.map(Tuple2::_2).orElse(null); + } + + private static List mergeContexts(Result oafClaim, Result oafRaw) { + return new ArrayList<>( + Stream + .concat( + Optional + .ofNullable(oafClaim.getContext()) + .map(List::stream) + .orElse(Stream.empty()), + Optional + .ofNullable(oafRaw.getContext()) + .map(List::stream) + .orElse(Stream.empty())) + .collect( + Collectors + .toMap( + Context::getId, + c -> c, + (c1, c2) -> { + Context c = new Context(); + c.setId(c1.getId()); + c + .setDataInfo( + new ArrayList<>( + Stream + .concat( + Optional + .ofNullable(c1.getDataInfo()) + .map(List::stream) + .orElse(Stream.empty()), + Optional + .ofNullable(c2.getDataInfo()) + .map(List::stream) + .orElse(Stream.empty())) + .collect( + Collectors + .toMap( + d -> Optional + .ofNullable(d.getProvenanceaction()) + .map(Qualifier::getClassid) + .orElse(""), + d -> d, + (d1, d2) -> d1)) + .values())); + return c; + })) + .values()); + } + private static Dataset readFromPath( SparkSession spark, String path, Class clazz) { return spark From ae2b47b29d84bd10839d855f3a52747eb3fe99f4 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 9 Jul 2021 15:47:51 +0200 Subject: [PATCH 26/26] [broker] added coalesce(1) on the stats dataset before storing it on postgres --- .../src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java | 1 + 1 file changed, 1 insertion(+) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java index a4fb20b1c..9927d6560 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java @@ -80,6 +80,7 @@ public class GenerateStatsJob { .map( (MapFunction, DatasourceStats>) t -> t._2, Encoders.bean(DatasourceStats.class)) + .coalesce(1) .write() .mode(SaveMode.Overwrite) .jdbc(dbUrl, "oa_datasource_stats_temp", connectionProperties);