From c4a3c52e4577f613a6d9c66e74869f0cfffbfc41 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 1 Oct 2020 15:46:44 +0200 Subject: [PATCH] fixed Doiboost bug in the identifier --- .../doiboost/DoiBoostMappingUtil.scala | 8 +-- .../doiboost/crossref/Crossref2Oaf.scala | 4 +- .../eu/dnetlib/dhp/doiboost/QueryTest.scala | 54 +++++++++++++++++++ 3 files changed, 57 insertions(+), 9 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 1a45defb06..9c9221b272 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -341,13 +341,7 @@ object DoiBoostMappingUtil { def generateIdentifier (oaf: Result, doi: String): String = { val id = DHPUtils.md5 (doi.toLowerCase) - return s"50|${ - doiBoostNSPREFIX - }${ - SEPARATOR - }${ - id - }" + s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}" } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index f39dd5be8f..b38e103bcb 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -93,7 +93,7 @@ case object Crossref2Oaf { result.setOriginalId(tmp.filter(id => id != null).asJava) - //Set identifier as {50|60} | doiboost____::md5(DOI) + //Set identifier as 50 | doiboost____::md5(DOI) result.setId(generateIdentifier(result, doi)) // Add DataInfo @@ -267,7 +267,7 @@ case object Crossref2Oaf { val r = new Relation r.setSource(sourceId) - r.setTarget(s"$nsPrefix::$targetId") + r.setTarget(s"40|$nsPrefix::$targetId") r.setRelType("resultProject") r.setRelClass("isProducedBy") r.setSubRelType("outcome") diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala new file mode 100644 index 0000000000..c393f0ae9c --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala @@ -0,0 +1,54 @@ +package eu.dnetlib.dhp.doiboost +import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, StructuredProperty, Dataset => OafDataset} +import org.apache.spark.sql.functions.{col, sum} +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} + +import scala.:: +import scala.collection.JavaConverters._ +class QueryTest { + + + def extractLicense(p:Publication):Tuple2[String,String] = { + + val tmp = p.getInstance().asScala.map(i => i.getLicense.getValue).distinct.mkString(",") + (p.getId,tmp) + } + + + + def hasDOI(publication: Publication, doi:String):Boolean = { + + + val s = publication.getOriginalId.asScala.filter(i => i.equalsIgnoreCase(doi)) + + s.nonEmpty + + } + + def hasNullHostedBy(publication: Publication):Boolean = { + publication.getInstance().asScala.exists(i => i.getHostedby == null || i.getHostedby.getValue == null) + } + + + + def myQuery(spark:SparkSession): Unit = { + implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] + implicit val mapEncoderDat: Encoder[OafDataset] = Encoders.kryo[OafDataset] + implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation] + + val doiboostPubs:Dataset[Publication] = spark.read.load("/data/doiboost/process/doiBoostPublicationFiltered").as[Publication] + + val relFunder: Dataset[Relation] = spark.read.format("org.apache.spark.sql.parquet").load("/data/doiboost/process/crossrefRelation").as[Relation] + + doiboostPubs.filter(p => p.getDateofacceptance != null && p.getDateofacceptance.getValue!= null && p.getDateofacceptance.getValue.length > 0 ) + + doiboostPubs.filter(p=>hasDOI(p, "10.1016/j.is.2020.101522")).collect()(0).getDescription.get(0).getValue + + + + doiboostPubs.filter(p=> hasNullHostedBy(p)).count() + + doiboostPubs.map(p=> (p.getId, p.getBestaccessright.getClassname))(Encoders.tuple(Encoders.STRING,Encoders.STRING)) + } + +}