forked from D-Net/dnet-hadoop
fixed Doiboost bug in the identifier
This commit is contained in:
parent
c171fdebe1
commit
c4a3c52e45
|
@ -341,13 +341,7 @@ object DoiBoostMappingUtil {
|
||||||
|
|
||||||
def generateIdentifier (oaf: Result, doi: String): String = {
|
def generateIdentifier (oaf: Result, doi: String): String = {
|
||||||
val id = DHPUtils.md5 (doi.toLowerCase)
|
val id = DHPUtils.md5 (doi.toLowerCase)
|
||||||
return s"50|${
|
s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}"
|
||||||
doiBoostNSPREFIX
|
|
||||||
}${
|
|
||||||
SEPARATOR
|
|
||||||
}${
|
|
||||||
id
|
|
||||||
}"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -93,7 +93,7 @@ case object Crossref2Oaf {
|
||||||
|
|
||||||
result.setOriginalId(tmp.filter(id => id != null).asJava)
|
result.setOriginalId(tmp.filter(id => id != null).asJava)
|
||||||
|
|
||||||
//Set identifier as {50|60} | doiboost____::md5(DOI)
|
//Set identifier as 50 | doiboost____::md5(DOI)
|
||||||
result.setId(generateIdentifier(result, doi))
|
result.setId(generateIdentifier(result, doi))
|
||||||
|
|
||||||
// Add DataInfo
|
// Add DataInfo
|
||||||
|
@ -267,7 +267,7 @@ case object Crossref2Oaf {
|
||||||
|
|
||||||
val r = new Relation
|
val r = new Relation
|
||||||
r.setSource(sourceId)
|
r.setSource(sourceId)
|
||||||
r.setTarget(s"$nsPrefix::$targetId")
|
r.setTarget(s"40|$nsPrefix::$targetId")
|
||||||
r.setRelType("resultProject")
|
r.setRelType("resultProject")
|
||||||
r.setRelClass("isProducedBy")
|
r.setRelClass("isProducedBy")
|
||||||
r.setSubRelType("outcome")
|
r.setSubRelType("outcome")
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
package eu.dnetlib.dhp.doiboost
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, StructuredProperty, Dataset => OafDataset}
|
||||||
|
import org.apache.spark.sql.functions.{col, sum}
|
||||||
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||||
|
|
||||||
|
import scala.::
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
class QueryTest {
|
||||||
|
|
||||||
|
|
||||||
|
def extractLicense(p:Publication):Tuple2[String,String] = {
|
||||||
|
|
||||||
|
val tmp = p.getInstance().asScala.map(i => i.getLicense.getValue).distinct.mkString(",")
|
||||||
|
(p.getId,tmp)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def hasDOI(publication: Publication, doi:String):Boolean = {
|
||||||
|
|
||||||
|
|
||||||
|
val s = publication.getOriginalId.asScala.filter(i => i.equalsIgnoreCase(doi))
|
||||||
|
|
||||||
|
s.nonEmpty
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def hasNullHostedBy(publication: Publication):Boolean = {
|
||||||
|
publication.getInstance().asScala.exists(i => i.getHostedby == null || i.getHostedby.getValue == null)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def myQuery(spark:SparkSession): Unit = {
|
||||||
|
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
|
implicit val mapEncoderDat: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||||
|
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
|
||||||
|
|
||||||
|
val doiboostPubs:Dataset[Publication] = spark.read.load("/data/doiboost/process/doiBoostPublicationFiltered").as[Publication]
|
||||||
|
|
||||||
|
val relFunder: Dataset[Relation] = spark.read.format("org.apache.spark.sql.parquet").load("/data/doiboost/process/crossrefRelation").as[Relation]
|
||||||
|
|
||||||
|
doiboostPubs.filter(p => p.getDateofacceptance != null && p.getDateofacceptance.getValue!= null && p.getDateofacceptance.getValue.length > 0 )
|
||||||
|
|
||||||
|
doiboostPubs.filter(p=>hasDOI(p, "10.1016/j.is.2020.101522")).collect()(0).getDescription.get(0).getValue
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
doiboostPubs.filter(p=> hasNullHostedBy(p)).count()
|
||||||
|
|
||||||
|
doiboostPubs.map(p=> (p.getId, p.getBestaccessright.getClassname))(Encoders.tuple(Encoders.STRING,Encoders.STRING))
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue