massive code refactor:

removed modules dhp-*-scholexplorer
This commit is contained in:
Sandro La Bruzzo 2021-07-01 22:13:45 +02:00
parent 829caee4fd
commit c6fa8598e1
19 changed files with 152 additions and 1404 deletions

View File

@ -6,7 +6,7 @@ import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.StringUtils
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
@ -118,11 +118,11 @@ object DoiBoostMappingUtil {
def getOpenAccessQualifier():AccessRight = { def getOpenAccessQualifier():AccessRight = {
OafUtils.createAccessRight("OPEN","Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) OafMapperUtils.accessRight("OPEN","Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
} }
def getRestrictedQualifier():AccessRight = { def getRestrictedQualifier():AccessRight = {
OafUtils.createAccessRight("RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) OafMapperUtils.accessRight("RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
} }
@ -150,7 +150,7 @@ object DoiBoostMappingUtil {
if (item.openAccess) if (item.openAccess)
i.setAccessright(getOpenAccessQualifier()) i.setAccessright(getOpenAccessQualifier())
val ar = getOpenAccessQualifier() val ar = getOpenAccessQualifier()
publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
} }
else { else {
hb = ModelConstants.UNKNOWN_REPOSITORY hb = ModelConstants.UNKNOWN_REPOSITORY
@ -162,11 +162,11 @@ object DoiBoostMappingUtil {
if (ar.nonEmpty) { if (ar.nonEmpty) {
if(ar.contains(ModelConstants.ACCESS_RIGHT_OPEN)){ if(ar.contains(ModelConstants.ACCESS_RIGHT_OPEN)){
val ar = getOpenAccessQualifier() val ar = getOpenAccessQualifier()
publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
} }
else { else {
val ar = getRestrictedQualifier() val ar = getRestrictedQualifier()
publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
} }
} }
publication publication
@ -254,7 +254,7 @@ object DoiBoostMappingUtil {
di.setInferred(false) di.setInferred(false)
di.setInvisible(false) di.setInvisible(false)
di.setTrust(trust) di.setTrust(trust)
di.setProvenanceaction(OafUtils.createQualifier(ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS)) di.setProvenanceaction(OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_ACTIONSET,ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS))
di di
} }
@ -262,7 +262,7 @@ object DoiBoostMappingUtil {
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = { def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
val sp = new StructuredProperty val sp = new StructuredProperty
sp.setQualifier(OafUtils.createQualifier(classId,className, schemeId, schemeName)) sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName))
sp.setValue(value) sp.setValue(value)
sp sp
@ -272,7 +272,7 @@ object DoiBoostMappingUtil {
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = { def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
val sp = new StructuredProperty val sp = new StructuredProperty
sp.setQualifier(OafUtils.createQualifier(classId,className, schemeId, schemeName)) sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName))
sp.setValue(value) sp.setValue(value)
sp.setDataInfo(dataInfo) sp.setDataInfo(dataInfo)
sp sp
@ -281,7 +281,7 @@ object DoiBoostMappingUtil {
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = { def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
val sp = new StructuredProperty val sp = new StructuredProperty
sp.setQualifier(OafUtils.createQualifier(classId, schemeId)) sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId))
sp.setValue(value) sp.setValue(value)
sp sp
@ -291,7 +291,7 @@ object DoiBoostMappingUtil {
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = { def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
val sp = new StructuredProperty val sp = new StructuredProperty
sp.setQualifier(OafUtils.createQualifier(classId, schemeId)) sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId))
sp.setValue(value) sp.setValue(value)
sp.setDataInfo(dataInfo) sp.setDataInfo(dataInfo)
sp sp

View File

@ -2,7 +2,7 @@ package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf._ import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import org.apache.commons.lang.StringUtils import org.apache.commons.lang.StringUtils
@ -15,8 +15,6 @@ import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.collection.mutable import scala.collection.mutable
import scala.util.matching.Regex import scala.util.matching.Regex
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils
import java.util import java.util
case class CrossrefDT(doi: String, json:String, timestamp: Long) {} case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
@ -182,12 +180,12 @@ case object Crossref2Oaf {
if(has_review != JNothing) { if(has_review != JNothing) {
instance.setRefereed( instance.setRefereed(
OafUtils.createQualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS)) OafMapperUtils.qualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS))
} }
instance.setAccessright(getRestrictedQualifier()) instance.setAccessright(getRestrictedQualifier())
instance.setInstancetype(OafUtils.createQualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) instance.setInstancetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
result.setResourcetype(OafUtils.createQualifier(cobjCategory.substring(0, 4),ModelConstants.DNET_DATA_CITE_RESOURCE)) result.setResourcetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
instance.setCollectedfrom(createCrossrefCollectedFrom()) instance.setCollectedfrom(createCrossrefCollectedFrom())
if (StringUtils.isNotBlank(issuedDate)) { if (StringUtils.isNotBlank(issuedDate)) {

View File

@ -0,0 +1,42 @@
package eu.dnetlib.dhp.sx.graph
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Result
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
object SparkCreateSummaryObject {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result]
implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
val ds:Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result]
ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).write.mode(SaveMode.Overwrite).save(targetPath)
}
}

View File

@ -1,177 +0,0 @@
package eu.dnetlib.dhp.sx.graph.ebi
import eu.dnetlib.dhp.oa.merge.AuthorMerger
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
import org.apache.spark.sql.{Encoder, Encoders}
import org.apache.spark.sql.expressions.Aggregator
object EBIAggregator {
def getDatasetAggregator(): Aggregator[(String, OafDataset), OafDataset, OafDataset] = new Aggregator[(String, OafDataset), OafDataset, OafDataset]{
override def zero: OafDataset = new OafDataset()
override def reduce(b: OafDataset, a: (String, OafDataset)): OafDataset = {
b.mergeFrom(a._2)
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
if (b.getId == null)
b.setId(a._2.getId)
b
}
override def merge(wx: OafDataset, wy: OafDataset): OafDataset = {
wx.mergeFrom(wy)
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
if(wx.getId == null && wy.getId.nonEmpty)
wx.setId(wy.getId)
wx
}
override def finish(reduction: OafDataset): OafDataset = reduction
override def bufferEncoder: Encoder[OafDataset] =
Encoders.kryo(classOf[OafDataset])
override def outputEncoder: Encoder[OafDataset] =
Encoders.kryo(classOf[OafDataset])
}
def getDLIUnknownAggregator(): Aggregator[(String, DLIUnknown), DLIUnknown, DLIUnknown] = new Aggregator[(String, DLIUnknown), DLIUnknown, DLIUnknown]{
override def zero: DLIUnknown = new DLIUnknown()
override def reduce(b: DLIUnknown, a: (String, DLIUnknown)): DLIUnknown = {
b.mergeFrom(a._2)
if (b.getId == null)
b.setId(a._2.getId)
b
}
override def merge(wx: DLIUnknown, wy: DLIUnknown): DLIUnknown = {
wx.mergeFrom(wy)
if(wx.getId == null && wy.getId.nonEmpty)
wx.setId(wy.getId)
wx
}
override def finish(reduction: DLIUnknown): DLIUnknown = reduction
override def bufferEncoder: Encoder[DLIUnknown] =
Encoders.kryo(classOf[DLIUnknown])
override def outputEncoder: Encoder[DLIUnknown] =
Encoders.kryo(classOf[DLIUnknown])
}
def getDLIDatasetAggregator(): Aggregator[(String, DLIDataset), DLIDataset, DLIDataset] = new Aggregator[(String, DLIDataset), DLIDataset, DLIDataset]{
override def zero: DLIDataset = new DLIDataset()
override def reduce(b: DLIDataset, a: (String, DLIDataset)): DLIDataset = {
b.mergeFrom(a._2)
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
if (b.getId == null)
b.setId(a._2.getId)
b
}
override def merge(wx: DLIDataset, wy: DLIDataset): DLIDataset = {
wx.mergeFrom(wy)
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
if(wx.getId == null && wy.getId.nonEmpty)
wx.setId(wy.getId)
wx
}
override def finish(reduction: DLIDataset): DLIDataset = reduction
override def bufferEncoder: Encoder[DLIDataset] =
Encoders.kryo(classOf[DLIDataset])
override def outputEncoder: Encoder[DLIDataset] =
Encoders.kryo(classOf[DLIDataset])
}
def getDLIPublicationAggregator(): Aggregator[(String, DLIPublication), DLIPublication, DLIPublication] = new Aggregator[(String, DLIPublication), DLIPublication, DLIPublication]{
override def zero: DLIPublication = new DLIPublication()
override def reduce(b: DLIPublication, a: (String, DLIPublication)): DLIPublication = {
b.mergeFrom(a._2)
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
if (b.getId == null)
b.setId(a._2.getId)
b
}
override def merge(wx: DLIPublication, wy: DLIPublication): DLIPublication = {
wx.mergeFrom(wy)
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
if(wx.getId == null && wy.getId.nonEmpty)
wx.setId(wy.getId)
wx
}
override def finish(reduction: DLIPublication): DLIPublication = reduction
override def bufferEncoder: Encoder[DLIPublication] =
Encoders.kryo(classOf[DLIPublication])
override def outputEncoder: Encoder[DLIPublication] =
Encoders.kryo(classOf[DLIPublication])
}
def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
override def zero: Publication = new Publication()
override def reduce(b: Publication, a: (String, Publication)): Publication = {
b.mergeFrom(a._2)
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
if (b.getId == null)
b.setId(a._2.getId)
b
}
override def merge(wx: Publication, wy: Publication): Publication = {
wx.mergeFrom(wy)
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
if(wx.getId == null && wy.getId.nonEmpty)
wx.setId(wy.getId)
wx
}
override def finish(reduction: Publication): Publication = reduction
override def bufferEncoder: Encoder[Publication] =
Encoders.kryo(classOf[Publication])
override def outputEncoder: Encoder[Publication] =
Encoders.kryo(classOf[Publication])
}
def getRelationAggregator(): Aggregator[(String, Relation), Relation, Relation] = new Aggregator[(String, Relation), Relation, Relation]{
override def zero: Relation = new Relation()
override def reduce(b: Relation, a: (String, Relation)): Relation = {
a._2
}
override def merge(a: Relation, b: Relation): Relation = {
if(b!= null) b else a
}
override def finish(reduction: Relation): Relation = reduction
override def bufferEncoder: Encoder[Relation] =
Encoders.kryo(classOf[Relation])
override def outputEncoder: Encoder[Relation] =
Encoders.kryo(classOf[Relation])
}
}

View File

@ -1,248 +0,0 @@
package eu.dnetlib.dhp.sx.graph.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Author, Instance, Journal, KeyValue, Oaf, Publication, Relation, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils.createQualifier
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, OafUtils, ProvenaceInfo}
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
import eu.dnetlib.dhp.utils.DHPUtils
import eu.dnetlib.scholexplorer.relation.RelationMapper
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import org.apache.spark.sql.functions._
import scala.collection.JavaConverters._
object SparkAddLinkUpdates {
val relationMapper: RelationMapper = RelationMapper.load
case class EBILinks(relation:String, pubdate:String, tpid:String, tpidType:String, turl:String, title:String, publisher:String) {}
def generatePubmedDLICollectedFrom(): KeyValue = {
OafUtils.generateKeyValue("dli_________::europe_pmc__", "Europe PMC")
}
def journalToOAF(pj:PMJournal): Journal = {
val j = new Journal
j.setIssnPrinted(pj.getIssn)
j.setVol(pj.getVolume)
j.setName(pj.getTitle)
j.setIss(pj.getIssue)
j.setDataInfo(OafUtils.generateDataInfo())
j
}
def pubmedTOPublication(input:PMArticle):DLIPublication = {
val dnetPublicationId = s"50|${DHPUtils.md5(s"${input.getPmid}::pmid")}"
val p = new DLIPublication
p.setId(dnetPublicationId)
p.setDataInfo(OafUtils.generateDataInfo())
p.setPid(List(OafUtils.createSP(input.getPmid.toLowerCase.trim, "pmid", ModelConstants.DNET_PID_TYPES)).asJava)
p.setCompletionStatus("complete")
val pi = new ProvenaceInfo
pi.setId("dli_________::europe_pmc__")
pi.setName( "Europe PMC")
pi.setCompletionStatus("complete")
pi.setCollectionMode("collected")
p.setDlicollectedfrom(List(pi).asJava)
p.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
if (input.getAuthors != null && input.getAuthors.size() >0) {
var aths: List[Author] = List()
input.getAuthors.asScala.filter(a=> a!= null).foreach(a => {
val c = new Author
c.setFullname(a.getFullName)
c.setName(a.getForeName)
c.setSurname(a.getLastName)
aths = aths ::: List(c)
})
if (aths.nonEmpty)
p.setAuthor(aths.asJava)
}
if (input.getJournal != null)
p.setJournal(journalToOAF(input.getJournal))
p.setTitle(List(OafUtils.createSP(input.getTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)).asJava)
p.setDateofacceptance(OafUtils.asField(input.getDate))
val i = new Instance
i.setCollectedfrom(generatePubmedDLICollectedFrom())
i.setDateofacceptance(p.getDateofacceptance)
i.setUrl(List(s"https://pubmed.ncbi.nlm.nih.gov/${input.getPmid}").asJava)
i.setInstancetype(createQualifier("0001", "Article", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
p.setInstance(List(i).asJava)
p
}
def ebiLinksToOaf(input:(String, String)):List[Oaf] = {
val pmid :String = input._1
val input_json :String = input._2
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input_json)
val targets:List[EBILinks] = for {
JObject(link) <- json \\ "Category" \\ "Link"
JField("PublicationDate", JString(pubdate)) <- link
JField("RelationshipType", JObject(relationshipType)) <- link
JField("Name", JString(relname)) <- relationshipType
JField("Target", JObject(target)) <- link
JField("Identifier", JObject(identifier)) <- target
JField("ID", JString(tpid)) <- identifier
JField("IDScheme", JString(tpidtype)) <- identifier
JField("IDURL", JString(turl)) <- identifier
JField("Title", JString(title)) <- target
JField("Publisher", JObject(pub)) <- target
JField("Name", JString(publisher)) <- pub
} yield EBILinks(relname, pubdate, tpid, tpidtype, turl,title, publisher)
val dnetPublicationId = s"50|${DHPUtils.md5(s"$pmid::pmid")}"
targets.flatMap(l => {
val relation = new Relation
val inverseRelation = new Relation
val targetDnetId = s"50|${DHPUtils.md5(s"${l.tpid.toLowerCase.trim}::${l.tpidType.toLowerCase.trim}")}"
val relInfo = relationMapper.get(l.relation.toLowerCase)
val relationSemantic = relInfo.getOriginal
val inverseRelationSemantic = relInfo.getInverse
relation.setSource(dnetPublicationId)
relation.setTarget(targetDnetId)
relation.setRelClass("datacite")
relation.setRelType(relationSemantic)
relation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
inverseRelation.setSource(targetDnetId)
inverseRelation.setTarget(dnetPublicationId)
inverseRelation.setRelClass("datacite")
inverseRelation.setRelType(inverseRelationSemantic)
inverseRelation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
val d = new DLIDataset
d.setId(targetDnetId)
d.setDataInfo(OafUtils.generateDataInfo())
d.setPid(List(OafUtils.createSP(l.tpid.toLowerCase.trim, l.tpidType.toLowerCase.trim, ModelConstants.DNET_PID_TYPES)).asJava)
d.setCompletionStatus("complete")
val pi = new ProvenaceInfo
pi.setId("dli_________::europe_pmc__")
pi.setName( "Europe PMC")
pi.setCompletionStatus("complete")
pi.setCollectionMode("collected")
d.setDlicollectedfrom(List(pi).asJava)
d.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
d.setPublisher(OafUtils.asField(l.publisher))
d.setTitle(List(OafUtils.createSP(l.title, "main title", ModelConstants.DNET_DATACITE_TITLE)).asJava)
d.setDateofacceptance(OafUtils.asField(l.pubdate))
val i = new Instance
i.setCollectedfrom(generatePubmedDLICollectedFrom())
i.setDateofacceptance(d.getDateofacceptance)
i.setUrl(List(l.turl).asJava)
i.setInstancetype(createQualifier("0021", "Dataset", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
d.setInstance(List(i).asJava)
List(relation, inverseRelation, d)
})
}
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_to_df_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val workingPath = parser.get("workingPath")
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
implicit val oafpubEncoder: Encoder[Publication] = Encoders.kryo[Publication]
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset])
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo(classOf[DLIPublication])
implicit val atEncoder: Encoder[Author] = Encoders.kryo(classOf[Author])
implicit val strEncoder:Encoder[String] = Encoders.STRING
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
val ds:Dataset[(String,String)] = spark.read.load(s"$workingPath/baseline_links_updates").as[(String,String)](Encoders.tuple(Encoders.STRING, Encoders.STRING))
ds.flatMap(l =>ebiLinksToOaf(l)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_oaf")
ds.filter(s => s.isInstanceOf)
val oDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/baseline_links_updates_oaf").as[Oaf]
oDataset.filter(p =>p.isInstanceOf[Relation]).map(p => p.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_relation")
oDataset.filter(p =>p.isInstanceOf[DLIDataset]).map(p => p.asInstanceOf[DLIDataset]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_dataset")
val idPublicationSolved:Dataset[String] = spark.read.load(s"$workingPath/baseline_links_updates").where(col("links").isNotNull).select("pmid").as[String]
val baseline:Dataset[(String, PMArticle)]= spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle].map(p=> (p.getPmid, p))(Encoders.tuple(strEncoder,PMEncoder))
idPublicationSolved.joinWith(baseline, idPublicationSolved("pmid").equalTo(baseline("_1"))).map(k => pubmedTOPublication(k._2._2)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_publication")
val pmaDatasets = spark.read.load("/user/sandro.labruzzo/scholix/EBI/ebi_garr/baseline_dataset").as[PMArticle]
pmaDatasets.map(p => pubmedTOPublication(p)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_publication_all")
val pubs: Dataset[(String,Publication)] = spark.read.load("/user/sandro.labruzzo/scholix/EBI/publication").as[Publication].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING,oafpubEncoder))
val pubdate:Dataset[(String,DLIPublication)] = spark.read.load(s"$workingPath/baseline_publication_all").as[DLIPublication].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING,pubEncoder))
pubs.joinWith(pubdate, pubs("_1").equalTo(pubdate("_1"))).map(k => k._2._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_publication_ebi")
val dt : Dataset[DLIDataset] = spark.read.load(s"$workingPath/dataset").as[DLIDataset]
val update : Dataset[DLIDataset] = spark.read.load(s"$workingPath/ebi_garr/baseline_links_updates_dataset").as[DLIDataset]
dt.union(update).map(d => (d.getId,d))(Encoders.tuple(Encoders.STRING, datEncoder))
.groupByKey(_._1)(Encoders.STRING)
.agg(EBIAggregator.getDLIDatasetAggregator().toColumn)
.map(p => p._2)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset_ebi")
val rel: Dataset[Relation] = spark.read.load(s"$workingPath/relation").as[Relation]
val relupdate : Dataset[Relation] = spark.read.load(s"$workingPath/ebi_garr/baseline_links_updates_relation").as[Relation]
rel.union(relupdate)
.map(d => (s"${d.getSource}::${d.getRelType}::${d.getTarget}", d))(Encoders.tuple(Encoders.STRING, relEncoder))
.groupByKey(_._1)(Encoders.STRING)
.agg(EBIAggregator.getRelationAggregator().toColumn)
.map(p => p._2)
.write.mode(SaveMode.Overwrite)
.save(s"$workingPath/baseline_relation_ebi")
}
}

View File

@ -1,223 +0,0 @@
package eu.dnetlib.dhp.sx.graph.parser;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.xml.stream.XMLStreamReader;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.scholexplorer.relation.RelInfo;
import eu.dnetlib.scholexplorer.relation.RelationMapper;
public abstract class AbstractScholexplorerParser {
protected static final Log log = LogFactory.getLog(AbstractScholexplorerParser.class);
static final Pattern pattern = Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE);
private final List<String> datasetSubTypes = Arrays
.asList(
"dataset",
"software",
"film",
"sound",
"physicalobject",
"audiovisual",
"collection",
"other",
"study",
"metadata");
public abstract List<Oaf> parseObject(final String record, final RelationMapper relMapper);
protected Map<String, String> getAttributes(final XMLStreamReader parser) {
final Map<String, String> attributesMap = new HashMap<>();
for (int i = 0; i < parser.getAttributeCount(); i++) {
attributesMap.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
}
return attributesMap;
}
protected List<StructuredProperty> extractSubject(List<VtdUtilityParser.Node> subjects) {
final List<StructuredProperty> subjectResult = new ArrayList<>();
if (subjects != null && subjects.size() > 0) {
subjects
.forEach(
subjectMap -> {
final StructuredProperty subject = new StructuredProperty();
subject.setValue(subjectMap.getTextValue());
final Qualifier schema = new Qualifier();
schema.setClassid("dnet:subject");
schema.setClassname("dnet:subject");
schema.setSchemeid(subjectMap.getAttributes().get("subjectScheme"));
schema.setSchemename(subjectMap.getAttributes().get("subjectScheme"));
subject.setQualifier(schema);
subjectResult.add(subject);
});
}
return subjectResult;
}
protected StructuredProperty extractIdentifier(
List<VtdUtilityParser.Node> identifierType, final String fieldName) {
final StructuredProperty pid = new StructuredProperty();
if (identifierType != null && identifierType.size() > 0) {
final VtdUtilityParser.Node result = identifierType.get(0);
pid.setValue(result.getTextValue());
final Qualifier pidType = new Qualifier();
pidType.setClassname(result.getAttributes().get(fieldName));
pidType.setClassid(result.getAttributes().get(fieldName));
pidType.setSchemename(ModelConstants.DNET_PID_TYPES);
pidType.setSchemeid(ModelConstants.DNET_PID_TYPES);
pid.setQualifier(pidType);
return pid;
}
return null;
}
protected void inferPid(final StructuredProperty input) {
final Matcher matcher = pattern.matcher(input.getValue());
if (matcher.find()) {
input.setValue(matcher.group());
if (input.getQualifier() == null) {
input.setQualifier(new Qualifier());
input.getQualifier().setSchemename(ModelConstants.DNET_PID_TYPES);
input.getQualifier().setSchemeid(ModelConstants.DNET_PID_TYPES);
}
input.getQualifier().setClassid("doi");
input.getQualifier().setClassname("doi");
}
}
protected String generateId(final String pid, final String pidType, final String entityType) {
String type;
switch (entityType) {
case "publication":
type = "50|";
break;
case "dataset":
type = "60|";
break;
case "unknown":
type = "70|";
break;
default:
throw new IllegalArgumentException("unexpected value " + entityType);
}
if ("dnet".equalsIgnoreCase(pidType))
return type + StringUtils.substringAfter(pid, "::");
return type
+ DHPUtils
.md5(
String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim()));
}
protected DLIUnknown createUnknownObject(
final String pid,
final String pidType,
final KeyValue cf,
final DataInfo di,
final String dateOfCollection) {
final DLIUnknown uk = new DLIUnknown();
uk.setId(generateId(pid, pidType, "unknown"));
ProvenaceInfo pi = new ProvenaceInfo();
pi.setId(cf.getKey());
pi.setName(cf.getValue());
pi.setCompletionStatus("incomplete");
uk.setDataInfo(di);
uk.setDlicollectedfrom(Collections.singletonList(pi));
final StructuredProperty sourcePid = new StructuredProperty();
sourcePid.setValue(pid);
final Qualifier pt = new Qualifier();
pt.setClassname(pidType);
pt.setClassid(pidType);
pt.setSchemename(ModelConstants.DNET_PID_TYPES);
pt.setSchemeid(ModelConstants.DNET_PID_TYPES);
sourcePid.setQualifier(pt);
uk.setPid(Collections.singletonList(sourcePid));
uk.setDateofcollection(dateOfCollection);
return uk;
}
protected Qualifier generateQualifier(final String classId, final String className, final String schemeId,
final String schemeName) {
final Qualifier q = new Qualifier();
q.setClassid(classId);
q.setClassid(className);
q.setSchemeid(schemeId);
q.setSchemename(schemeName);
return q;
}
protected void generateRelations(
RelationMapper relationMapper,
Result parsedObject,
List<Oaf> result,
DataInfo di,
String dateOfCollection,
List<VtdUtilityParser.Node> relatedIdentifiers) {
if (relatedIdentifiers != null) {
result
.addAll(
relatedIdentifiers
.stream()
.flatMap(
n -> {
final List<Relation> rels = new ArrayList<>();
Relation r = new Relation();
r.setSource(parsedObject.getId());
final String relatedPid = n.getTextValue();
final String relatedPidType = n.getAttributes().get("relatedIdentifierType");
final String relatedType = n.getAttributes().getOrDefault("entityType", "unknown");
String relationSemantic = n.getAttributes().get("relationType");
String inverseRelation;
final String targetId = generateId(relatedPid, relatedPidType, relatedType);
if (relationMapper.containsKey(relationSemantic.toLowerCase())) {
RelInfo relInfo = relationMapper.get(relationSemantic.toLowerCase());
relationSemantic = relInfo.getOriginal();
inverseRelation = relInfo.getInverse();
} else {
relationSemantic = "Unknown";
inverseRelation = "Unknown";
}
r.setTarget(targetId);
r.setRelType(relationSemantic);
r.setRelClass("datacite");
r.setCollectedfrom(parsedObject.getCollectedfrom());
r.setDataInfo(di);
rels.add(r);
r = new Relation();
r.setDataInfo(di);
r.setSource(targetId);
r.setTarget(parsedObject.getId());
r.setRelType(inverseRelation);
r.setRelClass("datacite");
r.setCollectedfrom(parsedObject.getCollectedfrom());
rels.add(r);
if ("unknown".equalsIgnoreCase(relatedType))
result
.add(
createUnknownObject(
relatedPid,
relatedPidType,
parsedObject.getCollectedfrom().get(0),
di,
dateOfCollection));
return rels.stream();
})
.collect(Collectors.toList()));
}
}
}

View File

@ -1,340 +0,0 @@
package eu.dnetlib.dhp.sx.graph.parser;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.ximpleware.AutoPilot;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
import eu.dnetlib.scholexplorer.relation.RelationMapper;
public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
@Override
public List<Oaf> parseObject(String record, final RelationMapper relationMapper) {
try {
final DLIDataset parsedObject = new DLIDataset();
final VTDGen vg = new VTDGen();
vg.setDoc(record.getBytes());
final List<Oaf> result = new ArrayList<>();
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
DataInfo di = new DataInfo();
di.setTrust("0.9");
di.setDeletedbyinference(false);
di.setInvisible(false);
parsedObject.setDataInfo(di);
parsedObject
.setOriginalId(
Collections
.singletonList(
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']")));
parsedObject
.setOriginalObjIdentifier(
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']"));
String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']");
parsedObject.setDateofcollection(dateOfCollection);
final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']");
if (StringUtils.isNotBlank(resolvedDate)) {
StructuredProperty currentDate = new StructuredProperty();
currentDate.setValue(resolvedDate);
final Qualifier dateQualifier = new Qualifier();
dateQualifier.setClassname("resolvedDate");
dateQualifier.setClassid("resolvedDate");
dateQualifier.setSchemename("dnet::date");
dateQualifier.setSchemeid("dnet::date");
currentDate.setQualifier(dateQualifier);
parsedObject.setRelevantdate(Collections.singletonList(currentDate));
}
final String completionStatus = VtdUtilityParser
.getSingleValue(ap, vn, "//*[local-name()='completionStatus']");
final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
final String publisher = VtdUtilityParser
.getSingleValue(
ap, vn, "//*[local-name()='resource']/*[local-name()='publisher']");
List<VtdUtilityParser.Node> collectedFromNodes = VtdUtilityParser
.getTextValuesWithAttributes(
ap,
vn,
"//*[local-name()='collectedFrom']",
Arrays.asList("name", "id", "mode", "completionStatus"));
List<VtdUtilityParser.Node> resolvededFromNodes = VtdUtilityParser
.getTextValuesWithAttributes(
ap,
vn,
"//*[local-name()='resolvedFrom']",
Arrays.asList("name", "id", "mode", "completionStatus"));
Field<String> pf = new Field<>();
pf.setValue(publisher);
parsedObject.setPublisher(pf);
final List<ProvenaceInfo> provenances = new ArrayList<>();
if (collectedFromNodes != null && collectedFromNodes.size() > 0) {
collectedFromNodes
.forEach(
it -> {
final ProvenaceInfo provenance = new ProvenaceInfo();
provenance.setId(it.getAttributes().get("id"));
provenance.setName(it.getAttributes().get("name"));
provenance.setCollectionMode(provisionMode);
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
provenances.add(provenance);
});
}
if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) {
resolvededFromNodes
.forEach(
it -> {
final ProvenaceInfo provenance = new ProvenaceInfo();
provenance.setId(it.getAttributes().get("id"));
provenance.setName(it.getAttributes().get("name"));
provenance.setCollectionMode("resolved");
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
provenances.add(provenance);
});
}
parsedObject.setDlicollectedfrom(provenances);
parsedObject
.setCollectedfrom(
parsedObject
.getDlicollectedfrom()
.stream()
.map(
p -> {
final KeyValue cf = new KeyValue();
cf.setKey(p.getId());
cf.setValue(p.getName());
return cf;
})
.collect(Collectors.toList()));
parsedObject
.setCompletionStatus(
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"));
final List<Node> identifierType = VtdUtilityParser
.getTextValuesWithAttributes(
ap,
vn,
"//*[local-name()='resource']/*[local-name()='identifier']",
Collections.singletonList("identifierType"));
StructuredProperty currentPid = extractIdentifier(identifierType, "identifierType");
if (currentPid == null)
return null;
inferPid(currentPid);
parsedObject.setPid(Collections.singletonList(currentPid));
String resolvedURL = null;
switch (currentPid.getQualifier().getClassname().toLowerCase()) {
case "uniprot":
resolvedURL = "https://www.uniprot.org/uniprot/" + currentPid.getValue();
break;
case "ena":
if (StringUtils.isNotBlank(currentPid.getValue()) && currentPid.getValue().length() > 7)
resolvedURL = "https://www.ebi.ac.uk/ena/data/view/" + currentPid.getValue().substring(0, 8);
break;
case "chembl":
resolvedURL = "https://www.ebi.ac.uk/chembl/compound_report_card/" + currentPid.getValue();
break;
case "ncbi-n":
resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
break;
case "ncbi-p":
resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
break;
case "genbank":
resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
break;
case "pdb":
resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
break;
case "url":
resolvedURL = currentPid.getValue();
break;
}
final String sourceId = generateId(
currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset");
parsedObject.setId(sourceId);
List<String> descs = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='description']");
if (descs != null && descs.size() > 0)
parsedObject
.setDescription(
descs
.stream()
// .map(it -> it.length() < 10000 ? it : it.substring(0, 10000))
.map(
it -> {
final Field<String> d = new Field<>();
d.setValue(it);
return d;
})
.collect(Collectors.toList()));
final List<Node> relatedIdentifiers = VtdUtilityParser
.getTextValuesWithAttributes(
ap,
vn,
"//*[local-name()='relatedIdentifier']",
Arrays
.asList(
"relatedIdentifierType", "relationType", "entityType", "inverseRelationType"));
generateRelations(
relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers);
final List<Node> hostedBy = VtdUtilityParser
.getTextValuesWithAttributes(
ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name"));
if (hostedBy != null) {
parsedObject
.setInstance(
hostedBy
.stream()
.map(
it -> {
final Instance i = new Instance();
i.setUrl(Collections.singletonList(currentPid.getValue()));
KeyValue h = new KeyValue();
i.setHostedby(h);
h.setKey(it.getAttributes().get("id"));
h.setValue(it.getAttributes().get("name"));
return i;
})
.collect(Collectors.toList()));
}
List<StructuredProperty> subjects = extractSubject(
VtdUtilityParser
.getTextValuesWithAttributes(
ap,
vn,
"//*[local-name()='resource']//*[local-name()='subject']",
Collections.singletonList("subjectScheme")));
parsedObject.setSubject(subjects);
Qualifier q = new Qualifier();
q.setClassname("dataset");
q.setClassid("dataset");
q.setSchemename("dataset");
q.setSchemeid("dataset");
parsedObject.setResulttype(q);
parsedObject.setCompletionStatus(completionStatus);
final List<String> creators = VtdUtilityParser
.getTextValue(
ap,
vn,
"//*[local-name()='resource']//*[local-name()='creator']/*[local-name()='creatorName']");
if (creators != null && creators.size() > 0) {
parsedObject
.setAuthor(
creators
.stream()
.map(
a -> {
final Author author = new Author();
author.setFullname(a);
return author;
})
.collect(Collectors.toList()));
}
final List<String> titles = VtdUtilityParser
.getTextValue(
ap, vn, "//*[local-name()='resource']//*[local-name()='title']");
if (titles != null && titles.size() > 0) {
parsedObject
.setTitle(
titles
.stream()
.map(
t -> {
final StructuredProperty st = new StructuredProperty();
st.setValue(t);
st.setQualifier(ModelConstants.MAIN_TITLE_QUALIFIER);
return st;
})
.collect(Collectors.toList()));
}
final List<String> dates = VtdUtilityParser
.getTextValue(
ap,
vn,
"//*[local-name()='resource']/*[local-name()='dates']/*[local-name()='date']");
if (dates != null && dates.size() > 0) {
parsedObject
.setRelevantdate(
dates
.stream()
.map(
cd -> {
StructuredProperty date = new StructuredProperty();
date.setValue(cd);
final Qualifier dq = new Qualifier();
dq.setClassname("date");
dq.setClassid("date");
dq.setSchemename("dnet::date");
dq.setSchemeid("dnet::date");
date.setQualifier(dq);
return date;
})
.collect(Collectors.toList()));
}
// TERRIBLE HACK TO AVOID EMPTY COLLECTED FROM
if (parsedObject.getDlicollectedfrom() == null) {
final KeyValue cf = new KeyValue();
cf.setKey("dli_________::europe_pmc__");
cf.setValue("Europe PMC");
parsedObject.setCollectedfrom(Collections.singletonList(cf));
}
if (StringUtils.isNotBlank(resolvedURL)) {
Instance i = new Instance();
i.setCollectedfrom(parsedObject.getCollectedfrom().get(0));
i.setUrl(Collections.singletonList(resolvedURL));
parsedObject.setInstance(Collections.singletonList(i));
}
result.add(parsedObject);
return result;
} catch (Throwable e) {
log.error("Error on parsing record " + record, e);
return null;
}
}
}

View File

@ -1,264 +0,0 @@
package eu.dnetlib.dhp.sx.graph.parser;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.ximpleware.AutoPilot;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
import eu.dnetlib.scholexplorer.relation.RelationMapper;
public class PublicationScholexplorerParser extends AbstractScholexplorerParser {
@Override
public List<Oaf> parseObject(final String record, final RelationMapper relationMapper) {
try {
final List<Oaf> result = new ArrayList<>();
final DLIPublication parsedObject = new DLIPublication();
final VTDGen vg = new VTDGen();
vg.setDoc(record.getBytes());
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
final DataInfo di = new DataInfo();
di.setTrust("0.9");
di.setDeletedbyinference(false);
di.setInvisible(false);
String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']");
parsedObject.setDateofcollection(dateOfCollection);
final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']");
parsedObject
.setOriginalId(
Collections
.singletonList(
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']")));
if (StringUtils.isNotBlank(resolvedDate)) {
StructuredProperty currentDate = new StructuredProperty();
currentDate.setValue(resolvedDate);
final Qualifier dateQualifier = new Qualifier();
dateQualifier.setClassname("resolvedDate");
dateQualifier.setClassid("resolvedDate");
dateQualifier.setSchemename("dnet::date");
dateQualifier.setSchemeid("dnet::date");
currentDate.setQualifier(dateQualifier);
parsedObject.setRelevantdate(Collections.singletonList(currentDate));
}
final List<Node> pid = VtdUtilityParser
.getTextValuesWithAttributes(
ap, vn, "//*[local-name()='pid']", Arrays.asList("type"));
StructuredProperty currentPid = extractIdentifier(pid, "type");
if (currentPid == null)
return null;
inferPid(currentPid);
parsedObject.setPid(Collections.singletonList(currentPid));
final String sourceId = generateId(
currentPid.getValue(), currentPid.getQualifier().getClassid(), "publication");
parsedObject.setId(sourceId);
parsedObject
.setOriginalObjIdentifier(
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']"));
String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
List<Node> collectedFromNodes = VtdUtilityParser
.getTextValuesWithAttributes(
ap,
vn,
"//*[local-name()='collectedFrom']",
Arrays.asList("name", "id", "mode", "completionStatus"));
List<Node> resolvededFromNodes = VtdUtilityParser
.getTextValuesWithAttributes(
ap,
vn,
"//*[local-name()='resolvedFrom']",
Arrays.asList("name", "id", "mode", "completionStatus"));
final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='publisher']");
Field<String> pf = new Field<>();
pf.setValue(publisher);
parsedObject.setPublisher(pf);
final List<ProvenaceInfo> provenances = new ArrayList<>();
if (collectedFromNodes != null && collectedFromNodes.size() > 0) {
collectedFromNodes
.forEach(
it -> {
final ProvenaceInfo provenance = new ProvenaceInfo();
provenance.setId(it.getAttributes().get("id"));
provenance.setName(it.getAttributes().get("name"));
provenance.setCollectionMode(provisionMode);
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
provenances.add(provenance);
});
}
if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) {
resolvededFromNodes
.forEach(
it -> {
final ProvenaceInfo provenance = new ProvenaceInfo();
provenance.setId(it.getAttributes().get("id"));
provenance.setName(it.getAttributes().get("name"));
provenance.setCollectionMode("resolved");
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
provenances.add(provenance);
});
}
parsedObject.setDlicollectedfrom(provenances);
parsedObject
.setCompletionStatus(
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"));
parsedObject
.setCollectedfrom(
parsedObject
.getDlicollectedfrom()
.stream()
.map(
p -> {
final KeyValue cf = new KeyValue();
cf.setKey(p.getId());
cf.setValue(p.getName());
return cf;
})
.collect(Collectors.toList()));
final List<Node> relatedIdentifiers = VtdUtilityParser
.getTextValuesWithAttributes(
ap,
vn,
"//*[local-name()='relatedIdentifier']",
Arrays
.asList(
"relatedIdentifierType", "relationType", "entityType", "inverseRelationType"));
generateRelations(
relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers);
final List<Node> hostedBy = VtdUtilityParser
.getTextValuesWithAttributes(
ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name"));
if (hostedBy != null) {
parsedObject
.setInstance(
hostedBy
.stream()
.map(
it -> {
final Instance i = new Instance();
i.setUrl(Collections.singletonList(currentPid.getValue()));
KeyValue h = new KeyValue();
i.setHostedby(h);
h.setKey(it.getAttributes().get("id"));
h.setValue(it.getAttributes().get("name"));
return i;
})
.collect(Collectors.toList()));
}
final List<String> authorsNode = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='creator']");
if (authorsNode != null)
parsedObject
.setAuthor(
authorsNode
.stream()
.map(
a -> {
final Author author = new Author();
author.setFullname(a);
return author;
})
.collect(Collectors.toList()));
final List<String> titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='title']");
if (titles != null) {
parsedObject
.setTitle(
titles
.stream()
.map(
t -> {
final StructuredProperty st = new StructuredProperty();
st.setValue(t);
st
.setQualifier(
generateQualifier(
"main title", "main title", "dnet:dataCite_title",
"dnet:dataCite_title"));
return st;
})
.collect(Collectors.toList()));
}
Field<String> description = new Field<>();
description
.setValue(
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='description']"));
// if (StringUtils.isNotBlank(description.getValue())
// && description.getValue().length() > 10000) {
// description.setValue(description.getValue().substring(0, 10000));
// }
parsedObject.setDescription(Collections.singletonList(description));
final String cd = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='date']");
StructuredProperty date = new StructuredProperty();
date.setValue(cd);
final Qualifier dq = new Qualifier();
dq.setClassname("date");
dq.setClassid("date");
dq.setSchemename("dnet::date");
dq.setSchemeid("dnet::date");
date.setQualifier(dq);
parsedObject.setRelevantdate(Collections.singletonList(date));
List<StructuredProperty> subjects = extractSubject(
VtdUtilityParser
.getTextValuesWithAttributes(
ap, vn, "//*[local-name()='subject']", Collections.singletonList("scheme")));
parsedObject.setSubject(subjects);
parsedObject.setDataInfo(di);
parsedObject.setSubject(subjects);
Qualifier q = new Qualifier();
q.setClassname("publication");
q.setClassid("publication");
q.setSchemename("publication");
q.setSchemeid("publication");
parsedObject.setResulttype(q);
result.add(parsedObject);
return result;
} catch (Throwable e) {
log.error("Input record: " + record);
log.error("Error on parsing record ", e);
return null;
}
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.dhp.sx.graph.scholix
import eu.dnetlib.dhp.schema.oaf.{Dataset, Result}
import eu.dnetlib.dhp.schema.sx.summary.{SchemeValue, ScholixSummary, TypedIdentifier, Typology}
import scala.collection.JavaConverters._
object ScholixUtils {
def resultToSummary(r:Result):ScholixSummary = {
val s = new ScholixSummary
s.setId(r.getId)
s.setLocalIdentifier(r.getPid.asScala.map(p => new TypedIdentifier(p.getValue, p.getQualifier.getClassid)).asJava)
if (r.isInstanceOf[Dataset])
s.setTypology(Typology.dataset)
else
s.setTypology(Typology.publication)
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
if (r.getTitle!= null && r.getTitle.asScala.nonEmpty) {
s.setTitle(r.getTitle.asScala.map(t => t.getValue).asJava)
}
if(r.getAuthor!= null && !r.getAuthor.isEmpty) {
s.setAuthor(r.getAuthor.asScala.map(a=> a.getFullname).asJava)
}
if (r.getInstance() != null) {
val dt:List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue)(collection.breakOut)
if (dt.nonEmpty)
s.setDate(dt.asJava)
}
if (r.getDescription!= null && !r.getDescription.isEmpty) {
val d = r.getDescription.asScala.find(f => f.getValue!=null)
if (d.isDefined)
s.setDescription(d.get.getValue)
}
if (r.getSubject!= null && !r.getSubject.isEmpty)
s.setSubject(r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue)).asJava)
if (r.getPublisher!= null)
s.setPublisher(List(r.getPublisher.getValue).asJava)
s.setRelatedDatasets(0)
s.setRelatedPublications(0)
s.setRelatedUnknown(0)
s
}
}

View File

@ -0,0 +1,5 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true}
]

View File

@ -10,7 +10,7 @@
</property> </property>
</parameters> </parameters>
<start to="ResolveRelations"/> <start to="CreateSummaries"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
@ -64,9 +64,38 @@
<arg>--workingPath</arg><arg>${targetPath}/resolved/</arg> <arg>--workingPath</arg><arg>${targetPath}/resolved/</arg>
<arg>--entityPath</arg><arg>${targetPath}/dedup</arg> <arg>--entityPath</arg><arg>${targetPath}/dedup</arg>
</spark> </spark>
<ok to="CreateSummaries"/>
<error to="Kill"/>
</action>
<action name="CreateSummaries">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Convert Entities to summaries</name>
<class>eu.dnetlib.dhp.sx.graph.SparkCreateSummaryObject</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.shuffle.partitions=5000
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--sourcePath</arg><arg>${targetPath}/dedup</arg>
<arg>--targetPath</arg><arg>${targetPath}/provision/summaries</arg>
</spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -1,63 +0,0 @@
package eu.dnetlib.dhp.sx.graph;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser;
import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser;
import eu.dnetlib.scholexplorer.relation.RelationMapper;
public class ScholexplorerParserTest {
@Test
public void testDataciteParser() throws Exception {
String xml = IOUtils.toString(this.getClass().getResourceAsStream("dmf.xml"));
DatasetScholexplorerParser p = new DatasetScholexplorerParser();
List<Oaf> oaves = p.parseObject(xml, RelationMapper.load());
ObjectMapper m = new ObjectMapper();
m.enable(SerializationFeature.INDENT_OUTPUT);
oaves
.forEach(
oaf -> {
try {
System.out.println(m.writeValueAsString(oaf));
System.out.println("----------------------------");
} catch (JsonProcessingException e) {
}
});
}
@Test
public void testPublicationParser() throws Exception {
String xml = IOUtils.toString(this.getClass().getResourceAsStream("pmf.xml"));
PublicationScholexplorerParser p = new PublicationScholexplorerParser();
List<Oaf> oaves = p.parseObject(xml, RelationMapper.load());
ObjectMapper m = new ObjectMapper();
m.enable(SerializationFeature.INDENT_OUTPUT);
oaves
.forEach(
oaf -> {
try {
System.out.println(m.writeValueAsString(oaf));
System.out.println("----------------------------");
} catch (JsonProcessingException e) {
}
});
}
}

View File

@ -1,54 +0,0 @@
package eu.dnetlib.dhp.sx.graph
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication
import eu.dnetlib.dhp.sx.graph.ebi.EBIAggregator
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test
import scala.io.Source
class SparkScholexplorerAggregationTest {
@Test
def testFunderRelationshipsMapping(): Unit = {
val publications = Source.fromInputStream(getClass.getResourceAsStream("publication.json")).mkString
var s: List[DLIPublication] = List[DLIPublication]()
val m: ObjectMapper = new ObjectMapper()
m.enable(SerializationFeature.INDENT_OUTPUT)
for (line <- publications.lines) {
s = m.readValue(line, classOf[DLIPublication]) :: s
}
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()
val ds: Dataset[DLIPublication] = spark.createDataset(spark.sparkContext.parallelize(s)).as[DLIPublication]
val unique = ds.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder))
.groupByKey(_._1)(Encoders.STRING)
.agg(EBIAggregator.getDLIPublicationAggregator().toColumn)
.map(p => p._2)
val uniquePubs: DLIPublication = unique.first()
s.foreach(pp => assertFalse(pp.getAuthor.isEmpty))
assertNotNull(uniquePubs.getAuthor)
assertFalse(uniquePubs.getAuthor.isEmpty)
}
}

View File

@ -1,6 +0,0 @@
package eu.dnetlib.dhp.sx.graph;
public class SparkScholexplorerGraphImporterTest {
}

View File

@ -1,5 +0,0 @@
package eu.dnetlib.dhp.sx.graph;
public class SparkScholexplorerMergeEntitiesJobTest {
}

View File

@ -8,10 +8,9 @@ import eu.dnetlib.dhp.common.PacePerson
import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.StringUtils
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils._
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._

View File

@ -15,7 +15,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class SparkIndexCollectionOnES { public class
SparkIndexCollectionOnES {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {

View File

@ -24,8 +24,8 @@
<module>dhp-dedup-openaire</module> <module>dhp-dedup-openaire</module>
<module>dhp-enrichment</module> <module>dhp-enrichment</module>
<module>dhp-graph-provision</module> <module>dhp-graph-provision</module>
<module>dhp-dedup-scholexplorer</module> <!-- <module>dhp-dedup-scholexplorer</module>-->
<module>dhp-graph-provision-scholexplorer</module> <!-- <module>dhp-graph-provision-scholexplorer</module>-->
<module>dhp-blacklist</module> <module>dhp-blacklist</module>
<module>dhp-stats-update</module> <module>dhp-stats-update</module>
<module>dhp-stats-promote</module> <module>dhp-stats-promote</module>

View File

@ -736,7 +736,7 @@
<mockito-core.version>3.3.3</mockito-core.version> <mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version> <mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version> <vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[2.6.13]</dhp-schemas.version> <dhp-schemas.version>[2.6.14]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version> <dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version> <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version> <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>