forked from D-Net/dnet-hadoop
massive code refactor:
removed modules dhp-*-scholexplorer
This commit is contained in:
parent
829caee4fd
commit
c6fa8598e1
|
@ -6,7 +6,7 @@ import eu.dnetlib.dhp.utils.DHPUtils
|
|||
import org.apache.commons.lang3.StringUtils
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
@ -118,11 +118,11 @@ object DoiBoostMappingUtil {
|
|||
|
||||
|
||||
def getOpenAccessQualifier():AccessRight = {
|
||||
OafUtils.createAccessRight("OPEN","Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
OafMapperUtils.accessRight("OPEN","Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
}
|
||||
|
||||
def getRestrictedQualifier():AccessRight = {
|
||||
OafUtils.createAccessRight("RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
OafMapperUtils.accessRight("RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
}
|
||||
|
||||
|
||||
|
@ -150,7 +150,7 @@ object DoiBoostMappingUtil {
|
|||
if (item.openAccess)
|
||||
i.setAccessright(getOpenAccessQualifier())
|
||||
val ar = getOpenAccessQualifier()
|
||||
publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
|
||||
publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
|
||||
}
|
||||
else {
|
||||
hb = ModelConstants.UNKNOWN_REPOSITORY
|
||||
|
@ -162,11 +162,11 @@ object DoiBoostMappingUtil {
|
|||
if (ar.nonEmpty) {
|
||||
if(ar.contains(ModelConstants.ACCESS_RIGHT_OPEN)){
|
||||
val ar = getOpenAccessQualifier()
|
||||
publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
|
||||
publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
|
||||
}
|
||||
else {
|
||||
val ar = getRestrictedQualifier()
|
||||
publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
|
||||
publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
|
||||
}
|
||||
}
|
||||
publication
|
||||
|
@ -254,7 +254,7 @@ object DoiBoostMappingUtil {
|
|||
di.setInferred(false)
|
||||
di.setInvisible(false)
|
||||
di.setTrust(trust)
|
||||
di.setProvenanceaction(OafUtils.createQualifier(ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS))
|
||||
di.setProvenanceaction(OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_ACTIONSET,ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS))
|
||||
di
|
||||
}
|
||||
|
||||
|
@ -262,7 +262,7 @@ object DoiBoostMappingUtil {
|
|||
|
||||
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(OafUtils.createQualifier(classId,className, schemeId, schemeName))
|
||||
sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName))
|
||||
sp.setValue(value)
|
||||
sp
|
||||
|
||||
|
@ -272,7 +272,7 @@ object DoiBoostMappingUtil {
|
|||
|
||||
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(OafUtils.createQualifier(classId,className, schemeId, schemeName))
|
||||
sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName))
|
||||
sp.setValue(value)
|
||||
sp.setDataInfo(dataInfo)
|
||||
sp
|
||||
|
@ -281,7 +281,7 @@ object DoiBoostMappingUtil {
|
|||
|
||||
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(OafUtils.createQualifier(classId, schemeId))
|
||||
sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId))
|
||||
sp.setValue(value)
|
||||
sp
|
||||
|
||||
|
@ -291,7 +291,7 @@ object DoiBoostMappingUtil {
|
|||
|
||||
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(OafUtils.createQualifier(classId, schemeId))
|
||||
sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId))
|
||||
sp.setValue(value)
|
||||
sp.setDataInfo(dataInfo)
|
||||
sp
|
||||
|
|
|
@ -2,7 +2,7 @@ package eu.dnetlib.doiboost.crossref
|
|||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf._
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
||||
import org.apache.commons.lang.StringUtils
|
||||
|
@ -15,8 +15,6 @@ import org.slf4j.{Logger, LoggerFactory}
|
|||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable
|
||||
import scala.util.matching.Regex
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils
|
||||
|
||||
import java.util
|
||||
|
||||
case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
|
||||
|
@ -182,12 +180,12 @@ case object Crossref2Oaf {
|
|||
|
||||
if(has_review != JNothing) {
|
||||
instance.setRefereed(
|
||||
OafUtils.createQualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS))
|
||||
OafMapperUtils.qualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS))
|
||||
}
|
||||
|
||||
instance.setAccessright(getRestrictedQualifier())
|
||||
instance.setInstancetype(OafUtils.createQualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
result.setResourcetype(OafUtils.createQualifier(cobjCategory.substring(0, 4),ModelConstants.DNET_DATA_CITE_RESOURCE))
|
||||
instance.setInstancetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
result.setResourcetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
|
||||
instance.setCollectedfrom(createCrossrefCollectedFrom())
|
||||
if (StringUtils.isNotBlank(issuedDate)) {
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
package eu.dnetlib.dhp.sx.graph
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.Result
|
||||
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
|
||||
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkCreateSummaryObject {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
|
||||
implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result]
|
||||
|
||||
implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
|
||||
|
||||
val ds:Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result]
|
||||
|
||||
ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,177 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.graph.ebi
|
||||
|
||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
|
||||
import org.apache.spark.sql.{Encoder, Encoders}
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
|
||||
|
||||
|
||||
object EBIAggregator {
|
||||
|
||||
def getDatasetAggregator(): Aggregator[(String, OafDataset), OafDataset, OafDataset] = new Aggregator[(String, OafDataset), OafDataset, OafDataset]{
|
||||
|
||||
override def zero: OafDataset = new OafDataset()
|
||||
|
||||
override def reduce(b: OafDataset, a: (String, OafDataset)): OafDataset = {
|
||||
b.mergeFrom(a._2)
|
||||
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
|
||||
if (b.getId == null)
|
||||
b.setId(a._2.getId)
|
||||
b
|
||||
}
|
||||
|
||||
|
||||
override def merge(wx: OafDataset, wy: OafDataset): OafDataset = {
|
||||
wx.mergeFrom(wy)
|
||||
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
|
||||
if(wx.getId == null && wy.getId.nonEmpty)
|
||||
wx.setId(wy.getId)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: OafDataset): OafDataset = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[OafDataset] =
|
||||
Encoders.kryo(classOf[OafDataset])
|
||||
|
||||
override def outputEncoder: Encoder[OafDataset] =
|
||||
Encoders.kryo(classOf[OafDataset])
|
||||
}
|
||||
|
||||
def getDLIUnknownAggregator(): Aggregator[(String, DLIUnknown), DLIUnknown, DLIUnknown] = new Aggregator[(String, DLIUnknown), DLIUnknown, DLIUnknown]{
|
||||
|
||||
override def zero: DLIUnknown = new DLIUnknown()
|
||||
|
||||
override def reduce(b: DLIUnknown, a: (String, DLIUnknown)): DLIUnknown = {
|
||||
b.mergeFrom(a._2)
|
||||
if (b.getId == null)
|
||||
b.setId(a._2.getId)
|
||||
b
|
||||
}
|
||||
|
||||
override def merge(wx: DLIUnknown, wy: DLIUnknown): DLIUnknown = {
|
||||
wx.mergeFrom(wy)
|
||||
if(wx.getId == null && wy.getId.nonEmpty)
|
||||
wx.setId(wy.getId)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: DLIUnknown): DLIUnknown = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[DLIUnknown] =
|
||||
Encoders.kryo(classOf[DLIUnknown])
|
||||
|
||||
override def outputEncoder: Encoder[DLIUnknown] =
|
||||
Encoders.kryo(classOf[DLIUnknown])
|
||||
}
|
||||
|
||||
def getDLIDatasetAggregator(): Aggregator[(String, DLIDataset), DLIDataset, DLIDataset] = new Aggregator[(String, DLIDataset), DLIDataset, DLIDataset]{
|
||||
|
||||
override def zero: DLIDataset = new DLIDataset()
|
||||
|
||||
override def reduce(b: DLIDataset, a: (String, DLIDataset)): DLIDataset = {
|
||||
b.mergeFrom(a._2)
|
||||
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
|
||||
if (b.getId == null)
|
||||
b.setId(a._2.getId)
|
||||
b
|
||||
}
|
||||
|
||||
override def merge(wx: DLIDataset, wy: DLIDataset): DLIDataset = {
|
||||
wx.mergeFrom(wy)
|
||||
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
|
||||
if(wx.getId == null && wy.getId.nonEmpty)
|
||||
wx.setId(wy.getId)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: DLIDataset): DLIDataset = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[DLIDataset] =
|
||||
Encoders.kryo(classOf[DLIDataset])
|
||||
|
||||
override def outputEncoder: Encoder[DLIDataset] =
|
||||
Encoders.kryo(classOf[DLIDataset])
|
||||
}
|
||||
|
||||
|
||||
def getDLIPublicationAggregator(): Aggregator[(String, DLIPublication), DLIPublication, DLIPublication] = new Aggregator[(String, DLIPublication), DLIPublication, DLIPublication]{
|
||||
|
||||
override def zero: DLIPublication = new DLIPublication()
|
||||
|
||||
override def reduce(b: DLIPublication, a: (String, DLIPublication)): DLIPublication = {
|
||||
b.mergeFrom(a._2)
|
||||
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
|
||||
|
||||
if (b.getId == null)
|
||||
b.setId(a._2.getId)
|
||||
b
|
||||
}
|
||||
|
||||
|
||||
override def merge(wx: DLIPublication, wy: DLIPublication): DLIPublication = {
|
||||
wx.mergeFrom(wy)
|
||||
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
|
||||
if(wx.getId == null && wy.getId.nonEmpty)
|
||||
wx.setId(wy.getId)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: DLIPublication): DLIPublication = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[DLIPublication] =
|
||||
Encoders.kryo(classOf[DLIPublication])
|
||||
|
||||
override def outputEncoder: Encoder[DLIPublication] =
|
||||
Encoders.kryo(classOf[DLIPublication])
|
||||
}
|
||||
|
||||
|
||||
def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
|
||||
|
||||
override def zero: Publication = new Publication()
|
||||
|
||||
override def reduce(b: Publication, a: (String, Publication)): Publication = {
|
||||
b.mergeFrom(a._2)
|
||||
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
|
||||
if (b.getId == null)
|
||||
b.setId(a._2.getId)
|
||||
b
|
||||
}
|
||||
|
||||
|
||||
override def merge(wx: Publication, wy: Publication): Publication = {
|
||||
wx.mergeFrom(wy)
|
||||
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
|
||||
if(wx.getId == null && wy.getId.nonEmpty)
|
||||
wx.setId(wy.getId)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: Publication): Publication = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Publication] =
|
||||
Encoders.kryo(classOf[Publication])
|
||||
|
||||
override def outputEncoder: Encoder[Publication] =
|
||||
Encoders.kryo(classOf[Publication])
|
||||
}
|
||||
|
||||
def getRelationAggregator(): Aggregator[(String, Relation), Relation, Relation] = new Aggregator[(String, Relation), Relation, Relation]{
|
||||
|
||||
override def zero: Relation = new Relation()
|
||||
|
||||
override def reduce(b: Relation, a: (String, Relation)): Relation = {
|
||||
a._2
|
||||
}
|
||||
|
||||
|
||||
override def merge(a: Relation, b: Relation): Relation = {
|
||||
if(b!= null) b else a
|
||||
}
|
||||
override def finish(reduction: Relation): Relation = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Relation] =
|
||||
Encoders.kryo(classOf[Relation])
|
||||
|
||||
override def outputEncoder: Encoder[Relation] =
|
||||
Encoders.kryo(classOf[Relation])
|
||||
}
|
||||
}
|
|
@ -1,248 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.graph.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, Instance, Journal, KeyValue, Oaf, Publication, Relation, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils.createQualifier
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, OafUtils, ProvenaceInfo}
|
||||
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import eu.dnetlib.scholexplorer.relation.RelationMapper
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql._
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.apache.spark.sql.functions._
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkAddLinkUpdates {
|
||||
|
||||
val relationMapper: RelationMapper = RelationMapper.load
|
||||
|
||||
|
||||
case class EBILinks(relation:String, pubdate:String, tpid:String, tpidType:String, turl:String, title:String, publisher:String) {}
|
||||
|
||||
|
||||
def generatePubmedDLICollectedFrom(): KeyValue = {
|
||||
OafUtils.generateKeyValue("dli_________::europe_pmc__", "Europe PMC")
|
||||
}
|
||||
|
||||
|
||||
|
||||
def journalToOAF(pj:PMJournal): Journal = {
|
||||
val j = new Journal
|
||||
j.setIssnPrinted(pj.getIssn)
|
||||
j.setVol(pj.getVolume)
|
||||
j.setName(pj.getTitle)
|
||||
j.setIss(pj.getIssue)
|
||||
j.setDataInfo(OafUtils.generateDataInfo())
|
||||
j
|
||||
}
|
||||
|
||||
|
||||
def pubmedTOPublication(input:PMArticle):DLIPublication = {
|
||||
|
||||
|
||||
val dnetPublicationId = s"50|${DHPUtils.md5(s"${input.getPmid}::pmid")}"
|
||||
|
||||
val p = new DLIPublication
|
||||
p.setId(dnetPublicationId)
|
||||
p.setDataInfo(OafUtils.generateDataInfo())
|
||||
p.setPid(List(OafUtils.createSP(input.getPmid.toLowerCase.trim, "pmid", ModelConstants.DNET_PID_TYPES)).asJava)
|
||||
p.setCompletionStatus("complete")
|
||||
val pi = new ProvenaceInfo
|
||||
pi.setId("dli_________::europe_pmc__")
|
||||
pi.setName( "Europe PMC")
|
||||
pi.setCompletionStatus("complete")
|
||||
pi.setCollectionMode("collected")
|
||||
p.setDlicollectedfrom(List(pi).asJava)
|
||||
p.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
|
||||
|
||||
if (input.getAuthors != null && input.getAuthors.size() >0) {
|
||||
var aths: List[Author] = List()
|
||||
input.getAuthors.asScala.filter(a=> a!= null).foreach(a => {
|
||||
val c = new Author
|
||||
c.setFullname(a.getFullName)
|
||||
c.setName(a.getForeName)
|
||||
c.setSurname(a.getLastName)
|
||||
aths = aths ::: List(c)
|
||||
})
|
||||
if (aths.nonEmpty)
|
||||
p.setAuthor(aths.asJava)
|
||||
}
|
||||
|
||||
|
||||
if (input.getJournal != null)
|
||||
p.setJournal(journalToOAF(input.getJournal))
|
||||
p.setTitle(List(OafUtils.createSP(input.getTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)).asJava)
|
||||
p.setDateofacceptance(OafUtils.asField(input.getDate))
|
||||
val i = new Instance
|
||||
i.setCollectedfrom(generatePubmedDLICollectedFrom())
|
||||
i.setDateofacceptance(p.getDateofacceptance)
|
||||
i.setUrl(List(s"https://pubmed.ncbi.nlm.nih.gov/${input.getPmid}").asJava)
|
||||
i.setInstancetype(createQualifier("0001", "Article", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
p.setInstance(List(i).asJava)
|
||||
p
|
||||
}
|
||||
|
||||
|
||||
def ebiLinksToOaf(input:(String, String)):List[Oaf] = {
|
||||
val pmid :String = input._1
|
||||
val input_json :String = input._2
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input_json)
|
||||
|
||||
|
||||
val targets:List[EBILinks] = for {
|
||||
JObject(link) <- json \\ "Category" \\ "Link"
|
||||
JField("PublicationDate", JString(pubdate)) <- link
|
||||
JField("RelationshipType", JObject(relationshipType)) <- link
|
||||
JField("Name", JString(relname)) <- relationshipType
|
||||
JField("Target", JObject(target)) <- link
|
||||
JField("Identifier", JObject(identifier)) <- target
|
||||
JField("ID", JString(tpid)) <- identifier
|
||||
JField("IDScheme", JString(tpidtype)) <- identifier
|
||||
JField("IDURL", JString(turl)) <- identifier
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Publisher", JObject(pub)) <- target
|
||||
JField("Name", JString(publisher)) <- pub
|
||||
} yield EBILinks(relname, pubdate, tpid, tpidtype, turl,title, publisher)
|
||||
|
||||
|
||||
|
||||
val dnetPublicationId = s"50|${DHPUtils.md5(s"$pmid::pmid")}"
|
||||
|
||||
targets.flatMap(l => {
|
||||
val relation = new Relation
|
||||
val inverseRelation = new Relation
|
||||
val targetDnetId = s"50|${DHPUtils.md5(s"${l.tpid.toLowerCase.trim}::${l.tpidType.toLowerCase.trim}")}"
|
||||
val relInfo = relationMapper.get(l.relation.toLowerCase)
|
||||
val relationSemantic = relInfo.getOriginal
|
||||
val inverseRelationSemantic = relInfo.getInverse
|
||||
|
||||
relation.setSource(dnetPublicationId)
|
||||
relation.setTarget(targetDnetId)
|
||||
relation.setRelClass("datacite")
|
||||
relation.setRelType(relationSemantic)
|
||||
relation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
|
||||
|
||||
inverseRelation.setSource(targetDnetId)
|
||||
inverseRelation.setTarget(dnetPublicationId)
|
||||
inverseRelation.setRelClass("datacite")
|
||||
inverseRelation.setRelType(inverseRelationSemantic)
|
||||
inverseRelation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
|
||||
|
||||
|
||||
|
||||
val d = new DLIDataset
|
||||
d.setId(targetDnetId)
|
||||
d.setDataInfo(OafUtils.generateDataInfo())
|
||||
d.setPid(List(OafUtils.createSP(l.tpid.toLowerCase.trim, l.tpidType.toLowerCase.trim, ModelConstants.DNET_PID_TYPES)).asJava)
|
||||
d.setCompletionStatus("complete")
|
||||
val pi = new ProvenaceInfo
|
||||
pi.setId("dli_________::europe_pmc__")
|
||||
pi.setName( "Europe PMC")
|
||||
pi.setCompletionStatus("complete")
|
||||
pi.setCollectionMode("collected")
|
||||
d.setDlicollectedfrom(List(pi).asJava)
|
||||
d.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
|
||||
d.setPublisher(OafUtils.asField(l.publisher))
|
||||
d.setTitle(List(OafUtils.createSP(l.title, "main title", ModelConstants.DNET_DATACITE_TITLE)).asJava)
|
||||
d.setDateofacceptance(OafUtils.asField(l.pubdate))
|
||||
val i = new Instance
|
||||
i.setCollectedfrom(generatePubmedDLICollectedFrom())
|
||||
i.setDateofacceptance(d.getDateofacceptance)
|
||||
i.setUrl(List(l.turl).asJava)
|
||||
i.setInstancetype(createQualifier("0021", "Dataset", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
d.setInstance(List(i).asJava)
|
||||
List(relation, inverseRelation, d)
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_to_df_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val workingPath = parser.get("workingPath")
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
implicit val oafpubEncoder: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||
implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset])
|
||||
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo(classOf[DLIPublication])
|
||||
implicit val atEncoder: Encoder[Author] = Encoders.kryo(classOf[Author])
|
||||
implicit val strEncoder:Encoder[String] = Encoders.STRING
|
||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
||||
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
||||
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
|
||||
|
||||
|
||||
val ds:Dataset[(String,String)] = spark.read.load(s"$workingPath/baseline_links_updates").as[(String,String)](Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
|
||||
ds.flatMap(l =>ebiLinksToOaf(l)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_oaf")
|
||||
|
||||
ds.filter(s => s.isInstanceOf)
|
||||
|
||||
|
||||
|
||||
val oDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/baseline_links_updates_oaf").as[Oaf]
|
||||
|
||||
oDataset.filter(p =>p.isInstanceOf[Relation]).map(p => p.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_relation")
|
||||
oDataset.filter(p =>p.isInstanceOf[DLIDataset]).map(p => p.asInstanceOf[DLIDataset]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_dataset")
|
||||
|
||||
|
||||
val idPublicationSolved:Dataset[String] = spark.read.load(s"$workingPath/baseline_links_updates").where(col("links").isNotNull).select("pmid").as[String]
|
||||
val baseline:Dataset[(String, PMArticle)]= spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle].map(p=> (p.getPmid, p))(Encoders.tuple(strEncoder,PMEncoder))
|
||||
idPublicationSolved.joinWith(baseline, idPublicationSolved("pmid").equalTo(baseline("_1"))).map(k => pubmedTOPublication(k._2._2)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_publication")
|
||||
|
||||
|
||||
val pmaDatasets = spark.read.load("/user/sandro.labruzzo/scholix/EBI/ebi_garr/baseline_dataset").as[PMArticle]
|
||||
|
||||
pmaDatasets.map(p => pubmedTOPublication(p)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_publication_all")
|
||||
|
||||
val pubs: Dataset[(String,Publication)] = spark.read.load("/user/sandro.labruzzo/scholix/EBI/publication").as[Publication].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING,oafpubEncoder))
|
||||
val pubdate:Dataset[(String,DLIPublication)] = spark.read.load(s"$workingPath/baseline_publication_all").as[DLIPublication].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING,pubEncoder))
|
||||
|
||||
|
||||
|
||||
pubs.joinWith(pubdate, pubs("_1").equalTo(pubdate("_1"))).map(k => k._2._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_publication_ebi")
|
||||
|
||||
|
||||
|
||||
val dt : Dataset[DLIDataset] = spark.read.load(s"$workingPath/dataset").as[DLIDataset]
|
||||
val update : Dataset[DLIDataset] = spark.read.load(s"$workingPath/ebi_garr/baseline_links_updates_dataset").as[DLIDataset]
|
||||
|
||||
|
||||
dt.union(update).map(d => (d.getId,d))(Encoders.tuple(Encoders.STRING, datEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(EBIAggregator.getDLIDatasetAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset_ebi")
|
||||
|
||||
|
||||
val rel: Dataset[Relation] = spark.read.load(s"$workingPath/relation").as[Relation]
|
||||
val relupdate : Dataset[Relation] = spark.read.load(s"$workingPath/ebi_garr/baseline_links_updates_relation").as[Relation]
|
||||
|
||||
|
||||
rel.union(relupdate)
|
||||
.map(d => (s"${d.getSource}::${d.getRelType}::${d.getTarget}", d))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(EBIAggregator.getRelationAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/baseline_relation_ebi")
|
||||
|
||||
}
|
||||
}
|
|
@ -1,223 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.graph.parser;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javax.xml.stream.XMLStreamReader;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import eu.dnetlib.scholexplorer.relation.RelInfo;
|
||||
import eu.dnetlib.scholexplorer.relation.RelationMapper;
|
||||
|
||||
public abstract class AbstractScholexplorerParser {
|
||||
|
||||
protected static final Log log = LogFactory.getLog(AbstractScholexplorerParser.class);
|
||||
static final Pattern pattern = Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE);
|
||||
private final List<String> datasetSubTypes = Arrays
|
||||
.asList(
|
||||
"dataset",
|
||||
"software",
|
||||
"film",
|
||||
"sound",
|
||||
"physicalobject",
|
||||
"audiovisual",
|
||||
"collection",
|
||||
"other",
|
||||
"study",
|
||||
"metadata");
|
||||
|
||||
public abstract List<Oaf> parseObject(final String record, final RelationMapper relMapper);
|
||||
|
||||
protected Map<String, String> getAttributes(final XMLStreamReader parser) {
|
||||
final Map<String, String> attributesMap = new HashMap<>();
|
||||
for (int i = 0; i < parser.getAttributeCount(); i++) {
|
||||
attributesMap.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
|
||||
}
|
||||
return attributesMap;
|
||||
}
|
||||
|
||||
protected List<StructuredProperty> extractSubject(List<VtdUtilityParser.Node> subjects) {
|
||||
final List<StructuredProperty> subjectResult = new ArrayList<>();
|
||||
if (subjects != null && subjects.size() > 0) {
|
||||
subjects
|
||||
.forEach(
|
||||
subjectMap -> {
|
||||
final StructuredProperty subject = new StructuredProperty();
|
||||
subject.setValue(subjectMap.getTextValue());
|
||||
final Qualifier schema = new Qualifier();
|
||||
schema.setClassid("dnet:subject");
|
||||
schema.setClassname("dnet:subject");
|
||||
schema.setSchemeid(subjectMap.getAttributes().get("subjectScheme"));
|
||||
schema.setSchemename(subjectMap.getAttributes().get("subjectScheme"));
|
||||
subject.setQualifier(schema);
|
||||
subjectResult.add(subject);
|
||||
});
|
||||
}
|
||||
return subjectResult;
|
||||
}
|
||||
|
||||
protected StructuredProperty extractIdentifier(
|
||||
List<VtdUtilityParser.Node> identifierType, final String fieldName) {
|
||||
final StructuredProperty pid = new StructuredProperty();
|
||||
if (identifierType != null && identifierType.size() > 0) {
|
||||
final VtdUtilityParser.Node result = identifierType.get(0);
|
||||
pid.setValue(result.getTextValue());
|
||||
final Qualifier pidType = new Qualifier();
|
||||
pidType.setClassname(result.getAttributes().get(fieldName));
|
||||
pidType.setClassid(result.getAttributes().get(fieldName));
|
||||
pidType.setSchemename(ModelConstants.DNET_PID_TYPES);
|
||||
pidType.setSchemeid(ModelConstants.DNET_PID_TYPES);
|
||||
pid.setQualifier(pidType);
|
||||
return pid;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
protected void inferPid(final StructuredProperty input) {
|
||||
final Matcher matcher = pattern.matcher(input.getValue());
|
||||
if (matcher.find()) {
|
||||
input.setValue(matcher.group());
|
||||
if (input.getQualifier() == null) {
|
||||
input.setQualifier(new Qualifier());
|
||||
input.getQualifier().setSchemename(ModelConstants.DNET_PID_TYPES);
|
||||
input.getQualifier().setSchemeid(ModelConstants.DNET_PID_TYPES);
|
||||
}
|
||||
input.getQualifier().setClassid("doi");
|
||||
input.getQualifier().setClassname("doi");
|
||||
}
|
||||
}
|
||||
|
||||
protected String generateId(final String pid, final String pidType, final String entityType) {
|
||||
String type;
|
||||
switch (entityType) {
|
||||
case "publication":
|
||||
type = "50|";
|
||||
break;
|
||||
case "dataset":
|
||||
type = "60|";
|
||||
break;
|
||||
case "unknown":
|
||||
type = "70|";
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("unexpected value " + entityType);
|
||||
}
|
||||
if ("dnet".equalsIgnoreCase(pidType))
|
||||
return type + StringUtils.substringAfter(pid, "::");
|
||||
|
||||
return type
|
||||
+ DHPUtils
|
||||
.md5(
|
||||
String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim()));
|
||||
}
|
||||
|
||||
protected DLIUnknown createUnknownObject(
|
||||
final String pid,
|
||||
final String pidType,
|
||||
final KeyValue cf,
|
||||
final DataInfo di,
|
||||
final String dateOfCollection) {
|
||||
final DLIUnknown uk = new DLIUnknown();
|
||||
uk.setId(generateId(pid, pidType, "unknown"));
|
||||
ProvenaceInfo pi = new ProvenaceInfo();
|
||||
pi.setId(cf.getKey());
|
||||
pi.setName(cf.getValue());
|
||||
pi.setCompletionStatus("incomplete");
|
||||
uk.setDataInfo(di);
|
||||
uk.setDlicollectedfrom(Collections.singletonList(pi));
|
||||
final StructuredProperty sourcePid = new StructuredProperty();
|
||||
sourcePid.setValue(pid);
|
||||
final Qualifier pt = new Qualifier();
|
||||
pt.setClassname(pidType);
|
||||
pt.setClassid(pidType);
|
||||
pt.setSchemename(ModelConstants.DNET_PID_TYPES);
|
||||
pt.setSchemeid(ModelConstants.DNET_PID_TYPES);
|
||||
sourcePid.setQualifier(pt);
|
||||
uk.setPid(Collections.singletonList(sourcePid));
|
||||
uk.setDateofcollection(dateOfCollection);
|
||||
return uk;
|
||||
}
|
||||
|
||||
protected Qualifier generateQualifier(final String classId, final String className, final String schemeId,
|
||||
final String schemeName) {
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid(classId);
|
||||
q.setClassid(className);
|
||||
q.setSchemeid(schemeId);
|
||||
q.setSchemename(schemeName);
|
||||
return q;
|
||||
|
||||
}
|
||||
|
||||
protected void generateRelations(
|
||||
RelationMapper relationMapper,
|
||||
Result parsedObject,
|
||||
List<Oaf> result,
|
||||
DataInfo di,
|
||||
String dateOfCollection,
|
||||
List<VtdUtilityParser.Node> relatedIdentifiers) {
|
||||
if (relatedIdentifiers != null) {
|
||||
result
|
||||
.addAll(
|
||||
relatedIdentifiers
|
||||
.stream()
|
||||
.flatMap(
|
||||
n -> {
|
||||
final List<Relation> rels = new ArrayList<>();
|
||||
Relation r = new Relation();
|
||||
r.setSource(parsedObject.getId());
|
||||
final String relatedPid = n.getTextValue();
|
||||
final String relatedPidType = n.getAttributes().get("relatedIdentifierType");
|
||||
final String relatedType = n.getAttributes().getOrDefault("entityType", "unknown");
|
||||
String relationSemantic = n.getAttributes().get("relationType");
|
||||
String inverseRelation;
|
||||
final String targetId = generateId(relatedPid, relatedPidType, relatedType);
|
||||
if (relationMapper.containsKey(relationSemantic.toLowerCase())) {
|
||||
RelInfo relInfo = relationMapper.get(relationSemantic.toLowerCase());
|
||||
relationSemantic = relInfo.getOriginal();
|
||||
inverseRelation = relInfo.getInverse();
|
||||
} else {
|
||||
relationSemantic = "Unknown";
|
||||
inverseRelation = "Unknown";
|
||||
}
|
||||
r.setTarget(targetId);
|
||||
r.setRelType(relationSemantic);
|
||||
r.setRelClass("datacite");
|
||||
r.setCollectedfrom(parsedObject.getCollectedfrom());
|
||||
r.setDataInfo(di);
|
||||
rels.add(r);
|
||||
r = new Relation();
|
||||
r.setDataInfo(di);
|
||||
r.setSource(targetId);
|
||||
r.setTarget(parsedObject.getId());
|
||||
r.setRelType(inverseRelation);
|
||||
r.setRelClass("datacite");
|
||||
r.setCollectedfrom(parsedObject.getCollectedfrom());
|
||||
rels.add(r);
|
||||
if ("unknown".equalsIgnoreCase(relatedType))
|
||||
result
|
||||
.add(
|
||||
createUnknownObject(
|
||||
relatedPid,
|
||||
relatedPidType,
|
||||
parsedObject.getCollectedfrom().get(0),
|
||||
di,
|
||||
dateOfCollection));
|
||||
return rels.stream();
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,340 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.graph.parser;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.ximpleware.AutoPilot;
|
||||
import com.ximpleware.VTDGen;
|
||||
import com.ximpleware.VTDNav;
|
||||
|
||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
|
||||
import eu.dnetlib.scholexplorer.relation.RelationMapper;
|
||||
|
||||
public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
|
||||
@Override
|
||||
public List<Oaf> parseObject(String record, final RelationMapper relationMapper) {
|
||||
try {
|
||||
final DLIDataset parsedObject = new DLIDataset();
|
||||
final VTDGen vg = new VTDGen();
|
||||
vg.setDoc(record.getBytes());
|
||||
final List<Oaf> result = new ArrayList<>();
|
||||
vg.parse(true);
|
||||
|
||||
final VTDNav vn = vg.getNav();
|
||||
final AutoPilot ap = new AutoPilot(vn);
|
||||
|
||||
DataInfo di = new DataInfo();
|
||||
di.setTrust("0.9");
|
||||
di.setDeletedbyinference(false);
|
||||
di.setInvisible(false);
|
||||
parsedObject.setDataInfo(di);
|
||||
|
||||
parsedObject
|
||||
.setOriginalId(
|
||||
Collections
|
||||
.singletonList(
|
||||
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']")));
|
||||
|
||||
parsedObject
|
||||
.setOriginalObjIdentifier(
|
||||
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']"));
|
||||
String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']");
|
||||
parsedObject.setDateofcollection(dateOfCollection);
|
||||
|
||||
final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']");
|
||||
|
||||
if (StringUtils.isNotBlank(resolvedDate)) {
|
||||
StructuredProperty currentDate = new StructuredProperty();
|
||||
currentDate.setValue(resolvedDate);
|
||||
final Qualifier dateQualifier = new Qualifier();
|
||||
dateQualifier.setClassname("resolvedDate");
|
||||
dateQualifier.setClassid("resolvedDate");
|
||||
dateQualifier.setSchemename("dnet::date");
|
||||
dateQualifier.setSchemeid("dnet::date");
|
||||
currentDate.setQualifier(dateQualifier);
|
||||
parsedObject.setRelevantdate(Collections.singletonList(currentDate));
|
||||
}
|
||||
final String completionStatus = VtdUtilityParser
|
||||
.getSingleValue(ap, vn, "//*[local-name()='completionStatus']");
|
||||
final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
|
||||
|
||||
final String publisher = VtdUtilityParser
|
||||
.getSingleValue(
|
||||
ap, vn, "//*[local-name()='resource']/*[local-name()='publisher']");
|
||||
|
||||
List<VtdUtilityParser.Node> collectedFromNodes = VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap,
|
||||
vn,
|
||||
"//*[local-name()='collectedFrom']",
|
||||
Arrays.asList("name", "id", "mode", "completionStatus"));
|
||||
|
||||
List<VtdUtilityParser.Node> resolvededFromNodes = VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap,
|
||||
vn,
|
||||
"//*[local-name()='resolvedFrom']",
|
||||
Arrays.asList("name", "id", "mode", "completionStatus"));
|
||||
|
||||
Field<String> pf = new Field<>();
|
||||
pf.setValue(publisher);
|
||||
|
||||
parsedObject.setPublisher(pf);
|
||||
final List<ProvenaceInfo> provenances = new ArrayList<>();
|
||||
if (collectedFromNodes != null && collectedFromNodes.size() > 0) {
|
||||
collectedFromNodes
|
||||
.forEach(
|
||||
it -> {
|
||||
final ProvenaceInfo provenance = new ProvenaceInfo();
|
||||
provenance.setId(it.getAttributes().get("id"));
|
||||
provenance.setName(it.getAttributes().get("name"));
|
||||
provenance.setCollectionMode(provisionMode);
|
||||
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
|
||||
provenances.add(provenance);
|
||||
});
|
||||
}
|
||||
|
||||
if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) {
|
||||
resolvededFromNodes
|
||||
.forEach(
|
||||
it -> {
|
||||
final ProvenaceInfo provenance = new ProvenaceInfo();
|
||||
provenance.setId(it.getAttributes().get("id"));
|
||||
provenance.setName(it.getAttributes().get("name"));
|
||||
provenance.setCollectionMode("resolved");
|
||||
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
|
||||
provenances.add(provenance);
|
||||
});
|
||||
}
|
||||
|
||||
parsedObject.setDlicollectedfrom(provenances);
|
||||
parsedObject
|
||||
.setCollectedfrom(
|
||||
parsedObject
|
||||
.getDlicollectedfrom()
|
||||
.stream()
|
||||
.map(
|
||||
p -> {
|
||||
final KeyValue cf = new KeyValue();
|
||||
cf.setKey(p.getId());
|
||||
cf.setValue(p.getName());
|
||||
return cf;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
parsedObject
|
||||
.setCompletionStatus(
|
||||
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"));
|
||||
|
||||
final List<Node> identifierType = VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap,
|
||||
vn,
|
||||
"//*[local-name()='resource']/*[local-name()='identifier']",
|
||||
Collections.singletonList("identifierType"));
|
||||
|
||||
StructuredProperty currentPid = extractIdentifier(identifierType, "identifierType");
|
||||
if (currentPid == null)
|
||||
return null;
|
||||
inferPid(currentPid);
|
||||
parsedObject.setPid(Collections.singletonList(currentPid));
|
||||
|
||||
String resolvedURL = null;
|
||||
|
||||
switch (currentPid.getQualifier().getClassname().toLowerCase()) {
|
||||
case "uniprot":
|
||||
resolvedURL = "https://www.uniprot.org/uniprot/" + currentPid.getValue();
|
||||
break;
|
||||
case "ena":
|
||||
if (StringUtils.isNotBlank(currentPid.getValue()) && currentPid.getValue().length() > 7)
|
||||
resolvedURL = "https://www.ebi.ac.uk/ena/data/view/" + currentPid.getValue().substring(0, 8);
|
||||
break;
|
||||
case "chembl":
|
||||
resolvedURL = "https://www.ebi.ac.uk/chembl/compound_report_card/" + currentPid.getValue();
|
||||
break;
|
||||
|
||||
case "ncbi-n":
|
||||
resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
|
||||
break;
|
||||
case "ncbi-p":
|
||||
resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
|
||||
break;
|
||||
case "genbank":
|
||||
resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
|
||||
break;
|
||||
case "pdb":
|
||||
resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
|
||||
break;
|
||||
case "url":
|
||||
resolvedURL = currentPid.getValue();
|
||||
break;
|
||||
}
|
||||
|
||||
final String sourceId = generateId(
|
||||
currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset");
|
||||
parsedObject.setId(sourceId);
|
||||
|
||||
List<String> descs = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='description']");
|
||||
if (descs != null && descs.size() > 0)
|
||||
parsedObject
|
||||
.setDescription(
|
||||
descs
|
||||
.stream()
|
||||
// .map(it -> it.length() < 10000 ? it : it.substring(0, 10000))
|
||||
.map(
|
||||
it -> {
|
||||
final Field<String> d = new Field<>();
|
||||
d.setValue(it);
|
||||
return d;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
final List<Node> relatedIdentifiers = VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap,
|
||||
vn,
|
||||
"//*[local-name()='relatedIdentifier']",
|
||||
Arrays
|
||||
.asList(
|
||||
"relatedIdentifierType", "relationType", "entityType", "inverseRelationType"));
|
||||
|
||||
generateRelations(
|
||||
relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers);
|
||||
|
||||
final List<Node> hostedBy = VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name"));
|
||||
|
||||
if (hostedBy != null) {
|
||||
parsedObject
|
||||
.setInstance(
|
||||
hostedBy
|
||||
.stream()
|
||||
.map(
|
||||
it -> {
|
||||
final Instance i = new Instance();
|
||||
i.setUrl(Collections.singletonList(currentPid.getValue()));
|
||||
KeyValue h = new KeyValue();
|
||||
i.setHostedby(h);
|
||||
h.setKey(it.getAttributes().get("id"));
|
||||
h.setValue(it.getAttributes().get("name"));
|
||||
return i;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
List<StructuredProperty> subjects = extractSubject(
|
||||
VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap,
|
||||
vn,
|
||||
"//*[local-name()='resource']//*[local-name()='subject']",
|
||||
Collections.singletonList("subjectScheme")));
|
||||
|
||||
parsedObject.setSubject(subjects);
|
||||
|
||||
Qualifier q = new Qualifier();
|
||||
q.setClassname("dataset");
|
||||
q.setClassid("dataset");
|
||||
q.setSchemename("dataset");
|
||||
q.setSchemeid("dataset");
|
||||
parsedObject.setResulttype(q);
|
||||
|
||||
parsedObject.setCompletionStatus(completionStatus);
|
||||
|
||||
final List<String> creators = VtdUtilityParser
|
||||
.getTextValue(
|
||||
ap,
|
||||
vn,
|
||||
"//*[local-name()='resource']//*[local-name()='creator']/*[local-name()='creatorName']");
|
||||
if (creators != null && creators.size() > 0) {
|
||||
parsedObject
|
||||
.setAuthor(
|
||||
creators
|
||||
.stream()
|
||||
.map(
|
||||
a -> {
|
||||
final Author author = new Author();
|
||||
author.setFullname(a);
|
||||
return author;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
final List<String> titles = VtdUtilityParser
|
||||
.getTextValue(
|
||||
ap, vn, "//*[local-name()='resource']//*[local-name()='title']");
|
||||
if (titles != null && titles.size() > 0) {
|
||||
parsedObject
|
||||
.setTitle(
|
||||
titles
|
||||
.stream()
|
||||
.map(
|
||||
t -> {
|
||||
final StructuredProperty st = new StructuredProperty();
|
||||
st.setValue(t);
|
||||
st.setQualifier(ModelConstants.MAIN_TITLE_QUALIFIER);
|
||||
return st;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
final List<String> dates = VtdUtilityParser
|
||||
.getTextValue(
|
||||
ap,
|
||||
vn,
|
||||
"//*[local-name()='resource']/*[local-name()='dates']/*[local-name()='date']");
|
||||
|
||||
if (dates != null && dates.size() > 0) {
|
||||
parsedObject
|
||||
.setRelevantdate(
|
||||
dates
|
||||
.stream()
|
||||
.map(
|
||||
cd -> {
|
||||
StructuredProperty date = new StructuredProperty();
|
||||
date.setValue(cd);
|
||||
final Qualifier dq = new Qualifier();
|
||||
dq.setClassname("date");
|
||||
dq.setClassid("date");
|
||||
dq.setSchemename("dnet::date");
|
||||
dq.setSchemeid("dnet::date");
|
||||
date.setQualifier(dq);
|
||||
return date;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
// TERRIBLE HACK TO AVOID EMPTY COLLECTED FROM
|
||||
if (parsedObject.getDlicollectedfrom() == null) {
|
||||
|
||||
final KeyValue cf = new KeyValue();
|
||||
cf.setKey("dli_________::europe_pmc__");
|
||||
cf.setValue("Europe PMC");
|
||||
parsedObject.setCollectedfrom(Collections.singletonList(cf));
|
||||
}
|
||||
|
||||
if (StringUtils.isNotBlank(resolvedURL)) {
|
||||
Instance i = new Instance();
|
||||
i.setCollectedfrom(parsedObject.getCollectedfrom().get(0));
|
||||
i.setUrl(Collections.singletonList(resolvedURL));
|
||||
parsedObject.setInstance(Collections.singletonList(i));
|
||||
}
|
||||
|
||||
result.add(parsedObject);
|
||||
return result;
|
||||
} catch (Throwable e) {
|
||||
log.error("Error on parsing record " + record, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,264 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.graph.parser;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.ximpleware.AutoPilot;
|
||||
import com.ximpleware.VTDGen;
|
||||
import com.ximpleware.VTDNav;
|
||||
|
||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
|
||||
import eu.dnetlib.scholexplorer.relation.RelationMapper;
|
||||
|
||||
public class PublicationScholexplorerParser extends AbstractScholexplorerParser {
|
||||
|
||||
@Override
|
||||
public List<Oaf> parseObject(final String record, final RelationMapper relationMapper) {
|
||||
try {
|
||||
final List<Oaf> result = new ArrayList<>();
|
||||
final DLIPublication parsedObject = new DLIPublication();
|
||||
final VTDGen vg = new VTDGen();
|
||||
vg.setDoc(record.getBytes());
|
||||
vg.parse(true);
|
||||
|
||||
final VTDNav vn = vg.getNav();
|
||||
final AutoPilot ap = new AutoPilot(vn);
|
||||
|
||||
final DataInfo di = new DataInfo();
|
||||
di.setTrust("0.9");
|
||||
di.setDeletedbyinference(false);
|
||||
di.setInvisible(false);
|
||||
|
||||
String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']");
|
||||
parsedObject.setDateofcollection(dateOfCollection);
|
||||
|
||||
final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']");
|
||||
parsedObject
|
||||
.setOriginalId(
|
||||
Collections
|
||||
.singletonList(
|
||||
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']")));
|
||||
|
||||
if (StringUtils.isNotBlank(resolvedDate)) {
|
||||
StructuredProperty currentDate = new StructuredProperty();
|
||||
currentDate.setValue(resolvedDate);
|
||||
final Qualifier dateQualifier = new Qualifier();
|
||||
dateQualifier.setClassname("resolvedDate");
|
||||
dateQualifier.setClassid("resolvedDate");
|
||||
dateQualifier.setSchemename("dnet::date");
|
||||
dateQualifier.setSchemeid("dnet::date");
|
||||
currentDate.setQualifier(dateQualifier);
|
||||
parsedObject.setRelevantdate(Collections.singletonList(currentDate));
|
||||
}
|
||||
|
||||
final List<Node> pid = VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap, vn, "//*[local-name()='pid']", Arrays.asList("type"));
|
||||
|
||||
StructuredProperty currentPid = extractIdentifier(pid, "type");
|
||||
if (currentPid == null)
|
||||
return null;
|
||||
inferPid(currentPid);
|
||||
parsedObject.setPid(Collections.singletonList(currentPid));
|
||||
final String sourceId = generateId(
|
||||
currentPid.getValue(), currentPid.getQualifier().getClassid(), "publication");
|
||||
parsedObject.setId(sourceId);
|
||||
|
||||
parsedObject
|
||||
.setOriginalObjIdentifier(
|
||||
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']"));
|
||||
|
||||
String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
|
||||
|
||||
List<Node> collectedFromNodes = VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap,
|
||||
vn,
|
||||
"//*[local-name()='collectedFrom']",
|
||||
Arrays.asList("name", "id", "mode", "completionStatus"));
|
||||
|
||||
List<Node> resolvededFromNodes = VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap,
|
||||
vn,
|
||||
"//*[local-name()='resolvedFrom']",
|
||||
Arrays.asList("name", "id", "mode", "completionStatus"));
|
||||
|
||||
final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='publisher']");
|
||||
Field<String> pf = new Field<>();
|
||||
pf.setValue(publisher);
|
||||
|
||||
parsedObject.setPublisher(pf);
|
||||
final List<ProvenaceInfo> provenances = new ArrayList<>();
|
||||
if (collectedFromNodes != null && collectedFromNodes.size() > 0) {
|
||||
collectedFromNodes
|
||||
.forEach(
|
||||
it -> {
|
||||
final ProvenaceInfo provenance = new ProvenaceInfo();
|
||||
provenance.setId(it.getAttributes().get("id"));
|
||||
provenance.setName(it.getAttributes().get("name"));
|
||||
provenance.setCollectionMode(provisionMode);
|
||||
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
|
||||
provenances.add(provenance);
|
||||
});
|
||||
}
|
||||
|
||||
if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) {
|
||||
resolvededFromNodes
|
||||
.forEach(
|
||||
it -> {
|
||||
final ProvenaceInfo provenance = new ProvenaceInfo();
|
||||
provenance.setId(it.getAttributes().get("id"));
|
||||
provenance.setName(it.getAttributes().get("name"));
|
||||
provenance.setCollectionMode("resolved");
|
||||
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
|
||||
provenances.add(provenance);
|
||||
});
|
||||
}
|
||||
|
||||
parsedObject.setDlicollectedfrom(provenances);
|
||||
parsedObject
|
||||
.setCompletionStatus(
|
||||
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"));
|
||||
|
||||
parsedObject
|
||||
.setCollectedfrom(
|
||||
parsedObject
|
||||
.getDlicollectedfrom()
|
||||
.stream()
|
||||
.map(
|
||||
p -> {
|
||||
final KeyValue cf = new KeyValue();
|
||||
cf.setKey(p.getId());
|
||||
cf.setValue(p.getName());
|
||||
return cf;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
final List<Node> relatedIdentifiers = VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap,
|
||||
vn,
|
||||
"//*[local-name()='relatedIdentifier']",
|
||||
Arrays
|
||||
.asList(
|
||||
"relatedIdentifierType", "relationType", "entityType", "inverseRelationType"));
|
||||
generateRelations(
|
||||
relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers);
|
||||
|
||||
final List<Node> hostedBy = VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name"));
|
||||
|
||||
if (hostedBy != null) {
|
||||
parsedObject
|
||||
.setInstance(
|
||||
hostedBy
|
||||
.stream()
|
||||
.map(
|
||||
it -> {
|
||||
final Instance i = new Instance();
|
||||
i.setUrl(Collections.singletonList(currentPid.getValue()));
|
||||
KeyValue h = new KeyValue();
|
||||
i.setHostedby(h);
|
||||
h.setKey(it.getAttributes().get("id"));
|
||||
h.setValue(it.getAttributes().get("name"));
|
||||
return i;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
final List<String> authorsNode = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='creator']");
|
||||
if (authorsNode != null)
|
||||
parsedObject
|
||||
.setAuthor(
|
||||
authorsNode
|
||||
.stream()
|
||||
.map(
|
||||
a -> {
|
||||
final Author author = new Author();
|
||||
author.setFullname(a);
|
||||
return author;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
final List<String> titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='title']");
|
||||
if (titles != null) {
|
||||
parsedObject
|
||||
.setTitle(
|
||||
titles
|
||||
.stream()
|
||||
.map(
|
||||
t -> {
|
||||
final StructuredProperty st = new StructuredProperty();
|
||||
st.setValue(t);
|
||||
st
|
||||
.setQualifier(
|
||||
generateQualifier(
|
||||
"main title", "main title", "dnet:dataCite_title",
|
||||
"dnet:dataCite_title"));
|
||||
return st;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
Field<String> description = new Field<>();
|
||||
|
||||
description
|
||||
.setValue(
|
||||
VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='description']"));
|
||||
|
||||
// if (StringUtils.isNotBlank(description.getValue())
|
||||
// && description.getValue().length() > 10000) {
|
||||
// description.setValue(description.getValue().substring(0, 10000));
|
||||
// }
|
||||
|
||||
parsedObject.setDescription(Collections.singletonList(description));
|
||||
|
||||
final String cd = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='date']");
|
||||
|
||||
StructuredProperty date = new StructuredProperty();
|
||||
date.setValue(cd);
|
||||
final Qualifier dq = new Qualifier();
|
||||
dq.setClassname("date");
|
||||
dq.setClassid("date");
|
||||
dq.setSchemename("dnet::date");
|
||||
dq.setSchemeid("dnet::date");
|
||||
date.setQualifier(dq);
|
||||
parsedObject.setRelevantdate(Collections.singletonList(date));
|
||||
|
||||
List<StructuredProperty> subjects = extractSubject(
|
||||
VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap, vn, "//*[local-name()='subject']", Collections.singletonList("scheme")));
|
||||
parsedObject.setSubject(subjects);
|
||||
|
||||
parsedObject.setDataInfo(di);
|
||||
|
||||
parsedObject.setSubject(subjects);
|
||||
Qualifier q = new Qualifier();
|
||||
q.setClassname("publication");
|
||||
q.setClassid("publication");
|
||||
q.setSchemename("publication");
|
||||
q.setSchemeid("publication");
|
||||
parsedObject.setResulttype(q);
|
||||
result.add(parsedObject);
|
||||
return result;
|
||||
|
||||
} catch (Throwable e) {
|
||||
log.error("Input record: " + record);
|
||||
log.error("Error on parsing record ", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package eu.dnetlib.dhp.sx.graph.scholix
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.{Dataset, Result}
|
||||
import eu.dnetlib.dhp.schema.sx.summary.{SchemeValue, ScholixSummary, TypedIdentifier, Typology}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object ScholixUtils {
|
||||
|
||||
|
||||
def resultToSummary(r:Result):ScholixSummary = {
|
||||
val s = new ScholixSummary
|
||||
s.setId(r.getId)
|
||||
s.setLocalIdentifier(r.getPid.asScala.map(p => new TypedIdentifier(p.getValue, p.getQualifier.getClassid)).asJava)
|
||||
|
||||
if (r.isInstanceOf[Dataset])
|
||||
s.setTypology(Typology.dataset)
|
||||
else
|
||||
s.setTypology(Typology.publication)
|
||||
|
||||
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
|
||||
|
||||
if (r.getTitle!= null && r.getTitle.asScala.nonEmpty) {
|
||||
s.setTitle(r.getTitle.asScala.map(t => t.getValue).asJava)
|
||||
}
|
||||
|
||||
if(r.getAuthor!= null && !r.getAuthor.isEmpty) {
|
||||
s.setAuthor(r.getAuthor.asScala.map(a=> a.getFullname).asJava)
|
||||
}
|
||||
if (r.getInstance() != null) {
|
||||
val dt:List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue)(collection.breakOut)
|
||||
if (dt.nonEmpty)
|
||||
s.setDate(dt.asJava)
|
||||
}
|
||||
if (r.getDescription!= null && !r.getDescription.isEmpty) {
|
||||
val d = r.getDescription.asScala.find(f => f.getValue!=null)
|
||||
if (d.isDefined)
|
||||
s.setDescription(d.get.getValue)
|
||||
}
|
||||
|
||||
if (r.getSubject!= null && !r.getSubject.isEmpty)
|
||||
s.setSubject(r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue)).asJava)
|
||||
|
||||
if (r.getPublisher!= null)
|
||||
s.setPublisher(List(r.getPublisher.getValue).asJava)
|
||||
|
||||
s.setRelatedDatasets(0)
|
||||
s.setRelatedPublications(0)
|
||||
s.setRelatedUnknown(0)
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true}
|
||||
]
|
|
@ -10,7 +10,7 @@
|
|||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResolveRelations"/>
|
||||
<start to="CreateSummaries"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
|
@ -64,9 +64,38 @@
|
|||
<arg>--workingPath</arg><arg>${targetPath}/resolved/</arg>
|
||||
<arg>--entityPath</arg><arg>${targetPath}/dedup</arg>
|
||||
</spark>
|
||||
<ok to="CreateSummaries"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="CreateSummaries">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Entities to summaries</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkCreateSummaryObject</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=5000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--sourcePath</arg><arg>${targetPath}/dedup</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/provision/summaries</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,63 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.graph;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.fasterxml.jackson.databind.SerializationFeature;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser;
|
||||
import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser;
|
||||
import eu.dnetlib.scholexplorer.relation.RelationMapper;
|
||||
|
||||
public class ScholexplorerParserTest {
|
||||
|
||||
@Test
|
||||
public void testDataciteParser() throws Exception {
|
||||
String xml = IOUtils.toString(this.getClass().getResourceAsStream("dmf.xml"));
|
||||
|
||||
DatasetScholexplorerParser p = new DatasetScholexplorerParser();
|
||||
List<Oaf> oaves = p.parseObject(xml, RelationMapper.load());
|
||||
|
||||
ObjectMapper m = new ObjectMapper();
|
||||
m.enable(SerializationFeature.INDENT_OUTPUT);
|
||||
|
||||
oaves
|
||||
.forEach(
|
||||
oaf -> {
|
||||
try {
|
||||
System.out.println(m.writeValueAsString(oaf));
|
||||
System.out.println("----------------------------");
|
||||
} catch (JsonProcessingException e) {
|
||||
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPublicationParser() throws Exception {
|
||||
String xml = IOUtils.toString(this.getClass().getResourceAsStream("pmf.xml"));
|
||||
|
||||
PublicationScholexplorerParser p = new PublicationScholexplorerParser();
|
||||
List<Oaf> oaves = p.parseObject(xml, RelationMapper.load());
|
||||
|
||||
ObjectMapper m = new ObjectMapper();
|
||||
m.enable(SerializationFeature.INDENT_OUTPUT);
|
||||
|
||||
oaves
|
||||
.forEach(
|
||||
oaf -> {
|
||||
try {
|
||||
System.out.println(m.writeValueAsString(oaf));
|
||||
System.out.println("----------------------------");
|
||||
} catch (JsonProcessingException e) {
|
||||
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
|
@ -1,54 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.graph
|
||||
|
||||
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication
|
||||
import eu.dnetlib.dhp.sx.graph.ebi.EBIAggregator
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||
import org.junit.jupiter.api.Assertions._
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
class SparkScholexplorerAggregationTest {
|
||||
|
||||
|
||||
@Test
|
||||
def testFunderRelationshipsMapping(): Unit = {
|
||||
val publications = Source.fromInputStream(getClass.getResourceAsStream("publication.json")).mkString
|
||||
|
||||
var s: List[DLIPublication] = List[DLIPublication]()
|
||||
|
||||
val m: ObjectMapper = new ObjectMapper()
|
||||
|
||||
m.enable(SerializationFeature.INDENT_OUTPUT)
|
||||
|
||||
for (line <- publications.lines) {
|
||||
s = m.readValue(line, classOf[DLIPublication]) :: s
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
|
||||
val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()
|
||||
|
||||
|
||||
val ds: Dataset[DLIPublication] = spark.createDataset(spark.sparkContext.parallelize(s)).as[DLIPublication]
|
||||
|
||||
val unique = ds.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(EBIAggregator.getDLIPublicationAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
|
||||
val uniquePubs: DLIPublication = unique.first()
|
||||
|
||||
s.foreach(pp => assertFalse(pp.getAuthor.isEmpty))
|
||||
|
||||
|
||||
assertNotNull(uniquePubs.getAuthor)
|
||||
assertFalse(uniquePubs.getAuthor.isEmpty)
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.graph;
|
||||
|
||||
public class SparkScholexplorerGraphImporterTest {
|
||||
|
||||
}
|
|
@ -1,5 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.graph;
|
||||
|
||||
public class SparkScholexplorerMergeEntitiesJobTest {
|
||||
}
|
|
@ -8,10 +8,9 @@ import eu.dnetlib.dhp.common.PacePerson
|
|||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils._
|
||||
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
|
|
@ -15,7 +15,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class SparkIndexCollectionOnES {
|
||||
public class
|
||||
SparkIndexCollectionOnES {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
|
|
|
@ -24,8 +24,8 @@
|
|||
<module>dhp-dedup-openaire</module>
|
||||
<module>dhp-enrichment</module>
|
||||
<module>dhp-graph-provision</module>
|
||||
<module>dhp-dedup-scholexplorer</module>
|
||||
<module>dhp-graph-provision-scholexplorer</module>
|
||||
<!-- <module>dhp-dedup-scholexplorer</module>-->
|
||||
<!-- <module>dhp-graph-provision-scholexplorer</module>-->
|
||||
<module>dhp-blacklist</module>
|
||||
<module>dhp-stats-update</module>
|
||||
<module>dhp-stats-promote</module>
|
||||
|
|
2
pom.xml
2
pom.xml
|
@ -736,7 +736,7 @@
|
|||
<mockito-core.version>3.3.3</mockito-core.version>
|
||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||
<vtd.version>[2.12,3.0)</vtd.version>
|
||||
<dhp-schemas.version>[2.6.13]</dhp-schemas.version>
|
||||
<dhp-schemas.version>[2.6.14]</dhp-schemas.version>
|
||||
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
||||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||
|
|
Loading…
Reference in New Issue