From c6fa8598e102d49fe28cb64aca2586d589912970 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 1 Jul 2021 22:13:45 +0200 Subject: [PATCH] massive code refactor: removed modules dhp-*-scholexplorer --- .../doiboost/DoiBoostMappingUtil.scala | 22 +- .../doiboost/crossref/Crossref2Oaf.scala | 10 +- .../sx/graph/SparkCreateSummaryObject.scala | 42 +++ .../dhp/sx/graph/ebi/EBIAggregator.scala | 177 --------- .../sx/graph/ebi/SparkAddLinkUpdates.scala | 248 ------------- .../parser/AbstractScholexplorerParser.java | 223 ------------ .../parser/DatasetScholexplorerParser.java | 340 ------------------ .../PublicationScholexplorerParser.java | 264 -------------- .../dhp/sx/graph/scholix/ScholixUtils.scala | 54 +++ .../dhp/sx/graph/create_summaries_params.json | 5 + .../dhp/sx/graph/step1/oozie_app/workflow.xml | 31 +- .../dhp/sx/graph/ScholexplorerParserTest.java | 63 ---- .../SparkScholexplorerAggregationTest.scala | 54 --- .../SparkScholexplorerGraphImporterTest.java | 6 - ...parkScholexplorerMergeEntitiesJobTest.java | 5 - .../java/eu/dnetlib/dhp/export/DLIToOAF.scala | 3 +- .../provision/SparkIndexCollectionOnES.java | 3 +- dhp-workflows/pom.xml | 4 +- pom.xml | 2 +- 19 files changed, 152 insertions(+), 1404 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/EBIAggregator.scala delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkAddLinkUpdates.scala delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/create_summaries_params.json delete mode 100644 dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java delete mode 100644 dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerAggregationTest.scala delete mode 100644 dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java delete mode 100644 dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index a6101c07e..85ae24d0c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -6,7 +6,7 @@ import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.common.ModelConstants -import eu.dnetlib.dhp.schema.scholexplorer.OafUtils +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse @@ -118,11 +118,11 @@ object DoiBoostMappingUtil { def getOpenAccessQualifier():AccessRight = { - OafUtils.createAccessRight("OPEN","Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) + OafMapperUtils.accessRight("OPEN","Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) } def getRestrictedQualifier():AccessRight = { - OafUtils.createAccessRight("RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) + OafMapperUtils.accessRight("RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) } @@ -150,7 +150,7 @@ object DoiBoostMappingUtil { if (item.openAccess) i.setAccessright(getOpenAccessQualifier()) val ar = getOpenAccessQualifier() - publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) + publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) } else { hb = ModelConstants.UNKNOWN_REPOSITORY @@ -162,11 +162,11 @@ object DoiBoostMappingUtil { if (ar.nonEmpty) { if(ar.contains(ModelConstants.ACCESS_RIGHT_OPEN)){ val ar = getOpenAccessQualifier() - publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) + publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) } else { val ar = getRestrictedQualifier() - publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) + publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) } } publication @@ -254,7 +254,7 @@ object DoiBoostMappingUtil { di.setInferred(false) di.setInvisible(false) di.setTrust(trust) - di.setProvenanceaction(OafUtils.createQualifier(ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS)) + di.setProvenanceaction(OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_ACTIONSET,ModelConstants.SYSIMPORT_ACTIONSET, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS)) di } @@ -262,7 +262,7 @@ object DoiBoostMappingUtil { def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = { val sp = new StructuredProperty - sp.setQualifier(OafUtils.createQualifier(classId,className, schemeId, schemeName)) + sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName)) sp.setValue(value) sp @@ -272,7 +272,7 @@ object DoiBoostMappingUtil { def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = { val sp = new StructuredProperty - sp.setQualifier(OafUtils.createQualifier(classId,className, schemeId, schemeName)) + sp.setQualifier(OafMapperUtils.qualifier(classId,className, schemeId, schemeName)) sp.setValue(value) sp.setDataInfo(dataInfo) sp @@ -281,7 +281,7 @@ object DoiBoostMappingUtil { def createSP(value: String, classId: String, schemeId: String): StructuredProperty = { val sp = new StructuredProperty - sp.setQualifier(OafUtils.createQualifier(classId, schemeId)) + sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId)) sp.setValue(value) sp @@ -291,7 +291,7 @@ object DoiBoostMappingUtil { def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = { val sp = new StructuredProperty - sp.setQualifier(OafUtils.createQualifier(classId, schemeId)) + sp.setQualifier(OafMapperUtils.qualifier(classId,classId, schemeId, schemeId)) sp.setValue(value) sp.setDataInfo(dataInfo) sp diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index f033c6d81..da211dd76 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -2,7 +2,7 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf._ -import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory +import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import org.apache.commons.lang.StringUtils @@ -15,8 +15,6 @@ import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex -import eu.dnetlib.dhp.schema.scholexplorer.OafUtils - import java.util case class CrossrefDT(doi: String, json:String, timestamp: Long) {} @@ -182,12 +180,12 @@ case object Crossref2Oaf { if(has_review != JNothing) { instance.setRefereed( - OafUtils.createQualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS)) + OafMapperUtils.qualifier("0001", "peerReviewed", ModelConstants.DNET_REVIEW_LEVELS, ModelConstants.DNET_REVIEW_LEVELS)) } instance.setAccessright(getRestrictedQualifier()) - instance.setInstancetype(OafUtils.createQualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) - result.setResourcetype(OafUtils.createQualifier(cobjCategory.substring(0, 4),ModelConstants.DNET_DATA_CITE_RESOURCE)) + instance.setInstancetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) + result.setResourcetype(OafMapperUtils.qualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) instance.setCollectedfrom(createCrossrefCollectedFrom()) if (StringUtils.isNotBlank(issuedDate)) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala new file mode 100644 index 000000000..f37abfa2a --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala @@ -0,0 +1,42 @@ +package eu.dnetlib.dhp.sx.graph + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.schema.oaf.Result +import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary +import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils +import org.apache.commons.io.IOUtils +import org.apache.spark.SparkConf +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +object SparkCreateSummaryObject { + + def main(args: Array[String]): Unit = { + val log: Logger = LoggerFactory.getLogger(getClass) + val conf: SparkConf = new SparkConf() + val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_summaries_params.json"))) + parser.parseArgument(args) + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(getClass.getSimpleName) + .master(parser.get("master")).getOrCreate() + + val sourcePath = parser.get("sourcePath") + log.info(s"sourcePath -> $sourcePath") + val targetPath = parser.get("targetPath") + log.info(s"targetPath -> $targetPath") + + implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result] + + implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] + + + val ds:Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result] + + ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).write.mode(SaveMode.Overwrite).save(targetPath) + + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/EBIAggregator.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/EBIAggregator.scala deleted file mode 100644 index 4e3fda800..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/EBIAggregator.scala +++ /dev/null @@ -1,177 +0,0 @@ -package eu.dnetlib.dhp.sx.graph.ebi - -import eu.dnetlib.dhp.oa.merge.AuthorMerger -import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset} -import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown} -import org.apache.spark.sql.{Encoder, Encoders} -import org.apache.spark.sql.expressions.Aggregator - - - -object EBIAggregator { - - def getDatasetAggregator(): Aggregator[(String, OafDataset), OafDataset, OafDataset] = new Aggregator[(String, OafDataset), OafDataset, OafDataset]{ - - override def zero: OafDataset = new OafDataset() - - override def reduce(b: OafDataset, a: (String, OafDataset)): OafDataset = { - b.mergeFrom(a._2) - b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor)) - if (b.getId == null) - b.setId(a._2.getId) - b - } - - - override def merge(wx: OafDataset, wy: OafDataset): OafDataset = { - wx.mergeFrom(wy) - wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor)) - if(wx.getId == null && wy.getId.nonEmpty) - wx.setId(wy.getId) - wx - } - override def finish(reduction: OafDataset): OafDataset = reduction - - override def bufferEncoder: Encoder[OafDataset] = - Encoders.kryo(classOf[OafDataset]) - - override def outputEncoder: Encoder[OafDataset] = - Encoders.kryo(classOf[OafDataset]) - } - - def getDLIUnknownAggregator(): Aggregator[(String, DLIUnknown), DLIUnknown, DLIUnknown] = new Aggregator[(String, DLIUnknown), DLIUnknown, DLIUnknown]{ - - override def zero: DLIUnknown = new DLIUnknown() - - override def reduce(b: DLIUnknown, a: (String, DLIUnknown)): DLIUnknown = { - b.mergeFrom(a._2) - if (b.getId == null) - b.setId(a._2.getId) - b - } - - override def merge(wx: DLIUnknown, wy: DLIUnknown): DLIUnknown = { - wx.mergeFrom(wy) - if(wx.getId == null && wy.getId.nonEmpty) - wx.setId(wy.getId) - wx - } - override def finish(reduction: DLIUnknown): DLIUnknown = reduction - - override def bufferEncoder: Encoder[DLIUnknown] = - Encoders.kryo(classOf[DLIUnknown]) - - override def outputEncoder: Encoder[DLIUnknown] = - Encoders.kryo(classOf[DLIUnknown]) - } - - def getDLIDatasetAggregator(): Aggregator[(String, DLIDataset), DLIDataset, DLIDataset] = new Aggregator[(String, DLIDataset), DLIDataset, DLIDataset]{ - - override def zero: DLIDataset = new DLIDataset() - - override def reduce(b: DLIDataset, a: (String, DLIDataset)): DLIDataset = { - b.mergeFrom(a._2) - b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor)) - if (b.getId == null) - b.setId(a._2.getId) - b - } - - override def merge(wx: DLIDataset, wy: DLIDataset): DLIDataset = { - wx.mergeFrom(wy) - wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor)) - if(wx.getId == null && wy.getId.nonEmpty) - wx.setId(wy.getId) - wx - } - override def finish(reduction: DLIDataset): DLIDataset = reduction - - override def bufferEncoder: Encoder[DLIDataset] = - Encoders.kryo(classOf[DLIDataset]) - - override def outputEncoder: Encoder[DLIDataset] = - Encoders.kryo(classOf[DLIDataset]) - } - - - def getDLIPublicationAggregator(): Aggregator[(String, DLIPublication), DLIPublication, DLIPublication] = new Aggregator[(String, DLIPublication), DLIPublication, DLIPublication]{ - - override def zero: DLIPublication = new DLIPublication() - - override def reduce(b: DLIPublication, a: (String, DLIPublication)): DLIPublication = { - b.mergeFrom(a._2) - b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor)) - - if (b.getId == null) - b.setId(a._2.getId) - b - } - - - override def merge(wx: DLIPublication, wy: DLIPublication): DLIPublication = { - wx.mergeFrom(wy) - wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor)) - if(wx.getId == null && wy.getId.nonEmpty) - wx.setId(wy.getId) - wx - } - override def finish(reduction: DLIPublication): DLIPublication = reduction - - override def bufferEncoder: Encoder[DLIPublication] = - Encoders.kryo(classOf[DLIPublication]) - - override def outputEncoder: Encoder[DLIPublication] = - Encoders.kryo(classOf[DLIPublication]) - } - - - def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{ - - override def zero: Publication = new Publication() - - override def reduce(b: Publication, a: (String, Publication)): Publication = { - b.mergeFrom(a._2) - b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor)) - if (b.getId == null) - b.setId(a._2.getId) - b - } - - - override def merge(wx: Publication, wy: Publication): Publication = { - wx.mergeFrom(wy) - wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor)) - if(wx.getId == null && wy.getId.nonEmpty) - wx.setId(wy.getId) - wx - } - override def finish(reduction: Publication): Publication = reduction - - override def bufferEncoder: Encoder[Publication] = - Encoders.kryo(classOf[Publication]) - - override def outputEncoder: Encoder[Publication] = - Encoders.kryo(classOf[Publication]) - } - - def getRelationAggregator(): Aggregator[(String, Relation), Relation, Relation] = new Aggregator[(String, Relation), Relation, Relation]{ - - override def zero: Relation = new Relation() - - override def reduce(b: Relation, a: (String, Relation)): Relation = { - a._2 - } - - - override def merge(a: Relation, b: Relation): Relation = { - if(b!= null) b else a - } - override def finish(reduction: Relation): Relation = reduction - - override def bufferEncoder: Encoder[Relation] = - Encoders.kryo(classOf[Relation]) - - override def outputEncoder: Encoder[Relation] = - Encoders.kryo(classOf[Relation]) - } -} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkAddLinkUpdates.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkAddLinkUpdates.scala deleted file mode 100644 index a8dd93688..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkAddLinkUpdates.scala +++ /dev/null @@ -1,248 +0,0 @@ -package eu.dnetlib.dhp.sx.graph.ebi - -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.common.ModelConstants -import eu.dnetlib.dhp.schema.oaf.{Author, Instance, Journal, KeyValue, Oaf, Publication, Relation, Dataset => OafDataset} -import eu.dnetlib.dhp.schema.scholexplorer.OafUtils.createQualifier -import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, OafUtils, ProvenaceInfo} -import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal} -import eu.dnetlib.dhp.utils.DHPUtils -import eu.dnetlib.scholexplorer.relation.RelationMapper -import org.apache.commons.io.IOUtils -import org.apache.spark.SparkConf -import org.apache.spark.sql._ -import org.json4s -import org.json4s.DefaultFormats -import org.json4s.JsonAST.{JField, JObject, JString} -import org.json4s.jackson.JsonMethods.parse -import org.apache.spark.sql.functions._ - -import scala.collection.JavaConverters._ - -object SparkAddLinkUpdates { - - val relationMapper: RelationMapper = RelationMapper.load - - -case class EBILinks(relation:String, pubdate:String, tpid:String, tpidType:String, turl:String, title:String, publisher:String) {} - - - def generatePubmedDLICollectedFrom(): KeyValue = { - OafUtils.generateKeyValue("dli_________::europe_pmc__", "Europe PMC") - } - - - - def journalToOAF(pj:PMJournal): Journal = { - val j = new Journal - j.setIssnPrinted(pj.getIssn) - j.setVol(pj.getVolume) - j.setName(pj.getTitle) - j.setIss(pj.getIssue) - j.setDataInfo(OafUtils.generateDataInfo()) - j - } - - - def pubmedTOPublication(input:PMArticle):DLIPublication = { - - - val dnetPublicationId = s"50|${DHPUtils.md5(s"${input.getPmid}::pmid")}" - - val p = new DLIPublication - p.setId(dnetPublicationId) - p.setDataInfo(OafUtils.generateDataInfo()) - p.setPid(List(OafUtils.createSP(input.getPmid.toLowerCase.trim, "pmid", ModelConstants.DNET_PID_TYPES)).asJava) - p.setCompletionStatus("complete") - val pi = new ProvenaceInfo - pi.setId("dli_________::europe_pmc__") - pi.setName( "Europe PMC") - pi.setCompletionStatus("complete") - pi.setCollectionMode("collected") - p.setDlicollectedfrom(List(pi).asJava) - p.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava) - - if (input.getAuthors != null && input.getAuthors.size() >0) { - var aths: List[Author] = List() - input.getAuthors.asScala.filter(a=> a!= null).foreach(a => { - val c = new Author - c.setFullname(a.getFullName) - c.setName(a.getForeName) - c.setSurname(a.getLastName) - aths = aths ::: List(c) - }) - if (aths.nonEmpty) - p.setAuthor(aths.asJava) - } - - - if (input.getJournal != null) - p.setJournal(journalToOAF(input.getJournal)) - p.setTitle(List(OafUtils.createSP(input.getTitle, "main title", ModelConstants.DNET_DATACITE_TITLE)).asJava) - p.setDateofacceptance(OafUtils.asField(input.getDate)) - val i = new Instance - i.setCollectedfrom(generatePubmedDLICollectedFrom()) - i.setDateofacceptance(p.getDateofacceptance) - i.setUrl(List(s"https://pubmed.ncbi.nlm.nih.gov/${input.getPmid}").asJava) - i.setInstancetype(createQualifier("0001", "Article", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) - p.setInstance(List(i).asJava) - p - } - - - def ebiLinksToOaf(input:(String, String)):List[Oaf] = { - val pmid :String = input._1 - val input_json :String = input._2 - implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats - lazy val json: json4s.JValue = parse(input_json) - - - val targets:List[EBILinks] = for { - JObject(link) <- json \\ "Category" \\ "Link" - JField("PublicationDate", JString(pubdate)) <- link - JField("RelationshipType", JObject(relationshipType)) <- link - JField("Name", JString(relname)) <- relationshipType - JField("Target", JObject(target)) <- link - JField("Identifier", JObject(identifier)) <- target - JField("ID", JString(tpid)) <- identifier - JField("IDScheme", JString(tpidtype)) <- identifier - JField("IDURL", JString(turl)) <- identifier - JField("Title", JString(title)) <- target - JField("Publisher", JObject(pub)) <- target - JField("Name", JString(publisher)) <- pub - } yield EBILinks(relname, pubdate, tpid, tpidtype, turl,title, publisher) - - - - val dnetPublicationId = s"50|${DHPUtils.md5(s"$pmid::pmid")}" - - targets.flatMap(l => { - val relation = new Relation - val inverseRelation = new Relation - val targetDnetId = s"50|${DHPUtils.md5(s"${l.tpid.toLowerCase.trim}::${l.tpidType.toLowerCase.trim}")}" - val relInfo = relationMapper.get(l.relation.toLowerCase) - val relationSemantic = relInfo.getOriginal - val inverseRelationSemantic = relInfo.getInverse - - relation.setSource(dnetPublicationId) - relation.setTarget(targetDnetId) - relation.setRelClass("datacite") - relation.setRelType(relationSemantic) - relation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava) - - inverseRelation.setSource(targetDnetId) - inverseRelation.setTarget(dnetPublicationId) - inverseRelation.setRelClass("datacite") - inverseRelation.setRelType(inverseRelationSemantic) - inverseRelation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava) - - - - val d = new DLIDataset - d.setId(targetDnetId) - d.setDataInfo(OafUtils.generateDataInfo()) - d.setPid(List(OafUtils.createSP(l.tpid.toLowerCase.trim, l.tpidType.toLowerCase.trim, ModelConstants.DNET_PID_TYPES)).asJava) - d.setCompletionStatus("complete") - val pi = new ProvenaceInfo - pi.setId("dli_________::europe_pmc__") - pi.setName( "Europe PMC") - pi.setCompletionStatus("complete") - pi.setCollectionMode("collected") - d.setDlicollectedfrom(List(pi).asJava) - d.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava) - d.setPublisher(OafUtils.asField(l.publisher)) - d.setTitle(List(OafUtils.createSP(l.title, "main title", ModelConstants.DNET_DATACITE_TITLE)).asJava) - d.setDateofacceptance(OafUtils.asField(l.pubdate)) - val i = new Instance - i.setCollectedfrom(generatePubmedDLICollectedFrom()) - i.setDateofacceptance(d.getDateofacceptance) - i.setUrl(List(l.turl).asJava) - i.setInstancetype(createQualifier("0021", "Dataset", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)) - d.setInstance(List(i).asJava) - List(relation, inverseRelation, d) - }) - } - - - def main(args: Array[String]): Unit = { - val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_to_df_params.json"))) - parser.parseArgument(args) - val spark: SparkSession = - SparkSession - .builder() - .config(conf) - .appName(SparkEBILinksToOaf.getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - - - val workingPath = parser.get("workingPath") - implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) - implicit val oafpubEncoder: Encoder[Publication] = Encoders.kryo[Publication] - implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) - implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset]) - implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo(classOf[DLIPublication]) - implicit val atEncoder: Encoder[Author] = Encoders.kryo(classOf[Author]) - implicit val strEncoder:Encoder[String] = Encoders.STRING - implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle]) - implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal]) - implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor]) - - - val ds:Dataset[(String,String)] = spark.read.load(s"$workingPath/baseline_links_updates").as[(String,String)](Encoders.tuple(Encoders.STRING, Encoders.STRING)) - - ds.flatMap(l =>ebiLinksToOaf(l)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_oaf") - - ds.filter(s => s.isInstanceOf) - - - - val oDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/baseline_links_updates_oaf").as[Oaf] - - oDataset.filter(p =>p.isInstanceOf[Relation]).map(p => p.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_relation") - oDataset.filter(p =>p.isInstanceOf[DLIDataset]).map(p => p.asInstanceOf[DLIDataset]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_dataset") - - - val idPublicationSolved:Dataset[String] = spark.read.load(s"$workingPath/baseline_links_updates").where(col("links").isNotNull).select("pmid").as[String] - val baseline:Dataset[(String, PMArticle)]= spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle].map(p=> (p.getPmid, p))(Encoders.tuple(strEncoder,PMEncoder)) - idPublicationSolved.joinWith(baseline, idPublicationSolved("pmid").equalTo(baseline("_1"))).map(k => pubmedTOPublication(k._2._2)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_publication") - - - val pmaDatasets = spark.read.load("/user/sandro.labruzzo/scholix/EBI/ebi_garr/baseline_dataset").as[PMArticle] - - pmaDatasets.map(p => pubmedTOPublication(p)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_publication_all") - - val pubs: Dataset[(String,Publication)] = spark.read.load("/user/sandro.labruzzo/scholix/EBI/publication").as[Publication].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING,oafpubEncoder)) - val pubdate:Dataset[(String,DLIPublication)] = spark.read.load(s"$workingPath/baseline_publication_all").as[DLIPublication].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING,pubEncoder)) - - - - pubs.joinWith(pubdate, pubs("_1").equalTo(pubdate("_1"))).map(k => k._2._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_publication_ebi") - - - - val dt : Dataset[DLIDataset] = spark.read.load(s"$workingPath/dataset").as[DLIDataset] - val update : Dataset[DLIDataset] = spark.read.load(s"$workingPath/ebi_garr/baseline_links_updates_dataset").as[DLIDataset] - - - dt.union(update).map(d => (d.getId,d))(Encoders.tuple(Encoders.STRING, datEncoder)) - .groupByKey(_._1)(Encoders.STRING) - .agg(EBIAggregator.getDLIDatasetAggregator().toColumn) - .map(p => p._2) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset_ebi") - - - val rel: Dataset[Relation] = spark.read.load(s"$workingPath/relation").as[Relation] - val relupdate : Dataset[Relation] = spark.read.load(s"$workingPath/ebi_garr/baseline_links_updates_relation").as[Relation] - - - rel.union(relupdate) - .map(d => (s"${d.getSource}::${d.getRelType}::${d.getTarget}", d))(Encoders.tuple(Encoders.STRING, relEncoder)) - .groupByKey(_._1)(Encoders.STRING) - .agg(EBIAggregator.getRelationAggregator().toColumn) - .map(p => p._2) - .write.mode(SaveMode.Overwrite) - .save(s"$workingPath/baseline_relation_ebi") - - } -} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java deleted file mode 100644 index 6e3dad7fd..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java +++ /dev/null @@ -1,223 +0,0 @@ - -package eu.dnetlib.dhp.sx.graph.parser; - -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -import javax.xml.stream.XMLStreamReader; - -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; -import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo; -import eu.dnetlib.dhp.utils.DHPUtils; -import eu.dnetlib.scholexplorer.relation.RelInfo; -import eu.dnetlib.scholexplorer.relation.RelationMapper; - -public abstract class AbstractScholexplorerParser { - - protected static final Log log = LogFactory.getLog(AbstractScholexplorerParser.class); - static final Pattern pattern = Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE); - private final List datasetSubTypes = Arrays - .asList( - "dataset", - "software", - "film", - "sound", - "physicalobject", - "audiovisual", - "collection", - "other", - "study", - "metadata"); - - public abstract List parseObject(final String record, final RelationMapper relMapper); - - protected Map getAttributes(final XMLStreamReader parser) { - final Map attributesMap = new HashMap<>(); - for (int i = 0; i < parser.getAttributeCount(); i++) { - attributesMap.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); - } - return attributesMap; - } - - protected List extractSubject(List subjects) { - final List subjectResult = new ArrayList<>(); - if (subjects != null && subjects.size() > 0) { - subjects - .forEach( - subjectMap -> { - final StructuredProperty subject = new StructuredProperty(); - subject.setValue(subjectMap.getTextValue()); - final Qualifier schema = new Qualifier(); - schema.setClassid("dnet:subject"); - schema.setClassname("dnet:subject"); - schema.setSchemeid(subjectMap.getAttributes().get("subjectScheme")); - schema.setSchemename(subjectMap.getAttributes().get("subjectScheme")); - subject.setQualifier(schema); - subjectResult.add(subject); - }); - } - return subjectResult; - } - - protected StructuredProperty extractIdentifier( - List identifierType, final String fieldName) { - final StructuredProperty pid = new StructuredProperty(); - if (identifierType != null && identifierType.size() > 0) { - final VtdUtilityParser.Node result = identifierType.get(0); - pid.setValue(result.getTextValue()); - final Qualifier pidType = new Qualifier(); - pidType.setClassname(result.getAttributes().get(fieldName)); - pidType.setClassid(result.getAttributes().get(fieldName)); - pidType.setSchemename(ModelConstants.DNET_PID_TYPES); - pidType.setSchemeid(ModelConstants.DNET_PID_TYPES); - pid.setQualifier(pidType); - return pid; - } - return null; - } - - protected void inferPid(final StructuredProperty input) { - final Matcher matcher = pattern.matcher(input.getValue()); - if (matcher.find()) { - input.setValue(matcher.group()); - if (input.getQualifier() == null) { - input.setQualifier(new Qualifier()); - input.getQualifier().setSchemename(ModelConstants.DNET_PID_TYPES); - input.getQualifier().setSchemeid(ModelConstants.DNET_PID_TYPES); - } - input.getQualifier().setClassid("doi"); - input.getQualifier().setClassname("doi"); - } - } - - protected String generateId(final String pid, final String pidType, final String entityType) { - String type; - switch (entityType) { - case "publication": - type = "50|"; - break; - case "dataset": - type = "60|"; - break; - case "unknown": - type = "70|"; - break; - default: - throw new IllegalArgumentException("unexpected value " + entityType); - } - if ("dnet".equalsIgnoreCase(pidType)) - return type + StringUtils.substringAfter(pid, "::"); - - return type - + DHPUtils - .md5( - String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); - } - - protected DLIUnknown createUnknownObject( - final String pid, - final String pidType, - final KeyValue cf, - final DataInfo di, - final String dateOfCollection) { - final DLIUnknown uk = new DLIUnknown(); - uk.setId(generateId(pid, pidType, "unknown")); - ProvenaceInfo pi = new ProvenaceInfo(); - pi.setId(cf.getKey()); - pi.setName(cf.getValue()); - pi.setCompletionStatus("incomplete"); - uk.setDataInfo(di); - uk.setDlicollectedfrom(Collections.singletonList(pi)); - final StructuredProperty sourcePid = new StructuredProperty(); - sourcePid.setValue(pid); - final Qualifier pt = new Qualifier(); - pt.setClassname(pidType); - pt.setClassid(pidType); - pt.setSchemename(ModelConstants.DNET_PID_TYPES); - pt.setSchemeid(ModelConstants.DNET_PID_TYPES); - sourcePid.setQualifier(pt); - uk.setPid(Collections.singletonList(sourcePid)); - uk.setDateofcollection(dateOfCollection); - return uk; - } - - protected Qualifier generateQualifier(final String classId, final String className, final String schemeId, - final String schemeName) { - final Qualifier q = new Qualifier(); - q.setClassid(classId); - q.setClassid(className); - q.setSchemeid(schemeId); - q.setSchemename(schemeName); - return q; - - } - - protected void generateRelations( - RelationMapper relationMapper, - Result parsedObject, - List result, - DataInfo di, - String dateOfCollection, - List relatedIdentifiers) { - if (relatedIdentifiers != null) { - result - .addAll( - relatedIdentifiers - .stream() - .flatMap( - n -> { - final List rels = new ArrayList<>(); - Relation r = new Relation(); - r.setSource(parsedObject.getId()); - final String relatedPid = n.getTextValue(); - final String relatedPidType = n.getAttributes().get("relatedIdentifierType"); - final String relatedType = n.getAttributes().getOrDefault("entityType", "unknown"); - String relationSemantic = n.getAttributes().get("relationType"); - String inverseRelation; - final String targetId = generateId(relatedPid, relatedPidType, relatedType); - if (relationMapper.containsKey(relationSemantic.toLowerCase())) { - RelInfo relInfo = relationMapper.get(relationSemantic.toLowerCase()); - relationSemantic = relInfo.getOriginal(); - inverseRelation = relInfo.getInverse(); - } else { - relationSemantic = "Unknown"; - inverseRelation = "Unknown"; - } - r.setTarget(targetId); - r.setRelType(relationSemantic); - r.setRelClass("datacite"); - r.setCollectedfrom(parsedObject.getCollectedfrom()); - r.setDataInfo(di); - rels.add(r); - r = new Relation(); - r.setDataInfo(di); - r.setSource(targetId); - r.setTarget(parsedObject.getId()); - r.setRelType(inverseRelation); - r.setRelClass("datacite"); - r.setCollectedfrom(parsedObject.getCollectedfrom()); - rels.add(r); - if ("unknown".equalsIgnoreCase(relatedType)) - result - .add( - createUnknownObject( - relatedPid, - relatedPidType, - parsedObject.getCollectedfrom().get(0), - di, - dateOfCollection)); - return rels.stream(); - }) - .collect(Collectors.toList())); - } - } -} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java deleted file mode 100644 index 4493010a0..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java +++ /dev/null @@ -1,340 +0,0 @@ - -package eu.dnetlib.dhp.sx.graph.parser; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; - -import com.ximpleware.AutoPilot; -import com.ximpleware.VTDGen; -import com.ximpleware.VTDNav; - -import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; -import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; -import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo; -import eu.dnetlib.scholexplorer.relation.RelationMapper; - -public class DatasetScholexplorerParser extends AbstractScholexplorerParser { - @Override - public List parseObject(String record, final RelationMapper relationMapper) { - try { - final DLIDataset parsedObject = new DLIDataset(); - final VTDGen vg = new VTDGen(); - vg.setDoc(record.getBytes()); - final List result = new ArrayList<>(); - vg.parse(true); - - final VTDNav vn = vg.getNav(); - final AutoPilot ap = new AutoPilot(vn); - - DataInfo di = new DataInfo(); - di.setTrust("0.9"); - di.setDeletedbyinference(false); - di.setInvisible(false); - parsedObject.setDataInfo(di); - - parsedObject - .setOriginalId( - Collections - .singletonList( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']"))); - - parsedObject - .setOriginalObjIdentifier( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); - String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); - parsedObject.setDateofcollection(dateOfCollection); - - final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); - - if (StringUtils.isNotBlank(resolvedDate)) { - StructuredProperty currentDate = new StructuredProperty(); - currentDate.setValue(resolvedDate); - final Qualifier dateQualifier = new Qualifier(); - dateQualifier.setClassname("resolvedDate"); - dateQualifier.setClassid("resolvedDate"); - dateQualifier.setSchemename("dnet::date"); - dateQualifier.setSchemeid("dnet::date"); - currentDate.setQualifier(dateQualifier); - parsedObject.setRelevantdate(Collections.singletonList(currentDate)); - } - final String completionStatus = VtdUtilityParser - .getSingleValue(ap, vn, "//*[local-name()='completionStatus']"); - final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); - - final String publisher = VtdUtilityParser - .getSingleValue( - ap, vn, "//*[local-name()='resource']/*[local-name()='publisher']"); - - List collectedFromNodes = VtdUtilityParser - .getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='collectedFrom']", - Arrays.asList("name", "id", "mode", "completionStatus")); - - List resolvededFromNodes = VtdUtilityParser - .getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='resolvedFrom']", - Arrays.asList("name", "id", "mode", "completionStatus")); - - Field pf = new Field<>(); - pf.setValue(publisher); - - parsedObject.setPublisher(pf); - final List provenances = new ArrayList<>(); - if (collectedFromNodes != null && collectedFromNodes.size() > 0) { - collectedFromNodes - .forEach( - it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode(provisionMode); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); - } - - if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) { - resolvededFromNodes - .forEach( - it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode("resolved"); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); - } - - parsedObject.setDlicollectedfrom(provenances); - parsedObject - .setCollectedfrom( - parsedObject - .getDlicollectedfrom() - .stream() - .map( - p -> { - final KeyValue cf = new KeyValue(); - cf.setKey(p.getId()); - cf.setValue(p.getName()); - return cf; - }) - .collect(Collectors.toList())); - parsedObject - .setCompletionStatus( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']")); - - final List identifierType = VtdUtilityParser - .getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='resource']/*[local-name()='identifier']", - Collections.singletonList("identifierType")); - - StructuredProperty currentPid = extractIdentifier(identifierType, "identifierType"); - if (currentPid == null) - return null; - inferPid(currentPid); - parsedObject.setPid(Collections.singletonList(currentPid)); - - String resolvedURL = null; - - switch (currentPid.getQualifier().getClassname().toLowerCase()) { - case "uniprot": - resolvedURL = "https://www.uniprot.org/uniprot/" + currentPid.getValue(); - break; - case "ena": - if (StringUtils.isNotBlank(currentPid.getValue()) && currentPid.getValue().length() > 7) - resolvedURL = "https://www.ebi.ac.uk/ena/data/view/" + currentPid.getValue().substring(0, 8); - break; - case "chembl": - resolvedURL = "https://www.ebi.ac.uk/chembl/compound_report_card/" + currentPid.getValue(); - break; - - case "ncbi-n": - resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue(); - break; - case "ncbi-p": - resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue(); - break; - case "genbank": - resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue(); - break; - case "pdb": - resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue(); - break; - case "url": - resolvedURL = currentPid.getValue(); - break; - } - - final String sourceId = generateId( - currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset"); - parsedObject.setId(sourceId); - - List descs = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='description']"); - if (descs != null && descs.size() > 0) - parsedObject - .setDescription( - descs - .stream() -// .map(it -> it.length() < 10000 ? it : it.substring(0, 10000)) - .map( - it -> { - final Field d = new Field<>(); - d.setValue(it); - return d; - }) - .collect(Collectors.toList())); - - final List relatedIdentifiers = VtdUtilityParser - .getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='relatedIdentifier']", - Arrays - .asList( - "relatedIdentifierType", "relationType", "entityType", "inverseRelationType")); - - generateRelations( - relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); - - final List hostedBy = VtdUtilityParser - .getTextValuesWithAttributes( - ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); - - if (hostedBy != null) { - parsedObject - .setInstance( - hostedBy - .stream() - .map( - it -> { - final Instance i = new Instance(); - i.setUrl(Collections.singletonList(currentPid.getValue())); - KeyValue h = new KeyValue(); - i.setHostedby(h); - h.setKey(it.getAttributes().get("id")); - h.setValue(it.getAttributes().get("name")); - return i; - }) - .collect(Collectors.toList())); - } - - List subjects = extractSubject( - VtdUtilityParser - .getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='resource']//*[local-name()='subject']", - Collections.singletonList("subjectScheme"))); - - parsedObject.setSubject(subjects); - - Qualifier q = new Qualifier(); - q.setClassname("dataset"); - q.setClassid("dataset"); - q.setSchemename("dataset"); - q.setSchemeid("dataset"); - parsedObject.setResulttype(q); - - parsedObject.setCompletionStatus(completionStatus); - - final List creators = VtdUtilityParser - .getTextValue( - ap, - vn, - "//*[local-name()='resource']//*[local-name()='creator']/*[local-name()='creatorName']"); - if (creators != null && creators.size() > 0) { - parsedObject - .setAuthor( - creators - .stream() - .map( - a -> { - final Author author = new Author(); - author.setFullname(a); - return author; - }) - .collect(Collectors.toList())); - } - final List titles = VtdUtilityParser - .getTextValue( - ap, vn, "//*[local-name()='resource']//*[local-name()='title']"); - if (titles != null && titles.size() > 0) { - parsedObject - .setTitle( - titles - .stream() - .map( - t -> { - final StructuredProperty st = new StructuredProperty(); - st.setValue(t); - st.setQualifier(ModelConstants.MAIN_TITLE_QUALIFIER); - return st; - }) - .collect(Collectors.toList())); - } - - final List dates = VtdUtilityParser - .getTextValue( - ap, - vn, - "//*[local-name()='resource']/*[local-name()='dates']/*[local-name()='date']"); - - if (dates != null && dates.size() > 0) { - parsedObject - .setRelevantdate( - dates - .stream() - .map( - cd -> { - StructuredProperty date = new StructuredProperty(); - date.setValue(cd); - final Qualifier dq = new Qualifier(); - dq.setClassname("date"); - dq.setClassid("date"); - dq.setSchemename("dnet::date"); - dq.setSchemeid("dnet::date"); - date.setQualifier(dq); - return date; - }) - .collect(Collectors.toList())); - } - - // TERRIBLE HACK TO AVOID EMPTY COLLECTED FROM - if (parsedObject.getDlicollectedfrom() == null) { - - final KeyValue cf = new KeyValue(); - cf.setKey("dli_________::europe_pmc__"); - cf.setValue("Europe PMC"); - parsedObject.setCollectedfrom(Collections.singletonList(cf)); - } - - if (StringUtils.isNotBlank(resolvedURL)) { - Instance i = new Instance(); - i.setCollectedfrom(parsedObject.getCollectedfrom().get(0)); - i.setUrl(Collections.singletonList(resolvedURL)); - parsedObject.setInstance(Collections.singletonList(i)); - } - - result.add(parsedObject); - return result; - } catch (Throwable e) { - log.error("Error on parsing record " + record, e); - return null; - } - } -} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java deleted file mode 100644 index 8d76004dc..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java +++ /dev/null @@ -1,264 +0,0 @@ - -package eu.dnetlib.dhp.sx.graph.parser; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; - -import com.ximpleware.AutoPilot; -import com.ximpleware.VTDGen; -import com.ximpleware.VTDNav; - -import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; -import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; -import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo; -import eu.dnetlib.scholexplorer.relation.RelationMapper; - -public class PublicationScholexplorerParser extends AbstractScholexplorerParser { - - @Override - public List parseObject(final String record, final RelationMapper relationMapper) { - try { - final List result = new ArrayList<>(); - final DLIPublication parsedObject = new DLIPublication(); - final VTDGen vg = new VTDGen(); - vg.setDoc(record.getBytes()); - vg.parse(true); - - final VTDNav vn = vg.getNav(); - final AutoPilot ap = new AutoPilot(vn); - - final DataInfo di = new DataInfo(); - di.setTrust("0.9"); - di.setDeletedbyinference(false); - di.setInvisible(false); - - String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); - parsedObject.setDateofcollection(dateOfCollection); - - final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); - parsedObject - .setOriginalId( - Collections - .singletonList( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']"))); - - if (StringUtils.isNotBlank(resolvedDate)) { - StructuredProperty currentDate = new StructuredProperty(); - currentDate.setValue(resolvedDate); - final Qualifier dateQualifier = new Qualifier(); - dateQualifier.setClassname("resolvedDate"); - dateQualifier.setClassid("resolvedDate"); - dateQualifier.setSchemename("dnet::date"); - dateQualifier.setSchemeid("dnet::date"); - currentDate.setQualifier(dateQualifier); - parsedObject.setRelevantdate(Collections.singletonList(currentDate)); - } - - final List pid = VtdUtilityParser - .getTextValuesWithAttributes( - ap, vn, "//*[local-name()='pid']", Arrays.asList("type")); - - StructuredProperty currentPid = extractIdentifier(pid, "type"); - if (currentPid == null) - return null; - inferPid(currentPid); - parsedObject.setPid(Collections.singletonList(currentPid)); - final String sourceId = generateId( - currentPid.getValue(), currentPid.getQualifier().getClassid(), "publication"); - parsedObject.setId(sourceId); - - parsedObject - .setOriginalObjIdentifier( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); - - String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); - - List collectedFromNodes = VtdUtilityParser - .getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='collectedFrom']", - Arrays.asList("name", "id", "mode", "completionStatus")); - - List resolvededFromNodes = VtdUtilityParser - .getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='resolvedFrom']", - Arrays.asList("name", "id", "mode", "completionStatus")); - - final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='publisher']"); - Field pf = new Field<>(); - pf.setValue(publisher); - - parsedObject.setPublisher(pf); - final List provenances = new ArrayList<>(); - if (collectedFromNodes != null && collectedFromNodes.size() > 0) { - collectedFromNodes - .forEach( - it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode(provisionMode); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); - } - - if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) { - resolvededFromNodes - .forEach( - it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode("resolved"); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); - } - - parsedObject.setDlicollectedfrom(provenances); - parsedObject - .setCompletionStatus( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']")); - - parsedObject - .setCollectedfrom( - parsedObject - .getDlicollectedfrom() - .stream() - .map( - p -> { - final KeyValue cf = new KeyValue(); - cf.setKey(p.getId()); - cf.setValue(p.getName()); - return cf; - }) - .collect(Collectors.toList())); - - final List relatedIdentifiers = VtdUtilityParser - .getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='relatedIdentifier']", - Arrays - .asList( - "relatedIdentifierType", "relationType", "entityType", "inverseRelationType")); - generateRelations( - relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); - - final List hostedBy = VtdUtilityParser - .getTextValuesWithAttributes( - ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); - - if (hostedBy != null) { - parsedObject - .setInstance( - hostedBy - .stream() - .map( - it -> { - final Instance i = new Instance(); - i.setUrl(Collections.singletonList(currentPid.getValue())); - KeyValue h = new KeyValue(); - i.setHostedby(h); - h.setKey(it.getAttributes().get("id")); - h.setValue(it.getAttributes().get("name")); - return i; - }) - .collect(Collectors.toList())); - } - - final List authorsNode = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='creator']"); - if (authorsNode != null) - parsedObject - .setAuthor( - authorsNode - .stream() - .map( - a -> { - final Author author = new Author(); - author.setFullname(a); - return author; - }) - .collect(Collectors.toList())); - - final List titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='title']"); - if (titles != null) { - parsedObject - .setTitle( - titles - .stream() - .map( - t -> { - final StructuredProperty st = new StructuredProperty(); - st.setValue(t); - st - .setQualifier( - generateQualifier( - "main title", "main title", "dnet:dataCite_title", - "dnet:dataCite_title")); - return st; - }) - .collect(Collectors.toList())); - } - - Field description = new Field<>(); - - description - .setValue( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='description']")); - -// if (StringUtils.isNotBlank(description.getValue()) -// && description.getValue().length() > 10000) { -// description.setValue(description.getValue().substring(0, 10000)); -// } - - parsedObject.setDescription(Collections.singletonList(description)); - - final String cd = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='date']"); - - StructuredProperty date = new StructuredProperty(); - date.setValue(cd); - final Qualifier dq = new Qualifier(); - dq.setClassname("date"); - dq.setClassid("date"); - dq.setSchemename("dnet::date"); - dq.setSchemeid("dnet::date"); - date.setQualifier(dq); - parsedObject.setRelevantdate(Collections.singletonList(date)); - - List subjects = extractSubject( - VtdUtilityParser - .getTextValuesWithAttributes( - ap, vn, "//*[local-name()='subject']", Collections.singletonList("scheme"))); - parsedObject.setSubject(subjects); - - parsedObject.setDataInfo(di); - - parsedObject.setSubject(subjects); - Qualifier q = new Qualifier(); - q.setClassname("publication"); - q.setClassid("publication"); - q.setSchemename("publication"); - q.setSchemeid("publication"); - parsedObject.setResulttype(q); - result.add(parsedObject); - return result; - - } catch (Throwable e) { - log.error("Input record: " + record); - log.error("Error on parsing record ", e); - return null; - } - } -} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala new file mode 100644 index 000000000..ba1ebdb64 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala @@ -0,0 +1,54 @@ +package eu.dnetlib.dhp.sx.graph.scholix + +import eu.dnetlib.dhp.schema.oaf.{Dataset, Result} +import eu.dnetlib.dhp.schema.sx.summary.{SchemeValue, ScholixSummary, TypedIdentifier, Typology} + +import scala.collection.JavaConverters._ + +object ScholixUtils { + + + def resultToSummary(r:Result):ScholixSummary = { + val s = new ScholixSummary + s.setId(r.getId) + s.setLocalIdentifier(r.getPid.asScala.map(p => new TypedIdentifier(p.getValue, p.getQualifier.getClassid)).asJava) + + if (r.isInstanceOf[Dataset]) + s.setTypology(Typology.dataset) + else + s.setTypology(Typology.publication) + + s.setSubType(r.getInstance().get(0).getInstancetype.getClassname) + + if (r.getTitle!= null && r.getTitle.asScala.nonEmpty) { + s.setTitle(r.getTitle.asScala.map(t => t.getValue).asJava) + } + + if(r.getAuthor!= null && !r.getAuthor.isEmpty) { + s.setAuthor(r.getAuthor.asScala.map(a=> a.getFullname).asJava) + } + if (r.getInstance() != null) { + val dt:List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue)(collection.breakOut) + if (dt.nonEmpty) + s.setDate(dt.asJava) + } + if (r.getDescription!= null && !r.getDescription.isEmpty) { + val d = r.getDescription.asScala.find(f => f.getValue!=null) + if (d.isDefined) + s.setDescription(d.get.getValue) + } + + if (r.getSubject!= null && !r.getSubject.isEmpty) + s.setSubject(r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue)).asJava) + + if (r.getPublisher!= null) + s.setPublisher(List(r.getPublisher.getValue).asJava) + + s.setRelatedDatasets(0) + s.setRelatedPublications(0) + s.setRelatedUnknown(0) + + s + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/create_summaries_params.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/create_summaries_params.json new file mode 100644 index 000000000..8bfdde5b0 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/create_summaries_params.json @@ -0,0 +1,5 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true}, + {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step1/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step1/oozie_app/workflow.xml index 4045e2dfb..492b21489 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step1/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step1/oozie_app/workflow.xml @@ -10,7 +10,7 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -64,9 +64,38 @@ --workingPath${targetPath}/resolved/ --entityPath${targetPath}/dedup + + + + + + + + yarn + cluster + Convert Entities to summaries + eu.dnetlib.dhp.sx.graph.SparkCreateSummaryObject + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.shuffle.partitions=5000 + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --masteryarn + --sourcePath${targetPath}/dedup + --targetPath${targetPath}/provision/summaries + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java deleted file mode 100644 index 67226a031..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java +++ /dev/null @@ -1,63 +0,0 @@ - -package eu.dnetlib.dhp.sx.graph; - -import java.util.List; - -import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.SerializationFeature; - -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser; -import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser; -import eu.dnetlib.scholexplorer.relation.RelationMapper; - -public class ScholexplorerParserTest { - - @Test - public void testDataciteParser() throws Exception { - String xml = IOUtils.toString(this.getClass().getResourceAsStream("dmf.xml")); - - DatasetScholexplorerParser p = new DatasetScholexplorerParser(); - List oaves = p.parseObject(xml, RelationMapper.load()); - - ObjectMapper m = new ObjectMapper(); - m.enable(SerializationFeature.INDENT_OUTPUT); - - oaves - .forEach( - oaf -> { - try { - System.out.println(m.writeValueAsString(oaf)); - System.out.println("----------------------------"); - } catch (JsonProcessingException e) { - - } - }); - } - - @Test - public void testPublicationParser() throws Exception { - String xml = IOUtils.toString(this.getClass().getResourceAsStream("pmf.xml")); - - PublicationScholexplorerParser p = new PublicationScholexplorerParser(); - List oaves = p.parseObject(xml, RelationMapper.load()); - - ObjectMapper m = new ObjectMapper(); - m.enable(SerializationFeature.INDENT_OUTPUT); - - oaves - .forEach( - oaf -> { - try { - System.out.println(m.writeValueAsString(oaf)); - System.out.println("----------------------------"); - } catch (JsonProcessingException e) { - - } - }); - } -} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerAggregationTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerAggregationTest.scala deleted file mode 100644 index 278af2a16..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerAggregationTest.scala +++ /dev/null @@ -1,54 +0,0 @@ -package eu.dnetlib.dhp.sx.graph - -import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature} -import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication -import eu.dnetlib.dhp.sx.graph.ebi.EBIAggregator -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} -import org.junit.jupiter.api.Assertions._ -import org.junit.jupiter.api.Test - -import scala.io.Source - -class SparkScholexplorerAggregationTest { - - - @Test - def testFunderRelationshipsMapping(): Unit = { - val publications = Source.fromInputStream(getClass.getResourceAsStream("publication.json")).mkString - - var s: List[DLIPublication] = List[DLIPublication]() - - val m: ObjectMapper = new ObjectMapper() - - m.enable(SerializationFeature.INDENT_OUTPUT) - - for (line <- publications.lines) { - s = m.readValue(line, classOf[DLIPublication]) :: s - - - } - - - implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication] - val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate() - - - val ds: Dataset[DLIPublication] = spark.createDataset(spark.sparkContext.parallelize(s)).as[DLIPublication] - - val unique = ds.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder)) - .groupByKey(_._1)(Encoders.STRING) - .agg(EBIAggregator.getDLIPublicationAggregator().toColumn) - .map(p => p._2) - - val uniquePubs: DLIPublication = unique.first() - - s.foreach(pp => assertFalse(pp.getAuthor.isEmpty)) - - - assertNotNull(uniquePubs.getAuthor) - assertFalse(uniquePubs.getAuthor.isEmpty) - - - } - -} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java deleted file mode 100644 index ce00466df..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java +++ /dev/null @@ -1,6 +0,0 @@ - -package eu.dnetlib.dhp.sx.graph; - -public class SparkScholexplorerGraphImporterTest { - -} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java deleted file mode 100644 index 348a2b030..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java +++ /dev/null @@ -1,5 +0,0 @@ - -package eu.dnetlib.dhp.sx.graph; - -public class SparkScholexplorerMergeEntitiesJobTest { -} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala index 0e79420df..b71b7f054 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala @@ -8,10 +8,9 @@ import eu.dnetlib.dhp.common.PacePerson import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty} -import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication} import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils -import eu.dnetlib.dhp.schema.scholexplorer.OafUtils._ + import scala.collection.JavaConverters._ diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java index 1b435bea3..f1eb3992d 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java @@ -15,7 +15,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -public class SparkIndexCollectionOnES { +public class +SparkIndexCollectionOnES { public static void main(String[] args) throws Exception { diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index ec8f9268c..cfdf36573 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -24,8 +24,8 @@ dhp-dedup-openaire dhp-enrichment dhp-graph-provision - dhp-dedup-scholexplorer - dhp-graph-provision-scholexplorer + + dhp-blacklist dhp-stats-update dhp-stats-promote diff --git a/pom.xml b/pom.xml index 9b6d43ff9..9396eeeb6 100644 --- a/pom.xml +++ b/pom.xml @@ -736,7 +736,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [2.6.13] + [2.6.14] [4.0.3] [6.0.5] [3.1.6]