From 46631a44217917f8a23b4c019d7f73641e255e67 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 24 Jun 2020 14:06:38 +0200 Subject: [PATCH 01/65] updated mapping scholexplorer to OAF --- .../java/eu/dnetlib/dhp/export/DLIToOAF.scala | 16 +++++++++++++++- .../export/SparkExportContentForOpenAire.scala | 11 +++++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala index 637362acf0..967834d694 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala @@ -5,7 +5,7 @@ import java.time.format.DateTimeFormatter import eu.dnetlib.dhp.common.PacePerson import eu.dnetlib.dhp.schema.action.AtomicAction -import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, StructuredProperty} +import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation} import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils @@ -99,6 +99,20 @@ object DLIToOAF { ) + def fixInstance(r:Publication) :Publication = { + val collectedFrom = r.getCollectedfrom.asScala.head + r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom)) + r + } + + + def fixInstanceDataset(r:Dataset) :Dataset = { + val collectedFrom = r.getCollectedfrom.asScala.head + r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom)) + r + } + + def toActionSet(item: Oaf): (String, String) = { val mapper = new ObjectMapper() diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala index edf951df4d..fd8f2d136d 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala @@ -1,7 +1,7 @@ package eu.dnetlib.dhp.`export` import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.oaf.{Instance, Publication, Relation, Dataset => OafDataset} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation} import org.apache.commons.io.IOUtils import org.apache.hadoop.io.Text @@ -166,10 +166,13 @@ object SparkExportContentForOpenAire { }).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationAS") - val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationAS").as[Relation].map(DLIToOAF.toActionSet) - val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/publicationAS").as[Publication].map(DLIToOAF.toActionSet) - val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/datasetAS").as[OafDataset].map(DLIToOAF.toActionSet) + spark.read.load(s"$workingPath/publicationAS").as[Publication].map(DLIToOAF.fixInstance).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationAS_fixed") + spark.read.load(s"$workingPath/datasetAS").as[OafDataset].map(DLIToOAF.fixInstanceDataset).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetAS_fixed") + + val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationAS").as[Relation].map(DLIToOAF.toActionSet) + val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/publicationAS_fixed").as[Publication].map(DLIToOAF.toActionSet) + val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/datasetAS_fixed").as[OafDataset].map(DLIToOAF.toActionSet) fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) } From 1d420eedb4135c72ecef00383bff7df5fe5c9aec Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 2 Jul 2020 12:37:43 +0200 Subject: [PATCH 02/65] added generation of EBI Dataset --- .../eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala | 89 +++++++++++++++++++ .../dhp/sx/ebi/SparkCreateEBIDataFrame.scala | 83 +++++++++++++++++ .../parser/AbstractScholexplorerParser.java | 13 +++ .../parser/DatasetScholexplorerParser.java | 46 +++++++++- .../PublicationScholexplorerParser.java | 1 + .../dnetlib/dhp/sx/ebi/ebi_to_df_params.json | 4 + .../dhp/sx/ebi/oozie_app/config-default.xml | 38 ++++++++ .../dnetlib/dhp/sx/ebi/oozie_app/workflow.xml | 51 +++++++++++ .../eu/dnetlib/dhp/sx/ebi/TestEBITODS.scala | 22 +++++ 9 files changed, 346 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBITODS.scala diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala new file mode 100644 index 0000000000..41fcd26362 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala @@ -0,0 +1,89 @@ +package eu.dnetlib.dhp.sx.ebi +import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset} +import org.apache.spark.sql.{Encoder, Encoders} +import org.apache.spark.sql.expressions.Aggregator + + + +object EBIAggregator { + + def getDatasetAggregator(): Aggregator[(String, OafDataset), OafDataset, OafDataset] = new Aggregator[(String, OafDataset), OafDataset, OafDataset]{ + + override def zero: OafDataset = new OafDataset() + + override def reduce(b: OafDataset, a: (String, OafDataset)): OafDataset = { + b.mergeFrom(a._2) + if (b.getId == null) + b.setId(a._2.getId) + b + } + + + override def merge(wx: OafDataset, wy: OafDataset): OafDataset = { + wx.mergeFrom(wy) + if(wx.getId == null && wy.getId.nonEmpty) + wx.setId(wy.getId) + wx + } + override def finish(reduction: OafDataset): OafDataset = reduction + + override def bufferEncoder: Encoder[OafDataset] = + Encoders.kryo(classOf[OafDataset]) + + override def outputEncoder: Encoder[OafDataset] = + Encoders.kryo(classOf[OafDataset]) + } + + + def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{ + + override def zero: Publication = new Publication() + + override def reduce(b: Publication, a: (String, Publication)): Publication = { + b.mergeFrom(a._2) + if (b.getId == null) + b.setId(a._2.getId) + b + } + + + override def merge(wx: Publication, wy: Publication): Publication = { + wx.mergeFrom(wy) + if(wx.getId == null && wy.getId.nonEmpty) + wx.setId(wy.getId) + wx + } + override def finish(reduction: Publication): Publication = reduction + + override def bufferEncoder: Encoder[Publication] = + Encoders.kryo(classOf[Publication]) + + override def outputEncoder: Encoder[Publication] = + Encoders.kryo(classOf[Publication]) + } + + + def getRelationAggregator(): Aggregator[(String, Relation), Relation, Relation] = new Aggregator[(String, Relation), Relation, Relation]{ + + override def zero: Relation = new Relation() + + override def reduce(b: Relation, a: (String, Relation)): Relation = { + a._2 + } + + + override def merge(a: Relation, b: Relation): Relation = { + if(b!= null) b else a + } + override def finish(reduction: Relation): Relation = reduction + + override def bufferEncoder: Encoder[Relation] = + Encoders.kryo(classOf[Relation]) + + override def outputEncoder: Encoder[Relation] = + Encoders.kryo(classOf[Relation]) + } + + + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala new file mode 100644 index 0000000000..c7cc3ed048 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala @@ -0,0 +1,83 @@ +package eu.dnetlib.dhp.sx.ebi + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset} +import eu.dnetlib.dhp.sx.graph.parser.{DatasetScholexplorerParser, PublicationScholexplorerParser} +import eu.dnetlib.scholexplorer.relation.RelationMapper +import org.apache.commons.io.IOUtils +import org.apache.spark.SparkConf +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} +import org.slf4j.{Logger, LoggerFactory} +import scala.collection.JavaConverters._ + +object SparkCreateEBIDataFrame { + + + def main(args: Array[String]): Unit = { + val logger: Logger = LoggerFactory.getLogger(SparkCreateEBIDataFrame.getClass) + val conf: SparkConf = new SparkConf() + val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json"))) + parser.parseArgument(args) + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(SparkCreateEBIDataFrame.getClass.getSimpleName) + .master(parser.get("master")).getOrCreate() + + val sc = spark.sparkContext + + + val workingPath = parser.get("workingPath") + val relationMapper = RelationMapper.load + + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) + implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) + implicit val pubEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) + implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) + + logger.info("Extract Publication and relation from publication_xml") + val oafPubsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/publication_xml").map(s => + { + new ObjectMapper().readValue(s, classOf[String]) + }).flatMap(s => { + val d = new PublicationScholexplorerParser + d.parseObject(s, relationMapper).asScala.iterator}) + + val mapper = new ObjectMapper() + mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) + spark.createDataset(oafPubsRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/oaf") + + logger.info("Extract Publication and relation from dataset_xml") + val oafDatsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/_dataset_xml").map(s => + { + new ObjectMapper().readValue(s, classOf[String]) + }).flatMap(s => { + val d = new DatasetScholexplorerParser + d.parseObject(s, relationMapper).asScala.iterator}) + + spark.createDataset(oafDatsRDD).write.mode(SaveMode.Append).save(s"$workingPath/oaf") + val dataset: Dataset[OafDataset] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[OafDataset]).map(d => d.asInstanceOf[OafDataset]) + val publication: Dataset[Publication] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Publication]).map(d => d.asInstanceOf[Publication]) + val relations: Dataset[Relation] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Relation]).map(d => d.asInstanceOf[Relation]) + publication.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder)) + .groupByKey(_._1)(Encoders.STRING) + .agg(EBIAggregator.getPublicationAggregator().toColumn) + .map(p => p._2) + .write.mode(SaveMode.Overwrite).save(s"$workingPath/publication") + + dataset.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datasetEncoder)) + .groupByKey(_._1)(Encoders.STRING) + .agg(EBIAggregator.getDatasetAggregator().toColumn) + .map(p => p._2) + .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset") + + relations.map(d => (s"${d.getSource}::${d.getRelType}::${d.getTarget}", d))(Encoders.tuple(Encoders.STRING, relEncoder)) + .groupByKey(_._1)(Encoders.STRING) + .agg(EBIAggregator.getRelationAggregator().toColumn) + .map(p => p._2) + .write.mode(SaveMode.Overwrite).save(s"$workingPath/relation") + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java index 0db2b26884..f0c7893239 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java @@ -150,6 +150,19 @@ public abstract class AbstractScholexplorerParser { return uk; } + + protected Qualifier generateQualifier(final String classId, final String className, final String schemeId, final String schemeName) { + final Qualifier q = new Qualifier(); + q.setClassid(classId); + q.setClassid(className); + q.setSchemeid(schemeId); + q.setSchemename(schemeName); + return q; + + + + } + protected void generateRelations( RelationMapper relationMapper, Result parsedObject, diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java index afba57bb8a..c5c6b1493f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java @@ -64,7 +64,6 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser { currentDate.setQualifier(dateQualifier); parsedObject.setRelevantdate(Collections.singletonList(currentDate)); } - final String completionStatus = VtdUtilityParser .getSingleValue(ap, vn, "//*[local-name()='completionStatus']"); final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); @@ -149,6 +148,42 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser { inferPid(currentPid); parsedObject.setPid(Collections.singletonList(currentPid)); + + String resolvedURL = null; + + switch (currentPid.getQualifier().getClassname().toLowerCase()) { + case "uniprot": + resolvedURL ="https://www.uniprot.org/uniprot/"+currentPid.getValue(); + break; + case "ena": + if (StringUtils.isNotBlank(currentPid.getValue()) && currentPid.getValue().length() > 7) + resolvedURL ="https://www.ebi.ac.uk/ena/data/view/"+currentPid.getValue().substring(0,8); + break; + case "chembl": + resolvedURL ="https://www.ebi.ac.uk/chembl/compound_report_card/"+currentPid.getValue(); + break; + + case "ncbi-n": + resolvedURL ="https://www.ncbi.nlm.nih.gov/nuccore/"+currentPid.getValue(); + break; + case "ncbi-p": + resolvedURL ="https://www.ncbi.nlm.nih.gov/nuccore/"+currentPid.getValue(); + break; + case "genbank": + resolvedURL ="https://www.ncbi.nlm.nih.gov/nuccore/"+currentPid.getValue(); + break; + case "pdb": + resolvedURL ="https://www.ncbi.nlm.nih.gov/nuccore/"+currentPid.getValue(); + break; + case "url": + resolvedURL =currentPid.getValue(); + break; + } + + + + + final String sourceId = generateId( currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset"); parsedObject.setId(sourceId); @@ -251,6 +286,7 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser { t -> { final StructuredProperty st = new StructuredProperty(); st.setValue(t); + st.setQualifier(generateQualifier( "main title","main title", "dnet:dataCite_title","dnet:dataCite_title")); return st; }) .collect(Collectors.toList())); @@ -282,6 +318,14 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser { .collect(Collectors.toList())); } + + if(StringUtils.isNotBlank(resolvedURL)) { + Instance i = new Instance(); + i.setCollectedfrom(parsedObject.getCollectedfrom().get(0)); + i.setUrl(Collections.singletonList(resolvedURL)); + parsedObject.setInstance(Collections.singletonList(i)); + } + result.add(parsedObject); return result; } catch (Throwable e) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java index bf59a6f0ef..b66bfd161a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java @@ -202,6 +202,7 @@ public class PublicationScholexplorerParser extends AbstractScholexplorerParser t -> { final StructuredProperty st = new StructuredProperty(); st.setValue(t); + st.setQualifier(generateQualifier( "main title","main title", "dnet:dataCite_title","dnet:dataCite_title")); return st; }) .collect(Collectors.toList())); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json new file mode 100644 index 0000000000..366f1426e4 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json @@ -0,0 +1,4 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml new file mode 100644 index 0000000000..cf617a84c4 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml @@ -0,0 +1,38 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + spark2ExtraListeners + "com.cloudera.spark.lineage.NavigatorAppListener" + + + spark2SqlQueryExecutionListeners + "com.cloudera.spark.lineage.NavigatorQueryListener" + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/workflow.xml new file mode 100644 index 0000000000..b816696747 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/workflow.xml @@ -0,0 +1,51 @@ + + + + workingPath + the Working Path + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + yarn-cluster + cluster + Create EBI DataSet + eu.dnetlib.dhp.sx.ebi.SparkCreateEBIDataFrame + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=1000 + ${sparkExtraOPT} + + --workingPath${workingPath} + --masteryarn-cluster + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBITODS.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBITODS.scala new file mode 100644 index 0000000000..979bf7e14c --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBITODS.scala @@ -0,0 +1,22 @@ +package eu.dnetlib.dhp.sx.ebi + +import org.junit.jupiter.api.Test + +class TestEBITODS { + + + @Test + def testEBI():Unit = { + + + + + + + + + } + + + +} From c3d67f709a2589573330d85c298d33f17370d12c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 2 Jul 2020 17:35:22 +0200 Subject: [PATCH 03/65] adjusted dedup configuration for result entities: using new wordssuffixprefix clustering function, removed ngrampairs, adjusted queueMaxSize (800) and slidingWindowSize (80) --- .../test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java | 6 +++--- .../resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json | 7 +++---- .../resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json | 7 +++---- .../resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json | 7 +++---- .../resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json | 7 +++---- pom.xml | 2 +- 6 files changed, 16 insertions(+), 20 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 8dd00be979..88d5f24f90 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -182,7 +182,7 @@ public class SparkDedupTest implements Serializable { .count(); assertEquals(3432, orgs_simrel); - assertEquals(7152, pubs_simrel); + assertEquals(7054, pubs_simrel); assertEquals(344, sw_simrel); assertEquals(458, ds_simrel); assertEquals(6750, orp_simrel); @@ -234,7 +234,7 @@ public class SparkDedupTest implements Serializable { .count(); assertEquals(1276, orgs_mergerel); - assertEquals(1442, pubs_mergerel); + assertEquals(1440, pubs_mergerel); assertEquals(288, sw_mergerel); assertEquals(472, ds_mergerel); assertEquals(718, orp_mergerel); @@ -423,7 +423,7 @@ public class SparkDedupTest implements Serializable { long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count(); - assertEquals(4975, relations); + assertEquals(4971, relations); // check deletedbyinference final Dataset mergeRels = spark diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json index 13b18e1c35..2469b2cc03 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json @@ -6,10 +6,10 @@ "subEntityType" : "resulttype", "subEntityValue" : "dataset", "orderField" : "title", - "queueMaxSize" : "2000", + "queueMaxSize" : "800", "groupMaxSize" : "100", "maxChildren" : "100", - "slidingWindowSize" : "200", + "slidingWindowSize" : "80", "rootBuilder" : ["result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], "includeChildren" : "true", "idPath" : "$.id", @@ -17,8 +17,7 @@ }, "pace" : { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } }, { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ], "decisionTree" : { diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json index 5fb2a171ac..4adcc0439f 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json @@ -6,10 +6,10 @@ "subEntityType" : "resulttype", "subEntityValue" : "otherresearchproduct", "orderField" : "title", - "queueMaxSize" : "2000", + "queueMaxSize" : "800", "groupMaxSize" : "100", "maxChildren" : "100", - "slidingWindowSize" : "200", + "slidingWindowSize" : "80", "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], "includeChildren" : "true", "idPath" : "$.id", @@ -17,8 +17,7 @@ }, "pace" : { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } }, { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ], "decisionTree" : { diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json index d471ccb89d..ef0b26af41 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json @@ -6,10 +6,10 @@ "subEntityType": "resulttype", "subEntityValue": "publication", "orderField": "title", - "queueMaxSize": "2000", + "queueMaxSize": "800", "groupMaxSize": "100", "maxChildren": "100", - "slidingWindowSize": "200", + "slidingWindowSize": "80", "rootBuilder": [ "result", "resultProject_outcome_isProducedBy", @@ -29,8 +29,7 @@ }, "pace": { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } }, { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ], "decisionTree": { diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json index f4a107c745..623abbf9f4 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json @@ -6,10 +6,10 @@ "subEntityType" : "resulttype", "subEntityValue" : "software", "orderField" : "title", - "queueMaxSize" : "2000", + "queueMaxSize" : "800", "groupMaxSize" : "100", "maxChildren" : "100", - "slidingWindowSize" : "200", + "slidingWindowSize" : "80", "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], "includeChildren" : "true", "idPath" : "$.id", @@ -17,8 +17,7 @@ }, "pace" : { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } }, { "name" : "lowercase", "fields" : [ "doi", "url" ], "params" : { } } ], "decisionTree": { diff --git a/pom.xml b/pom.xml index 89b7e8829c..4619f31749 100644 --- a/pom.xml +++ b/pom.xml @@ -315,7 +315,7 @@ eu.dnetlib dnet-pace-core - 4.0.1 + 4.0.2 eu.dnetlib From b383ed42fa309b143fd5fba08e8297468c182b4b Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 7 Jul 2020 14:21:28 +0200 Subject: [PATCH 04/65] pass optional parameter relationFilter to the PrepareRelationJob implementation --- .../java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java | 1 + .../resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml | 1 + 2 files changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index 601cf64499..eb63d4423d 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -156,6 +156,7 @@ public class PrepareRelationsJob { .parquet(outputPath); } + // experimental private static void prepareRelationsDataset( SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int maxRelations, int relPartitions) { diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index 697a00a09b..faa81ad644 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -134,6 +134,7 @@ --inputRelationsPath${inputGraphRootPath}/relation --outputPath${workingDir}/relation --maxRelations${maxRelations} + --relationFilter${relationFilter} --relPartitions5000 From 8af8e7481ad7762d7c0b83a7c9bfe487a322b103 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 7 Jul 2020 14:23:34 +0200 Subject: [PATCH 05/65] code formatting --- .../relatedProjects/EnrichMoreProject.java | 4 +- .../dhp/broker/oa/util/ConversionUtils.java | 55 ++++++++++++++----- 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java index c38fd81a67..85086a6df8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java @@ -25,7 +25,9 @@ public class EnrichMoreProject extends UpdateMatcher { protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { - if (target.getProjects().size() >= BrokerConstants.MAX_LIST_SIZE) { return new ArrayList<>(); } + if (target.getProjects().size() >= BrokerConstants.MAX_LIST_SIZE) { + return new ArrayList<>(); + } final Set existingProjects = target .getProjects() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index 3a2cdc5f34..d00c5b817d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -44,7 +44,9 @@ public class ConversionUtils { private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class); public static List oafInstanceToBrokerInstances(final Instance i) { - if (i == null) { return new ArrayList<>(); } + if (i == null) { + return new ArrayList<>(); + } return mappedList(i.getUrl(), url -> { final OaBrokerInstance res = new OaBrokerInstance(); @@ -65,7 +67,9 @@ public class ConversionUtils { } public static final OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) { - if (d == null) { return null; } + if (d == null) { + return null; + } final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset(); res.setOpenaireId(d.getId()); @@ -78,7 +82,9 @@ public class ConversionUtils { } public static OaBrokerRelatedPublication oafPublicationToBrokerPublication(final Publication p) { - if (p == null) { return null; } + if (p == null) { + return null; + } final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication(); res.setOpenaireId(p.getId()); @@ -92,7 +98,9 @@ public class ConversionUtils { } public static final OaBrokerMainEntity oafResultToBrokerResult(final Result result) { - if (result == null) { return null; } + if (result == null) { + return null; + } final OaBrokerMainEntity res = new OaBrokerMainEntity(); @@ -109,7 +117,8 @@ public class ConversionUtils { res.setEmbargoenddate(fieldValue(result.getEmbargoenddate())); res.setContributor(fieldList(result.getContributor())); res - .setJournal(result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null); + .setJournal( + result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null); res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey)); res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue)); res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid)); @@ -121,7 +130,9 @@ public class ConversionUtils { } private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) { - if (author == null) { return null; } + if (author == null) { + return null; + } final String pids = author.getPid() != null ? author .getPid() @@ -145,7 +156,9 @@ public class ConversionUtils { } private static OaBrokerJournal oafJournalToBrokerJournal(final Journal journal) { - if (journal == null) { return null; } + if (journal == null) { + return null; + } final OaBrokerJournal res = new OaBrokerJournal(); res.setName(journal.getName()); @@ -157,7 +170,9 @@ public class ConversionUtils { } private static OaBrokerExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) { - if (ref == null) { return null; } + if (ref == null) { + return null; + } final OaBrokerExternalReference res = new OaBrokerExternalReference(); res.setRefidentifier(ref.getRefidentifier()); @@ -168,7 +183,9 @@ public class ConversionUtils { } public static final OaBrokerProject oafProjectToBrokerProject(final Project p) { - if (p == null) { return null; } + if (p == null) { + return null; + } final OaBrokerProject res = new OaBrokerProject(); res.setOpenaireId(p.getId()); @@ -192,7 +209,9 @@ public class ConversionUtils { } public static final OaBrokerRelatedSoftware oafSoftwareToBrokerSoftware(final Software sw) { - if (sw == null) { return null; } + if (sw == null) { + return null; + } final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware(); res.setOpenaireId(sw.getId()); @@ -255,7 +274,9 @@ public class ConversionUtils { } private static List structPropTypedList(final List list) { - if (list == null) { return new ArrayList<>(); } + if (list == null) { + return new ArrayList<>(); + } return list .stream() @@ -265,7 +286,9 @@ public class ConversionUtils { } private static List mappedList(final List list, final Function func) { - if (list == null) { return new ArrayList<>(); } + if (list == null) { + return new ArrayList<>(); + } return list .stream() @@ -276,7 +299,9 @@ public class ConversionUtils { } private static List flatMappedList(final List list, final Function> func) { - if (list == null) { return new ArrayList<>(); } + if (list == null) { + return new ArrayList<>(); + } return list .stream() @@ -288,7 +313,9 @@ public class ConversionUtils { } private static T mappedFirst(final List list, final Function func) { - if (list == null) { return null; } + if (list == null) { + return null; + } return list .stream() From efadbdb2bc33d4df424d5f9699ea3507a04a05af Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 7 Jul 2020 15:37:13 +0200 Subject: [PATCH 06/65] fixed a bug with duplicated events --- .../dhp/broker/oa/CheckDuplictedIdsJob.java | 112 ++++++++++++++++++ .../dhp/broker/oa/GenerateEventsJob.java | 24 +--- .../dnetlib/dhp/broker/oa/IndexOnESJob.java | 2 +- .../relatedProjects/EnrichMoreProject.java | 4 +- .../dhp/broker/oa/util/ClusterUtils.java | 26 +++- .../dhp/broker/oa/util/ConversionUtils.java | 55 ++++++--- .../dhp/broker/oa/util/EventFinder.java | 1 - .../oa/generate_all/oozie_app/workflow.xml | 2 +- .../broker/oa/partial/oozie_app/workflow.xml | 13 +- 9 files changed, 192 insertions(+), 47 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java new file mode 100644 index 0000000000..5ca865e8fc --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java @@ -0,0 +1,112 @@ + +package eu.dnetlib.dhp.broker.oa; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.TypedColumn; +import org.apache.spark.sql.expressions.Aggregator; +import org.apache.spark.util.LongAccumulator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.model.Event; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import scala.Tuple2; + +public class CheckDuplictedIdsJob { + + private static final Logger log = LoggerFactory.getLogger(CheckDuplictedIdsJob.class); + + public static void main(final String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + CheckDuplictedIdsJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final SparkConf conf = new SparkConf(); + + final String eventsPath = parser.get("workingPath") + "/events"; + log.info("eventsPath: {}", eventsPath); + + final String countPath = parser.get("workingPath") + "/counts"; + log.info("countPath: {}", countPath); + + final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); + + final LongAccumulator total = spark.sparkContext().longAccumulator("invaild_event_id"); + + final TypedColumn, Tuple2> agg = new CountAggregator().toColumn(); + + ClusterUtils + .readPath(spark, eventsPath, Event.class) + .map(e -> new Tuple2<>(e.getEventId(), 1l), Encoders.tuple(Encoders.STRING(), Encoders.LONG())) + .groupByKey(t -> t._1, Encoders.STRING()) + .agg(agg) + .map(t -> t._2, Encoders.tuple(Encoders.STRING(), Encoders.LONG())) + .filter(t -> t._2 > 1) + .map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG())) + .write() + .mode(SaveMode.Overwrite) + .json(countPath); + ; + + } + + private static String eventAsJsonString(final Event f) throws JsonProcessingException { + return new ObjectMapper().writeValueAsString(f); + } + +} + +class CountAggregator extends Aggregator, Tuple2, Tuple2> { + + /** + * + */ + private static final long serialVersionUID = 1395935985734672538L; + + @Override + public Encoder> bufferEncoder() { + return Encoders.tuple(Encoders.STRING(), Encoders.LONG()); + } + + @Override + public Tuple2 finish(final Tuple2 arg0) { + return arg0; + } + + @Override + public Tuple2 merge(final Tuple2 arg0, final Tuple2 arg1) { + final String s = StringUtils.defaultIfBlank(arg0._1, arg1._1); + return new Tuple2<>(s, arg0._2 + arg1._2); + } + + @Override + public Encoder> outputEncoder() { + return Encoders.tuple(Encoders.STRING(), Encoders.LONG()); + } + + @Override + public Tuple2 reduce(final Tuple2 arg0, final Tuple2 arg1) { + final String s = StringUtils.defaultIfBlank(arg0._1, arg1._1); + return new Tuple2<>(s, arg0._2 + arg1._2); + } + + @Override + public Tuple2 zero() { + return new Tuple2<>(null, 0l); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java index 5d3121aedc..a2d92e149e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java @@ -3,8 +3,6 @@ package eu.dnetlib.dhp.broker.oa; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.util.Arrays; -import java.util.HashSet; import java.util.Map; import java.util.Optional; import java.util.Set; @@ -63,13 +61,13 @@ public class GenerateEventsJob { final String eventsPath = workingPath + "/events"; log.info("eventsPath: {}", eventsPath); - final Set dsIdWhitelist = parseParamAsList(parser, "datasourceIdWhitelist"); + final Set dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist"); log.info("datasourceIdWhitelist: {}", StringUtils.join(dsIdWhitelist, ",")); - final Set dsTypeWhitelist = parseParamAsList(parser, "datasourceTypeWhitelist"); + final Set dsTypeWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceTypeWhitelist"); log.info("datasourceTypeWhitelist: {}", StringUtils.join(dsTypeWhitelist, ",")); - final Set dsIdBlacklist = parseParamAsList(parser, "datasourceIdBlacklist"); + final Set dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist"); log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ",")); final SparkConf conf = new SparkConf(); @@ -103,22 +101,6 @@ public class GenerateEventsJob { } - private static Set parseParamAsList(final ArgumentApplicationParser parser, final String key) { - final String s = parser.get(key).trim(); - - final Set res = new HashSet<>(); - - if (s.length() > 1) { // A value of a single char (for example: '-') indicates an empty list - Arrays - .stream(s.split(",")) - .map(String::trim) - .filter(StringUtils::isNotBlank) - .forEach(res::add); - } - - return res; - } - public static Map prepareAccumulators(final SparkContext sc) { return EventFinder diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java index 9124d18e39..806147bdd5 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java @@ -48,7 +48,7 @@ public class IndexOnESJob { final JavaRDD inputRdd = ClusterUtils .readPath(spark, eventsPath, Event.class) - .limit(10000) // TODO REMOVE + // .limit(10000) // TODO REMOVE .map(IndexOnESJob::eventAsJsonString, Encoders.STRING()) .javaRDD(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java index c38fd81a67..85086a6df8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java @@ -25,7 +25,9 @@ public class EnrichMoreProject extends UpdateMatcher { protected List findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) { - if (target.getProjects().size() >= BrokerConstants.MAX_LIST_SIZE) { return new ArrayList<>(); } + if (target.getProjects().size() >= BrokerConstants.MAX_LIST_SIZE) { + return new ArrayList<>(); + } final Set existingProjects = target .getProjects() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java index 2d0106a7a8..d8b8dd8079 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java @@ -1,6 +1,11 @@ package eu.dnetlib.dhp.broker.oa.util; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.lang3.StringUtils; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -10,6 +15,7 @@ import org.apache.spark.util.LongAccumulator; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; public class ClusterUtils { @@ -53,7 +59,9 @@ public class ClusterUtils { return o; } - public static void save(final Dataset dataset, final String path, final Class clazz, + public static void save(final Dataset dataset, + final String path, + final Class clazz, final LongAccumulator acc) { dataset .map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz)) @@ -62,4 +70,20 @@ public class ClusterUtils { .json(path); } + public static Set parseParamAsList(final ArgumentApplicationParser parser, final String key) { + final String s = parser.get(key).trim(); + + final Set res = new HashSet<>(); + + if (s.length() > 1) { // A value of a single char (for example: '-') indicates an empty list + Arrays + .stream(s.split(",")) + .map(String::trim) + .filter(StringUtils::isNotBlank) + .forEach(res::add); + } + + return res; + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index 3a2cdc5f34..d00c5b817d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -44,7 +44,9 @@ public class ConversionUtils { private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class); public static List oafInstanceToBrokerInstances(final Instance i) { - if (i == null) { return new ArrayList<>(); } + if (i == null) { + return new ArrayList<>(); + } return mappedList(i.getUrl(), url -> { final OaBrokerInstance res = new OaBrokerInstance(); @@ -65,7 +67,9 @@ public class ConversionUtils { } public static final OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) { - if (d == null) { return null; } + if (d == null) { + return null; + } final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset(); res.setOpenaireId(d.getId()); @@ -78,7 +82,9 @@ public class ConversionUtils { } public static OaBrokerRelatedPublication oafPublicationToBrokerPublication(final Publication p) { - if (p == null) { return null; } + if (p == null) { + return null; + } final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication(); res.setOpenaireId(p.getId()); @@ -92,7 +98,9 @@ public class ConversionUtils { } public static final OaBrokerMainEntity oafResultToBrokerResult(final Result result) { - if (result == null) { return null; } + if (result == null) { + return null; + } final OaBrokerMainEntity res = new OaBrokerMainEntity(); @@ -109,7 +117,8 @@ public class ConversionUtils { res.setEmbargoenddate(fieldValue(result.getEmbargoenddate())); res.setContributor(fieldList(result.getContributor())); res - .setJournal(result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null); + .setJournal( + result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null); res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey)); res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue)); res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid)); @@ -121,7 +130,9 @@ public class ConversionUtils { } private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) { - if (author == null) { return null; } + if (author == null) { + return null; + } final String pids = author.getPid() != null ? author .getPid() @@ -145,7 +156,9 @@ public class ConversionUtils { } private static OaBrokerJournal oafJournalToBrokerJournal(final Journal journal) { - if (journal == null) { return null; } + if (journal == null) { + return null; + } final OaBrokerJournal res = new OaBrokerJournal(); res.setName(journal.getName()); @@ -157,7 +170,9 @@ public class ConversionUtils { } private static OaBrokerExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) { - if (ref == null) { return null; } + if (ref == null) { + return null; + } final OaBrokerExternalReference res = new OaBrokerExternalReference(); res.setRefidentifier(ref.getRefidentifier()); @@ -168,7 +183,9 @@ public class ConversionUtils { } public static final OaBrokerProject oafProjectToBrokerProject(final Project p) { - if (p == null) { return null; } + if (p == null) { + return null; + } final OaBrokerProject res = new OaBrokerProject(); res.setOpenaireId(p.getId()); @@ -192,7 +209,9 @@ public class ConversionUtils { } public static final OaBrokerRelatedSoftware oafSoftwareToBrokerSoftware(final Software sw) { - if (sw == null) { return null; } + if (sw == null) { + return null; + } final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware(); res.setOpenaireId(sw.getId()); @@ -255,7 +274,9 @@ public class ConversionUtils { } private static List structPropTypedList(final List list) { - if (list == null) { return new ArrayList<>(); } + if (list == null) { + return new ArrayList<>(); + } return list .stream() @@ -265,7 +286,9 @@ public class ConversionUtils { } private static List mappedList(final List list, final Function func) { - if (list == null) { return new ArrayList<>(); } + if (list == null) { + return new ArrayList<>(); + } return list .stream() @@ -276,7 +299,9 @@ public class ConversionUtils { } private static List flatMappedList(final List list, final Function> func) { - if (list == null) { return new ArrayList<>(); } + if (list == null) { + return new ArrayList<>(); + } return list .stream() @@ -288,7 +313,9 @@ public class ConversionUtils { } private static T mappedFirst(final List list, final Function func) { - if (list == null) { return null; } + if (list == null) { + return null; + } return list .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java index e7abae68b0..593e66d434 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java @@ -70,7 +70,6 @@ public class EventFinder { matchers.add(new EnrichMissingDatasetReferences()); matchers.add(new EnrichMissingDatasetIsSupplementedTo()); matchers.add(new EnrichMissingDatasetIsSupplementedBy()); - matchers.add(new EnrichMissingAbstract()); } public static EventGroup generateEvents(final ResultGroup results, diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index a0c7b00dbd..b85c60fdf7 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -447,7 +447,7 @@ --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} - --conf spark.dynamicAllocation.maxExecutors="2" + --conf spark.dynamicAllocation.maxExecutors="8" --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml index b382904481..392271260e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml @@ -73,33 +73,32 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + yarn cluster - IndexOnESJob - eu.dnetlib.dhp.broker.oa.IndexOnESJob + Count + eu.dnetlib.dhp.broker.oa.CheckDuplictedIdsJob dhp-broker-events-${projectVersion}.jar + --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} - --conf spark.dynamicAllocation.maxExecutors="2" --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 + --graphPath${graphInputPath} --workingPath${workingPath} - --index${esIndexName} - --esHost${esIndexHost} From e2ea30f89d7125770433120a60e8e262e1b4c654 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 8 Jul 2020 12:16:24 +0200 Subject: [PATCH 07/65] updated graph construction workflow definition: cleaning wf moved at the bottom to include cleaning of the information produced by the enrichment workflows --- .../dhp/wf/profiles/graph_construction.xml | 159 +++++++++++------- 1 file changed, 99 insertions(+), 60 deletions(-) diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_construction.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_construction.xml index 819b3e12d1..4d77883b49 100644 --- a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_construction.xml +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_construction.xml @@ -11,6 +11,29 @@ Data Provision 30 + + + reuse cached content from the aggregation system + + reuseContent + true + + + + + + + + set the aggregator content path + + contentPath + /tmp/beta_aggregator + + + + + + Set the path containing the AGGREGATOR graph @@ -62,87 +85,94 @@ - - Set the target path to store the CLEANED graph + + Set the target path to store the ORCID enriched graph - cleanedGraphPath - /tmp/beta_provision/graph/05_graph_cleaned + orcidGraphPath + /tmp/beta_provision/graph/05_graph_orcid - - Set the target path to store the ORCID enriched graph - - orcidGraphPath - /tmp/beta_provision/graph/06_graph_orcid - - - - - Set the target path to store the BULK TAGGED graph bulkTaggingGraphPath - /tmp/beta_provision/graph/07_graph_bulktagging + /tmp/beta_provision/graph/06_graph_bulktagging + Set the target path to store the AFFILIATION from INSTITUTIONAL REPOS graph affiliationGraphPath - /tmp/beta_provision/graph/08_graph_affiliation + /tmp/beta_provision/graph/07_graph_affiliation + Set the target path to store the COMMUNITY from SELECTED SOURCES graph communityOrganizationGraphPath - /tmp/beta_provision/graph/09_graph_comunity_organization + /tmp/beta_provision/graph/08_graph_comunity_organization + Set the target path to store the FUNDING from SEMANTIC RELATION graph fundingGraphPath - /tmp/beta_provision/graph/10_graph_funding + /tmp/beta_provision/graph/09_graph_funding + Set the target path to store the COMMUNITY from SEMANTIC RELATION graph communitySemRelGraphPath - /tmp/beta_provision/graph/11_graph_comunity_sem_rel + /tmp/beta_provision/graph/10_graph_comunity_sem_rel + Set the target path to store the COUNTRY enriched graph countryGraphPath - /tmp/beta_provision/graph/12_graph_country + /tmp/beta_provision/graph/11_graph_country + + + Set the target path to store the CLEANED graph + + cleanedGraphPath + /tmp/beta_provision/graph/12_graph_cleaned + + + + + + Set the target path to store the blacklisted graph @@ -153,6 +183,7 @@ + Set the lookup address @@ -163,6 +194,7 @@ + Set the map of paths for the Bulk Tagging @@ -173,6 +205,7 @@ + Set the map of associations organization, community list for the propagation of community to result through organization @@ -185,6 +218,7 @@ + Set the dedup orchestrator name @@ -195,6 +229,7 @@ + declares the ActionSet ids to promote in the RAW graph @@ -205,6 +240,7 @@ + declares the ActionSet ids to promote in the INFERRED graph @@ -215,6 +251,7 @@ + wait configurations @@ -222,6 +259,7 @@ + create the AGGREGATOR graph @@ -230,7 +268,9 @@ { 'graphOutputPath' : 'aggregatorGraphPath', - 'isLookupUrl' : 'isLookUpUrl' + 'isLookupUrl' : 'isLookUpUrl', + 'reuseContent' : 'reuseContent', + 'contentPath' : 'contentPath' } @@ -241,8 +281,6 @@ 'postgresURL' : 'jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus', 'postgresUser' : 'dnet', 'postgresPassword' : '', - 'reuseContent' : 'false', - 'contentPath' : '/tmp/beta_provision/aggregator', 'workingDir' : '/tmp/beta_provision/working_dir/aggregator' } @@ -252,6 +290,7 @@ + create the RAW graph @@ -289,6 +328,7 @@ + search for duplicates in the raw graph @@ -314,6 +354,7 @@ + create the INFERRED graph @@ -351,6 +392,7 @@ + mark duplicates as deleted and redistribute the relationships @@ -375,41 +417,6 @@ - - clean the properties in the graph typed as Qualifier according to the vocabulary indicated in schemeid - - executeOozieJob - IIS - - { - 'graphInputPath' : 'consistentGraphPath', - 'graphOutputPath': 'cleanedGraphPath', - 'isLookupUrl': 'isLookUpUrl' - } - - - { - 'oozie.wf.application.path' : '/lib/dnet/oa/graph/clean/oozie_app', - 'workingPath' : '/tmp/beta_provision/working_dir/clean' - } - - build-report - - - - - - - - Do we skip the graph enrichment steps? (Yes to prepare the graph for the IIS) - - NO - - - - - - propagates ORCID among results linked by allowedsemrels semantic relationships @@ -417,7 +424,7 @@ IIS { - 'sourcePath' : 'cleanedGraphPath', + 'sourcePath' : 'consistentGraphPath', 'outputPath': 'orcidGraphPath' } @@ -435,6 +442,7 @@ + mark results respecting some rules as belonging to communities @@ -460,6 +468,7 @@ + creates relashionships between results and organizations when the organizations are associated to institutional repositories @@ -484,6 +493,7 @@ + marks as belonging to communities the result collected from datasources related to the organizations specified in the organizationCommunityMap @@ -509,6 +519,7 @@ + created relation between projects and results linked to other results trough allowedsemrel semantic relations linked to projects @@ -534,6 +545,7 @@ + tag as belonging to communitites result in in allowedsemrels relation with other result already linked to communities @@ -560,6 +572,7 @@ + associated to results colleced from allowedtypes and those in the whithelist the country of the organization(s) handling the datasource it is collected from @@ -584,10 +597,36 @@ build-report + + + + + + + clean the properties in the graph typed as Qualifier according to the vocabulary indicated in schemeid + + executeOozieJob + IIS + + { + 'graphInputPath' : 'countryGraphPath', + 'graphOutputPath': 'cleanedGraphPath', + 'isLookupUrl': 'isLookUpUrl' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/graph/clean/oozie_app', + 'workingPath' : '/tmp/beta_provision/working_dir/clean' + } + + build-report + + removes blacklisted relations @@ -595,7 +634,7 @@ IIS { - 'sourcePath' : 'countryGraphPath', + 'sourcePath' : 'cleanedGraphPath', 'outputPath': 'blacklistedGraphPath' } From 610d377d57f43613b4f3a9167a5a82b3825f0fe1 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 8 Jul 2020 16:54:26 +0200 Subject: [PATCH 08/65] first implementation of the BETA & PROD graphs merge procedure --- .../oa/graph/merge/MergeGraphSparkJob.java | 130 +++ .../graph/merge/oozie_app/config-default.xml | 18 + .../dhp/oa/graph/merge/oozie_app/workflow.xml | 282 +++++++ .../dhp/oa/graph/merge_graphs_parameters.json | 32 + .../wf/profiles/graph_beta_construction.xml | 779 ++++++++++++++++++ ...uction.xml => graph_core_construction.xml} | 2 +- 6 files changed, 1242 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json create mode 100644 dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_beta_construction.xml rename dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/{graph_construction.xml => graph_core_construction.xml} (99%) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java new file mode 100644 index 0000000000..d33b9e8468 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java @@ -0,0 +1,130 @@ +package eu.dnetlib.dhp.oa.graph.merge; + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; + +import java.util.Objects; +import java.util.Optional; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +/** + * Combines the content from two aggregator graph tables of the same type, entities (or relationships) with the same ids + * are picked preferring those from the BETA aggregator rather then from PROD. The identity of a relationship is defined + * by eu.dnetlib.dhp.schema.common.ModelSupport#idFn() + */ +public class MergeGraphSparkJob { + + private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + CleanGraphSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String betaInputPath = parser.get("betaInputPath"); + log.info("betaInputPath: {}", betaInputPath); + + String prodInputPath = parser.get("prodInputPath"); + log.info("prodInputPath: {}", prodInputPath); + + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + String graphTableClassName = parser.get("graphTableClassName"); + log.info("graphTableClassName: {}", graphTableClassName); + + Class entityClazz = (Class) Class.forName(graphTableClassName); + + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + mergeGraphTable(spark, betaInputPath, prodInputPath, entityClazz, entityClazz, outputPath); + }); + } + + private static

void mergeGraphTable( + SparkSession spark, + String betaInputPath, + String prodInputPath, + Class

p_clazz, + Class b_clazz, + String outputPath) { + + Dataset> beta = readTableFromPath(spark, betaInputPath, b_clazz); + Dataset> prod = readTableFromPath(spark, prodInputPath, p_clazz); + + prod.joinWith(beta, prod.col("_1").equalTo(beta.col("_1")), "full_outer") + .map((MapFunction, Tuple2>, P>) value -> { + Optional

p = Optional.ofNullable(value._1()).map(Tuple2::_2); + Optional b = Optional.ofNullable(value._2()).map(Tuple2::_2); + if (p.isPresent() & !b.isPresent()) { + return p.get(); + } + if (b.isPresent()) { + return (P) b.get(); + } + return null; + }, Encoders.bean(p_clazz)) + .filter((FilterFunction

) Objects::nonNull) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + + private static Dataset> readTableFromPath( + SparkSession spark, String inputEntityPath, Class clazz) { + + log.info("Reading Graph table from: {}", inputEntityPath); + return spark + .read() + .textFile(inputEntityPath) + .map( + (MapFunction>) value -> { + final T t = OBJECT_MAPPER.readValue(value, clazz); + final String id = ModelSupport.idFn().apply(t); + return new Tuple2<>(id, t); + }, + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + } + + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml new file mode 100644 index 0000000000..0a512fb6ab --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml @@ -0,0 +1,282 @@ + + + + + betaInputGgraphPath + the beta graph root path + + + prodInputGgraphPath + the production graph root path + + + graphOutputPath + the output merged graph root path + + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + yarn + cluster + Merge publications + eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/publication + --prodInputPath${prodInputGgraphPath}/publication + --outputPath${graphOutputPath}/publication + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication + + + + + + + + yarn + cluster + Merge datasets + eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/dataset + --prodInputPath${prodInputGgraphPath}/dataset + --outputPath${graphOutputPath}/dataset + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset + + + + + + + + yarn + cluster + Merge otherresearchproducts + eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/otherresearchproduct + --prodInputPath${prodInputGgraphPath}/otherresearchproduct + --outputPath${graphOutputPath}/otherresearchproduct + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + + + + + + + + yarn + cluster + Merge softwares + eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/software + --prodInputPath${prodInputGgraphPath}/software + --outputPath${graphOutputPath}/software + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software + + + + + + + + yarn + cluster + Merge datasources + eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/datasource + --prodInputPath${prodInputGgraphPath}/datasource + --outputPath${graphOutputPath}/datasource + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource + + + + + + + + yarn + cluster + Merge organizations + eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/organization + --prodInputPath${prodInputGgraphPath}/organization + --outputPath${graphOutputPath}/organization + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization + + + + + + + + yarn + cluster + Merge projects + eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGgraphPath}/project + --prodInputPath${prodInputGgraphPath}/project + --outputPath${graphOutputPath}/project + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project + + + + + + + + yarn + cluster + Merge relations + eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + + --betaInputPath${betaInputGgraphPath}/relation + --prodInputPath${prodInputGgraphPath}/relation + --outputPath${graphOutputPath}/relation + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json new file mode 100644 index 0000000000..6018b7e93c --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "bin", + "paramLongName": "betaInputPath", + "paramDescription": "the beta graph root path", + "paramRequired": true + }, + { + "paramName": "pin", + "paramLongName": "prodInputPath", + "paramDescription": "the production graph root path", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the output merged graph root path", + "paramRequired": true + }, + { + "paramName": "class", + "paramLongName": "graphTableClassName", + "paramDescription": "class name moelling the graph table", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_beta_construction.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_beta_construction.xml new file mode 100644 index 0000000000..08ed24cd07 --- /dev/null +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_beta_construction.xml @@ -0,0 +1,779 @@ + +

+ + + + + +
+ + Graph Construction [HYBRID] + Data Provision + 30 + + + + reuse cached content from the PROD aggregation system + + reuseProdContent + true + + + + + + + + set the PROD aggregator content path + + prodContentPath + /tmp/core_aggregator + + + + + + + + Set the path containing the PROD AGGREGATOR graph + + prodAggregatorGraphPath + /tmp/core_provision/graph/00_prod_graph_aggregator + + + + + + + + reuse cached content from the BETA aggregation system + + reuseBetaContent + true + + + + + + + + set the BETA aggregator content path + + betaContentPath + /tmp/beta_aggregator + + + + + + + + Set the path containing the BETA AGGREGATOR graph + + betaAggregatorGraphPath + /tmp/core_provision/graph/00_beta_graph_aggregator + + + + + + + + Set the IS lookup service address + + isLookUpUrl + http://services.openaire.eu:8280/is/services/isLookUp?wsdl + + + + + + + + Set the target path to store the MERGED graph + + mergedGraphPath + /tmp/core_provision/graph/01_graph_merged + + + + + + + + Set the target path to store the RAW graph + + rawGraphPath + /tmp/core_provision/graph/02_graph_raw + + + + + + + + Set the target path to store the DEDUPED graph + + dedupGraphPath + /tmp/core_provision/graph/03_graph_dedup + + + + + + + + Set the target path to store the INFERRED graph + + inferredGraphPath + /tmp/core_provision/graph/04_graph_inferred + + + + + + + + Set the target path to store the CONSISTENCY graph + + consistentGraphPath + /tmp/core_provision/graph/05_graph_consistent + + + + + + + + Set the target path to store the ORCID enriched graph + + orcidGraphPath + /tmp/core_provision/graph/06_graph_orcid + + + + + + + + Set the target path to store the BULK TAGGED graph + + bulkTaggingGraphPath + /tmp/core_provision/graph/07_graph_bulktagging + + + + + + + + Set the target path to store the AFFILIATION from INSTITUTIONAL REPOS graph + + affiliationGraphPath + /tmp/core_provision/graph/08_graph_affiliation + + + + + + + + Set the target path to store the COMMUNITY from SELECTED SOURCES graph + + communityOrganizationGraphPath + /tmp/core_provision/graph/09_graph_comunity_organization + + + + + + + + Set the target path to store the FUNDING from SEMANTIC RELATION graph + + fundingGraphPath + /tmp/core_provision/graph/10_graph_funding + + + + + + + + Set the target path to store the COMMUNITY from SEMANTIC RELATION graph + + communitySemRelGraphPath + /tmp/core_provision/graph/11_graph_comunity_sem_rel + + + + + + + + Set the target path to store the COUNTRY enriched graph + + countryGraphPath + /tmp/core_provision/graph/12_graph_country + + + + + + + + Set the target path to store the CLEANED graph + + cleanedGraphPath + /tmp/core_provision/graph/13_graph_cleaned + + + + + + + + Set the target path to store the blacklisted graph + + blacklistedGraphPath + /tmp/core_provision/graph/14_graph_blacklisted + + + + + + + + Set the map of paths for the Bulk Tagging + + bulkTaggingPathMap + {"author" : "$['author'][*]['fullname']", "title" : "$['title'][*]['value']", "orcid" : "$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']", "contributor" : "$['contributor'][*]['value']", "description" : "$['description'][*]['value']"} + + + + + + + + Set the map of associations organization, community list for the propagation of community to result through organization + + propagationOrganizationCommunityMap + {"20|corda__h2020::3fb05a9524c3f790391261347852f638":["mes","euromarine"], "20|corda__h2020::e8dbe14cca9bf6fce09d468872f813f8":["mes","euromarine"], "20|snsf________::9b253f265e3bef5cae6d881fdf61aceb":["mes","euromarine"],"20|rcuk________::e054eea0a47665af8c3656b5785ccf76":["mes","euromarine"],"20|corda__h2020::edc18d67c9b11fb616ca9f6e1db1b151":["mes","euromarine"],"20|rcuk________::d5736d9da90521ddcdc7828a05a85e9a":["mes","euromarine"],"20|corda__h2020::f5d418d3aa1cf817ddefcc3fdc039f27":["mes","euromarine"],"20|snsf________::8fa091f8f25a846779acb4ea97b50aef":["mes","euromarine"],"20|corda__h2020::81e020977211c2c40fae2e1a50bffd71":["mes","euromarine"],"20|corda_______::81e020977211c2c40fae2e1a50bffd71":["mes","euromarine"],"20|snsf________::31d0a100e54e3cdb3c6f52d91e638c78":["mes","euromarine"],"20|corda__h2020::ea379ef91b8cc86f9ac5edc4169292db":["mes","euromarine"],"20|corda__h2020::f75ee2ee48e5cb0ec8c8d30aaa8fef70":["mes","euromarine"],"20|rcuk________::e16010089551a1a9182a94604fc0ea59":["mes","euromarine"],"20|corda__h2020::38531a2cce7c5c347ffc439b07c1f43b":["mes","euromarine"],"20|corda_______::38531a2cce7c5c347ffc439b07c1f43b":["mes","euromarine"],"20|grid________::b2cbbf5eadbbf87d534b022bad3191d7":["mes","euromarine"],"20|snsf________::74730ef1439d7f7636a8be58a6b471b8":["mes","euromarine"],"20|nsf_________::ad72e19043a5a467e35f9b444d11563e":["mes","euromarine"],"20|rcuk________::0fc3e92500290902a2d38ec2445e74c3":["mes","euromarine"],"20|grid________::ad2c29905da0eb3c06b3fa80cacd89ea":["mes","euromarine"],"20|corda__h2020::30b53e4d63d3724f00acb9cbaca40860":["mes","euromarine"],"20|corda__h2020::f60f84bee14ad93f0db0e49af1d5c317":["mes","euromarine"], "20|corda__h2020::7bf251ac3765b5e89d82270a1763d09f":["mes","euromarine"], "20|corda__h2020::65531bd11be9935948c7f2f4db1c1832":["mes","euromarine"], "20|corda__h2020::e0e98f86bbc76638bbb72a8fe2302946":["mes","euromarine"], "20|snsf________::3eb43582ac27601459a8d8b3e195724b":["mes","euromarine"], "20|corda__h2020::af2481dab65d06c8ea0ae02b5517b9b6":["mes","euromarine"], "20|corda__h2020::c19d05cfde69a50d3ebc89bd0ee49929":["mes","euromarine"], "20|corda__h2020::af0bfd9fc09f80d9488f56d71a9832f0":["mes","euromarine"], "20|rcuk________::f33c02afb0dc66c49d0ed97ca5dd5cb0":["beopen"], + "20|grid________::a867f78acdc5041b34acfe4f9a349157":["beopen"], "20|grid________::7bb116a1a9f95ab812bf9d2dea2be1ff":["beopen"], "20|corda__h2020::6ab0e0739dbe625b99a2ae45842164ad":["beopen"], "20|corda__h2020::8ba50792bc5f4d51d79fca47d860c602":["beopen"], "20|corda_______::8ba50792bc5f4d51d79fca47d860c602":["beopen"], "20|corda__h2020::e70e9114979e963eef24666657b807c3":["beopen"], "20|corda_______::e70e9114979e963eef24666657b807c3":["beopen"], "20|corda_______::15911e01e9744d57205825d77c218737":["beopen"], "20|opendoar____::056a41e24e2a9a67215e87bbee6a80ab":["beopen"], "20|opendoar____::7f67f2e6c6fbb0628f8160fcd3d92ae3":["beopen"], "20|grid________::a8ecfd7c084e561168bcbe6bf0daf3e3":["beopen"], "20|corda_______::7bbe6cc5d8ec1864739a04b0d020c9e9":["beopen"], "20|corda_______::3ff558e30c2e434d688539548300b050":["beopen"], "20|corda__h2020::5ffee5b3b83b33a8cf0e046877bd3a39":["beopen"], "20|corda__h2020::5187217e2e806a6df3579c46f82401bc":["beopen"], "20|grid________::5fa7e2709bcd945e26bfa18689adeec1":["beopen"], "20|corda_______::d8696683c53027438031a96ad27c3c07":["beopen"], "20|corda__h2020::d8696683c53027438031a96ad27c3c07":["beopen"], "20|rcuk________::23a79ebdfa59790864e4a485881568c1":["beopen"], "20|corda__h2020::b76cf8fe49590a966953c37e18608af9":["beopen"], "20|grid________::d2f0204126ee709244a488a4cd3b91c2":["beopen"], "20|corda__h2020::05aba9d2ed17533d15221e5655ac11e6":["beopen"], "20|grid________::802401579481dc32062bdee69f5e6a34":["beopen"], "20|corda__h2020::3f6d9d54cac975a517ba6b252c81582d":["beopen"]} + + + + + + + + + Set the dedup orchestrator name + + dedupConfig + decisiontree-dedup-test + + + + + + + + declares the ActionSet ids to promote in the RAW graph + + actionSetIdsRawGraph + scholexplorer-dump,gridac-dump,doiboost-organizations,doiboost,orcidworks-no-doi,iis-wos-entities,iis-entities-software,iis-entities-patent + + + + + + + + declares the ActionSet ids to promote in the INFERRED graph + + actionSetIdsIISGraph + iis-researchinitiative,iis-document-citations,iis-document-affiliation,iis-document-classes,iis-document-similarities,iis-referenced-datasets-main,iis-referenced-datasets-preprocessing,iis-referenced-projects-main,iis-referenced-projects-preprocessing,iis-referenceextraction-pdb,document_software_url,iis-extracted-metadata,iis-communities,iis-referenced-patents,iis-covid-19 + + + + + + + + wait configurations + + + + + + + + + + create the AGGREGATOR graph + + executeOozieJob + IIS + + { + 'graphOutputPath' : 'betaAggregatorGraphPath', + 'isLookupUrl' : 'isLookUpUrl', + 'reuseContent' : 'reuseBetaContent', + 'contentPath' : 'betaContentPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/graph/raw_all/oozie_app', + 'mongoURL' : 'mongodb://beta.services.openaire.eu', + 'mongoDb' : 'mdstore', + 'postgresURL' : 'jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus', + 'postgresUser' : 'dnet', + 'postgresPassword' : '', + 'workingDir' : '/tmp/core_provision/working_dir/beta_aggregator' + } + + build-report + + + + + + + + create the AGGREGATOR graph + + executeOozieJob + IIS + + { + 'graphOutputPath' : 'prodAggregatorGraphPath', + 'isLookupUrl' : 'isLookUpUrl', + 'reuseContent' : 'reuseProdContent', + 'contentPath' : 'prodContentPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/graph/raw_all/oozie_app', + 'mongoURL' : 'mongodb://services.openaire.eu', + 'mongoDb' : 'mdstore', + 'postgresURL' : 'jdbc:postgresql://postgresql.services.openaire.eu:5432/dnet_openaireplus', + 'postgresUser' : 'dnet', + 'postgresPassword' : '', + 'workingDir' : '/tmp/core_provision/working_dir/prod_aggregator' + } + + build-report + + + + + + + + wait configurations + + + + + + + + create the AGGREGATOR graph + + executeOozieJob + IIS + + { + 'betaInputGgraphPath' : 'betaAggregatorGraphPath', + 'prodInputGgraphPath' : 'prodAggregatorGraphPath', + 'graphOutputPath' : 'mergedGraphPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/graph/merge/oozie_app', + 'workingDir' : '/tmp/core_provision/working_dir/merge_graph' + } + + build-report + + + + + + + + create the RAW graph + + executeOozieJob + IIS + + { + 'inputActionSetIds' : 'actionSetIdsRawGraph', + 'inputGraphRootPath' : 'mergedGraphPath', + 'outputGraphRootPath' : 'rawGraphPath', + 'isLookupUrl' : 'isLookUpUrl' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/actionmanager/wf/main/oozie_app', + 'sparkExecutorCores' : '3', + 'sparkExecutorMemory' : '10G', + 'activePromoteDatasetActionPayload' : 'true', + 'activePromoteDatasourceActionPayload' : 'true', + 'activePromoteOrganizationActionPayload' : 'true', + 'activePromoteOtherResearchProductActionPayload' : 'true', + 'activePromoteProjectActionPayload' : 'true', + 'activePromotePublicationActionPayload' : 'true', + 'activePromoteRelationActionPayload' : 'true', + 'activePromoteResultActionPayload' : 'true', + 'activePromoteSoftwareActionPayload' : 'true', + 'mergeAndGetStrategy' : 'MERGE_FROM_AND_GET', + 'workingDir' : '/tmp/core_provision/working_dir/promoteActionsRaw' + } + + build-report + + + + + + + + search for duplicates in the raw graph + + executeOozieJob + IIS + + { + 'actionSetId' : 'dedupConfig', + 'graphBasePath' : 'rawGraphPath', + 'dedupGraphPath': 'dedupGraphPath', + 'isLookUpUrl' : 'isLookUpUrl' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/dedup/scan/oozie_app', + 'workingPath' : '/tmp/core_provision/working_dir/dedup' + } + + build-report + + + + + + + + create the INFERRED graph + + executeOozieJob + IIS + + { + 'inputActionSetIds' : 'actionSetIdsIISGraph', + 'inputGraphRootPath' : 'dedupGraphPath', + 'outputGraphRootPath' : 'inferredGraphPath', + 'isLookupUrl' : 'isLookUpUrl' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/actionmanager/wf/main/oozie_app', + 'sparkExecutorCores' : '3', + 'sparkExecutorMemory' : '10G', + 'activePromoteDatasetActionPayload' : 'true', + 'activePromoteDatasourceActionPayload' : 'true', + 'activePromoteOrganizationActionPayload' : 'true', + 'activePromoteOtherResearchProductActionPayload' : 'true', + 'activePromoteProjectActionPayload' : 'true', + 'activePromotePublicationActionPayload' : 'true', + 'activePromoteRelationActionPayload' : 'true', + 'activePromoteResultActionPayload' : 'true', + 'activePromoteSoftwareActionPayload' : 'true', + 'mergeAndGetStrategy' : 'MERGE_FROM_AND_GET', + 'workingDir' : '/tmp/core_provision/working_dir/promoteActionsIIS' + } + + build-report + + + + + + + + mark duplicates as deleted and redistribute the relationships + + executeOozieJob + IIS + + { + 'graphBasePath' : 'inferredGraphPath', + 'dedupGraphPath': 'consistentGraphPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/dedup/consistency/oozie_app', + 'workingPath' : '/tmp/core_provision/working_dir/dedup' + } + + build-report + + + + + + + + propagates ORCID among results linked by allowedsemrels semantic relationships + + executeOozieJob + IIS + + { + 'sourcePath' : 'consistentGraphPath', + 'outputPath': 'orcidGraphPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/orcidtoresultfromsemrel/oozie_app', + 'workingDir' : '/tmp/core_provision/working_dir/orcid', + 'allowedsemrels' : 'isSupplementedBy;isSupplementTo', + 'saveGraph' : 'true' + } + + build-report + + + + + + + + mark results respecting some rules as belonging to communities + + executeOozieJob + IIS + + { + 'sourcePath' : 'orcidGraphPath', + 'outputPath': 'bulkTaggingGraphPath', + 'isLookUpUrl' : 'isLookUpUrl', + 'pathMap' : 'bulkTaggingPathMap' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/bulktag/oozie_app', + 'workingDir' : '/tmp/core_provision/working_dir/bulktag' + } + + build-report + + + + + + + + creates relashionships between results and organizations when the organizations are associated to institutional repositories + + executeOozieJob + IIS + + { + 'sourcePath' : 'bulkTaggingGraphPath', + 'outputPath': 'affiliationGraphPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/affiliation/oozie_app', + 'workingDir' : '/tmp/core_provision/working_dir/affiliation', + 'saveGraph' : 'true' + } + + build-report + + + + + + + + marks as belonging to communities the result collected from datasources related to the organizations specified in the organizationCommunityMap + + executeOozieJob + IIS + + { + 'sourcePath' : 'affiliationGraphPath', + 'outputPath': 'communityOrganizationGraphPath', + 'organizationtoresultcommunitymap': 'propagationOrganizationCommunityMap' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/community_organization/oozie_app', + 'workingDir' : '/tmp/core_provision/working_dir/community_organization', + 'saveGraph' : 'true' + } + + build-report + + + + + + + + created relation between projects and results linked to other results trough allowedsemrel semantic relations linked to projects + + executeOozieJob + IIS + + { + 'sourcePath' : 'communityOrganizationGraphPath', + 'outputPath': 'fundingGraphPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/funding/oozie_app', + 'workingDir' : '/tmp/core_provision/working_dir/funding', + 'allowedsemrels' : 'isSupplementedBy;isSupplementTo', + 'saveGraph' : 'true' + } + + build-report + + + + + + + + tag as belonging to communitites result in in allowedsemrels relation with other result already linked to communities + + executeOozieJob + IIS + + { + 'sourcePath' : 'fundingGraphPath', + 'outputPath': 'communitySemRelGraphPath', + 'isLookUpUrl' : 'isLookUpUrl' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/community_semrel/oozie_app', + 'workingDir' : '/tmp/core_provision/working_dir/community_semrel', + 'allowedsemrels' : 'isSupplementedBy;isSupplementTo', + 'saveGraph' : 'true' + } + + build-report + + + + + + + + associated to results colleced from allowedtypes and those in the whithelist the country of the organization(s) handling the datasource it is collected from + + executeOozieJob + IIS + + { + 'sourcePath' : 'communitySemRelGraphPath', + 'outputPath': 'countryGraphPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/country/oozie_app', + 'sparkExecutorCores' : '3', + 'sparkExecutorMemory' : '10G', + 'workingDir' : '/tmp/core_provision/working_dir/country', + 'allowedtypes' : 'pubsrepository::institutional', + 'whitelist' : '10|opendoar____::300891a62162b960cf02ce3827bb363c', + 'saveGraph' : 'true' + } + + build-report + + + + + + + + clean the properties in the graph typed as Qualifier according to the vocabulary indicated in schemeid + + executeOozieJob + IIS + + { + 'graphInputPath' : 'countryGraphPath', + 'graphOutputPath': 'cleanedGraphPath', + 'isLookupUrl': 'isLookUpUrl' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/graph/clean/oozie_app', + 'workingPath' : '/tmp/core_provision/working_dir/clean' + } + + build-report + + + + + + + + removes blacklisted relations + + executeOozieJob + IIS + + { + 'sourcePath' : 'cleanedGraphPath', + 'outputPath': 'blacklistedGraphPath' + } + + + { + 'oozie.wf.application.path' : '/lib/dnet/oa/enrichment/blacklist/oozie_app', + 'workingDir' : '/tmp/core_provision/working_dir/blacklist', + 'postgresURL' : 'jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus', + 'postgresUser' : 'dnet', + 'postgresPassword' : '' + } + + build-report + + + + + + + + + wf_20200615_163630_609 + 2020-06-15T17:08:00+00:00 + SUCCESS + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_construction.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_core_construction.xml similarity index 99% rename from dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_construction.xml rename to dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_core_construction.xml index 4d77883b49..3bce81c50a 100644 --- a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_construction.xml +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_core_construction.xml @@ -413,7 +413,7 @@ build-report
- +
From 853e8d79875ca0d713f4c64e8b589a18e8ca2fd5 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Wed, 8 Jul 2020 17:03:53 +0200 Subject: [PATCH 09/65] test for software merge --- .../java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java | 10 ++++++++++ .../eu/dnetlib/dhp/dedup/json/software_merge.json | 3 +++ 2 files changed, 13 insertions(+) create mode 100644 dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/software_merge.json diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java index b8ccb038d7..513e14f073 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java @@ -45,6 +45,16 @@ public class EntityMergerTest implements Serializable { } + @Test + public void softwareMergerTest() throws InstantiationException, IllegalAccessException { + List> softwares = readSample(testEntityBasePath + "/software_merge.json", Software.class); + + Software merged = DedupRecordFactory + .entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class); + + System.out.println(merged.getBestaccessright().getClassid()); + } + @Test public void publicationMergerTest() throws InstantiationException, IllegalAccessException { diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/software_merge.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/software_merge.json new file mode 100644 index 0000000000..b146d61022 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/software_merge.json @@ -0,0 +1,3 @@ +{"context": [], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "sysimport:crosswalk:datasetarchive", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "dedup-similarity-result-levenstein", "invisible": false, "trust": "0.95"}, "resourcetype": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "pid": [], "contributor": [], "resulttype": {"classid": "software", "classname": "software", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "relevantdate": [], "collectedfrom": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}], "id": "50|a89337edbe55::4930db9e954866d70916cbfba9f81f97", "subject": [], "instance": [{"refereed": null, "hostedby": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}, "processingchargeamount": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "url": [], "distributionlocation": "", "processingchargecurrency": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "collectedfrom": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Journal.fi", "key": "10|openaire____::6eef8049d0feedc089ee009abca55e35"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}}], "embargoenddate": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "lastupdatetimestamp": 0, "author": [{"surname": "Go\\u0308tz", "name": "Julia", "pid": [], "rank": 5, "affiliation": [], "fullname": "G\\u00f6tz, Julia"}, {"surname": "Wolff", "name": "Stephan", "pid": [], "rank": 6, "affiliation": [], "fullname": "Wolff, Stephan"}, {"surname": "Jansen", "name": "Olav", "pid": [], "rank": 7, "affiliation": [], "fullname": "Jansen, Olav"}, {"surname": "Dressler", "name": "Dirk", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0000-0656-9999"},{"qualifier": {"classid": "id", "classname": "id"}, "value": "987654321"}], "rank": 8, "affiliation": [], "fullname": "Dressler, Dirk"}, {"surname": "Schneider", "name": "Susanne A.", "pid": [], "rank": 9, "affiliation": [], "fullname": "Schneider, Susanne A."}], "source": [], "dateofcollection": "2019-11-05T14:49:22.351Z", "fulltext": [], "dateoftransformation": "2019-11-05T16:10:58.988Z", "description": [], "format": [], "coverage": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "language": {"classid": "eng", "classname": "English", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "OPEN SOURCE", "classname": "Open Source", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [], "extraInfo": [], "originalId": [], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2018-09-30"}, "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Altered brain activation in a reversal learning task unmasks adaptive changes in cognitive control in writer's cramp"}]} +{"context": [], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "sysimport:crosswalk:repository", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "dedup-similarity-result-levenstein", "invisible": false, "trust": "0.9"}, "resourcetype": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "pid": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "doi", "classname": "doi", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.1016/j.nicl.2015.11.006"}], "contributor": [], "resulttype": {"classid": "software", "classname": "software", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "relevantdate": [], "collectedfrom": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "BASE (Open Access Aggregator)", "key": "10|openaire____::df45502607927471ecf8a6ae83683ff5"}], "id": "50|base_oa_____::0968af610a356656706657e4f234b340", "subject": [], "instance": [{"refereed": null, "hostedby": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "NeuroImage: Clinical", "key": "10|doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a"}, "processingchargeamount": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "http://creativecommons.org/licenses/by-nc-nd/4.0/"}, "url": ["http://dx.doi.org/10.1016/j.nicl.2015.11.006"], "distributionlocation": "", "processingchargecurrency": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "collectedfrom": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "BASE (Open Access Aggregator)", "key": "10|openaire____::df45502607927471ecf8a6ae83683ff5"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "embargoenddate": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "lastupdatetimestamp": 0, "author": [{"surname": "Zeuner", "name": "Kirsten E.", "pid": [], "rank": 1, "affiliation": [], "fullname": "Zeuner, Kirsten E."}, {"surname": "Knutzen", "name": "Arne", "pid": [], "rank": 2, "affiliation": [], "fullname": "Knutzen, Arne"}, {"surname": "Granert", "name": "Oliver", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0002-0656-1023"}], "rank": 3, "affiliation": [], "fullname": "Granert, Oliver"}, {"surname": "Sablowsky", "name": "Simone", "pid": [], "rank": 4, "affiliation": [], "fullname": "Sablowsky, Simone"}, {"surname": "Go\\u0308tz", "name": "Julia", "pid": [], "rank": 5, "affiliation": [], "fullname": "G\\u00f6tz, Julia"}, {"surname": "Wolff", "name": "Stephan", "pid": [], "rank": 6, "affiliation": [], "fullname": "Wolff, Stephan"}, {"surname": "Jansen", "name": "Olav", "pid": [], "rank": 7, "affiliation": [], "fullname": "Jansen, Olav"}, {"surname": "Dressler", "name": "Dirk", "pid": [], "rank": 8, "affiliation": [], "fullname": "Dressler, Dirk"}, {"surname": "Schneider", "name": "Susanne A.", "pid": [], "rank": 9, "affiliation": [], "fullname": "Schneider, Susanne A."}, {"surname": "Klein", "name": "Christine", "pid": [], "rank": 10, "affiliation": [], "fullname": "Klein, Christine"}, {"surname": "Deuschl", "name": "Gu\\u0308nther", "pid": [], "rank": 11, "affiliation": [], "fullname": "Deuschl, G\\u00fcnther"}, {"surname": "Eimeren", "name": "Thilo", "pid": [], "rank": 12, "affiliation": [], "fullname": "van Eimeren, Thilo"}, {"surname": "Witt", "name": "Karsten", "pid": [], "rank": 13, "affiliation": [], "fullname": "Witt, Karsten"}], "source": [], "dateofcollection": "2017-07-27T19:04:09.131Z", "fulltext": [], "dateoftransformation": "2019-01-23T10:15:19.582Z", "description": [], "format": [], "coverage": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "Elsevier BV"}, "language": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "bestaccessright": {"classid": "OPEN SOURCE", "classname": "Open Source", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [{"classid": "IT", "classname": "Italy", "schemeid": "dnet:countries", "schemename": "dnet:countries"}], "extraInfo": [], "originalId": ["10.1016/j.nicl.2015.11.006"], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Altered brain activation in a reversal learning task unmasks adaptive changes in cognitive control in writer's cramp"}]} +{"context": [], "dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "sysimport:crosswalk:datasetarchive", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "dedup-similarity-result-levenstein", "invisible": false, "trust": "0.9"}, "resourcetype": {"classid": "0004", "classname": "Conference object", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}, "pid": [], "contributor": [], "resulttype": {"classid": "software", "classname": "software", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "relevantdate": [], "collectedfrom": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "CRIS UNS (Current Research Information System University of Novi Sad)", "key": "10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556"}], "id": "50|CrisUnsNoviS::9f9d014eea45dab432cab636c4c9cf39", "subject": [], "instance": [{"refereed": null, "hostedby": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "CRIS UNS (Current Research Information System University of Novi Sad)", "key": "10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556"}, "processingchargeamount": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "license": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "url": ["https://www.cris.uns.ac.rs/record.jsf?recordId=113444&source=OpenAIRE&language=en"], "distributionlocation": "", "processingchargecurrency": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2019-01-01"}, "collectedfrom": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "CRIS UNS (Current Research Information System University of Novi Sad)", "key": "10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556"}, "accessright": {"classid": "UNKNOWN", "classname": "UNKNOWN", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0004", "classname": "Conference object", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}}], "embargoenddate": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "lastupdatetimestamp": 0, "author": [{"surname": "Zeuner", "name": "Kirsten E.", "pid": [], "rank": 1, "affiliation": [], "fullname": "Zeuner, Kirsten E."}, {"surname": "Knutzen", "name": "Arne", "pid": [], "rank": 2, "affiliation": [], "fullname": "Knutzen, Arne"}, {"surname": "Granert", "name": "Oliver", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0002-0656-1023"}, {"qualifier": {"classid": "pubmed", "classname": "pubmed"}, "value": "pubmed.it"}], "rank": 3, "affiliation": [], "fullname": "Granert, Oliver"}, {"surname": "Sablowsky", "name": "Simone", "pid": [{"qualifier": {"classid": "id", "classname": "id"}, "value": "12345678"}], "rank": 4, "affiliation": [], "fullname": "Sablowsky, Simone"}, {"surname": "Go\\u0308tz", "name": "Julia", "pid": [], "rank": 5, "affiliation": [], "fullname": "G\\u00f6tz, Julia"}, {"surname": "Wolff", "name": "Stephan", "pid": [], "rank": 6, "affiliation": [], "fullname": "Wolff, Stephan"}, {"surname": "Jansen", "name": "Olav", "pid": [{"qualifier": {"classid": "ORCID", "classname": "ORCID"}, "value": "0000-0000-0656-1023"},{"qualifier": {"classid": "id", "classname": "id"}, "value": "987654321"}], "rank": 7, "affiliation": [], "fullname": "Jansen, Olav"}, {"surname": "Dressler", "name": "Dirk", "pid": [], "rank": 8, "affiliation": [], "fullname": "Dressler, Dirk"}, {"surname": "Schneider", "name": "Susanne A.", "pid": [], "rank": 9, "affiliation": [], "fullname": "Schneider, Susanne A."}], "source": [], "dateofcollection": "2020-03-10T15:05:38.685Z", "fulltext": [], "dateoftransformation": "2020-03-11T20:11:13.15Z", "description": [], "format": [], "coverage": [], "publisher": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": ""}, "language": {"classid": "en", "classname": "en", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "bestaccessright": {"classid": "UNKNOWN", "classname": "unknown", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "country": [{"classid": "FI", "classname": "Finland", "schemeid": "dnet:countries", "schemename": "dnet:countries"}], "extraInfo": [], "originalId": ["(BISIS)113444", "https://www.cris.uns.ac.rs/record.jsf?recordId=113444&source=OpenAIRE&language=en"], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "value": "2016-01-01"}, "title": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "", "classname": "", "schemename": "", "schemeid": ""}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": ""}, "qualifier": {"classid": "test title", "classname": "test title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Antichains of copies of ultrahomogeneous structures"}]} \ No newline at end of file From 67e1d222b6768a2f2168f94621a482e4029bb05b Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 8 Jul 2020 17:53:35 +0200 Subject: [PATCH 10/65] bulk cleaning when found null or empty, sets bestaccessrights evaluating the result instances --- .../dhp/oa/graph/clean/CleanGraphSparkJob.java | 17 +++++++++++------ .../graph/raw/AbstractMdRecordToOafMapper.java | 4 ++++ .../oa/graph/clean/CleaningFunctionTest.java | 5 +++++ .../eu/dnetlib/dhp/oa/graph/clean/result.json | 7 +------ 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index e1c4b53b50..fd707e949a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -8,6 +8,7 @@ import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; +import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -97,7 +98,7 @@ public class CleanGraphSparkJob { .json(outputPath); } - private static T fixDefaults(T value) { + protected static T fixDefaults(T value) { if (value instanceof Datasource) { // nothing to clean here } else if (value instanceof Project) { @@ -134,11 +135,6 @@ public class CleanGraphSparkJob { .setResourcetype( qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); } - if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) { - r - .setBestaccessright( - qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); - } if (Objects.nonNull(r.getInstance())) { for (Instance i : r.getInstance()) { if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { @@ -152,6 +148,15 @@ public class CleanGraphSparkJob { } } } + if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) { + Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance()); + if (Objects.isNull(bestaccessrights)) { + r.setBestaccessright( + qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); + } else { + r.setBestaccessright(bestaccessrights); + } + } if (Objects.nonNull(r.getAuthor())) { boolean nullRank = r .getAuthor() diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index fc77950d01..c43ee29fe0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -378,6 +378,10 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); + public static Qualifier createBestAccessRights(final List instanceList) { + return getBestAccessRights(instanceList); + } + protected static Qualifier getBestAccessRights(final List instanceList) { if (instanceList != null) { final Optional min = instanceList diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java index 4783aa81f6..559a30b1ea 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java @@ -57,6 +57,8 @@ public class CleaningFunctionTest { String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json")); Publication p_in = MAPPER.readValue(json, Publication.class); + assertNull(p_in.getBestaccessright()); + assertTrue(p_in instanceof Result); assertTrue(p_in instanceof Publication); @@ -84,6 +86,9 @@ public class CleaningFunctionTest { .map(p -> p.getQualifier()) .allMatch(q -> pidTerms.contains(q.getClassid()))); + Publication p_defaults = CleanGraphSparkJob.fixDefaults(p_out); + assertEquals("CLOSED", p_defaults.getBestaccessright().getClassid()); + // TODO add more assertions to verity the cleaned values System.out.println(MAPPER.writeValueAsString(p_out)); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 2c1d5017d6..5d0c0d1ed8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -185,12 +185,7 @@ "surname": "" } ], - "bestaccessright": { - "classid": "CLOSED", - "classname": "Closed Access", - "schemeid": "dnet:access_modes", - "schemename": "dnet:access_modes" - }, + "bestaccessright": null, "collectedfrom": [ { "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", From 4365cf41d70cd1a16b359340a4b2da2537237543 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 8 Jul 2020 22:31:46 +0200 Subject: [PATCH 11/65] trying to overcome OOM errors during duplicate scan phase --- .../dhp/oa/dedup/SparkCreateMergeRels.java | 7 +++++-- .../dhp/oa/dedup/SparkCreateSimRels.java | 20 +++++++++++-------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java index c0503d991d..19e60b5206 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java @@ -10,6 +10,7 @@ import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.graphx.Edge; import org.apache.spark.rdd.RDD; @@ -100,8 +101,10 @@ public class SparkCreateMergeRels extends AbstractSparkAction { final RDD> edgeRdd = spark .read() - .load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)) - .as(Encoders.bean(Relation.class)) + .textFile(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)) + .map( + (MapFunction) r -> OBJECT_MAPPER.readValue(r, Relation.class), + Encoders.bean(Relation.class)) .javaRDD() .map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass())) .rdd(); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java index 2cfe2e0801..7bc77fe2b2 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java @@ -95,19 +95,23 @@ public class SparkCreateSimRels extends AbstractSparkAction { }); // create blocks for deduplication - JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); + JavaPairRDD blocks = Deduper + .createSortedBlocks(mapDocuments, dedupConf) + .repartition(10000); // create relations by comparing only elements in the same group - JavaRDD relations = Deduper + Deduper .computeRelations(sc, blocks, dedupConf) - .map(t -> createSimRel(t._1(), t._2(), entity)); + .map(t -> createSimRel(t._1(), t._2(), entity)) + .repartition(10000) + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(outputPath); // save the simrel in the workingdir - spark - .createDataset(relations.rdd(), Encoders.bean(Relation.class)) - .write() - .mode(SaveMode.Append) - .save(outputPath); + /* + * spark .createDataset(relations.rdd(), Encoders.bean(Relation.class)) .write() .mode(SaveMode.Append) + * .save(outputPath); + */ } } From 3c728aaa0c1c993b172bac1c792ff45a0bac75ea Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 8 Jul 2020 22:39:51 +0200 Subject: [PATCH 12/65] trying to overcome OOM errors during duplicate scan phase --- .../main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java | 1 + 1 file changed, 1 insertion(+) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java index 7bc77fe2b2..1be2b9e31e 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java @@ -88,6 +88,7 @@ public class SparkCreateSimRels extends AbstractSparkAction { JavaPairRDD mapDocuments = sc .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .repartition(10000) .mapToPair( (PairFunction) s -> { MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); From 1c6a1716332e02c49f1b05639803e9c3f8c83d00 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 9 Jul 2020 11:02:09 +0200 Subject: [PATCH 13/65] updated pom --- dhp-workflows/dhp-broker-events/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index 119031b064..1a219c5c91 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -57,9 +57,9 @@
- eu.dnetlib + eu.dnetlib.dhp dnet-openaire-broker-common - [3.0.4,4.0.0) + [3.0.0-SNAPSHOT,) From 2d742a84aeb699aa67c889f7ed7ffe99c137fdae Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 9 Jul 2020 12:53:46 +0200 Subject: [PATCH 14/65] DedupConfig as json file --- .../dhp/broker/oa/GenerateEventsJob.java | 36 +----- .../dhp/broker/oa/matchers/UpdateMatcher.java | 4 +- .../dhp/broker/oa/util/EventFinder.java | 4 +- .../dhp/broker/oa/util/TrustUtils.java | 54 ++++++++ .../dhp/broker/oa/util/UpdateInfo.java | 41 +----- .../broker/oa/dedupConfig/dedupConfig.json | 122 ++++++++++++++++++ .../oa/generate_all/oozie_app/workflow.xml | 10 -- .../dhp/broker/oa/generate_events.json | 12 -- .../broker/oa/partial/oozie_app/workflow.xml | 9 -- .../broker/oa/matchers/UpdateMatcherTest.java | 12 +- .../dhp/broker/oa/util/TrustUtilsTest.java | 21 +++ 11 files changed, 208 insertions(+), 117 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java index a2d92e149e..cfee360c57 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java @@ -18,8 +18,6 @@ import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.model.Event; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; @@ -27,9 +25,6 @@ import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; import eu.dnetlib.dhp.broker.oa.util.EventFinder; import eu.dnetlib.dhp.broker.oa.util.EventGroup; import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.config.DedupConfig; public class GenerateEventsJob { @@ -52,12 +47,6 @@ public class GenerateEventsJob { final String workingPath = parser.get("workingPath"); log.info("workingPath: {}", workingPath); - final String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); - - final String dedupConfigProfileId = parser.get("dedupConfProfile"); - log.info("dedupConfigProfileId: {}", dedupConfigProfileId); - final String eventsPath = workingPath + "/events"; log.info("eventsPath: {}", eventsPath); @@ -72,10 +61,6 @@ public class GenerateEventsJob { final SparkConf conf = new SparkConf(); - // TODO UNCOMMENT - // final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId); - final DedupConfig dedupConfig = null; - runWithSparkSession(conf, isSparkSessionManaged, spark -> { ClusterUtils.removeDir(spark, eventsPath); @@ -90,7 +75,7 @@ public class GenerateEventsJob { final Dataset dataset = groups .map( g -> EventFinder - .generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, dedupConfig, accumulators), + .generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, accumulators), Encoders .bean(EventGroup.class)) .flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class)); @@ -112,23 +97,4 @@ public class GenerateEventsJob { } - private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception { - - final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); - - final String conf = isLookUpService - .getResourceProfileByQuery( - String - .format( - "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", - profId)); - - final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); - dedupConfig.getPace().initModel(); - dedupConfig.getPace().initTranslationMap(); - // dedupConfig.getWf().setConfigurationId("???"); - - return dedupConfig; - } - } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index af6ab30a1a..3d688fa1d1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -17,7 +17,6 @@ import org.apache.spark.util.LongAccumulator; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; -import eu.dnetlib.pace.config.DedupConfig; public abstract class UpdateMatcher { @@ -37,7 +36,6 @@ public abstract class UpdateMatcher { public Collection> searchUpdatesForRecord(final OaBrokerMainEntity res, final Collection others, - final DedupConfig dedupConfig, final Map accumulators) { final Map> infoMap = new HashMap<>(); @@ -49,7 +47,7 @@ public abstract class UpdateMatcher { if (topic != null) { final UpdateInfo info = new UpdateInfo<>(topic, hl, source, res, getCompileHighlightFunction(), - getHighlightToStringFunction(), dedupConfig); + getHighlightToStringFunction()); final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java index 593e66d434..b6328eb954 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java @@ -37,7 +37,6 @@ import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess; import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid; import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject; import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup; -import eu.dnetlib.pace.config.DedupConfig; public class EventFinder { @@ -76,7 +75,6 @@ public class EventFinder { final Set dsIdWhitelist, final Set dsIdBlacklist, final Set dsTypeWhitelist, - final DedupConfig dedupConfig, final Map accumulators) { final List> list = new ArrayList<>(); @@ -84,7 +82,7 @@ public class EventFinder { for (final OaBrokerMainEntity target : results.getData()) { if (verifyTarget(target, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) { for (final UpdateMatcher matcher : matchers) { - list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), dedupConfig, accumulators)); + list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), accumulators)); } } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java index 5338d4f3d7..72fe1b204e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java @@ -1,8 +1,62 @@ package eu.dnetlib.dhp.broker.oa.util; +import java.io.IOException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.tree.support.TreeProcessor; +import eu.dnetlib.pace.util.MapDocumentUtil; + public class TrustUtils { + private static final Logger log = LoggerFactory.getLogger(TrustUtils.class); + + private static DedupConfig dedupConfig; + + static { + final ObjectMapper mapper = new ObjectMapper(); + try { + dedupConfig = mapper + .readValue( + DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"), + DedupConfig.class); + } catch (final IOException e) { + log.error("Error loading dedupConfig, e"); + } + + } + + protected static float calculateTrust(final OaBrokerMainEntity r1, final OaBrokerMainEntity r2) { + + if (dedupConfig == null) { + return BrokerConstants.MIN_TRUST; + } + + try { + final ObjectMapper objectMapper = new ObjectMapper(); + final MapDocument doc1 = MapDocumentUtil + .asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1)); + final MapDocument doc2 = MapDocumentUtil + .asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2)); + + final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2); + + final double threshold = dedupConfig.getWf().getThreshold(); + + return TrustUtils.rescale(score, threshold); + } catch (final Exception e) { + log.error("Error computing score between results", e); + return BrokerConstants.MIN_TRUST; + } + } + public static float rescale(final double score, final double threshold) { if (score >= BrokerConstants.MAX_TRUST) { return BrokerConstants.MAX_TRUST; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index 0586b681ea..ef8fb240c4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -4,20 +4,11 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.function.BiConsumer; import java.util.function.Function; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.broker.objects.OaBrokerEventPayload; import eu.dnetlib.broker.objects.OaBrokerInstance; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerProvenance; import eu.dnetlib.dhp.broker.model.Topic; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.tree.support.TreeProcessor; -import eu.dnetlib.pace.util.MapDocumentUtil; public final class UpdateInfo { @@ -35,20 +26,17 @@ public final class UpdateInfo { private final float trust; - private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class); - public UpdateInfo(final Topic topic, final T highlightValue, final OaBrokerMainEntity source, final OaBrokerMainEntity target, final BiConsumer compileHighlight, - final Function highlightToString, - final DedupConfig dedupConfig) { + final Function highlightToString) { this.topic = topic; this.highlightValue = highlightValue; this.source = source; this.target = target; this.compileHighlight = compileHighlight; this.highlightToString = highlightToString; - this.trust = calculateTrust(dedupConfig, source, target); + this.trust = TrustUtils.calculateTrust(source, target); } public T getHighlightValue() { @@ -63,31 +51,6 @@ public final class UpdateInfo { return target; } - private float calculateTrust(final DedupConfig dedupConfig, - final OaBrokerMainEntity r1, - final OaBrokerMainEntity r2) { - - if (dedupConfig == null) { - return BrokerConstants.MIN_TRUST; - } - - try { - final ObjectMapper objectMapper = new ObjectMapper(); - final MapDocument doc1 = MapDocumentUtil - .asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1)); - final MapDocument doc2 = MapDocumentUtil - .asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2)); - - final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2); - final double threshold = dedupConfig.getWf().getThreshold(); - - return TrustUtils.rescale(score, threshold); - } catch (final Exception e) { - log.error("Error computing score between results", e); - return BrokerConstants.MIN_TRUST; - } - } - protected Topic getTopic() { return topic; } diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json new file mode 100644 index 0000000000..d0319b441f --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json @@ -0,0 +1,122 @@ +{ + "wf": { + + }, + "pace": { + "clustering": [ + { + "name": "wordssuffixprefix", + "fields": [ + "title" + ], + "params": { + "max": "2", + "len": "3" + } + }, + { + "name": "lowercase", + "fields": [ + "doi" + ], + "params": { + + } + } + ], + "decisionTree": { + "start": { + "fields": [ + { + "field": "doi", + "comparator": "exactMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + + } + } + ], + "threshold": 0.5, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "layer1", + "undefined": "layer1", + "ignoreUndefined": "true" + }, + "layer1": { + "fields": [ + { + "field": "title", + "comparator": "titleVersionMatch", + "weight": 0.9, + "countIfUndefined": "false", + "params": { + + } + }, + { + "field": "authors", + "comparator": "sizeMatch", + "weight": 0.9, + "countIfUndefined": "false", + "params": { + + } + } + ], + "threshold": 0.5, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "layer2", + "undefined": "layer2", + "ignoreUndefined": "true" + }, + "layer2": { + "fields": [ + { + "field": "title", + "comparator": "levensteinTitle", + "weight": 1.0, + "countIfUndefined": "true", + "params": { + + } + } + ], + "threshold": 0.99, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + } + }, + "model": [ + { + "name": "doi", + "type": "String", + "path": "$.pids[?(@.type == 'doi')].value" + }, + { + "name": "title", + "type": "String", + "path": "$.titles", + "length": 250, + "size": 5 + }, + { + "name": "authors", + "type": "List", + "path": "$.creators[*].fullname", + "size": 200 + } + ], + "blacklists": { + + }, + "synonyms": { + + } + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index b85c60fdf7..2c728cd981 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -8,14 +8,6 @@ workingPath the path where the the generated data will be stored - - - isLookupUrl - the address of the lookUp service - - - dedupConfProfId - the id of a valid Dedup Configuration Profile datasourceIdWhitelist @@ -427,8 +419,6 @@ --conf spark.sql.shuffle.partitions=3840 --workingPath${workingPath} - --isLookupUrl${isLookupUrl} - --dedupConfProfile${dedupConfProfId} --datasourceIdWhitelist${datasourceIdWhitelist} --datasourceTypeWhitelist${datasourceTypeWhitelist} --datasourceIdBlacklist${datasourceIdBlacklist} diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json index c545884f9e..bab8081932 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json @@ -5,18 +5,6 @@ "paramDescription": "the path where the generated events will be stored", "paramRequired": true }, - { - "paramName": "lu", - "paramLongName": "isLookupUrl", - "paramDescription": "the address of the ISLookUpService", - "paramRequired": true - }, - { - "paramName": "d", - "paramLongName": "dedupConfProfile", - "paramDescription": "the id of a valid Dedup Configuration Profile", - "paramRequired": true - }, { "paramName": "datasourceIdWhitelist", "paramLongName": "datasourceIdWhitelist", diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml index 392271260e..d19ad6c5a0 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml @@ -9,15 +9,6 @@ workingPath the path where the the generated data will be stored - - isLookupUrl - the address of the lookUp service - - - dedupConfProfId - the id of a valid Dedup Configuration Profile - - sparkDriverMemory memory for driver process diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java index 93bc5617fb..82374b335f 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java @@ -30,7 +30,7 @@ class UpdateMatcherTest { final OaBrokerMainEntity p4 = new OaBrokerMainEntity(); final Collection> list = matcher - .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null); + .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null); assertTrue(list.isEmpty()); } @@ -46,7 +46,7 @@ class UpdateMatcherTest { res.setPublicationdate("2018"); final Collection> list = matcher - .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null); + .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null); assertTrue(list.isEmpty()); } @@ -62,7 +62,7 @@ class UpdateMatcherTest { p2.setPublicationdate("2018"); final Collection> list = matcher - .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null); + .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null); assertTrue(list.size() == 1); } @@ -79,7 +79,7 @@ class UpdateMatcherTest { p2.setPublicationdate("2018"); final Collection> list = matcher - .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null); + .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null); assertTrue(list.isEmpty()); } @@ -98,7 +98,7 @@ class UpdateMatcherTest { p4.setPublicationdate("2018"); final Collection> list = matcher - .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null); + .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null); assertTrue(list.isEmpty()); } @@ -117,7 +117,7 @@ class UpdateMatcherTest { p4.setPublicationdate("2018"); final Collection> list = matcher - .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null); + .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null); assertTrue(list.size() == 1); } diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java index bb23d6085e..974baa28b4 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java @@ -5,6 +5,10 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import org.junit.jupiter.api.Test; +import eu.dnetlib.broker.objects.OaBrokerAuthor; +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; + public class TrustUtilsTest { private static final double THRESHOLD = 0.95; @@ -64,6 +68,23 @@ public class TrustUtilsTest { verifyValue(2.00, BrokerConstants.MAX_TRUST); } + @Test + public void test() throws Exception { + final OaBrokerMainEntity r1 = new OaBrokerMainEntity(); + r1.getTitles().add("D-NET Service Package: Data Import"); + r1.getPids().add(new OaBrokerTypedValue("doi", "123")); + r1.getCreators().add(new OaBrokerAuthor("Michele Artini", null)); + r1.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null)); + + final OaBrokerMainEntity r2 = new OaBrokerMainEntity(); + r2.getTitles().add("D-NET Service Package: Data Import"); + // r2.getPids().add(new OaBrokerTypedValue("doi", "123")); + r2.getCreators().add(new OaBrokerAuthor("Michele Artini", null)); + // r2.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null)); + + System.out.println("TRUST: " + TrustUtils.calculateTrust(r1, r2)); + } + private void verifyValue(final double originalScore, final float expectedTrust) { final float trust = TrustUtils.rescale(originalScore, THRESHOLD); System.out.println(trust); From faea30cda0fd4ccae91be5d83d34b5427ec528c3 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 9 Jul 2020 14:05:21 +0200 Subject: [PATCH 15/65] - --- .../community/QueryInformationSystem.java | 2 +- .../dhp/bulktag/community/ResultTagger.java | 29 ++++++++++++------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java index cb5ad083b4..6a40bc2e29 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java @@ -41,7 +41,7 @@ public class QueryInformationSystem { + " " + " } " + " " + - " "+ + " " + "{for $zc in $zenodo " + "return " + " " + diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java index f5a985d15f..d58704b753 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java @@ -71,10 +71,9 @@ public class ResultTagger implements Serializable { // tagging for Subject final Set subjects = new HashSet<>(); - Optional> oresultsubj = Optional.ofNullable(result.getSubject()); - if (oresultsubj.isPresent()) { - oresultsubj - .get() + + if (Objects.nonNull(result.getSubject())){ + result.getSubject() .stream() .map(subject -> subject.getValue()) .filter(StringUtils::isNotBlank) @@ -90,15 +89,23 @@ public class ResultTagger implements Serializable { final Set datasources = new HashSet<>(); final Set tmp = new HashSet<>(); - Optional> oresultinstance = Optional.ofNullable(result.getInstance()); - if (oresultinstance.isPresent()) { - for (Instance i : oresultinstance.get()) { - tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|")); - tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|")); + if (Objects.nonNull(result.getInstance())) { + for (Instance i : result.getInstance()) { + if(Objects.nonNull(i.getCollectedfrom())){ + if(Objects.nonNull(i.getCollectedfrom().getKey())){ + tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|")); + } + } + if(Objects.nonNull(i.getHostedby())){ + if(Objects.nonNull(i.getHostedby().getKey())){ + tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|")); + } + } + } - oresultinstance - .get() + result + .getInstance() .stream() .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey())) .flatMap(p -> Stream.of(p.getFst(), p.getSnd())) From ff4d6214f19292d90076d8ff6ec86a885861c1af Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 10 Jul 2020 10:06:41 +0200 Subject: [PATCH 16/65] experimenting with pruning of relations --- .../dhp/oa/provision/PrepareRelationsJob.java | 344 +++++++++--------- 1 file changed, 174 insertions(+), 170 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index eb63d4423d..e1f7386e92 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -59,200 +59,204 @@ import scala.Tuple2; */ public class PrepareRelationsJob { - private static final Logger log = LoggerFactory.getLogger(PrepareRelationsJob.class); + private static final Logger log = LoggerFactory.getLogger(PrepareRelationsJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static final int MAX_RELS = 100; + public static final int MAX_RELS = 100; - public static final int DEFAULT_NUM_PARTITIONS = 3000; + public static final int DEFAULT_NUM_PARTITIONS = 3000; - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - PrepareRelationsJob.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PrepareRelationsJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputRelationsPath = parser.get("inputRelationsPath"); - log.info("inputRelationsPath: {}", inputRelationsPath); + String inputRelationsPath = parser.get("inputRelationsPath"); + log.info("inputRelationsPath: {}", inputRelationsPath); - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - int relPartitions = Optional - .ofNullable(parser.get("relPartitions")) - .map(Integer::valueOf) - .orElse(DEFAULT_NUM_PARTITIONS); - log.info("relPartitions: {}", relPartitions); + int relPartitions = Optional + .ofNullable(parser.get("relPartitions")) + .map(Integer::valueOf) + .orElse(DEFAULT_NUM_PARTITIONS); + log.info("relPartitions: {}", relPartitions); - Set relationFilter = Optional - .ofNullable(parser.get("relationFilter")) - .map(s -> Sets.newHashSet(Splitter.on(",").split(s))) - .orElse(new HashSet<>()); - log.info("relationFilter: {}", relationFilter); + Set relationFilter = Optional + .ofNullable(parser.get("relationFilter")) + .map(s -> Sets.newHashSet(Splitter.on(",").split(s))) + .orElse(new HashSet<>()); + log.info("relationFilter: {}", relationFilter); - int maxRelations = Optional - .ofNullable(parser.get("maxRelations")) - .map(Integer::valueOf) - .orElse(MAX_RELS); - log.info("maxRelations: {}", maxRelations); + int maxRelations = Optional + .ofNullable(parser.get("maxRelations")) + .map(Integer::valueOf) + .orElse(MAX_RELS); + log.info("maxRelations: {}", maxRelations); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ProvisionModelSupport.getModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ProvisionModelSupport.getModelClasses()); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - prepareRelationsRDD( - spark, inputRelationsPath, outputPath, relationFilter, maxRelations, relPartitions); - }); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + prepareRelationsRDD( + spark, inputRelationsPath, outputPath, relationFilter, maxRelations, relPartitions); + }); + } + + /** + * RDD based implementation that prepares the graph relations by limiting the number of outgoing links and filtering + * the relation types according to the given criteria. Moreover, outgoing links kept within the given limit are + * prioritized according to the weights indicated in eu.dnetlib.dhp.oa.provision.model.SortableRelation. + * + * @param spark the spark session + * @param inputRelationsPath source path for the graph relations + * @param outputPath output path for the processed relations + * @param relationFilter set of relation filters applied to the `relClass` field + * @param maxRelations maximum number of allowed outgoing edges + * @param relPartitions number of partitions for the output RDD + */ + private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath, + Set relationFilter, int maxRelations, int relPartitions) { + + JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath); + + JavaRDD pruned = pruneRels( + pruneRels(rels, relationFilter, maxRelations, relPartitions, (Function) r -> r.getSource()), + relationFilter, maxRelations, relPartitions, (Function) r -> r.getTarget()); + spark + .createDataset(pruned.rdd(), Encoders.bean(Relation.class)) + .repartition(relPartitions) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } + + private static JavaRDD pruneRels(JavaRDD rels, Set relationFilter, int maxRelations, int relPartitions, Function idFn) { + return rels + .filter(rel -> rel.getDataInfo().getDeletedbyinference() == false) + .filter(rel -> relationFilter.contains(rel.getRelClass()) == false) + .mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, idFn.call(r)), r)) + .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) + .groupBy(Tuple2::_1) + .map(Tuple2::_2) + .map(t -> Iterables.limit(t, maxRelations)) + .flatMap(Iterable::iterator).map(Tuple2::_2); } - /** - * RDD based implementation that prepares the graph relations by limiting the number of outgoing links and filtering - * the relation types according to the given criteria. Moreover, outgoing links kept within the given limit are - * prioritized according to the weights indicated in eu.dnetlib.dhp.oa.provision.model.SortableRelation. - * - * @param spark the spark session - * @param inputRelationsPath source path for the graph relations - * @param outputPath output path for the processed relations - * @param relationFilter set of relation filters applied to the `relClass` field - * @param maxRelations maximum number of allowed outgoing edges - * @param relPartitions number of partitions for the output RDD - */ - private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath, - Set relationFilter, int maxRelations, int relPartitions) { + // experimental + private static void prepareRelationsDataset( + SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int maxRelations, + int relPartitions) { + spark + .read() + .textFile(inputRelationsPath) + .repartition(relPartitions) + .map( + (MapFunction) s -> OBJECT_MAPPER.readValue(s, Relation.class), + Encoders.kryo(Relation.class)) + .filter((FilterFunction) rel -> rel.getDataInfo().getDeletedbyinference() == false) + .filter((FilterFunction) rel -> relationFilter.contains(rel.getRelClass()) == false) + .groupByKey( + (MapFunction) Relation::getSource, + Encoders.STRING()) + .agg(new RelationAggregator(maxRelations).toColumn()) + .flatMap( + (FlatMapFunction, Relation>) t -> Iterables + .limit(t._2().getRelations(), maxRelations) + .iterator(), + Encoders.bean(Relation.class)) + .repartition(relPartitions) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } - // group by SOURCE and apply limit - RDD bySource = readPathRelationRDD(spark, inputRelationsPath) - .filter(rel -> rel.getDataInfo().getDeletedbyinference() == false) - .filter(rel -> relationFilter.contains(rel.getRelClass()) == false) - .mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, r.getSource()), r)) - .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) - .groupBy(Tuple2::_1) - .map(Tuple2::_2) - .map(t -> Iterables.limit(t, maxRelations)) - .flatMap(Iterable::iterator) - .map(Tuple2::_2) - .rdd(); + public static class RelationAggregator + extends Aggregator { - spark - .createDataset(bySource, Encoders.bean(Relation.class)) - .repartition(relPartitions) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } + private int maxRelations; - // experimental - private static void prepareRelationsDataset( - SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int maxRelations, - int relPartitions) { - spark - .read() - .textFile(inputRelationsPath) - .repartition(relPartitions) - .map( - (MapFunction) s -> OBJECT_MAPPER.readValue(s, Relation.class), - Encoders.kryo(Relation.class)) - .filter((FilterFunction) rel -> rel.getDataInfo().getDeletedbyinference() == false) - .filter((FilterFunction) rel -> relationFilter.contains(rel.getRelClass()) == false) - .groupByKey( - (MapFunction) Relation::getSource, - Encoders.STRING()) - .agg(new RelationAggregator(maxRelations).toColumn()) - .flatMap( - (FlatMapFunction, Relation>) t -> Iterables - .limit(t._2().getRelations(), maxRelations) - .iterator(), - Encoders.bean(Relation.class)) - .repartition(relPartitions) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } + public RelationAggregator(int maxRelations) { + this.maxRelations = maxRelations; + } - public static class RelationAggregator - extends Aggregator { + @Override + public RelationList zero() { + return new RelationList(); + } - private int maxRelations; + @Override + public RelationList reduce(RelationList b, Relation a) { + b.getRelations().add(a); + return getSortableRelationList(b); + } - public RelationAggregator(int maxRelations) { - this.maxRelations = maxRelations; - } + @Override + public RelationList merge(RelationList b1, RelationList b2) { + b1.getRelations().addAll(b2.getRelations()); + return getSortableRelationList(b1); + } - @Override - public RelationList zero() { - return new RelationList(); - } + @Override + public RelationList finish(RelationList r) { + return getSortableRelationList(r); + } - @Override - public RelationList reduce(RelationList b, Relation a) { - b.getRelations().add(a); - return getSortableRelationList(b); - } + private RelationList getSortableRelationList(RelationList b1) { + RelationList sr = new RelationList(); + sr + .setRelations( + b1 + .getRelations() + .stream() + .limit(maxRelations) + .collect(Collectors.toCollection(() -> new PriorityQueue<>(new RelationComparator())))); + return sr; + } - @Override - public RelationList merge(RelationList b1, RelationList b2) { - b1.getRelations().addAll(b2.getRelations()); - return getSortableRelationList(b1); - } + @Override + public Encoder bufferEncoder() { + return Encoders.kryo(RelationList.class); + } - @Override - public RelationList finish(RelationList r) { - return getSortableRelationList(r); - } + @Override + public Encoder outputEncoder() { + return Encoders.kryo(RelationList.class); + } + } - private RelationList getSortableRelationList(RelationList b1) { - RelationList sr = new RelationList(); - sr - .setRelations( - b1 - .getRelations() - .stream() - .limit(maxRelations) - .collect(Collectors.toCollection(() -> new PriorityQueue<>(new RelationComparator())))); - return sr; - } + /** + * Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text + * file, + * + * @param spark + * @param inputPath + * @return the JavaRDD containing all the relationships + */ + private static JavaRDD readPathRelationRDD( + SparkSession spark, final String inputPath) { + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class)); + } - @Override - public Encoder bufferEncoder() { - return Encoders.kryo(RelationList.class); - } - - @Override - public Encoder outputEncoder() { - return Encoders.kryo(RelationList.class); - } - } - - /** - * Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text - * file, - * - * @param spark - * @param inputPath - * @return the JavaRDD containing all the relationships - */ - private static JavaRDD readPathRelationRDD( - SparkSession spark, final String inputPath) { - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class)); - } - - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } From b21866a2da7be44106713c3f3982e3ee646d22d4 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 10 Jul 2020 13:59:48 +0200 Subject: [PATCH 17/65] allow to set different to relations cut points by source and by target; adjusted weight assigned to relationship types --- .../dhp/oa/provision/PrepareRelationsJob.java | 358 +++++++++--------- .../provision/model/SortableRelationKey.java | 22 +- .../input_params_prepare_relations.json | 13 +- 3 files changed, 206 insertions(+), 187 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index e1f7386e92..da0a810217 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -59,204 +59,216 @@ import scala.Tuple2; */ public class PrepareRelationsJob { - private static final Logger log = LoggerFactory.getLogger(PrepareRelationsJob.class); + private static final Logger log = LoggerFactory.getLogger(PrepareRelationsJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static final int MAX_RELS = 100; + public static final int MAX_RELS = 100; - public static final int DEFAULT_NUM_PARTITIONS = 3000; + public static final int DEFAULT_NUM_PARTITIONS = 3000; - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - PrepareRelationsJob.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PrepareRelationsJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputRelationsPath = parser.get("inputRelationsPath"); - log.info("inputRelationsPath: {}", inputRelationsPath); + String inputRelationsPath = parser.get("inputRelationsPath"); + log.info("inputRelationsPath: {}", inputRelationsPath); - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - int relPartitions = Optional - .ofNullable(parser.get("relPartitions")) - .map(Integer::valueOf) - .orElse(DEFAULT_NUM_PARTITIONS); - log.info("relPartitions: {}", relPartitions); + int relPartitions = Optional + .ofNullable(parser.get("relPartitions")) + .map(Integer::valueOf) + .orElse(DEFAULT_NUM_PARTITIONS); + log.info("relPartitions: {}", relPartitions); - Set relationFilter = Optional - .ofNullable(parser.get("relationFilter")) - .map(s -> Sets.newHashSet(Splitter.on(",").split(s))) - .orElse(new HashSet<>()); - log.info("relationFilter: {}", relationFilter); + Set relationFilter = Optional + .ofNullable(parser.get("relationFilter")) + .map(s -> Sets.newHashSet(Splitter.on(",").split(s))) + .orElse(new HashSet<>()); + log.info("relationFilter: {}", relationFilter); - int maxRelations = Optional - .ofNullable(parser.get("maxRelations")) - .map(Integer::valueOf) - .orElse(MAX_RELS); - log.info("maxRelations: {}", maxRelations); + int sourceMaxRelations = Optional + .ofNullable(parser.get("sourceMaxRelations")) + .map(Integer::valueOf) + .orElse(MAX_RELS); + log.info("sourceMaxRelations: {}", sourceMaxRelations); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ProvisionModelSupport.getModelClasses()); + int targetMaxRelations = Optional + .ofNullable(parser.get("targetMaxRelations")) + .map(Integer::valueOf) + .orElse(MAX_RELS); + log.info("targetMaxRelations: {}", targetMaxRelations); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - prepareRelationsRDD( - spark, inputRelationsPath, outputPath, relationFilter, maxRelations, relPartitions); - }); - } + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ProvisionModelSupport.getModelClasses()); - /** - * RDD based implementation that prepares the graph relations by limiting the number of outgoing links and filtering - * the relation types according to the given criteria. Moreover, outgoing links kept within the given limit are - * prioritized according to the weights indicated in eu.dnetlib.dhp.oa.provision.model.SortableRelation. - * - * @param spark the spark session - * @param inputRelationsPath source path for the graph relations - * @param outputPath output path for the processed relations - * @param relationFilter set of relation filters applied to the `relClass` field - * @param maxRelations maximum number of allowed outgoing edges - * @param relPartitions number of partitions for the output RDD - */ - private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath, - Set relationFilter, int maxRelations, int relPartitions) { - - JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath); - - JavaRDD pruned = pruneRels( - pruneRels(rels, relationFilter, maxRelations, relPartitions, (Function) r -> r.getSource()), - relationFilter, maxRelations, relPartitions, (Function) r -> r.getTarget()); - spark - .createDataset(pruned.rdd(), Encoders.bean(Relation.class)) - .repartition(relPartitions) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } - - private static JavaRDD pruneRels(JavaRDD rels, Set relationFilter, int maxRelations, int relPartitions, Function idFn) { - return rels - .filter(rel -> rel.getDataInfo().getDeletedbyinference() == false) - .filter(rel -> relationFilter.contains(rel.getRelClass()) == false) - .mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, idFn.call(r)), r)) - .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) - .groupBy(Tuple2::_1) - .map(Tuple2::_2) - .map(t -> Iterables.limit(t, maxRelations)) - .flatMap(Iterable::iterator).map(Tuple2::_2); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + prepareRelationsRDD( + spark, inputRelationsPath, outputPath, relationFilter, sourceMaxRelations, targetMaxRelations, + relPartitions); + }); } - // experimental - private static void prepareRelationsDataset( - SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int maxRelations, - int relPartitions) { - spark - .read() - .textFile(inputRelationsPath) - .repartition(relPartitions) - .map( - (MapFunction) s -> OBJECT_MAPPER.readValue(s, Relation.class), - Encoders.kryo(Relation.class)) - .filter((FilterFunction) rel -> rel.getDataInfo().getDeletedbyinference() == false) - .filter((FilterFunction) rel -> relationFilter.contains(rel.getRelClass()) == false) - .groupByKey( - (MapFunction) Relation::getSource, - Encoders.STRING()) - .agg(new RelationAggregator(maxRelations).toColumn()) - .flatMap( - (FlatMapFunction, Relation>) t -> Iterables - .limit(t._2().getRelations(), maxRelations) - .iterator(), - Encoders.bean(Relation.class)) - .repartition(relPartitions) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } + /** + * RDD based implementation that prepares the graph relations by limiting the number of outgoing links and filtering + * the relation types according to the given criteria. Moreover, outgoing links kept within the given limit are + * prioritized according to the weights indicated in eu.dnetlib.dhp.oa.provision.model.SortableRelation. + * + * @param spark the spark session + * @param inputRelationsPath source path for the graph relations + * @param outputPath output path for the processed relations + * @param relationFilter set of relation filters applied to the `relClass` field + * @param sourceMaxRelations maximum number of allowed outgoing edges grouping by relation.source + * @param targetMaxRelations maximum number of allowed outgoing edges grouping by relation.target + * @param relPartitions number of partitions for the output RDD + */ + private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath, + Set relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) { - public static class RelationAggregator - extends Aggregator { + JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath) + .filter(rel -> rel.getDataInfo().getDeletedbyinference() == false) + .filter(rel -> relationFilter.contains(rel.getRelClass()) == false); - private int maxRelations; + JavaRDD pruned = pruneRels( + pruneRels( + rels, + sourceMaxRelations, relPartitions, (Function) r -> r.getSource()), + targetMaxRelations, relPartitions, (Function) r -> r.getTarget()); + spark + .createDataset(pruned.rdd(), Encoders.bean(Relation.class)) + .repartition(relPartitions) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } - public RelationAggregator(int maxRelations) { - this.maxRelations = maxRelations; - } + private static JavaRDD pruneRels(JavaRDD rels, int maxRelations, + int relPartitions, Function idFn) { + return rels + .mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, idFn.call(r)), r)) + .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) + .groupBy(Tuple2::_1) + .map(Tuple2::_2) + .map(t -> Iterables.limit(t, maxRelations)) + .flatMap(Iterable::iterator) + .map(Tuple2::_2); + } - @Override - public RelationList zero() { - return new RelationList(); - } + // experimental + private static void prepareRelationsDataset( + SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int maxRelations, + int relPartitions) { + spark + .read() + .textFile(inputRelationsPath) + .repartition(relPartitions) + .map( + (MapFunction) s -> OBJECT_MAPPER.readValue(s, Relation.class), + Encoders.kryo(Relation.class)) + .filter((FilterFunction) rel -> rel.getDataInfo().getDeletedbyinference() == false) + .filter((FilterFunction) rel -> relationFilter.contains(rel.getRelClass()) == false) + .groupByKey( + (MapFunction) Relation::getSource, + Encoders.STRING()) + .agg(new RelationAggregator(maxRelations).toColumn()) + .flatMap( + (FlatMapFunction, Relation>) t -> Iterables + .limit(t._2().getRelations(), maxRelations) + .iterator(), + Encoders.bean(Relation.class)) + .repartition(relPartitions) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } - @Override - public RelationList reduce(RelationList b, Relation a) { - b.getRelations().add(a); - return getSortableRelationList(b); - } + public static class RelationAggregator + extends Aggregator { - @Override - public RelationList merge(RelationList b1, RelationList b2) { - b1.getRelations().addAll(b2.getRelations()); - return getSortableRelationList(b1); - } + private int maxRelations; - @Override - public RelationList finish(RelationList r) { - return getSortableRelationList(r); - } + public RelationAggregator(int maxRelations) { + this.maxRelations = maxRelations; + } - private RelationList getSortableRelationList(RelationList b1) { - RelationList sr = new RelationList(); - sr - .setRelations( - b1 - .getRelations() - .stream() - .limit(maxRelations) - .collect(Collectors.toCollection(() -> new PriorityQueue<>(new RelationComparator())))); - return sr; - } + @Override + public RelationList zero() { + return new RelationList(); + } - @Override - public Encoder bufferEncoder() { - return Encoders.kryo(RelationList.class); - } + @Override + public RelationList reduce(RelationList b, Relation a) { + b.getRelations().add(a); + return getSortableRelationList(b); + } - @Override - public Encoder outputEncoder() { - return Encoders.kryo(RelationList.class); - } - } + @Override + public RelationList merge(RelationList b1, RelationList b2) { + b1.getRelations().addAll(b2.getRelations()); + return getSortableRelationList(b1); + } - /** - * Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text - * file, - * - * @param spark - * @param inputPath - * @return the JavaRDD containing all the relationships - */ - private static JavaRDD readPathRelationRDD( - SparkSession spark, final String inputPath) { - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class)); - } + @Override + public RelationList finish(RelationList r) { + return getSortableRelationList(r); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private RelationList getSortableRelationList(RelationList b1) { + RelationList sr = new RelationList(); + sr + .setRelations( + b1 + .getRelations() + .stream() + .limit(maxRelations) + .collect(Collectors.toCollection(() -> new PriorityQueue<>(new RelationComparator())))); + return sr; + } + + @Override + public Encoder bufferEncoder() { + return Encoders.kryo(RelationList.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.kryo(RelationList.class); + } + } + + /** + * Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text + * file, + * + * @param spark + * @param inputPath + * @return the JavaRDD containing all the relationships + */ + private static JavaRDD readPathRelationRDD( + SparkSession spark, final String inputPath) { + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class)); + } + + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java index bf7f9330d1..bd7b4d78ee 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelationKey.java @@ -16,18 +16,18 @@ public class SortableRelationKey implements Comparable, Ser private static final Map weights = Maps.newHashMap(); static { - weights.put("outcome", 0); - weights.put("supplement", 1); - weights.put("review", 2); - weights.put("citation", 3); - weights.put("affiliation", 4); - weights.put("relationship", 5); - weights.put("publicationDataset", 6); - weights.put("similarity", 7); + weights.put("participation", 0); - weights.put("provision", 8); - weights.put("participation", 9); - weights.put("dedup", 10); + weights.put("outcome", 1); + weights.put("affiliation", 2); + weights.put("dedup", 3); + weights.put("publicationDataset", 4); + weights.put("citation", 5); + weights.put("supplement", 6); + weights.put("review", 7); + weights.put("relationship", 8); + weights.put("provision", 9); + weights.put("similarity", 10); } private static final long serialVersionUID = 3232323; diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json index 71b2becc4d..33fa1dc8df 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json @@ -30,9 +30,16 @@ "paramRequired": false }, { - "paramName": "mr", - "paramLongName": "maxRelations", - "paramDescription": "maximum number of relations allowed for a each entity", + "paramName": "smr", + "paramLongName": "sourceMaxRelations", + "paramDescription": "maximum number of relations allowed for a each entity grouping by source", + "paramRequired": false + }, + { + "paramName": "tmr", + "paramLongName": "targetMaxRelations", + "paramDescription": "maximum number of relations allowed for a each entity grouping by target", "paramRequired": false } + ] From a7d39774812ab3ede6570e65ebe3cb902ca11667 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 10 Jul 2020 14:44:50 +0200 Subject: [PATCH 18/65] added generation of EBI Dataset --- dhp-schemas/pom.xml | 31 +++ .../dhp/schema/scholexplorer/OafUtils.scala | 90 +++++++++ dhp-workflows/dhp-graph-mapper/pom.xml | 38 ++++ .../dhp/sx/ebi/SparkAddLinkUpdates.scala | 138 +++++++++++++ .../sx/ebi/SparkCreateBaselineDataFrame.scala | 49 +++++ .../dhp/sx/ebi/SparkCreateEBIDataFrame.scala | 6 +- .../dnetlib/dhp/sx/ebi/model/PMArticle.java | 64 ++++++ .../eu/dnetlib/dhp/sx/ebi/model/PMAuthor.java | 31 +++ .../dnetlib/dhp/sx/ebi/model/PMJournal.java | 53 +++++ .../dnetlib/dhp/sx/ebi/model/PMParser.scala | 92 +++++++++ .../parser/AbstractScholexplorerParser.java | 6 +- .../parser/DatasetScholexplorerParser.java | 30 ++- .../PublicationScholexplorerParser.java | 6 +- .../dhp/sx/ebi/oozie_app/config-default.xml | 46 ++++- .../dnetlib/dhp/sx/ebi/oozie_app/workflow.xml | 52 ++++- .../java/eu/dnetlib/dhp/sx/ebi/TestEBI.scala | 20 ++ .../eu/dnetlib/dhp/sx/ebi/TestEBITODS.scala | 22 -- .../resources/eu/dnetlib/dhp/sx/ebi/rel1.json | 55 +++++ .../eu/dnetlib/dhp/sx/ebi/rel_multiple.json | 191 ++++++++++++++++++ .../java/eu/dnetlib/dhp/export/DLIToOAF.scala | 43 +--- .../oa/provision/PrepareRelationsJobTest.java | 29 ++- 21 files changed, 985 insertions(+), 107 deletions(-) create mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkAddLinkUpdates.scala create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateBaselineDataFrame.scala create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMArticle.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMAuthor.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMJournal.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMParser.scala create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBI.scala delete mode 100644 dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBITODS.scala create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel1.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel_multiple.json diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index 2e5652b430..b04d62dd29 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -14,6 +14,37 @@ This module contains common schema classes meant to be used across the dnet-hadoop submodules + + + + net.alchim31.maven + scala-maven-plugin + 4.0.1 + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + + diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala new file mode 100644 index 0000000000..27eec77fa2 --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala @@ -0,0 +1,90 @@ +package eu.dnetlib.dhp.schema.scholexplorer + +import eu.dnetlib.dhp.schema.oaf.{DataInfo, Field, KeyValue, Qualifier, StructuredProperty} + +object OafUtils { + + + + def generateKeyValue(key: String, value: String): KeyValue = { + val kv: KeyValue = new KeyValue() + kv.setKey(key) + kv.setValue(value) + kv.setDataInfo(generateDataInfo("0.9")) + kv + } + + + def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = { + val di = new DataInfo + di.setDeletedbyinference(false) + di.setInferred(false) + di.setInvisible(false) + di.setTrust(trust) + di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions")) + di + } + + def createQualifier(cls: String, sch: String): Qualifier = { + createQualifier(cls, cls, sch, sch) + } + + + def createQualifier(classId: String, className: String, schemeId: String, schemeName: String): Qualifier = { + val q: Qualifier = new Qualifier + q.setClassid(classId) + q.setClassname(className) + q.setSchemeid(schemeId) + q.setSchemename(schemeName) + q + } + + + def asField[T](value: T): Field[T] = { + val tmp = new Field[T] + tmp.setValue(value) + tmp + + + } + + def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = { + val sp = new StructuredProperty + sp.setQualifier(createQualifier(classId,className, schemeId, schemeName)) + sp.setValue(value) + sp + + } + + + + def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = { + val sp = new StructuredProperty + sp.setQualifier(createQualifier(classId,className, schemeId, schemeName)) + sp.setValue(value) + sp.setDataInfo(dataInfo) + sp + + } + + def createSP(value: String, classId: String, schemeId: String): StructuredProperty = { + val sp = new StructuredProperty + sp.setQualifier(createQualifier(classId, schemeId)) + sp.setValue(value) + sp + + } + + + + def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = { + val sp = new StructuredProperty + sp.setQualifier(createQualifier(classId, schemeId)) + sp.setValue(value) + sp.setDataInfo(dataInfo) + sp + + } + + +} diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index 0439c2ba32..5ddcda3fa0 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -9,6 +9,37 @@ dhp-graph-mapper + + + + net.alchim31.maven + scala-maven-plugin + 4.0.1 + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + + @@ -61,6 +92,13 @@ org.postgresql postgresql + + org.json4s + json4s-jackson_2.11 + 3.5.3 + + + diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkAddLinkUpdates.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkAddLinkUpdates.scala new file mode 100644 index 0000000000..897bbd5407 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkAddLinkUpdates.scala @@ -0,0 +1,138 @@ +package eu.dnetlib.dhp.sx.ebi +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.schema.oaf.{Instance, KeyValue, Oaf} +import eu.dnetlib.dhp.schema.scholexplorer.OafUtils.createQualifier +import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIRelation, OafUtils, ProvenaceInfo} +import eu.dnetlib.dhp.utils.DHPUtils +import eu.dnetlib.scholexplorer.relation.RelationMapper +import org.apache.commons.io.IOUtils +import org.apache.spark.SparkConf +import org.apache.spark.sql._ +import org.json4s +import org.json4s.DefaultFormats +import org.json4s.JsonAST.{JField, JObject, JString} +import org.json4s.jackson.JsonMethods.parse + +import scala.collection.JavaConverters._ + +object SparkAddLinkUpdates { + + val relationMapper = RelationMapper.load + + +case class EBILinks(relation:String, pubdate:String, tpid:String, tpidType:String, turl:String, title:String, publisher:String) {} + + + def generatePubmedDLICollectedFrom(): KeyValue = { + OafUtils.generateKeyValue("dli_________::europe_pmc__", "Europe PMC") + } + + + def ebiLinksToOaf(input:(String, String)):List[Oaf] = { + val pmid :String = input._1 + val input_json :String = input._2 + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: json4s.JValue = parse(input_json) + + + val targets:List[EBILinks] = for { + JObject(link) <- json \\ "Category" \\ "Link" + JField("PublicationDate", JString(pubdate)) <- link + JField("RelationshipType", JObject(relationshipType)) <- link + JField("Name", JString(relname)) <- relationshipType + JField("Target", JObject(target)) <- link + JField("Identifier", JObject(identifier)) <- target + JField("ID", JString(tpid)) <- identifier + JField("IDScheme", JString(tpidtype)) <- identifier + JField("IDURL", JString(turl)) <- identifier + JField("Title", JString(title)) <- target + JField("Publisher", JObject(pub)) <- target + JField("Name", JString(publisher)) <- pub + } yield EBILinks(relname, pubdate, tpid, tpidtype, turl,title, publisher) + + + + val dnetPublicationId = s"50|${DHPUtils.md5(s"$pmid::pmid")}" + + targets.flatMap(l => { + val relation = new DLIRelation + val inverseRelation = new DLIRelation + val targetDnetId = s"50|${DHPUtils.md5(s"${l.tpid.toLowerCase.trim}::${l.tpidType.toLowerCase.trim}")}" + val relInfo = relationMapper.get(l.relation.toLowerCase) + val relationSemantic = relInfo.getOriginal + val inverseRelationSemantic = relInfo.getInverse + + relation.setSource(dnetPublicationId) + relation.setTarget(targetDnetId) + relation.setRelClass("datacite") + relation.setRelType(relationSemantic) + relation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava) + + inverseRelation.setSource(targetDnetId) + inverseRelation.setTarget(dnetPublicationId) + inverseRelation.setRelClass("datacite") + inverseRelation.setRelType(inverseRelationSemantic) + inverseRelation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava) + + + + val d = new DLIDataset + d.setId(targetDnetId) + d.setDataInfo(OafUtils.generateDataInfo()) + d.setPid(List(OafUtils.createSP(l.tpid.toLowerCase.trim, l.tpidType.toLowerCase.trim, "dnet:pid_types")).asJava) + d.setCompletionStatus("complete") + val pi = new ProvenaceInfo + pi.setId("dli_________::europe_pmc__") + pi.setName( "Europe PMC") + pi.setCompletionStatus("complete") + pi.setCollectionMode("collected") + d.setDlicollectedfrom(List(pi).asJava) + d.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava) + d.setPublisher(OafUtils.asField(l.publisher)) + d.setTitle(List(OafUtils.createSP(l.title, "main title", "dnet:dataCite_title")).asJava) + d.setDateofacceptance(OafUtils.asField(l.pubdate)) + val i = new Instance + i.setCollectedfrom(generatePubmedDLICollectedFrom()) + i.setDateofacceptance(d.getDateofacceptance) + i.setUrl(List(l.turl).asJava) + i.setInstancetype(createQualifier("0021", "Dataset", "dnet:publication_resource", "dnet:publication_resource")) + d.setInstance(List(i).asJava) + List(relation, inverseRelation, d) + }) + } + + + def main(args: Array[String]): Unit = { + val conf: SparkConf = new SparkConf() + val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json"))) + parser.parseArgument(args) + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(SparkCreateEBIDataFrame.getClass.getSimpleName) + .master(parser.get("master")).getOrCreate() + + + val workingPath = parser.get("workingPath") + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) + implicit val relEncoder: Encoder[DLIRelation] = Encoders.kryo(classOf[DLIRelation]) + implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset]) + + val ds:Dataset[(String,String)] = spark.read.load(s"$workingPath/baseline_links_updates").as[(String,String)](Encoders.tuple(Encoders.STRING, Encoders.STRING)) + + ds.flatMap(l =>ebiLinksToOaf(l)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_oaf") + + ds.filter(s => s.isInstanceOf) + + + + val oDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/baseline_links_updates_oaf").as[Oaf] + + oDataset.filter(p =>p.isInstanceOf[DLIRelation]).map(p => p.asInstanceOf[DLIRelation]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_relation") + oDataset.filter(p =>p.isInstanceOf[DLIDataset]).map(p => p.asInstanceOf[DLIDataset]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_dataset") + + + + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateBaselineDataFrame.scala new file mode 100644 index 0000000000..77e03c9b3e --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateBaselineDataFrame.scala @@ -0,0 +1,49 @@ +package eu.dnetlib.dhp.sx.ebi + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import org.apache.commons.io.IOUtils +import org.apache.spark.SparkConf +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal, PMParser} + + +import scala.io.Source +import scala.xml.pull.XMLEventReader + +object SparkCreateBaselineDataFrame { + + + def main(args: Array[String]): Unit = { + val conf: SparkConf = new SparkConf() + val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json"))) + parser.parseArgument(args) + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(SparkCreateEBIDataFrame.getClass.getSimpleName) + .master(parser.get("master")).getOrCreate() + + val sc = spark.sparkContext + + val workingPath = parser.get("workingPath") + + implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle]) + implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal]) + implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor]) + val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000) + + val ds:Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i =>{ + val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes())) + new PMParser(xml) + + } )) + + ds.write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset") + + + + + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala index c7cc3ed048..60857f0fcf 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/SparkCreateEBIDataFrame.scala @@ -51,7 +51,7 @@ object SparkCreateEBIDataFrame { spark.createDataset(oafPubsRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/oaf") logger.info("Extract Publication and relation from dataset_xml") - val oafDatsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/_dataset_xml").map(s => + val oafDatsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/dataset_xml").map(s => { new ObjectMapper().readValue(s, classOf[String]) }).flatMap(s => { @@ -79,5 +79,9 @@ object SparkCreateEBIDataFrame { .agg(EBIAggregator.getRelationAggregator().toColumn) .map(p => p._2) .write.mode(SaveMode.Overwrite).save(s"$workingPath/relation") + + + + relations.map(r => (r.getSource, r.getTarget))(Encoders.tuple(Encoders.STRING,Encoders.STRING)) } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMArticle.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMArticle.java new file mode 100644 index 0000000000..75d4628e67 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMArticle.java @@ -0,0 +1,64 @@ + +package eu.dnetlib.dhp.sx.ebi.model; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +public class PMArticle implements Serializable { + + private String pmid; + private String date; + private PMJournal journal; + private String title; + private String description; + private List authors = new ArrayList<>(); + + public String getPmid() { + return pmid; + } + + public void setPmid(String pmid) { + this.pmid = pmid; + } + + public String getDate() { + return date; + } + + public void setDate(String date) { + this.date = date; + } + + public PMJournal getJournal() { + return journal; + } + + public void setJournal(PMJournal journal) { + this.journal = journal; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getDescription() { + return description; + } + + public void setDescription(String description) { + this.description = description; + } + + public List getAuthors() { + return authors; + } + + public void setAuthors(List authors) { + this.authors = authors; + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMAuthor.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMAuthor.java new file mode 100644 index 0000000000..4a21985424 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMAuthor.java @@ -0,0 +1,31 @@ + +package eu.dnetlib.dhp.sx.ebi.model; + +import java.io.Serializable; + +public class PMAuthor implements Serializable { + + private String lastName; + private String foreName; + + public String getLastName() { + return lastName; + } + + public void setLastName(String lastName) { + this.lastName = lastName; + } + + public String getForeName() { + return foreName; + } + + public void setForeName(String foreName) { + this.foreName = foreName; + } + + public String getFullName() { + return String.format("%s, %s", this.foreName, this.lastName); + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMJournal.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMJournal.java new file mode 100644 index 0000000000..d4ff5a1580 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMJournal.java @@ -0,0 +1,53 @@ + +package eu.dnetlib.dhp.sx.ebi.model; + +import java.io.Serializable; + +public class PMJournal implements Serializable { + + private String issn; + private String volume; + private String issue; + private String date; + private String title; + + public String getIssn() { + return issn; + } + + public void setIssn(String issn) { + this.issn = issn; + } + + public String getVolume() { + return volume; + } + + public void setVolume(String volume) { + this.volume = volume; + } + + public String getIssue() { + return issue; + } + + public void setIssue(String issue) { + this.issue = issue; + } + + public String getDate() { + return date; + } + + public void setDate(String date) { + this.date = date; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMParser.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMParser.scala new file mode 100644 index 0000000000..903eba1347 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/model/PMParser.scala @@ -0,0 +1,92 @@ +package eu.dnetlib.dhp.sx.ebi.model +import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader} +class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] { + + var currentArticle:PMArticle = generateNextArticle() + + override def hasNext: Boolean = currentArticle!= null + + override def next(): PMArticle = { + val tmp = currentArticle + currentArticle = generateNextArticle() + tmp + } + + + def generateNextArticle():PMArticle = { + + var currentAuthor: PMAuthor = null + var currentJournal: PMJournal = null + var currNode: String = null + var currentYear = "0" + var currentMonth = "01" + var currentDay = "01" + + while (xml.hasNext) { + xml.next match { + case EvElemStart(_, label, _, _) => + currNode = label + label match { + case "PubmedArticle" => currentArticle = new PMArticle + case "Author" => currentAuthor = new PMAuthor + case "Journal" => currentJournal = new PMJournal + case _ => + } + case EvElemEnd(_, label) => + label match { + case "PubmedArticle" => return currentArticle + case "Author" => currentArticle.getAuthors.add(currentAuthor) + case "Journal" => currentArticle.setJournal(currentJournal) + case "DateCompleted" => currentArticle.setDate(s"$currentYear-$currentMonth-$currentDay") + case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay") + case _ => + } + case EvText(text) => + if (currNode!= null && text.trim.nonEmpty) + currNode match { + case "ArticleTitle" => { + if (currentArticle.getTitle==null) + currentArticle.setTitle(text.trim) + else + currentArticle.setTitle(currentArticle.getTitle + text.trim) + } + case "AbstractText" => { + if (currentArticle.getDescription==null) + currentArticle.setDescription(text.trim) + else + currentArticle.setDescription(currentArticle.getDescription + text.trim) + } + case "PMID" => currentArticle.setPmid(text.trim) + case "ISSN" => currentJournal.setIssn(text.trim) + case "Year" => currentYear = text.trim + case "Month" => currentMonth = text.trim + case "Day" => currentDay = text.trim + case "Volume" => currentJournal.setVolume( text.trim) + case "Issue" => currentJournal.setIssue (text.trim) + case "LastName" => { + if (currentAuthor != null) + currentAuthor.setLastName(text.trim) + + } + case "ForeName" => if (currentAuthor != null) + currentAuthor.setForeName(text.trim) + case "Title" => + if (currentJournal.getTitle==null) + currentJournal.setTitle(text.trim) + else + currentJournal.setTitle(currentJournal.getTitle + text.trim) + case _ => + + } + case _ => + } + + } + null + } +} + + + + + diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java index f0c7893239..75f28c1299 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java @@ -150,8 +150,8 @@ public abstract class AbstractScholexplorerParser { return uk; } - - protected Qualifier generateQualifier(final String classId, final String className, final String schemeId, final String schemeName) { + protected Qualifier generateQualifier(final String classId, final String className, final String schemeId, + final String schemeName) { final Qualifier q = new Qualifier(); q.setClassid(classId); q.setClassid(className); @@ -159,8 +159,6 @@ public abstract class AbstractScholexplorerParser { q.setSchemename(schemeName); return q; - - } protected void generateRelations( diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java index c5c6b1493f..60371fa536 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java @@ -148,42 +148,37 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser { inferPid(currentPid); parsedObject.setPid(Collections.singletonList(currentPid)); - String resolvedURL = null; switch (currentPid.getQualifier().getClassname().toLowerCase()) { case "uniprot": - resolvedURL ="https://www.uniprot.org/uniprot/"+currentPid.getValue(); + resolvedURL = "https://www.uniprot.org/uniprot/" + currentPid.getValue(); break; case "ena": if (StringUtils.isNotBlank(currentPid.getValue()) && currentPid.getValue().length() > 7) - resolvedURL ="https://www.ebi.ac.uk/ena/data/view/"+currentPid.getValue().substring(0,8); + resolvedURL = "https://www.ebi.ac.uk/ena/data/view/" + currentPid.getValue().substring(0, 8); break; case "chembl": - resolvedURL ="https://www.ebi.ac.uk/chembl/compound_report_card/"+currentPid.getValue(); + resolvedURL = "https://www.ebi.ac.uk/chembl/compound_report_card/" + currentPid.getValue(); break; case "ncbi-n": - resolvedURL ="https://www.ncbi.nlm.nih.gov/nuccore/"+currentPid.getValue(); + resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue(); break; case "ncbi-p": - resolvedURL ="https://www.ncbi.nlm.nih.gov/nuccore/"+currentPid.getValue(); + resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue(); break; case "genbank": - resolvedURL ="https://www.ncbi.nlm.nih.gov/nuccore/"+currentPid.getValue(); + resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue(); break; case "pdb": - resolvedURL ="https://www.ncbi.nlm.nih.gov/nuccore/"+currentPid.getValue(); + resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue(); break; case "url": - resolvedURL =currentPid.getValue(); + resolvedURL = currentPid.getValue(); break; } - - - - final String sourceId = generateId( currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset"); parsedObject.setId(sourceId); @@ -286,7 +281,11 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser { t -> { final StructuredProperty st = new StructuredProperty(); st.setValue(t); - st.setQualifier(generateQualifier( "main title","main title", "dnet:dataCite_title","dnet:dataCite_title")); + st + .setQualifier( + generateQualifier( + "main title", "main title", "dnet:dataCite_title", + "dnet:dataCite_title")); return st; }) .collect(Collectors.toList())); @@ -318,8 +317,7 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser { .collect(Collectors.toList())); } - - if(StringUtils.isNotBlank(resolvedURL)) { + if (StringUtils.isNotBlank(resolvedURL)) { Instance i = new Instance(); i.setCollectedfrom(parsedObject.getCollectedfrom().get(0)); i.setUrl(Collections.singletonList(resolvedURL)); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java index b66bfd161a..8d76004dcb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java @@ -202,7 +202,11 @@ public class PublicationScholexplorerParser extends AbstractScholexplorerParser t -> { final StructuredProperty st = new StructuredProperty(); st.setValue(t); - st.setQualifier(generateQualifier( "main title","main title", "dnet:dataCite_title","dnet:dataCite_title")); + st + .setQualifier( + generateQualifier( + "main title", "main title", "dnet:dataCite_title", + "dnet:dataCite_title")); return st; }) .collect(Collectors.toList())); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml index cf617a84c4..cac3cc2bb0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml @@ -1,4 +1,7 @@ + + + + + + + + jobTracker + yarn + + + nameNode + hdfs://hadoop-rm1.garr-pa1.d4science.org:8020 + + + hive_metastore_uris + thrift://hadoop-edge3.garr-pa1.d4science.org:9083 + + + spark2YarnHistoryServerAddress + http://hadoop-rm2.garr-pa1.d4science.org:19888 + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + spark2EventLogDir /user/spark/spark2ApplicationHistory diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/workflow.xml index b816696747..a5035c56c6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/workflow.xml @@ -18,7 +18,7 @@ - + @@ -26,13 +26,59 @@ + + + yarn-cluster + cluster + Create Baselnie DataSet + + eu.dnetlib.dhp.sx.ebi.SparkCreateBaselineDataFrame + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=1 + --driver-memory=${sparkDriverMemory} + --executor-cores=${sparkExecutorCores} + ${sparkExtraOPT} + + --workingPath${workingPath} + --masteryarn + + + + + + + + yarn-cluster + cluster + Create Baselnie DataSet + + eu.dnetlib.dhp.sx.ebi.SparkAddLinkUpdates + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=1 + --driver-memory=${sparkDriverMemory} + --executor-cores=${sparkExecutorCores} + ${sparkExtraOPT} + + --workingPath${workingPath} + --masteryarn + + + + + + yarn-cluster cluster Create EBI DataSet + eu.dnetlib.dhp.sx.ebi.SparkCreateEBIDataFrame - dhp-doiboost-${projectVersion}.jar + dhp-graph-mapper-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -41,7 +87,7 @@ ${sparkExtraOPT} --workingPath${workingPath} - --masteryarn-cluster + --masteryarn diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBI.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBI.scala new file mode 100644 index 0000000000..fa390a21b1 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBI.scala @@ -0,0 +1,20 @@ +package eu.dnetlib.dhp.sx.ebi + +import org.junit.jupiter.api.Test + +class TestEBI { + + + + @Test + def testEBIData() = { + SparkAddLinkUpdates.main("-mt local[*] -w /home/sandro/Downloads".split(" ")) + + + + + + + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBITODS.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBITODS.scala deleted file mode 100644 index 979bf7e14c..0000000000 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/ebi/TestEBITODS.scala +++ /dev/null @@ -1,22 +0,0 @@ -package eu.dnetlib.dhp.sx.ebi - -import org.junit.jupiter.api.Test - -class TestEBITODS { - - - @Test - def testEBI():Unit = { - - - - - - - - - } - - - -} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel1.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel1.json new file mode 100644 index 0000000000..038b84a499 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel1.json @@ -0,0 +1,55 @@ +{ + "Category": [ + { + "Section": [ + { + "Linklist": { + "Link": [ + { + "LinkProvider": { + "Name": "Europe PMC" + }, + "Target": { + "Publisher": { + "Name": "Altmetric" + }, + "ImageURL": "https://api.altmetric.com/v1/donut/58578459_64.png", + "Identifier": { + "ID": "https://www.altmetric.com/details/58578459", + "IDScheme": "URL", + "IDURL": "https://www.altmetric.com/details/58578459" + }, + "Type": { + "Name": "dataset" + }, + "Title": "Optical clumped isotope thermometry of carbon dioxide" + }, + "Source": { + "Identifier": { + "ID": "30886173", + "IDScheme": "PMID" + }, + "Type": { + "Name": "literature" + } + }, + "PublicationDate": "06-04-2019", + "RelationshipType": { + "Name": "IsReferencedBy" + }, + "ObtainedBy": "ext_links" + } + ] + }, + "ObtainedBy": "ext_links", + "SectionLinkCount": 1, + "Tags": [ + "altmetrics" + ] + } + ], + "CategoryLinkCount": 1, + "Name": "Altmetric" + } + ] +} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel_multiple.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel_multiple.json new file mode 100644 index 0000000000..2ad55861e8 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/ebi/rel_multiple.json @@ -0,0 +1,191 @@ +{ + "version": "6.3", + "hitCount": 4, + "request": { + "id": "28818901", + "source": "MED" + }, + "dataLinkList": { + "Category": [ + { + "Name": "Nucleotide Sequences", + "CategoryLinkCount": 3, + "Section": [ + { + "ObtainedBy": "tm_accession", + "Tags": [ + "supporting_data" + ], + "SectionLinkCount": 1, + "Linklist": { + "Link": [ + { + "ObtainedBy": "tm_accession", + "PublicationDate": "27-02-2020", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "References" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "28818901", + "IDScheme": "MED" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "AP008937", + "IDScheme": "ENA", + "IDURL": "http://identifiers.org/ena.embl/AP008937" + }, + "Title": "AP008937", + "Publisher": { + "Name": "Europe PMC" + } + }, + "Frequency": 1 + } + ] + } + }, + { + "ObtainedBy": "submission", + "Tags": [ + "related_data" + ], + "SectionLinkCount": 2, + "CollectionURL": "http://www.ebi.ac.uk/ena/data/search?query=28818901", + "Linklist": { + "Link": [ + { + "ObtainedBy": "submission", + "PublicationDate": "25-06-2018", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "IsReferencedBy" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "28818901", + "IDScheme": "PMID" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "NIWV01000000", + "IDScheme": "ENA", + "IDURL": "http://www.ebi.ac.uk/ena/data/view/NIWV01000000" + }, + "Title": "Nucleotide sequences", + "Publisher": { + "Name": "ENA" + } + } + }, + { + "ObtainedBy": "submission", + "PublicationDate": "25-06-2018", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "IsReferencedBy" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "28818901", + "IDScheme": "PMID" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "PRJNA390617", + "IDScheme": "ENA", + "IDURL": "http://www.ebi.ac.uk/ena/data/view/PRJNA390617" + }, + "Title": "Lactobacillus fermentum strain:BFE 6620", + "Publisher": { + "Name": "ENA" + } + } + } + ] + } + } + ] + }, + { + "Name": "BioStudies: supplemental material and supporting data", + "CategoryLinkCount": 1, + "Section": [ + { + "ObtainedBy": "ext_links", + "Tags": [ + "supporting_data" + ], + "SectionLinkCount": 1, + "Linklist": { + "Link": [ + { + "ObtainedBy": "ext_links", + "PublicationDate": "24-07-2018", + "LinkProvider": { + "Name": "Europe PMC" + }, + "RelationshipType": { + "Name": "IsReferencedBy" + }, + "Source": { + "Type": { + "Name": "literature" + }, + "Identifier": { + "ID": "28818901", + "IDScheme": "PMID" + } + }, + "Target": { + "Type": { + "Name": "dataset" + }, + "Identifier": { + "ID": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC5604774?xr=true", + "IDScheme": "URL", + "IDURL": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC5604774?xr=true" + }, + "Title": "Draft Genome Sequence of Lactobacillus fermentum BFE 6620, a Potential Starter Culture for African Vegetable Foods, Isolated from Fermented Cassava.", + "Publisher": { + "Name": "BioStudies: supplemental material and supporting data" + } + } + } + ] + } + } + ] + } + ] + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala index 967834d694..86b68fbd2a 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala @@ -5,11 +5,12 @@ import java.time.format.DateTimeFormatter import eu.dnetlib.dhp.common.PacePerson import eu.dnetlib.dhp.schema.action.AtomicAction -import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty} +import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation} import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils import org.codehaus.jackson.map.ObjectMapper +import eu.dnetlib.dhp.schema.scholexplorer.OafUtils._ import scala.collection.JavaConverters._ @@ -426,46 +427,6 @@ object DLIToOAF { } - def generateKeyValue(key: String, value: String): KeyValue = { - val kv: KeyValue = new KeyValue() - kv.setKey(key) - kv.setValue(value) - kv.setDataInfo(generateDataInfo("0.9")) - kv - } - def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = { - val di = new DataInfo - di.setDeletedbyinference(false) - di.setInferred(false) - di.setInvisible(false) - di.setTrust(trust) - di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions")) - di - } - - def createQualifier(cls: String, sch: String): Qualifier = { - createQualifier(cls, cls, sch, sch) - } - - - def createQualifier(classId: String, className: String, schemeId: String, schemeName: String): Qualifier = { - val q: Qualifier = new Qualifier - q.setClassid(classId) - q.setClassname(className) - q.setSchemeid(schemeId) - q.setSchemename(schemeName) - q - } - - - def asField[T](value: T): Field[T] = { - val tmp = new Field[T] - tmp.setValue(value) - tmp - - - } - } diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJobTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJobTest.java index c16bbc6fba..528532eddd 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJobTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJobTest.java @@ -1,9 +1,10 @@ package eu.dnetlib.dhp.oa.provision; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; -import eu.dnetlib.dhp.schema.oaf.Relation; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; @@ -19,9 +20,10 @@ import org.junit.jupiter.api.io.TempDir; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; +import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareRelationsJobTest { @@ -74,14 +76,19 @@ public class PrepareRelationsJobTest { "-maxRelations", String.valueOf(maxRelations) }); - Dataset out = spark.read() - .parquet(testPath.toString()) - .as(Encoders.bean(Relation.class)) - .cache(); + Dataset out = spark + .read() + .parquet(testPath.toString()) + .as(Encoders.bean(Relation.class)) + .cache(); Assertions.assertEquals(10, out.count()); - Dataset freq = out.toDF().cube(SUBRELTYPE).count().filter((FilterFunction) value -> !value.isNullAt(0)); + Dataset freq = out + .toDF() + .cube(SUBRELTYPE) + .count() + .filter((FilterFunction) value -> !value.isNullAt(0)); long outcome = freq.filter(freq.col(SUBRELTYPE).equalTo(OUTCOME)).collectAsList().get(0).getAs("count"); long supplement = freq.filter(freq.col(SUBRELTYPE).equalTo(SUPPLEMENT)).collectAsList().get(0).getAs("count"); From 752d28f8eb955128eb59c5f71e3f0b652f003319 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 10 Jul 2020 15:09:50 +0200 Subject: [PATCH 19/65] make the relations produced by the dedup SparkPropagateRelation jon unique --- .../dhp/oa/dedup/SparkPropagateRelation.java | 45 ++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index 5168085116..e65eb7ab57 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -7,6 +7,7 @@ import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; +import org.apache.spark.sql.expressions.Aggregator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -95,7 +96,49 @@ public class SparkPropagateRelation extends AbstractSparkAction { FieldType.TARGET, getDeletedFn()); - save(newRels.union(updated).union(mergeRels), outputRelationPath, SaveMode.Overwrite); + save(distinctRelations(newRels.union(updated).union(mergeRels)), outputRelationPath, SaveMode.Overwrite); + } + + private Dataset distinctRelations(Dataset rels) { + return rels + .groupByKey((MapFunction) r -> ModelSupport.idFn().apply(r), Encoders.STRING()) + .agg(new RelationAggregator().toColumn()) + .map((MapFunction, Relation>) t -> t._2(), Encoders.bean(Relation.class)); + } + + class RelationAggregator extends Aggregator { + + @Override + public Relation zero() { + return new Relation(); + } + + @Override + public Relation reduce(Relation b, Relation a) { + b.mergeFrom(a); + return b; + } + + @Override + public Relation merge(Relation b, Relation a) { + b.mergeFrom(a); + return b; + } + + @Override + public Relation finish(Relation r) { + return r; + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(Relation.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(Relation.class); + } } private static Dataset processDataset( From e1ae964bc462e532cd6e61369899677941dc2a4a Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 10 Jul 2020 16:12:08 +0200 Subject: [PATCH 20/65] stats --- .../dhp/broker/oa/GenerateStatsJob.java | 63 +++++++++++++++++++ .../dnetlib/dhp/broker/oa/JoinStep2Job.java | 5 +- .../aggregators/stats/DatasourceStats.java | 61 ++++++++++++++++++ .../aggregators/stats/StatsAggregator.java | 59 +++++++++++++++++ .../broker/oa/partial/oozie_app/workflow.xml | 8 +-- .../dhp/oa/dedup/EntityMergerTest.java | 5 +- .../oa/graph/clean/CleanGraphSparkJob.java | 5 +- 7 files changed, 194 insertions(+), 12 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/DatasourceStats.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/StatsAggregator.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java new file mode 100644 index 0000000000..a51601cd76 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java @@ -0,0 +1,63 @@ + +package eu.dnetlib.dhp.broker.oa; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.TypedColumn; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.broker.model.Event; +import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; +import eu.dnetlib.dhp.broker.oa.util.aggregators.stats.DatasourceStats; +import eu.dnetlib.dhp.broker.oa.util.aggregators.stats.StatsAggregator; + +public class GenerateStatsJob { + + private static final Logger log = LoggerFactory.getLogger(GenerateStatsJob.class); + + public static void main(final String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + IndexOnESJob.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final SparkConf conf = new SparkConf(); + + final String eventsPath = parser.get("workingPath") + "/events"; + log.info("eventsPath: {}", eventsPath); + + final String statsPath = parser.get("workingPath") + "/stats"; + log.info("stats: {}", statsPath); + + final TypedColumn aggr = new StatsAggregator().toColumn(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + + final Dataset stats = ClusterUtils + .readPath(spark, eventsPath, Event.class) + .groupByKey(e -> e.getMap().getTargetDatasourceId(), Encoders.STRING()) + .agg(aggr) + .map(t -> t._2, Encoders.bean(DatasourceStats.class)); + + ClusterUtils.save(stats, statsPath, DatasourceStats.class, null); + }); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java index cdcf0add4a..55ab497f07 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java @@ -7,7 +7,6 @@ import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.TypedColumn; @@ -65,9 +64,7 @@ public class JoinStep2Job { final Dataset dataset = sources .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") - .groupByKey( - (MapFunction, String>) t -> t._1.getOpenaireId(), - Encoders.STRING()) + .groupByKey(t -> t._1.getOpenaireId(), Encoders.STRING()) .agg(aggr) .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/DatasourceStats.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/DatasourceStats.java new file mode 100644 index 0000000000..8b628809db --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/DatasourceStats.java @@ -0,0 +1,61 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.stats; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.Map; + +public class DatasourceStats implements Serializable { + + /** + * + */ + private static final long serialVersionUID = -282112564184047677L; + + private String id; + private String name; + private String type; + private Map topics = new HashMap<>(); + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public String getName() { + return name; + } + + public void setName(final String name) { + this.name = name; + } + + public String getType() { + return type; + } + + public void setType(final String type) { + this.type = type; + } + + public Map getTopics() { + return topics; + } + + public void setTopics(final Map topics) { + this.topics = topics; + } + + public void incrementTopic(final String topic, final long inc) { + if (topics.containsKey(topic)) { + topics.put(topic, topics.get(topic) + inc); + } else { + topics.put(topic, inc); + } + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/StatsAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/StatsAggregator.java new file mode 100644 index 0000000000..5aa6698e39 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/stats/StatsAggregator.java @@ -0,0 +1,59 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.stats; + +import org.apache.commons.lang.StringUtils; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.dhp.broker.model.Event; + +public class StatsAggregator extends Aggregator { + + /** + * + */ + private static final long serialVersionUID = 6652105853037330529L; + + @Override + public DatasourceStats zero() { + return new DatasourceStats(); + } + + @Override + public DatasourceStats reduce(final DatasourceStats stats, final Event e) { + stats.setId(e.getMap().getTargetDatasourceId()); + stats.setName(e.getMap().getTargetDatasourceName()); + stats.setType(e.getMap().getTargetDatasourceType()); + stats.incrementTopic(e.getTopic(), 1l); + return stats; + } + + @Override + public DatasourceStats merge(final DatasourceStats stats0, final DatasourceStats stats1) { + if (StringUtils.isBlank(stats0.getId())) { + stats0.setId(stats1.getId()); + stats0.setName(stats1.getName()); + stats0.setType(stats1.getType()); + } + stats1.getTopics().entrySet().forEach(e -> stats0.incrementTopic(e.getKey(), e.getValue())); + return stats0; + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(DatasourceStats.class); + + } + + @Override + public DatasourceStats finish(final DatasourceStats stats) { + return stats; + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(DatasourceStats.class); + + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml index d19ad6c5a0..b4155f93ff 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml @@ -64,19 +64,19 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + yarn cluster - Count - eu.dnetlib.dhp.broker.oa.CheckDuplictedIdsJob + GenerateStatsJob + eu.dnetlib.dhp.broker.oa.GenerateStatsJob dhp-broker-events-${projectVersion}.jar --executor-cores=${sparkExecutorCores} diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java index 513e14f073..3d45f666bc 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java @@ -47,10 +47,11 @@ public class EntityMergerTest implements Serializable { @Test public void softwareMergerTest() throws InstantiationException, IllegalAccessException { - List> softwares = readSample(testEntityBasePath + "/software_merge.json", Software.class); + List> softwares = readSample( + testEntityBasePath + "/software_merge.json", Software.class); Software merged = DedupRecordFactory - .entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class); + .entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class); System.out.println(merged.getBestaccessright().getClassid()); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index fd707e949a..7091d97403 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -8,7 +8,6 @@ import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; -import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -24,6 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper; import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; @@ -151,7 +151,8 @@ public class CleanGraphSparkJob { if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) { Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance()); if (Objects.isNull(bestaccessrights)) { - r.setBestaccessright( + r + .setBestaccessright( qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); } else { r.setBestaccessright(bestaccessrights); From 4c3836f62e3358d173163aef8de11ead52fcc707 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 10 Jul 2020 19:00:44 +0200 Subject: [PATCH 21/65] materialize the related entities before joining them --- .../CreateRelatedEntitiesJob_phase1.java | 25 ++++++++++++++++--- .../CreateRelatedEntitiesJob_phase2.java | 17 +++++-------- .../dhp/oa/provision/ProvisionConstants.java | 14 +++++++++++ 3 files changed, 42 insertions(+), 14 deletions(-) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index 80b8000173..57dca7bb15 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -9,6 +9,7 @@ import java.util.Optional; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -115,11 +116,21 @@ public class CreateRelatedEntitiesJob_phase1 { Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class))) .cache(); - Dataset> entities = readPathEntity(spark, inputEntityPath, clazz) + final String relatedEntityPath = outputPath + "_relatedEntity"; + readPathEntity(spark, inputEntityPath, clazz) .filter("dataInfo.invisible == false") .map( (MapFunction) value -> asRelatedEntity(value, clazz), Encoders.kryo(RelatedEntity.class)) + .repartition(5000) + .write() + .mode(SaveMode.Overwrite) + .parquet(relatedEntityPath); + + Dataset> entities = spark + .read() + .load(relatedEntityPath) + .as(Encoders.kryo(RelatedEntity.class)) .map( (MapFunction>) e -> new Tuple2<>(e.getId(), e), Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class))) @@ -165,13 +176,21 @@ public class CreateRelatedEntitiesJob_phase1 { Result result = (Result) entity; if (result.getTitle() != null && !result.getTitle().isEmpty()) { - re.setTitle(result.getTitle().stream().findFirst().get()); + final StructuredProperty title = result.getTitle().stream().findFirst().get(); + title.setValue(StringUtils.left(title.getValue(), ProvisionConstants.MAX_TITLE_LENGTH)); + re.setTitle(title); } re.setDateofacceptance(getValue(result.getDateofacceptance())); re.setPublisher(getValue(result.getPublisher())); re.setResulttype(result.getResulttype()); - re.setInstances(result.getInstance()); + re + .setInstances( + result + .getInstance() + .stream() + .limit(ProvisionConstants.MAX_INSTANCES) + .collect(Collectors.toList())); // TODO still to be mapped // re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java index bfcc648a35..7e175121e5 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java @@ -61,12 +61,6 @@ public class CreateRelatedEntitiesJob_phase2 { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final int MAX_EXTERNAL_ENTITIES = 50; - private static final int MAX_AUTHORS = 200; - private static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000; - private static final int MAX_TITLE_LENGTH = 5000; - private static final int MAX_ABSTRACT_LENGTH = 100000; - public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils @@ -246,15 +240,15 @@ public class CreateRelatedEntitiesJob_phase2 { List refs = r .getExternalReference() .stream() - .limit(MAX_EXTERNAL_ENTITIES) + .limit(ProvisionConstants.MAX_EXTERNAL_ENTITIES) .collect(Collectors.toList()); r.setExternalReference(refs); } if (r.getAuthor() != null) { List authors = Lists.newArrayList(); for (Author a : r.getAuthor()) { - a.setFullname(StringUtils.left(a.getFullname(), MAX_AUTHOR_FULLNAME_LENGTH)); - if (authors.size() < MAX_AUTHORS || hasORCID(a)) { + a.setFullname(StringUtils.left(a.getFullname(), ProvisionConstants.MAX_AUTHOR_FULLNAME_LENGTH)); + if (authors.size() < ProvisionConstants.MAX_AUTHORS || hasORCID(a)) { authors.add(a); } } @@ -266,7 +260,7 @@ public class CreateRelatedEntitiesJob_phase2 { .stream() .filter(Objects::nonNull) .map(d -> { - d.setValue(StringUtils.left(d.getValue(), MAX_ABSTRACT_LENGTH)); + d.setValue(StringUtils.left(d.getValue(), ProvisionConstants.MAX_ABSTRACT_LENGTH)); return d; }) .collect(Collectors.toList()); @@ -278,9 +272,10 @@ public class CreateRelatedEntitiesJob_phase2 { .stream() .filter(Objects::nonNull) .map(t -> { - t.setValue(StringUtils.left(t.getValue(), MAX_TITLE_LENGTH)); + t.setValue(StringUtils.left(t.getValue(), ProvisionConstants.MAX_TITLE_LENGTH)); return t; }) + .limit(ProvisionConstants.MAX_TITLES) .collect(Collectors.toList()); r.setTitle(titles); } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java new file mode 100644 index 0000000000..9bc3706cdd --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java @@ -0,0 +1,14 @@ + +package eu.dnetlib.dhp.oa.provision; + +public class ProvisionConstants { + + public static final int MAX_EXTERNAL_ENTITIES = 50; + public static final int MAX_AUTHORS = 200; + public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000; + public static final int MAX_TITLE_LENGTH = 5000; + public static final int MAX_TITLES = 10; + public static final int MAX_ABSTRACT_LENGTH = 100000; + public static final int MAX_INSTANCES = 10; + +} From cc77446dc4d092503bff388994a005e17e61dcfc Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 10 Jul 2020 19:01:50 +0200 Subject: [PATCH 22/65] added dbSchema parameter to the raw_db workflow --- .../eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml index 575f9229ed..eea8d0a5ab 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml @@ -16,6 +16,11 @@ postgresPassword the password postgres + + dbSchema + beta + the database schema according to the D-Net infrastructure (beta or production) + isLookupUrl the address of the lookUp service @@ -93,6 +98,7 @@ --postgresUser${postgresUser} --postgresPassword${postgresPassword} --isLookupUrl${isLookupUrl} + --dbschema${dbSchema} @@ -109,6 +115,7 @@ --postgresUser${postgresUser} --postgresPassword${postgresPassword} --isLookupUrl${isLookupUrl} + --dbschema${dbSchema} --actionclaims From 06c1913062541062b4b5ed49044870acc1d3bd56 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 10 Jul 2020 19:03:33 +0200 Subject: [PATCH 23/65] added different limits for grouping by source and by target, incremented spark.sql.shuffle.partitions for the join operations --- .../dhp/oa/provision/oozie_app/workflow.xml | 40 ++++++++++--------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index faa81ad644..32bf7ce83d 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -18,8 +18,12 @@ filter applied reading relations (by relClass) - maxRelations - maximum number of relations allowed for a each entity + sourceMaxRelations + maximum number of relations allowed for a each entity grouping by source + + + targetMaxRelations + maximum number of relations allowed for a each entity grouping by target otherDsTypeId @@ -133,7 +137,8 @@ --inputRelationsPath${inputGraphRootPath}/relation --outputPath${workingDir}/relation - --maxRelations${maxRelations} + --sourceMaxRelations${sourceMaxRelations} + --targetMaxRelations${targetMaxRelations} --relationFilter${relationFilter} --relPartitions5000 @@ -166,7 +171,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 + --conf spark.sql.shuffle.partitions=15000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputRelationsPath${workingDir}/relation @@ -193,7 +198,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=15000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputRelationsPath${workingDir}/relation @@ -220,7 +225,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=10000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputRelationsPath${workingDir}/relation @@ -247,7 +252,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputRelationsPath${workingDir}/relation @@ -274,7 +279,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputRelationsPath${workingDir}/relation @@ -301,7 +306,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputRelationsPath${workingDir}/relation @@ -328,7 +333,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputRelationsPath${workingDir}/relation @@ -367,7 +372,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=15360 + --conf spark.sql.shuffle.partitions=15000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputEntityPath${inputGraphRootPath}/publication @@ -395,7 +400,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 + --conf spark.sql.shuffle.partitions=10000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputEntityPath${inputGraphRootPath}/dataset @@ -423,7 +428,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 + --conf spark.sql.shuffle.partitions=10000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputEntityPath${inputGraphRootPath}/otherresearchproduct @@ -451,7 +456,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputEntityPath${inputGraphRootPath}/software @@ -479,7 +484,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 + --conf spark.sql.shuffle.partitions=8000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputEntityPath${inputGraphRootPath}/datasource @@ -507,7 +512,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 + --conf spark.sql.shuffle.partitions=10000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputEntityPath${inputGraphRootPath}/organization @@ -535,7 +540,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} --inputEntityPath${inputGraphRootPath}/project @@ -607,5 +612,4 @@ - \ No newline at end of file From 770adc26e9d509837710de1d15bdeab88ed675a0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 10 Jul 2020 19:35:10 +0200 Subject: [PATCH 24/65] WIP aggregator to make relationships unique --- .../dnetlib/dhp/oa/dedup/SparkPropagateRelation.java | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index e65eb7ab57..13d2e4cd77 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -96,14 +96,14 @@ public class SparkPropagateRelation extends AbstractSparkAction { FieldType.TARGET, getDeletedFn()); - save(distinctRelations(newRels.union(updated).union(mergeRels)), outputRelationPath, SaveMode.Overwrite); + save(newRels.union(updated).union(mergeRels).distinct(), outputRelationPath, SaveMode.Overwrite); } private Dataset distinctRelations(Dataset rels) { return rels - .groupByKey((MapFunction) r -> ModelSupport.idFn().apply(r), Encoders.STRING()) - .agg(new RelationAggregator().toColumn()) - .map((MapFunction, Relation>) t -> t._2(), Encoders.bean(Relation.class)); + .groupByKey((MapFunction) r -> ModelSupport.idFn().apply(r), Encoders.STRING()) + .agg(new RelationAggregator().toColumn()) + .map((MapFunction, Relation>) t -> t._2(), Encoders.bean(Relation.class)); } class RelationAggregator extends Aggregator { @@ -115,13 +115,11 @@ public class SparkPropagateRelation extends AbstractSparkAction { @Override public Relation reduce(Relation b, Relation a) { - b.mergeFrom(a); return b; } @Override public Relation merge(Relation b, Relation a) { - b.mergeFrom(a); return b; } From b7a39731a6385457e4defd7eeac08edf208266a8 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Sun, 12 Jul 2020 19:28:56 +0200 Subject: [PATCH 25/65] assert, not print --- .../src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java index 513e14f073..0b718ecdc3 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java @@ -52,7 +52,7 @@ public class EntityMergerTest implements Serializable { Software merged = DedupRecordFactory .entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class); - System.out.println(merged.getBestaccessright().getClassid()); + assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE"); } @Test From 7a3fd9f54cc7cf803e799fc430b0eb898e33f3c6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 13 Jul 2020 10:11:36 +0200 Subject: [PATCH 26/65] dedup relation aggregator moved into dedicated class --- .../dhp/oa/dedup/RelationAggregator.java | 46 +++++++++++++++++++ .../dhp/oa/dedup/SparkPropagateRelation.java | 36 +-------------- 2 files changed, 47 insertions(+), 35 deletions(-) create mode 100644 dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java new file mode 100644 index 0000000000..0a29aa51b2 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java @@ -0,0 +1,46 @@ + +package eu.dnetlib.dhp.oa.dedup; + +import java.util.Objects; + +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class RelationAggregator extends Aggregator { + + private static Relation ZERO = new Relation(); + + @Override + public Relation zero() { + return ZERO; + } + + @Override + public Relation reduce(Relation b, Relation a) { + return Objects.equals(a, ZERO) ? b : a; + } + + @Override + public Relation merge(Relation b, Relation a) { + b.mergeFrom(a); + return b; + } + + @Override + public Relation finish(Relation r) { + return r; + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(Relation.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(Relation.class); + } +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index 13d2e4cd77..c197697494 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -7,7 +7,6 @@ import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; -import org.apache.spark.sql.expressions.Aggregator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -96,7 +95,7 @@ public class SparkPropagateRelation extends AbstractSparkAction { FieldType.TARGET, getDeletedFn()); - save(newRels.union(updated).union(mergeRels).distinct(), outputRelationPath, SaveMode.Overwrite); + save(distinctRelations(newRels.union(updated).union(mergeRels)), outputRelationPath, SaveMode.Overwrite); } private Dataset distinctRelations(Dataset rels) { @@ -106,39 +105,6 @@ public class SparkPropagateRelation extends AbstractSparkAction { .map((MapFunction, Relation>) t -> t._2(), Encoders.bean(Relation.class)); } - class RelationAggregator extends Aggregator { - - @Override - public Relation zero() { - return new Relation(); - } - - @Override - public Relation reduce(Relation b, Relation a) { - return b; - } - - @Override - public Relation merge(Relation b, Relation a) { - return b; - } - - @Override - public Relation finish(Relation r) { - return r; - } - - @Override - public Encoder bufferEncoder() { - return Encoders.bean(Relation.class); - } - - @Override - public Encoder outputEncoder() { - return Encoders.bean(Relation.class); - } - } - private static Dataset processDataset( Dataset rels, Dataset> mergedIds, From ccbe5c565860f787d200226e4d6d44ef73fda0e5 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Mon, 13 Jul 2020 10:55:27 +0200 Subject: [PATCH 27/65] fixed import of eu.dnetlib.dhp:dnet-openaire-broker-common --- dhp-workflows/dhp-broker-events/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index 1a219c5c91..6787226a87 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -59,7 +59,7 @@ eu.dnetlib.dhp dnet-openaire-broker-common - [3.0.0-SNAPSHOT,) + [3.0.0,) From d561b2dd210eece82b30311d19d7746adad8a497 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 13 Jul 2020 14:18:42 +0200 Subject: [PATCH 28/65] implemented cut of connected component --- .../dhp/oa/dedup/SparkCreateMergeRels.java | 19 ++++++++++++++++++- .../oa/dedup/graph/ConnectedComponent.java | 9 ++++++++- .../dhp/oa/dedup/graph/GraphProcessor.scala | 8 ++++---- .../dhp/oa/dedup/createCC_parameters.json | 7 +++++++ 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java index 19e60b5206..0c31f5fa25 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java @@ -76,6 +76,19 @@ public class SparkCreateMergeRels extends AbstractSparkAction { final String workingPath = parser.get("workingPath"); final String isLookUpUrl = parser.get("isLookUpUrl"); final String actionSetId = parser.get("actionSetId"); + int cut = 0; + try { + cut = Integer.parseInt(parser.get("cutConnectedComponent")); + + + } catch (Throwable e) { + log.error("unable to parse "+parser.get(" cut-off threshold")); + } + + + + + log.info("graphBasePath: '{}'", graphBasePath); log.info("isLookUpUrl: '{}'", isLookUpUrl); @@ -112,7 +125,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction { final Dataset mergeRels = spark .createDataset( GraphProcessor - .findCCs(vertexes.rdd(), edgeRdd, maxIterations) + .findCCs(vertexes.rdd(), edgeRdd, maxIterations, cut) .toJavaRDD() .filter(k -> k.getDocIds().size() > 1) .flatMap(cc -> ccToMergeRel(cc, dedupConf)) @@ -120,6 +133,10 @@ public class SparkCreateMergeRels extends AbstractSparkAction { Encoders.bean(Relation.class)); mergeRels.write().mode(SaveMode.Append).parquet(mergeRelPath); + + + + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java index bfd2c25e21..fd6e70916c 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.dedup.graph; import java.io.IOException; import java.io.Serializable; import java.util.Set; +import java.util.stream.Collectors; import org.apache.commons.lang.StringUtils; import org.codehaus.jackson.annotate.JsonIgnore; @@ -21,9 +22,14 @@ public class ConnectedComponent implements Serializable { public ConnectedComponent() { } - public ConnectedComponent(Set docIds) { + + public ConnectedComponent(Set docIds, final int cut) { this.docIds = docIds; createID(); + if (cut > 0 && docIds.size() > cut){ + docIds = docIds.stream().filter(s -> !ccId.equalsIgnoreCase(s)).limit(cut -1).collect(Collectors.toSet()); + docIds.add(ccId); + } } public String createID() { @@ -41,6 +47,7 @@ public class ConnectedComponent implements Serializable { public String getMin() { final StringBuilder min = new StringBuilder(); + docIds .forEach( i -> { diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/GraphProcessor.scala b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/GraphProcessor.scala index e19bb7ff58..f4dd85d758 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/GraphProcessor.scala +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/GraphProcessor.scala @@ -7,7 +7,7 @@ import scala.collection.JavaConversions; object GraphProcessor { - def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = { + def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int, cut:Int): RDD[ConnectedComponent] = { val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby val cc = graph.connectedComponents(maxIterations).vertices @@ -22,15 +22,15 @@ object GraphProcessor { } } val connectedComponents = joinResult.groupByKey() - .map[ConnectedComponent](cc => asConnectedComponent(cc)) + .map[ConnectedComponent](cc => asConnectedComponent(cc, cut)) connectedComponents } - def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = { + def asConnectedComponent(group: (VertexId, Iterable[String]), cut:Int): ConnectedComponent = { val docs = group._2.toSet[String] - val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs)); + val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs), cut); connectedComponent } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json index 6eedd54321..9350cf22b6 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json @@ -17,6 +17,13 @@ "paramDescription": "the url for the lookup service", "paramRequired": true }, + { + "paramName": "cc", + "paramLongName": "cutConnectedComponent", + "paramDescription": "the number of maximum elements that belongs to a connected components", + "paramRequired": false + } +, { "paramName": "w", "paramLongName": "workingPath", From 9ef23850228ab7ece1d9e71c83fde76dc933c9cc Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 13 Jul 2020 15:28:17 +0200 Subject: [PATCH 29/65] implemented test for cut of connected component --- .../dhp/oa/dedup/SparkCreateMergeRels.java | 11 +- .../oa/dedup/graph/ConnectedComponent.java | 14 +-- .../dhp/oa/dedup/createCC_parameters.json | 3 +- .../dnetlib/dhp/oa/dedup/SparkDedupTest.java | 102 +++++++++++++++++- 4 files changed, 107 insertions(+), 23 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java index 0c31f5fa25..0b44935d06 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java @@ -80,16 +80,10 @@ public class SparkCreateMergeRels extends AbstractSparkAction { try { cut = Integer.parseInt(parser.get("cutConnectedComponent")); - } catch (Throwable e) { - log.error("unable to parse "+parser.get(" cut-off threshold")); + log.error("unable to parse " + parser.get(" cut-off threshold")); } - - - - - log.info("graphBasePath: '{}'", graphBasePath); log.info("isLookUpUrl: '{}'", isLookUpUrl); log.info("actionSetId: '{}'", actionSetId); @@ -134,9 +128,6 @@ public class SparkCreateMergeRels extends AbstractSparkAction { mergeRels.write().mode(SaveMode.Append).parquet(mergeRelPath); - - - } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java index fd6e70916c..cd4f99f634 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java @@ -19,16 +19,16 @@ public class ConnectedComponent implements Serializable { private Set docIds; private String ccId; - public ConnectedComponent() { - } - - public ConnectedComponent(Set docIds, final int cut) { this.docIds = docIds; createID(); - if (cut > 0 && docIds.size() > cut){ - docIds = docIds.stream().filter(s -> !ccId.equalsIgnoreCase(s)).limit(cut -1).collect(Collectors.toSet()); - docIds.add(ccId); + if (cut > 0 && docIds.size() > cut) { + this.docIds = docIds + .stream() + .filter(s -> !ccId.equalsIgnoreCase(s)) + .limit(cut - 1) + .collect(Collectors.toSet()); + this.docIds.add(ccId); } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json index 9350cf22b6..b1df08535e 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json @@ -22,8 +22,7 @@ "paramLongName": "cutConnectedComponent", "paramDescription": "the number of maximum elements that belongs to a connected components", "paramRequired": false - } -, + }, { "paramName": "w", "paramLongName": "workingPath", diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 88d5f24f90..66f0af1767 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -3,6 +3,8 @@ package eu.dnetlib.dhp.oa.dedup; import static java.nio.file.Files.createTempDirectory; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.count; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.Mockito.lenient; @@ -11,6 +13,9 @@ import java.io.IOException; import java.io.Serializable; import java.net.URISyntaxException; import java.nio.file.Paths; +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; @@ -18,6 +23,7 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Dataset; @@ -190,6 +196,94 @@ public class SparkDedupTest implements Serializable { @Test @Order(2) + public void cutMergeRelsTest() throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateMergeRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); + parser + .parseArgument( + new String[] { + "-i", + testGraphBasePath, + "-asi", + testActionSetId, + "-la", + "lookupurl", + "-w", + testOutputBasePath, + "-cc", + "3" + }); + + new SparkCreateMergeRels(parser, spark).run(isLookUpService); + + long orgs_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") + .as(Encoders.bean(Relation.class)) + .filter((FilterFunction) r -> r.getRelClass().equalsIgnoreCase("merges")) + .groupBy("source") + .agg(count("target").alias("cnt")) + .select("source", "cnt") + .where("cnt > 3") + .count(); + + long pubs_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") + .as(Encoders.bean(Relation.class)) + .filter((FilterFunction) r -> r.getRelClass().equalsIgnoreCase("merges")) + .groupBy("source") + .agg(count("target").alias("cnt")) + .select("source", "cnt") + .where("cnt > 3") + .count(); + long sw_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel") + .as(Encoders.bean(Relation.class)) + .filter((FilterFunction) r -> r.getRelClass().equalsIgnoreCase("merges")) + .groupBy("source") + .agg(count("target").alias("cnt")) + .select("source", "cnt") + .where("cnt > 3") + .count(); + + long ds_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel") + .as(Encoders.bean(Relation.class)) + .filter((FilterFunction) r -> r.getRelClass().equalsIgnoreCase("merges")) + .groupBy("source") + .agg(count("target").alias("cnt")) + .select("source", "cnt") + .where("cnt > 3") + .count(); + + long orp_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") + .as(Encoders.bean(Relation.class)) + .filter((FilterFunction) r -> r.getRelClass().equalsIgnoreCase("merges")) + .groupBy("source") + .agg(count("target").alias("cnt")) + .select("source", "cnt") + .where("cnt > 3") + .count(); + + assertEquals(0, orgs_mergerel); + assertEquals(0, pubs_mergerel); + assertEquals(0, sw_mergerel); + assertEquals(0, ds_mergerel); + assertEquals(0, orp_mergerel); + } + + @Test + @Order(3) public void createMergeRelsTest() throws Exception { ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -241,7 +335,7 @@ public class SparkDedupTest implements Serializable { } @Test - @Order(3) + @Order(4) public void createDedupRecordTest() throws Exception { ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -288,7 +382,7 @@ public class SparkDedupTest implements Serializable { } @Test - @Order(4) + @Order(5) public void updateEntityTest() throws Exception { ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -404,7 +498,7 @@ public class SparkDedupTest implements Serializable { } @Test - @Order(5) + @Order(6) public void propagateRelationTest() throws Exception { ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -454,7 +548,7 @@ public class SparkDedupTest implements Serializable { } @Test - @Order(6) + @Order(7) public void testRelations() throws Exception { testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_1.json", 12, 10); testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_2.json", 10, 2); From 8a612d861a89bb7dbe1da3992b9cc6730a4d5125 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 13 Jul 2020 15:30:57 +0200 Subject: [PATCH 30/65] WIP SparkCreateMergeRels distinct relations --- .../dhp/oa/dedup/SparkCreateMergeRels.java | 11 +------- .../dhp/oa/dedup/SparkCreateSimRels.java | 21 ++++---------- .../dhp/oa/dedup/SparkPropagateRelation.java | 18 +++++++++++- .../oa/dedup/graph/ConnectedComponent.java | 5 ++-- .../dnetlib/dhp/oa/dedup/SparkDedupTest.java | 28 +++++++++++++------ .../dnetlib/dhp/dedup/conf/ds.curr.conf.json | 7 +++-- .../dnetlib/dhp/dedup/conf/orp.curr.conf.json | 7 +++-- .../dnetlib/dhp/dedup/conf/pub.curr.conf.json | 7 +++-- .../dnetlib/dhp/dedup/conf/sw.curr.conf.json | 9 +++--- 9 files changed, 61 insertions(+), 52 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java index 0c31f5fa25..0b44935d06 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java @@ -80,16 +80,10 @@ public class SparkCreateMergeRels extends AbstractSparkAction { try { cut = Integer.parseInt(parser.get("cutConnectedComponent")); - } catch (Throwable e) { - log.error("unable to parse "+parser.get(" cut-off threshold")); + log.error("unable to parse " + parser.get(" cut-off threshold")); } - - - - - log.info("graphBasePath: '{}'", graphBasePath); log.info("isLookUpUrl: '{}'", isLookUpUrl); log.info("actionSetId: '{}'", actionSetId); @@ -134,9 +128,6 @@ public class SparkCreateMergeRels extends AbstractSparkAction { mergeRels.write().mode(SaveMode.Append).parquet(mergeRelPath); - - - } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java index 1be2b9e31e..2e96b35632 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java @@ -34,6 +34,8 @@ public class SparkCreateSimRels extends AbstractSparkAction { private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); + public static final int NUM_PARTITIONS = 10000; + public SparkCreateSimRels(ArgumentApplicationParser parser, SparkSession spark) { super(parser, spark); } @@ -48,13 +50,6 @@ public class SparkCreateSimRels extends AbstractSparkAction { parser.parseArgument(args); SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf - .registerKryoClasses( - new Class[] { - MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class - }); - new SparkCreateSimRels(parser, getSparkSession(conf)) .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); } @@ -88,7 +83,7 @@ public class SparkCreateSimRels extends AbstractSparkAction { JavaPairRDD mapDocuments = sc .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .repartition(10000) + .repartition(NUM_PARTITIONS) .mapToPair( (PairFunction) s -> { MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); @@ -98,21 +93,15 @@ public class SparkCreateSimRels extends AbstractSparkAction { // create blocks for deduplication JavaPairRDD blocks = Deduper .createSortedBlocks(mapDocuments, dedupConf) - .repartition(10000); + .repartition(NUM_PARTITIONS); // create relations by comparing only elements in the same group Deduper .computeRelations(sc, blocks, dedupConf) .map(t -> createSimRel(t._1(), t._2(), entity)) - .repartition(10000) + .repartition(NUM_PARTITIONS) .map(r -> OBJECT_MAPPER.writeValueAsString(r)) .saveAsTextFile(outputPath); - - // save the simrel in the workingdir - /* - * spark .createDataset(relations.rdd(), Encoders.bean(Relation.class)) .write() .mode(SaveMode.Append) - * .save(outputPath); - */ } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index c197697494..88fe5b26d1 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -4,7 +4,9 @@ package eu.dnetlib.dhp.oa.dedup; import static org.apache.spark.sql.functions.col; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; import org.slf4j.Logger; @@ -95,11 +97,17 @@ public class SparkPropagateRelation extends AbstractSparkAction { FieldType.TARGET, getDeletedFn()); - save(distinctRelations(newRels.union(updated).union(mergeRels)), outputRelationPath, SaveMode.Overwrite); + save( + newRels + .union(updated) + .union(mergeRels) + .map((MapFunction) r -> r, Encoders.kryo(Relation.class)), + outputRelationPath, SaveMode.Overwrite); } private Dataset distinctRelations(Dataset rels) { return rels + .filter(getRelationFilterFunction()) .groupByKey((MapFunction) r -> ModelSupport.idFn().apply(r), Encoders.STRING()) .agg(new RelationAggregator().toColumn()) .map((MapFunction, Relation>) t -> t._2(), Encoders.bean(Relation.class)); @@ -119,6 +127,14 @@ public class SparkPropagateRelation extends AbstractSparkAction { .map(mapFn, Encoders.bean(Relation.class)); } + private FilterFunction getRelationFilterFunction() { + return (FilterFunction) r -> StringUtils.isNotBlank(r.getSource()) || + StringUtils.isNotBlank(r.getTarget()) || + StringUtils.isNotBlank(r.getRelClass()) || + StringUtils.isNotBlank(r.getSubRelType()) || + StringUtils.isNotBlank(r.getRelClass()); + } + private static MapFunction patchRelFn() { return value -> { final Relation rel = OBJECT_MAPPER.readValue(value, Relation.class); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java index fd6e70916c..c5f893668f 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java @@ -22,12 +22,11 @@ public class ConnectedComponent implements Serializable { public ConnectedComponent() { } - public ConnectedComponent(Set docIds, final int cut) { this.docIds = docIds; createID(); - if (cut > 0 && docIds.size() > cut){ - docIds = docIds.stream().filter(s -> !ccId.equalsIgnoreCase(s)).limit(cut -1).collect(Collectors.toSet()); + if (cut > 0 && docIds.size() > cut) { + docIds = docIds.stream().filter(s -> !ccId.equalsIgnoreCase(s)).limit(cut - 1).collect(Collectors.toSet()); docIds.add(ccId); } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 88d5f24f90..8e1ef8f9c9 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -166,23 +166,31 @@ public class SparkDedupTest implements Serializable { long orgs_simrel = spark .read() - .load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel") + .textFile(testOutputBasePath + "/" + testActionSetId + "/organization_simrel") .count(); + long pubs_simrel = spark .read() - .load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel") + .textFile(testOutputBasePath + "/" + testActionSetId + "/publication_simrel") .count(); - long sw_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/software_simrel").count(); - long ds_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel").count(); + long sw_simrel = spark + .read() + .textFile(testOutputBasePath + "/" + testActionSetId + "/software_simrel") + .count(); + + long ds_simrel = spark + .read() + .textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel") + .count(); long orp_simrel = spark .read() - .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel") + .textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel") .count(); assertEquals(3432, orgs_simrel); - assertEquals(7054, pubs_simrel); + assertEquals(7152, pubs_simrel); assertEquals(344, sw_simrel); assertEquals(458, ds_simrel); assertEquals(6750, orp_simrel); @@ -225,8 +233,10 @@ public class SparkDedupTest implements Serializable { .read() .load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel") .count(); - - long ds_mergerel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel").count(); + long ds_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel") + .count(); long orp_mergerel = spark .read() @@ -234,7 +244,7 @@ public class SparkDedupTest implements Serializable { .count(); assertEquals(1276, orgs_mergerel); - assertEquals(1440, pubs_mergerel); + assertEquals(1442, pubs_mergerel); assertEquals(288, sw_mergerel); assertEquals(472, ds_mergerel); assertEquals(718, orp_mergerel); diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json index 2469b2cc03..fa889d63b7 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json @@ -6,10 +6,10 @@ "subEntityType" : "resulttype", "subEntityValue" : "dataset", "orderField" : "title", - "queueMaxSize" : "800", + "queueMaxSize" : "100", "groupMaxSize" : "100", "maxChildren" : "100", - "slidingWindowSize" : "80", + "slidingWindowSize" : "100", "rootBuilder" : ["result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], "includeChildren" : "true", "idPath" : "$.id", @@ -17,7 +17,8 @@ }, "pace" : { "clustering" : [ - { "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } }, + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ], "decisionTree" : { diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json index 4adcc0439f..b45b6ae832 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json @@ -6,10 +6,10 @@ "subEntityType" : "resulttype", "subEntityValue" : "otherresearchproduct", "orderField" : "title", - "queueMaxSize" : "800", + "queueMaxSize" : "100", "groupMaxSize" : "100", "maxChildren" : "100", - "slidingWindowSize" : "80", + "slidingWindowSize" : "100", "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], "includeChildren" : "true", "idPath" : "$.id", @@ -17,7 +17,8 @@ }, "pace" : { "clustering" : [ - { "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } }, + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ], "decisionTree" : { diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json index ef0b26af41..15ebc7a6a0 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json @@ -6,10 +6,10 @@ "subEntityType": "resulttype", "subEntityValue": "publication", "orderField": "title", - "queueMaxSize": "800", + "queueMaxSize": "100", "groupMaxSize": "100", "maxChildren": "100", - "slidingWindowSize": "80", + "slidingWindowSize": "100", "rootBuilder": [ "result", "resultProject_outcome_isProducedBy", @@ -29,7 +29,8 @@ }, "pace": { "clustering" : [ - { "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } }, + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ], "decisionTree": { diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json index 623abbf9f4..f53ff385f9 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json @@ -6,10 +6,10 @@ "subEntityType" : "resulttype", "subEntityValue" : "software", "orderField" : "title", - "queueMaxSize" : "800", + "queueMaxSize" : "100", "groupMaxSize" : "100", "maxChildren" : "100", - "slidingWindowSize" : "80", + "slidingWindowSize" : "100", "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], "includeChildren" : "true", "idPath" : "$.id", @@ -17,8 +17,9 @@ }, "pace" : { "clustering" : [ - { "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } }, - { "name" : "lowercase", "fields" : [ "doi", "url" ], "params" : { } } + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ], "decisionTree": { "start": { From 7dd91edf4335e7f23ac87661b86008e835134aa6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 13 Jul 2020 15:40:41 +0200 Subject: [PATCH 31/65] parsing of optional parameter --- .../dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java index 0b44935d06..70457fb4a0 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java @@ -5,6 +5,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; @@ -76,14 +77,11 @@ public class SparkCreateMergeRels extends AbstractSparkAction { final String workingPath = parser.get("workingPath"); final String isLookUpUrl = parser.get("isLookUpUrl"); final String actionSetId = parser.get("actionSetId"); - int cut = 0; - try { - cut = Integer.parseInt(parser.get("cutConnectedComponent")); - - } catch (Throwable e) { - log.error("unable to parse " + parser.get(" cut-off threshold")); - } - + int cut = Optional + .ofNullable(parser.get("cutConnectedComponent")) + .map(Integer::valueOf) + .orElse(0); + log.info("connected component cut: '{}'", cut); log.info("graphBasePath: '{}'", graphBasePath); log.info("isLookUpUrl: '{}'", isLookUpUrl); log.info("actionSetId: '{}'", actionSetId); From 3635d05061b45a1daa944193814f64f81947ad51 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Mon, 13 Jul 2020 15:52:23 +0200 Subject: [PATCH 32/65] poms --- dhp-workflows/dhp-broker-events/pom.xml | 1 - pom.xml | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index 6787226a87..f98708c648 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -59,7 +59,6 @@ eu.dnetlib.dhp dnet-openaire-broker-common - [3.0.0,) diff --git a/pom.xml b/pom.xml index 4619f31749..411ef9521c 100644 --- a/pom.xml +++ b/pom.xml @@ -323,6 +323,12 @@ [2.0.0,3.0.0) + + eu.dnetlib.dhp + dnet-openaire-broker-common + ${dnet.openaire.broker.common} + + org.apache.cxf cxf-rt-transports-http @@ -618,5 +624,6 @@ 3.3.3 3.4.2 [2.12,3.0) + 3.0.0 From 1d133b7fe6590dbd706ef77e40cfd657debbc00c Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 13 Jul 2020 15:52:41 +0200 Subject: [PATCH 33/65] update test --- .../test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index e106551260..294b19ecd9 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -288,6 +288,12 @@ public class SparkDedupTest implements Serializable { assertEquals(0, sw_mergerel); assertEquals(0, ds_mergerel); assertEquals(0, orp_mergerel); + + FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")); + FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel")); + FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")); + FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel")); + FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")); } @Test From c8284bab06c13f47d135cbd7d8a8fe009f090b6b Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 13 Jul 2020 15:54:51 +0200 Subject: [PATCH 34/65] WIP SparkCreateMergeRels distinct relations --- .../main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java | 4 ++-- .../java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java | 4 ++-- .../src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java | 4 +++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java index 0a29aa51b2..7935fe1ca2 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java @@ -36,11 +36,11 @@ public class RelationAggregator extends Aggregator @Override public Encoder bufferEncoder() { - return Encoders.bean(Relation.class); + return Encoders.kryo(Relation.class); } @Override public Encoder outputEncoder() { - return Encoders.bean(Relation.class); + return Encoders.kryo(Relation.class); } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index 88fe5b26d1..baba3bc875 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -98,10 +98,10 @@ public class SparkPropagateRelation extends AbstractSparkAction { getDeletedFn()); save( - newRels + distinctRelations(newRels .union(updated) .union(mergeRels) - .map((MapFunction) r -> r, Encoders.kryo(Relation.class)), + .map((MapFunction) r -> r, Encoders.kryo(Relation.class))), outputRelationPath, SaveMode.Overwrite); } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index e106551260..82c2d82b72 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -77,11 +77,13 @@ public class SparkDedupTest implements Serializable { FileUtils.deleteDirectory(new File(testOutputBasePath)); FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); + final SparkConf conf = new SparkConf(); + conf.set("spark.sql.shuffle.partitions", "200"); spark = SparkSession .builder() .appName(SparkDedupTest.class.getSimpleName()) .master("local[*]") - .config(new SparkConf()) + .config(conf) .getOrCreate(); jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); From 8c67938ad0a9d7713530d2a1ede0cf59f161fbe5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 13 Jul 2020 16:07:07 +0200 Subject: [PATCH 35/65] configurable number of partitions used in the SparkCreateSimRels phase --- .../dhp/oa/dedup/SparkCreateSimRels.java | 14 +++++--- .../oa/dedup/createSimRels_parameters.json | 6 ++++ .../dhp/oa/dedup/scan/oozie_app/workflow.xml | 36 +++++++++++-------- .../dhp/oa/dedup/updateEntity_parameters.json | 26 +++++++------- 4 files changed, 50 insertions(+), 32 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java index 2e96b35632..a66ab431c7 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.oa.dedup; import java.io.IOException; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; @@ -34,7 +35,7 @@ public class SparkCreateSimRels extends AbstractSparkAction { private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); - public static final int NUM_PARTITIONS = 10000; + public static final int NUM_PARTITIONS = 1000; public SparkCreateSimRels(ArgumentApplicationParser parser, SparkSession spark) { super(parser, spark); @@ -63,7 +64,12 @@ public class SparkCreateSimRels extends AbstractSparkAction { final String isLookUpUrl = parser.get("isLookUpUrl"); final String actionSetId = parser.get("actionSetId"); final String workingPath = parser.get("workingPath"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); + log.info("numPartitions: '{}'", numPartitions); log.info("graphBasePath: '{}'", graphBasePath); log.info("isLookUpUrl: '{}'", isLookUpUrl); log.info("actionSetId: '{}'", actionSetId); @@ -83,7 +89,7 @@ public class SparkCreateSimRels extends AbstractSparkAction { JavaPairRDD mapDocuments = sc .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .repartition(NUM_PARTITIONS) + .repartition(numPartitions) .mapToPair( (PairFunction) s -> { MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); @@ -93,13 +99,13 @@ public class SparkCreateSimRels extends AbstractSparkAction { // create blocks for deduplication JavaPairRDD blocks = Deduper .createSortedBlocks(mapDocuments, dedupConf) - .repartition(NUM_PARTITIONS); + .repartition(numPartitions); // create relations by comparing only elements in the same group Deduper .computeRelations(sc, blocks, dedupConf) .map(t -> createSimRel(t._1(), t._2(), entity)) - .repartition(NUM_PARTITIONS) + .repartition(numPartitions) .map(r -> OBJECT_MAPPER.writeValueAsString(r)) .saveAsTextFile(outputPath); } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json index ce38dc6f00..09f4365d34 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json @@ -22,5 +22,11 @@ "paramLongName": "workingPath", "paramDescription": "path of the working directory", "paramRequired": true + }, + { + "paramName": "np", + "paramLongName": "numPartitions", + "paramDescription": "number of partitions for the similarity relations intermediate phases", + "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml index 298a248e3b..c42ce12639 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml @@ -20,6 +20,10 @@ dedupGraphPath path for the output graph + + cutConnectedComponent + max number of elements in a connected component + sparkDriverMemory memory for driver process @@ -106,10 +110,11 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --i${graphBasePath} - --la${isLookUpUrl} - --asi${actionSetId} - --w${workingPath} + --graphBasePath${graphBasePath} + --isLookUpUrl${isLookUpUrl} + --actionSetId${actionSetId} + --workingPath${workingPath} + --numPartitions8000 @@ -132,10 +137,11 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --i${graphBasePath} - --w${workingPath} - --la${isLookUpUrl} - --asi${actionSetId} + --graphBasePath${graphBasePath} + --workingPath${workingPath} + --isLookUpUrl${isLookUpUrl} + --actionSetId${actionSetId} + --cutConnectedComponent${cutConnectedComponent} @@ -158,10 +164,10 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --i${graphBasePath} - --w${workingPath} - --la${isLookUpUrl} - --asi${actionSetId} + --graphBasePath${graphBasePath} + --workingPath${workingPath} + --isLookUpUrl${isLookUpUrl} + --actionSetId${actionSetId} @@ -184,9 +190,9 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --i${graphBasePath} - --w${workingPath} - --o${dedupGraphPath} + --graphBasePath${graphBasePath} + --workingPath${workingPath} + --dedupGraphPath${dedupGraphPath} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json index c91f3c04bc..6a2a48746b 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json @@ -1,17 +1,17 @@ [ -{ - "paramName": "i", - "paramLongName": "graphBasePath", - "paramDescription": "the base path of raw graph", - "paramRequired": true -}, -{ - "paramName": "w", - "paramLongName": "workingPath", - "paramDescription": "the working directory path", - "paramRequired": true -}, -{ + { + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of raw graph", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "the working directory path", + "paramRequired": true + }, + { "paramName": "o", "paramLongName": "dedupGraphPath", "paramDescription": "the path of the dedup graph", From 1143f426aaf2eadfa8b55dcb62fabf8b52bcb503 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 13 Jul 2020 16:13:36 +0200 Subject: [PATCH 36/65] WIP SparkCreateMergeRels distinct relations --- .../dhp/oa/dedup/RelationAggregator.java | 17 ++++++++++++++--- .../dhp/oa/dedup/SparkPropagateRelation.java | 3 ++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java index 7935fe1ca2..6fb7b844bb 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/RelationAggregator.java @@ -20,13 +20,12 @@ public class RelationAggregator extends Aggregator @Override public Relation reduce(Relation b, Relation a) { - return Objects.equals(a, ZERO) ? b : a; + return mergeRel(b, a); } @Override public Relation merge(Relation b, Relation a) { - b.mergeFrom(a); - return b; + return mergeRel(b, a); } @Override @@ -34,6 +33,18 @@ public class RelationAggregator extends Aggregator return r; } + private Relation mergeRel(Relation b, Relation a) { + if (Objects.equals(b, ZERO)) { + return a; + } + if (Objects.equals(a, ZERO)) { + return b; + } + + b.mergeFrom(a); + return b; + } + @Override public Encoder bufferEncoder() { return Encoders.kryo(Relation.class); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index baba3bc875..1073adbead 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.dedup; import static org.apache.spark.sql.functions.col; +import com.google.common.base.Joiner; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -108,7 +109,7 @@ public class SparkPropagateRelation extends AbstractSparkAction { private Dataset distinctRelations(Dataset rels) { return rels .filter(getRelationFilterFunction()) - .groupByKey((MapFunction) r -> ModelSupport.idFn().apply(r), Encoders.STRING()) + .groupByKey((MapFunction) r -> String.join(r.getSource(), r.getTarget(), r.getRelType(), r.getSubRelType(), r.getRelClass()), Encoders.STRING()) .agg(new RelationAggregator().toColumn()) .map((MapFunction, Relation>) t -> t._2(), Encoders.bean(Relation.class)); } From 344a90c2e6ab1aecf2f48bd5e949bfb8bbb467ae Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 13 Jul 2020 16:32:04 +0200 Subject: [PATCH 37/65] updated assertions in propagateRelationTest --- .../eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 8ed05c18d9..91e45447ca 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -160,14 +160,11 @@ public class SparkDedupTest implements Serializable { parser .parseArgument( new String[] { - "-i", - testGraphBasePath, - "-asi", - testActionSetId, - "-la", - "lookupurl", - "-w", - testOutputBasePath + "-i", testGraphBasePath, + "-asi", testActionSetId, + "-la", "lookupurl", + "-w", testOutputBasePath, + "-np", "50" }); new SparkCreateSimRels(parser, spark).run(isLookUpService); @@ -535,7 +532,7 @@ public class SparkDedupTest implements Serializable { long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count(); - assertEquals(4971, relations); + assertEquals(4866, relations); // check deletedbyinference final Dataset mergeRels = spark From c6f6fb0f2853efcb1cce467fa3113242e88cf8f0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 13 Jul 2020 16:46:13 +0200 Subject: [PATCH 38/65] code formatting --- .../dhp/oa/dedup/SparkCreateMergeRels.java | 6 +++--- .../dhp/oa/dedup/SparkCreateSimRels.java | 6 +++--- .../dhp/oa/dedup/SparkPropagateRelation.java | 17 +++++++++++------ .../eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java | 3 ++- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java index 70457fb4a0..6d625cd117 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java @@ -78,9 +78,9 @@ public class SparkCreateMergeRels extends AbstractSparkAction { final String isLookUpUrl = parser.get("isLookUpUrl"); final String actionSetId = parser.get("actionSetId"); int cut = Optional - .ofNullable(parser.get("cutConnectedComponent")) - .map(Integer::valueOf) - .orElse(0); + .ofNullable(parser.get("cutConnectedComponent")) + .map(Integer::valueOf) + .orElse(0); log.info("connected component cut: '{}'", cut); log.info("graphBasePath: '{}'", graphBasePath); log.info("isLookUpUrl: '{}'", isLookUpUrl); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java index a66ab431c7..3beb90e0b3 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java @@ -65,9 +65,9 @@ public class SparkCreateSimRels extends AbstractSparkAction { final String actionSetId = parser.get("actionSetId"); final String workingPath = parser.get("workingPath"); final int numPartitions = Optional - .ofNullable(parser.get("numPartitions")) - .map(Integer::valueOf) - .orElse(NUM_PARTITIONS); + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); log.info("numPartitions: '{}'", numPartitions); log.info("graphBasePath: '{}'", graphBasePath); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index 1073adbead..03e6674e4d 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -3,7 +3,6 @@ package eu.dnetlib.dhp.oa.dedup; import static org.apache.spark.sql.functions.col; -import com.google.common.base.Joiner; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -13,6 +12,8 @@ import org.apache.spark.sql.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.base.Joiner; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -99,17 +100,21 @@ public class SparkPropagateRelation extends AbstractSparkAction { getDeletedFn()); save( - distinctRelations(newRels - .union(updated) - .union(mergeRels) - .map((MapFunction) r -> r, Encoders.kryo(Relation.class))), + distinctRelations( + newRels + .union(updated) + .union(mergeRels) + .map((MapFunction) r -> r, Encoders.kryo(Relation.class))), outputRelationPath, SaveMode.Overwrite); } private Dataset distinctRelations(Dataset rels) { return rels .filter(getRelationFilterFunction()) - .groupByKey((MapFunction) r -> String.join(r.getSource(), r.getTarget(), r.getRelType(), r.getSubRelType(), r.getRelClass()), Encoders.STRING()) + .groupByKey( + (MapFunction) r -> String + .join(r.getSource(), r.getTarget(), r.getRelType(), r.getSubRelType(), r.getRelClass()), + Encoders.STRING()) .agg(new RelationAggregator().toColumn()) .map((MapFunction, Relation>) t -> t._2(), Encoders.bean(Relation.class)); } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 91e45447ca..fb5ebc0993 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -292,7 +292,8 @@ public class SparkDedupTest implements Serializable { FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel")); FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")); FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel")); - FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")); + FileUtils + .deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")); } @Test From 9258e4f095ab6e181d2c06161b1b3c69a4397669 Mon Sep 17 00:00:00 2001 From: miconis Date: Mon, 13 Jul 2020 18:22:34 +0200 Subject: [PATCH 39/65] implementation of a new workflow to compute statistics on the blocks --- .../eu/dnetlib/dhp/oa/dedup/DedupUtility.java | 5 + .../dnetlib/dhp/oa/dedup/SparkBlockStats.java | 121 ++++++++++++++++++ .../dhp/oa/dedup/model/BlockStats.java | 42 ++++++ .../oa/dedup/createBlockStats_parameters.json | 26 ++++ .../statistics/oozie_app/config-default.xml | 18 +++ .../dedup/statistics/oozie_app/workflow.xml | 111 ++++++++++++++++ 6 files changed, 323 insertions(+) create mode 100644 dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java create mode 100644 dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java create mode 100644 dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json create mode 100644 dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java index 222794d64d..01065510ae 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java @@ -100,6 +100,11 @@ public class DedupUtility { return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType); } + public static String createBlockStatsPath( + final String basePath, final String actionSetId, final String entityType) { + return String.format("%s/%s/%s_blockstats", basePath, actionSetId, entityType); + } + public static List getConfigurations(String isLookUpUrl, String orchestrator) throws ISLookUpException, DocumentException { final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java new file mode 100644 index 0000000000..bfd98e78ee --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java @@ -0,0 +1,121 @@ +package eu.dnetlib.dhp.oa.dedup; + +import java.io.IOException; + +import eu.dnetlib.dhp.oa.dedup.model.BlockStats; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.dedup.model.Block; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.FieldValueImpl; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; +import scala.Tuple2; + +public class SparkBlockStats extends AbstractSparkAction { + + private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); + + public SparkBlockStats(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } + + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json"))); + parser.parseArgument(args); + + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf + .registerKryoClasses( + new Class[] { + MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class + }); + + new SparkCreateSimRels(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } + + @Override + public void run(ISLookUpService isLookUpService) + throws DocumentException, IOException, ISLookUpException { + + // read oozie parameters + final String graphBasePath = parser.get("graphBasePath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); + + log.info("graphBasePath: '{}'", graphBasePath); + log.info("isLookUpUrl: '{}'", isLookUpUrl); + log.info("actionSetId: '{}'", actionSetId); + log.info("workingPath: '{}'", workingPath); + + // for each dedup configuration + for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { + + final String subEntity = dedupConf.getWf().getSubEntityValue(); + log.info("Creating blockstats for: '{}'", subEntity); + + final String outputPath = DedupUtility.createBlockStatsPath(workingPath, actionSetId, subEntity); + removeOutputDir(spark, outputPath); + + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaPairRDD mapDocuments = sc + .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .mapToPair( + (PairFunction) s -> { + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); + + // create blocks for deduplication + JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); + + JavaRDD blockStats = blocks.map(b -> + new BlockStats( + b._1(), + (long) b._2().getDocuments().size(), + computeComparisons( + (long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize())) + ); + + // save the blockstats in the workingdir + spark + .createDataset(blockStats.rdd(), Encoders.bean(BlockStats.class)) + .write() + .mode(SaveMode.Overwrite) + .save(outputPath); + } + } + + public Long computeComparisons(Long blockSize, Long slidingWindowSize){ + + if (slidingWindowSize >= blockSize) + return (slidingWindowSize * (slidingWindowSize - 1)) / 2; + else { + return (blockSize - slidingWindowSize + 1) * (slidingWindowSize * (slidingWindowSize - 1)) / 2; + } + } +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java new file mode 100644 index 0000000000..ef8505e8fa --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java @@ -0,0 +1,42 @@ +package eu.dnetlib.dhp.oa.dedup.model; + +public class BlockStats { + + private String key; //key of the block + private Long size; //number of elements in the block + private Long comparisons; //number of comparisons in the block + + public BlockStats() { + } + + public BlockStats(String key, Long size, Long comparisons) { + this.key = key; + this.size = size; + this.comparisons = comparisons; + } + + public String getKey() { + return key; + } + + public void setKey(String key) { + this.key = key; + } + + public Long getSize() { + return size; + } + + public void setSize(Long size) { + this.size = size; + } + + public Long getComparisons() { + return comparisons; + } + + public void setComparisons(Long comparisons) { + this.comparisons = comparisons; + } + +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json new file mode 100644 index 0000000000..ce38dc6f00 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "la", + "paramLongName": "isLookUpUrl", + "paramDescription": "address for the LookUp", + "paramRequired": true + }, + { + "paramName": "asi", + "paramLongName": "actionSetId", + "paramDescription": "action set identifier (name of the orchestrator)", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of the raw graph", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "path of the working directory", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml new file mode 100644 index 0000000000..e1019cd014 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml @@ -0,0 +1,111 @@ + + + + graphBasePath + the raw graph base path + + + isLookUpUrl + the address of the lookUp service + + + actionSetId + id of the actionSet + + + workingPath + path for the working directory + + + dedupGraphPath + path for the output graph + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn + cluster + Create Similarity Relations + eu.dnetlib.dhp.oa.dedup.SparkBlockStats + dhp-dedup-openaire-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --i${graphBasePath} + --la${isLookUpUrl} + --asi${actionSetId} + --w${workingPath} + + + + + + + \ No newline at end of file From 03ecfa5ebd617eeac6d35006608fc742b96d92ac Mon Sep 17 00:00:00 2001 From: miconis Date: Mon, 13 Jul 2020 18:48:23 +0200 Subject: [PATCH 40/65] implementation of the test class for the new block stats spark action --- .../dhp/oa/dedup/model/BlockStats.java | 4 +- .../dnetlib/dhp/oa/dedup/SparkStatsTest.java | 175 ++++++++++++++++++ 2 files changed, 178 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java index ef8505e8fa..be4ec51a09 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.oa.dedup.model; -public class BlockStats { +import java.io.Serializable; + +public class BlockStats implements Serializable { private String key; //key of the block private Long size; //number of elements in the block diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java new file mode 100644 index 0000000000..8fac56ef3e --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java @@ -0,0 +1,175 @@ +package eu.dnetlib.dhp.oa.dedup; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.net.URISyntaxException; +import java.nio.file.Paths; + +import static java.nio.file.Files.createTempDirectory; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.lenient; + +@ExtendWith(MockitoExtension.class) +public class SparkStatsTest implements Serializable { + + @Mock(serializable = true) + ISLookUpService isLookUpService; + + private static SparkSession spark; + private static JavaSparkContext jsc; + + private static String testGraphBasePath; + private static String testOutputBasePath; + private static final String testActionSetId = "test-orchestrator"; + + @BeforeAll + public static void cleanUp() throws IOException, URISyntaxException { + + testGraphBasePath = Paths + .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/entities").toURI()) + .toFile() + .getAbsolutePath(); + testOutputBasePath = createTempDirectory(SparkDedupTest.class.getSimpleName() + "-") + .toAbsolutePath() + .toString(); + + FileUtils.deleteDirectory(new File(testOutputBasePath)); + + final SparkConf conf = new SparkConf(); + conf.set("spark.sql.shuffle.partitions", "200"); + spark = SparkSession + .builder() + .appName(SparkDedupTest.class.getSimpleName()) + .master("local[*]") + .config(conf) + .getOrCreate(); + + jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + } + + @BeforeEach + public void setUp() throws IOException, ISLookUpException { + + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml"))); + + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); + + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"))); + + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json"))); + + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json"))); + + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json"))); + } + + @Test + public void createBlockStatsTest() throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json"))); + parser + .parseArgument( + new String[] { + "-i", testGraphBasePath, + "-asi", testActionSetId, + "-la", "lookupurl", + "-w", testOutputBasePath + }); + + new SparkBlockStats(parser, spark).run(isLookUpService); + + long orgs_blocks = spark + .read() + .textFile(testOutputBasePath + "/" + testActionSetId + "/organization_blockstats") + .count(); + + long pubs_blocks = spark + .read() + .textFile(testOutputBasePath + "/" + testActionSetId + "/publication_blockstats") + .count(); + + long sw_blocks = spark + .read() + .textFile(testOutputBasePath + "/" + testActionSetId + "/software_blockstats") + .count(); + + long ds_blocks = spark + .read() + .textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_blockstats") + .count(); + + long orp_blocks = spark + .read() + .textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats") + .count(); + + assertEquals(121, orgs_blocks); + assertEquals(110, pubs_blocks); + assertEquals(21, sw_blocks); + assertEquals(67, ds_blocks); + assertEquals(55, orp_blocks); + } +} + From 66f9f6d3239545d842798a869a1152f5204616ff Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 13 Jul 2020 19:26:46 +0200 Subject: [PATCH 41/65] adjusted parameters for the dedup stats workflow --- .../dnetlib/dhp/oa/dedup/SparkBlockStats.java | 142 +++++---- .../dhp/oa/dedup/model/BlockStats.java | 57 ++-- .../dedup/statistics/oozie_app/workflow.xml | 14 +- .../dnetlib/dhp/oa/dedup/SparkStatsTest.java | 270 +++++++++--------- 4 files changed, 237 insertions(+), 246 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java index bfd98e78ee..7fd359a0c9 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java @@ -1,8 +1,8 @@ + package eu.dnetlib.dhp.oa.dedup; import java.io.IOException; -import eu.dnetlib.dhp.oa.dedup.model.BlockStats; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; @@ -15,8 +15,10 @@ import org.apache.spark.sql.SparkSession; import org.dom4j.DocumentException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.dedup.model.Block; +import eu.dnetlib.dhp.oa.dedup.model.BlockStats; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -29,93 +31,87 @@ import scala.Tuple2; public class SparkBlockStats extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); + private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); - public SparkBlockStats(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } + public SparkBlockStats(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkCreateSimRels.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json"))); + parser.parseArgument(args); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf - .registerKryoClasses( - new Class[] { - MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class - }); + SparkConf conf = new SparkConf(); - new SparkCreateSimRels(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); - } + new SparkCreateSimRels(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } - @Override - public void run(ISLookUpService isLookUpService) - throws DocumentException, IOException, ISLookUpException { + @Override + public void run(ISLookUpService isLookUpService) + throws DocumentException, IOException, ISLookUpException { - // read oozie parameters - final String graphBasePath = parser.get("graphBasePath"); - final String isLookUpUrl = parser.get("isLookUpUrl"); - final String actionSetId = parser.get("actionSetId"); - final String workingPath = parser.get("workingPath"); + // read oozie parameters + final String graphBasePath = parser.get("graphBasePath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); - log.info("graphBasePath: '{}'", graphBasePath); - log.info("isLookUpUrl: '{}'", isLookUpUrl); - log.info("actionSetId: '{}'", actionSetId); - log.info("workingPath: '{}'", workingPath); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("isLookUpUrl: '{}'", isLookUpUrl); + log.info("actionSetId: '{}'", actionSetId); + log.info("workingPath: '{}'", workingPath); - // for each dedup configuration - for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { + // for each dedup configuration + for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { - final String subEntity = dedupConf.getWf().getSubEntityValue(); - log.info("Creating blockstats for: '{}'", subEntity); + final String subEntity = dedupConf.getWf().getSubEntityValue(); + log.info("Creating blockstats for: '{}'", subEntity); - final String outputPath = DedupUtility.createBlockStatsPath(workingPath, actionSetId, subEntity); - removeOutputDir(spark, outputPath); + final String outputPath = DedupUtility.createBlockStatsPath(workingPath, actionSetId, subEntity); + removeOutputDir(spark, outputPath); - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaPairRDD mapDocuments = sc - .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .mapToPair( - (PairFunction) s -> { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); - return new Tuple2<>(d.getIdentifier(), d); - }); + JavaPairRDD mapDocuments = sc + .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .mapToPair( + (PairFunction) s -> { + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); - // create blocks for deduplication - JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); + // create blocks for deduplication + JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); - JavaRDD blockStats = blocks.map(b -> - new BlockStats( - b._1(), - (long) b._2().getDocuments().size(), - computeComparisons( - (long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize())) - ); + JavaRDD blockStats = blocks + .map( + b -> new BlockStats( + b._1(), + (long) b._2().getDocuments().size(), + computeComparisons( + (long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize()))); - // save the blockstats in the workingdir - spark - .createDataset(blockStats.rdd(), Encoders.bean(BlockStats.class)) - .write() - .mode(SaveMode.Overwrite) - .save(outputPath); - } - } + // save the blockstats in the workingdir + spark + .createDataset(blockStats.rdd(), Encoders.bean(BlockStats.class)) + .write() + .mode(SaveMode.Overwrite) + .save(outputPath); + } + } - public Long computeComparisons(Long blockSize, Long slidingWindowSize){ + public Long computeComparisons(Long blockSize, Long slidingWindowSize) { - if (slidingWindowSize >= blockSize) - return (slidingWindowSize * (slidingWindowSize - 1)) / 2; - else { - return (blockSize - slidingWindowSize + 1) * (slidingWindowSize * (slidingWindowSize - 1)) / 2; - } - } + if (slidingWindowSize >= blockSize) + return (slidingWindowSize * (slidingWindowSize - 1)) / 2; + else { + return (blockSize - slidingWindowSize + 1) * (slidingWindowSize * (slidingWindowSize - 1)) / 2; + } + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java index be4ec51a09..0ec8c26999 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/BlockStats.java @@ -1,44 +1,45 @@ + package eu.dnetlib.dhp.oa.dedup.model; import java.io.Serializable; public class BlockStats implements Serializable { - private String key; //key of the block - private Long size; //number of elements in the block - private Long comparisons; //number of comparisons in the block + private String key; // key of the block + private Long size; // number of elements in the block + private Long comparisons; // number of comparisons in the block - public BlockStats() { - } + public BlockStats() { + } - public BlockStats(String key, Long size, Long comparisons) { - this.key = key; - this.size = size; - this.comparisons = comparisons; - } + public BlockStats(String key, Long size, Long comparisons) { + this.key = key; + this.size = size; + this.comparisons = comparisons; + } - public String getKey() { - return key; - } + public String getKey() { + return key; + } - public void setKey(String key) { - this.key = key; - } + public void setKey(String key) { + this.key = key; + } - public Long getSize() { - return size; - } + public Long getSize() { + return size; + } - public void setSize(Long size) { - this.size = size; - } + public void setSize(Long size) { + this.size = size; + } - public Long getComparisons() { - return comparisons; - } + public Long getComparisons() { + return comparisons; + } - public void setComparisons(Long comparisons) { - this.comparisons = comparisons; - } + public void setComparisons(Long comparisons) { + this.comparisons = comparisons; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml index e1019cd014..8b75d16b3d 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + graphBasePath @@ -12,14 +12,6 @@ actionSetId id of the actionSet - - workingPath - path for the working directory - - - dedupGraphPath - path for the output graph - sparkDriverMemory memory for driver process @@ -85,7 +77,7 @@ yarn cluster - Create Similarity Relations + Create deduplication blocks eu.dnetlib.dhp.oa.dedup.SparkBlockStats dhp-dedup-openaire-${projectVersion}.jar @@ -101,7 +93,7 @@ --i${graphBasePath} --la${isLookUpUrl} --asi${actionSetId} - --w${workingPath} + --w${workingDir} diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java index 8fac56ef3e..7e76c284b7 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java @@ -1,8 +1,17 @@ + package eu.dnetlib.dhp.oa.dedup; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import static java.nio.file.Files.createTempDirectory; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.lenient; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.net.URISyntaxException; +import java.nio.file.Paths; + import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; @@ -14,162 +23,155 @@ import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; -import java.io.File; -import java.io.IOException; -import java.io.Serializable; -import java.net.URISyntaxException; -import java.nio.file.Paths; - -import static java.nio.file.Files.createTempDirectory; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.mockito.Mockito.lenient; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @ExtendWith(MockitoExtension.class) public class SparkStatsTest implements Serializable { - @Mock(serializable = true) - ISLookUpService isLookUpService; + @Mock(serializable = true) + ISLookUpService isLookUpService; - private static SparkSession spark; - private static JavaSparkContext jsc; + private static SparkSession spark; + private static JavaSparkContext jsc; - private static String testGraphBasePath; - private static String testOutputBasePath; - private static final String testActionSetId = "test-orchestrator"; + private static String testGraphBasePath; + private static String testOutputBasePath; + private static final String testActionSetId = "test-orchestrator"; - @BeforeAll - public static void cleanUp() throws IOException, URISyntaxException { + @BeforeAll + public static void cleanUp() throws IOException, URISyntaxException { - testGraphBasePath = Paths - .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/entities").toURI()) - .toFile() - .getAbsolutePath(); - testOutputBasePath = createTempDirectory(SparkDedupTest.class.getSimpleName() + "-") - .toAbsolutePath() - .toString(); + testGraphBasePath = Paths + .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/entities").toURI()) + .toFile() + .getAbsolutePath(); + testOutputBasePath = createTempDirectory(SparkDedupTest.class.getSimpleName() + "-") + .toAbsolutePath() + .toString(); - FileUtils.deleteDirectory(new File(testOutputBasePath)); + FileUtils.deleteDirectory(new File(testOutputBasePath)); - final SparkConf conf = new SparkConf(); - conf.set("spark.sql.shuffle.partitions", "200"); - spark = SparkSession - .builder() - .appName(SparkDedupTest.class.getSimpleName()) - .master("local[*]") - .config(conf) - .getOrCreate(); + final SparkConf conf = new SparkConf(); + conf.set("spark.sql.shuffle.partitions", "200"); + spark = SparkSession + .builder() + .appName(SparkDedupTest.class.getSimpleName()) + .master("local[*]") + .config(conf) + .getOrCreate(); - jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - } + jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + } - @BeforeEach - public void setUp() throws IOException, ISLookUpException { + @BeforeEach + public void setUp() throws IOException, ISLookUpException { - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId))) - .thenReturn( - IOUtils - .toString( - SparkDedupTest.class - .getResourceAsStream( - "/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization"))) - .thenReturn( - IOUtils - .toString( - SparkDedupTest.class - .getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication"))) - .thenReturn( - IOUtils - .toString( - SparkDedupTest.class - .getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software"))) - .thenReturn( - IOUtils - .toString( - SparkDedupTest.class - .getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset"))) - .thenReturn( - IOUtils - .toString( - SparkDedupTest.class - .getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct"))) - .thenReturn( - IOUtils - .toString( - SparkDedupTest.class - .getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json"))); - } + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json"))); + } - @Test - public void createBlockStatsTest() throws Exception { + @Test + public void createBlockStatsTest() throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkCreateSimRels.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json"))); - parser - .parseArgument( - new String[] { - "-i", testGraphBasePath, - "-asi", testActionSetId, - "-la", "lookupurl", - "-w", testOutputBasePath - }); + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json"))); + parser + .parseArgument( + new String[] { + "-i", testGraphBasePath, + "-asi", testActionSetId, + "-la", "lookupurl", + "-w", testOutputBasePath + }); - new SparkBlockStats(parser, spark).run(isLookUpService); + new SparkBlockStats(parser, spark).run(isLookUpService); - long orgs_blocks = spark - .read() - .textFile(testOutputBasePath + "/" + testActionSetId + "/organization_blockstats") - .count(); + long orgs_blocks = spark + .read() + .textFile(testOutputBasePath + "/" + testActionSetId + "/organization_blockstats") + .count(); - long pubs_blocks = spark - .read() - .textFile(testOutputBasePath + "/" + testActionSetId + "/publication_blockstats") - .count(); + long pubs_blocks = spark + .read() + .textFile(testOutputBasePath + "/" + testActionSetId + "/publication_blockstats") + .count(); - long sw_blocks = spark - .read() - .textFile(testOutputBasePath + "/" + testActionSetId + "/software_blockstats") - .count(); + long sw_blocks = spark + .read() + .textFile(testOutputBasePath + "/" + testActionSetId + "/software_blockstats") + .count(); - long ds_blocks = spark - .read() - .textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_blockstats") - .count(); + long ds_blocks = spark + .read() + .textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_blockstats") + .count(); - long orp_blocks = spark - .read() - .textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats") - .count(); + long orp_blocks = spark + .read() + .textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats") + .count(); - assertEquals(121, orgs_blocks); - assertEquals(110, pubs_blocks); - assertEquals(21, sw_blocks); - assertEquals(67, ds_blocks); - assertEquals(55, orp_blocks); - } + assertEquals(121, orgs_blocks); + assertEquals(110, pubs_blocks); + assertEquals(21, sw_blocks); + assertEquals(67, ds_blocks); + assertEquals(55, orp_blocks); + } } - From b8a45041fdbf0142a2abdfb36b9d5a1bb21551a7 Mon Sep 17 00:00:00 2001 From: miconis Date: Mon, 13 Jul 2020 19:53:18 +0200 Subject: [PATCH 42/65] minor changes --- .../main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java index bfd98e78ee..31aa1ae81f 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java @@ -29,7 +29,7 @@ import scala.Tuple2; public class SparkBlockStats extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); + private static final Logger log = LoggerFactory.getLogger(SparkBlockStats.class); public SparkBlockStats(ArgumentApplicationParser parser, SparkSession spark) { super(parser, spark); @@ -39,7 +39,7 @@ public class SparkBlockStats extends AbstractSparkAction { ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - SparkCreateSimRels.class + SparkBlockStats.class .getResourceAsStream( "/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json"))); parser.parseArgument(args); @@ -52,7 +52,7 @@ public class SparkBlockStats extends AbstractSparkAction { MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class }); - new SparkCreateSimRels(parser, getSparkSession(conf)) + new SparkBlockStats(parser, getSparkSession(conf)) .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); } From 06def0c0cb7ac5f2bef1a56b9a4fad3bc9ea3415 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 13 Jul 2020 20:09:06 +0200 Subject: [PATCH 43/65] SparkBlockStats allows to repartition the input rdd via the numPartitions workflow parameter --- .../dhp/oa/dedup/AbstractSparkAction.java | 2 + .../dnetlib/dhp/oa/dedup/SparkBlockStats.java | 61 +++++++++---------- .../dhp/oa/dedup/SparkCreateSimRels.java | 2 - .../oa/dedup/createBlockStats_parameters.json | 6 ++ .../dedup/statistics/oozie_app/workflow.xml | 13 ++-- 5 files changed, 47 insertions(+), 37 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java index 2120da0808..74cecb7b6b 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java @@ -28,6 +28,8 @@ import eu.dnetlib.pace.config.DedupConfig; abstract class AbstractSparkAction implements Serializable { + protected static final int NUM_PARTITIONS = 1000; + protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java index 49f8123e87..d5de309670 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.oa.dedup; import java.io.IOException; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; @@ -23,49 +24,41 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.FieldListImpl; -import eu.dnetlib.pace.model.FieldValueImpl; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; public class SparkBlockStats extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkBlockStats.class); + private static final Logger log = LoggerFactory.getLogger(SparkBlockStats.class); - public SparkBlockStats(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } + public SparkBlockStats(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkBlockStats.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkBlockStats.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json"))); + parser.parseArgument(args); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf - .registerKryoClasses( - new Class[] { - MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class - }); + SparkConf conf = new SparkConf(); - new SparkBlockStats(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); - } + new SparkBlockStats(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } - public Long computeComparisons(Long blockSize, Long slidingWindowSize){ + public Long computeComparisons(Long blockSize, Long slidingWindowSize) { - if (slidingWindowSize >= blockSize) - return (slidingWindowSize * (slidingWindowSize - 1)) / 2; - else { - return (blockSize - slidingWindowSize + 1) * (slidingWindowSize * (slidingWindowSize - 1)) / 2; - } - } + if (slidingWindowSize >= blockSize) + return (slidingWindowSize * (slidingWindowSize - 1)) / 2; + else { + return (blockSize - slidingWindowSize + 1) * (slidingWindowSize * (slidingWindowSize - 1)) / 2; + } + } @Override public void run(ISLookUpService isLookUpService) @@ -76,6 +69,10 @@ public class SparkBlockStats extends AbstractSparkAction { final String isLookUpUrl = parser.get("isLookUpUrl"); final String actionSetId = parser.get("actionSetId"); final String workingPath = parser.get("workingPath"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); log.info("graphBasePath: '{}'", graphBasePath); log.info("isLookUpUrl: '{}'", isLookUpUrl); @@ -95,6 +92,7 @@ public class SparkBlockStats extends AbstractSparkAction { JavaPairRDD mapDocuments = sc .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .repartition(numPartitions) .mapToPair( (PairFunction) s -> { MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); @@ -105,6 +103,7 @@ public class SparkBlockStats extends AbstractSparkAction { JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); JavaRDD blockStats = blocks + .repartition(numPartitions) .map( b -> new BlockStats( b._1(), diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java index 3beb90e0b3..b3ee47bfc2 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java @@ -35,8 +35,6 @@ public class SparkCreateSimRels extends AbstractSparkAction { private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); - public static final int NUM_PARTITIONS = 1000; - public SparkCreateSimRels(ArgumentApplicationParser parser, SparkSession spark) { super(parser, spark); } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json index ce38dc6f00..09f4365d34 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json @@ -22,5 +22,11 @@ "paramLongName": "workingPath", "paramDescription": "path of the working directory", "paramRequired": true + }, + { + "paramName": "np", + "paramLongName": "numPartitions", + "paramDescription": "number of partitions for the similarity relations intermediate phases", + "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml index 8b75d16b3d..c0080b028f 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/statistics/oozie_app/workflow.xml @@ -12,6 +12,10 @@ actionSetId id of the actionSet + + numPartitions + number of partitions for the similarity relations intermediate phases + sparkDriverMemory memory for driver process @@ -90,10 +94,11 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --i${graphBasePath} - --la${isLookUpUrl} - --asi${actionSetId} - --w${workingDir} + --graphBasePath${graphBasePath} + --isLookUpUrl${isLookUpUrl} + --actionSetId${actionSetId} + --workingPath${workingDir} + --numPartitions${numPartitions} From 8e97598eb4b56cc1ca05260e59b81e6bfae4aff4 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 13 Jul 2020 20:46:14 +0200 Subject: [PATCH 44/65] avoid to NPE in case of null instances --- .../CreateRelatedEntitiesJob_phase1.java | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index 57dca7bb15..f3188007a0 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -184,13 +184,16 @@ public class CreateRelatedEntitiesJob_phase1 { re.setDateofacceptance(getValue(result.getDateofacceptance())); re.setPublisher(getValue(result.getPublisher())); re.setResulttype(result.getResulttype()); - re - .setInstances( - result - .getInstance() - .stream() - .limit(ProvisionConstants.MAX_INSTANCES) - .collect(Collectors.toList())); + if (Objects.nonNull(result.getInstance())) { + re + .setInstances( + result + .getInstance() + .stream() + .filter(Objects::nonNull) + .limit(ProvisionConstants.MAX_INSTANCES) + .collect(Collectors.toList())); + } // TODO still to be mapped // re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); From 7d6e269b40d0b54b33fe450c6008f973b77ed708 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 13 Jul 2020 22:54:04 +0200 Subject: [PATCH 45/65] reverted CreateRelatedEntitiesJob_phase1 to its previous state --- .../CreateRelatedEntitiesJob_phase1.java | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index f3188007a0..b08e593f73 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -116,23 +116,10 @@ public class CreateRelatedEntitiesJob_phase1 { Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class))) .cache(); - final String relatedEntityPath = outputPath + "_relatedEntity"; - readPathEntity(spark, inputEntityPath, clazz) + Dataset> entities = readPathEntity(spark, inputEntityPath, clazz) .filter("dataInfo.invisible == false") .map( - (MapFunction) value -> asRelatedEntity(value, clazz), - Encoders.kryo(RelatedEntity.class)) - .repartition(5000) - .write() - .mode(SaveMode.Overwrite) - .parquet(relatedEntityPath); - - Dataset> entities = spark - .read() - .load(relatedEntityPath) - .as(Encoders.kryo(RelatedEntity.class)) - .map( - (MapFunction>) e -> new Tuple2<>(e.getId(), e), + (MapFunction>) e -> new Tuple2<>(e.getId(), asRelatedEntity(e, clazz)), Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class))) .cache(); From 262c29463e19d696899d3e6a730cb9bbf7a19d26 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 15 Jul 2020 09:18:40 +0200 Subject: [PATCH 46/65] relations with multiple datasources --- .../dhp/broker/model/EventFactory.java | 24 ++- .../dnetlib/dhp/broker/oa/JoinStep0Job.java | 20 +- .../oa/PrepareRelatedDatasourcesJob.java | 55 ++++- .../dhp/broker/oa/matchers/UpdateMatcher.java | 10 +- .../dhp/broker/oa/util/BrokerConstants.java | 4 + .../dhp/broker/oa/util/ConversionUtils.java | 16 +- .../util/DatasourceRelationsAccumulator.java | 68 ++++++ .../dhp/broker/oa/util/EventFinder.java | 17 +- .../dhp/broker/oa/util/UpdateInfo.java | 25 ++- .../withRels/RelatedDatasource.java | 42 ++++ ....java => RelatedDatasourceAggregator.java} | 24 ++- .../withRels/SimpleDatasourceInfo.java | 40 ---- .../oa/generate_all/oozie_app/workflow.xml | 24 +++ .../broker/oa/partial/oozie_app/workflow.xml | 199 +++++++++++++++++- .../broker/oa/matchers/UpdateMatcherTest.java | 20 +- pom.xml | 2 +- 16 files changed, 495 insertions(+), 95 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasource.java rename dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/{AddDatasourceTypeAggregator.java => RelatedDatasourceAggregator.java} (55%) delete mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/SimpleDatasourceInfo.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java index 49e7506981..4a58cfd36d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java @@ -11,6 +11,8 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.DateUtils; import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; public class EventFactory { @@ -52,9 +54,11 @@ public class EventFactory { final OaBrokerMainEntity source = updateInfo.getSource(); final OaBrokerMainEntity target = updateInfo.getTarget(); - map.setTargetDatasourceId(target.getCollectedFromId()); - map.setTargetDatasourceName(target.getCollectedFromName()); - map.setTargetDatasourceType(target.getCollectedFromType()); + final OaBrokerRelatedDatasource targetDs = updateInfo.getTargetDs(); + + map.setTargetDatasourceId(targetDs.getOpenaireId()); + map.setTargetDatasourceName(targetDs.getName()); + map.setTargetDatasourceType(targetDs.getType()); map.setTargetResultId(target.getOpenaireId()); @@ -73,11 +77,19 @@ public class EventFactory { // PROVENANCE INFO map.setTrust(updateInfo.getTrust()); - map.setProvenanceDatasourceId(source.getCollectedFromId()); - map.setProvenanceDatasourceName(source.getCollectedFromName()); - map.setProvenanceDatasourceType(source.getCollectedFromType()); map.setProvenanceResultId(source.getOpenaireId()); + source + .getDatasources() + .stream() + .filter(ds -> ds.getRelType().equals(BrokerConstants.COLLECTED_FROM_REL)) + .findFirst() + .ifPresent(ds -> { + map.setProvenanceDatasourceId(ds.getOpenaireId()); + map.setProvenanceDatasourceName(ds.getName()); + map.setProvenanceDatasourceType(ds.getType()); + }); + return map; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java index eb1825fa5a..39fa76e43d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java @@ -17,8 +17,8 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.AddDatasourceTypeAggregator; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.SimpleDatasourceInfo; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDatasource; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDatasourceAggregator; import scala.Tuple2; public class JoinStep0Job { @@ -45,33 +45,33 @@ public class JoinStep0Job { final String workingPath = parser.get("workingPath"); log.info("workingPath: {}", workingPath); - final String outputPath = workingPath + "/joinedEntities_step0"; - log.info("outputPath: {}", outputPath); + final String joinedEntitiesPath = workingPath + "/joinedEntities_step0"; + log.info("joinedEntitiesPath: {}", joinedEntitiesPath); final SparkConf conf = new SparkConf(); runWithSparkSession(conf, isSparkSessionManaged, spark -> { - ClusterUtils.removeDir(spark, outputPath); + ClusterUtils.removeDir(spark, joinedEntitiesPath); final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); final Dataset sources = ClusterUtils .readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class); - final Dataset datasources = ClusterUtils - .readPath(spark, workingPath + "/datasources", SimpleDatasourceInfo.class); + final Dataset typedRels = ClusterUtils + .readPath(spark, workingPath + "/relatedDatasources", RelatedDatasource.class); - final TypedColumn, OaBrokerMainEntity> aggr = new AddDatasourceTypeAggregator() + final TypedColumn, OaBrokerMainEntity> aggr = new RelatedDatasourceAggregator() .toColumn(); final Dataset dataset = sources - .joinWith(datasources, sources.col("collectedFromId").equalTo(datasources.col("id")), "inner") + .joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer") .groupByKey(t -> t._1.getOpenaireId(), Encoders.STRING()) .agg(aggr) .map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class)); - ClusterUtils.save(dataset, outputPath, OaBrokerMainEntity.class, total); + ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total); }); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java index 30f5ddac33..166372a7f4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java @@ -9,14 +9,23 @@ import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; -import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.SimpleDatasourceInfo; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.DatasourceRelationsAccumulator; +import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDatasource; import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; +import scala.Tuple3; public class PrepareRelatedDatasourcesJob { @@ -42,7 +51,7 @@ public class PrepareRelatedDatasourcesJob { final String workingPath = parser.get("workingPath"); log.info("workingPath: {}", workingPath); - final String relsPath = workingPath + "/datasources"; + final String relsPath = workingPath + "/relatedDatasources"; log.info("relsPath: {}", relsPath); final SparkConf conf = new SparkConf(); @@ -53,16 +62,46 @@ public class PrepareRelatedDatasourcesJob { final LongAccumulator total = spark.sparkContext().longAccumulator("total_datasources"); - final Dataset dataset = ClusterUtils - .readPath(spark, graphPath + "/datasource", Datasource.class) - .map( - ds -> new SimpleDatasourceInfo(ds.getId(), ds.getDatasourcetype().getClassid()), - Encoders.bean(SimpleDatasourceInfo.class)); + final Dataset> rels = prepareResultTuples( + spark, graphPath, Publication.class) + .union(prepareResultTuples(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class)) + .union(prepareResultTuples(spark, graphPath, Software.class)) + .union(prepareResultTuples(spark, graphPath, OtherResearchProduct.class)); - ClusterUtils.save(dataset, relsPath, SimpleDatasourceInfo.class, total); + final Dataset datasources = ClusterUtils + .readPath(spark, graphPath + "/datasource", Datasource.class) + .map(ConversionUtils::oafDatasourceToBrokerDatasource, Encoders.bean(OaBrokerRelatedDatasource.class)); + + final Dataset dataset = rels + .joinWith(datasources, datasources.col("openaireId").equalTo(rels.col("_2")), "inner") + .map(t -> { + final RelatedDatasource r = new RelatedDatasource(); + r.setSource(t._1._1()); + r.setRelDatasource(t._2); + r.getRelDatasource().setRelType(t._1._3()); + return r; + }, Encoders.bean(RelatedDatasource.class)); + + ClusterUtils.save(dataset, relsPath, RelatedDatasource.class, total); }); } + private static final Dataset> prepareResultTuples(final SparkSession spark, + final String graphPath, + final Class sourceClass) { + + return ClusterUtils + .readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) + .filter(r -> !ClusterUtils.isDedupRoot(r.getId())) + .filter(r -> r.getDataInfo().getDeletedbyinference()) + .map( + r -> DatasourceRelationsAccumulator.calculateTuples(r), + Encoders.bean(DatasourceRelationsAccumulator.class)) + .flatMap( + acc -> acc.getRels().iterator(), + Encoders.tuple(Encoders.STRING(), Encoders.STRING(), Encoders.STRING())); + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index 3d688fa1d1..fba82aa8cc 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -15,6 +15,7 @@ import org.apache.commons.lang3.StringUtils; import org.apache.spark.util.LongAccumulator; import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource; import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; @@ -34,18 +35,19 @@ public abstract class UpdateMatcher { this.highlightToStringFunction = highlightToStringFunction; } - public Collection> searchUpdatesForRecord(final OaBrokerMainEntity res, + public Collection> searchUpdatesForRecord(final OaBrokerMainEntity target, + final OaBrokerRelatedDatasource targetDs, final Collection others, final Map accumulators) { final Map> infoMap = new HashMap<>(); for (final OaBrokerMainEntity source : others) { - if (source != res) { - for (final T hl : findDifferences(source, res)) { + if (source != target) { + for (final T hl : findDifferences(source, target)) { final Topic topic = getTopicFunction().apply(hl); if (topic != null) { - final UpdateInfo info = new UpdateInfo<>(topic, hl, source, res, + final UpdateInfo info = new UpdateInfo<>(topic, hl, source, target, targetDs, getCompileHighlightFunction(), getHighlightToStringFunction()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java index 5308b9dffb..7a09862d8c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java @@ -14,6 +14,10 @@ public class BrokerConstants { public static final String OPEN_ACCESS = "OPEN"; public static final String IS_MERGED_IN_CLASS = "isMergedIn"; + public static final String COLLECTED_FROM_REL = "collectedFrom"; + + public static final String HOSTED_BY_REL = "hostedBy"; + public static final float MIN_TRUST = 0.25f; public static final float MAX_TRUST = 1.00f; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index d00c5b817d..053627a5fb 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -22,11 +22,13 @@ import eu.dnetlib.broker.objects.OaBrokerJournal; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerProject; import eu.dnetlib.broker.objects.OaBrokerRelatedDataset; +import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource; import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.ExternalReference; import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Instance; @@ -119,8 +121,6 @@ public class ConversionUtils { res .setJournal( result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null); - res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey)); - res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue)); res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid)); res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances)); res @@ -223,6 +223,18 @@ public class ConversionUtils { return res; } + public static final OaBrokerRelatedDatasource oafDatasourceToBrokerDatasource(final Datasource ds) { + if (ds == null) { + return null; + } + + final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource(); + res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname()))); + res.setOpenaireId(ds.getId()); + res.setType(classId(ds.getDatasourcetype())); + return res; + } + private static String first(final List list) { return list != null && list.size() > 0 ? list.get(0) : null; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java new file mode 100644 index 0000000000..75c4625ce4 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java @@ -0,0 +1,68 @@ + +package eu.dnetlib.dhp.broker.oa.util; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.schema.oaf.Result; +import scala.Tuple3; + +public class DatasourceRelationsAccumulator implements Serializable { + + /** + * + */ + private static final long serialVersionUID = 3256220670651218957L; + + private List> rels = new ArrayList<>(); + + public List> getRels() { + return rels; + } + + public void setRels(final List> rels) { + this.rels = rels; + } + + protected void addTuple(final Tuple3 t) { + rels.add(t); + } + + public static final DatasourceRelationsAccumulator calculateTuples(final Result r) { + + final Set collectedFromSet = r + .getCollectedfrom() + .stream() + .map(kv -> kv.getKey()) + .filter(StringUtils::isNotBlank) + .distinct() + .collect(Collectors.toSet()); + + final Set hostedBySet = r + .getInstance() + .stream() + .map(i -> i.getHostedby()) + .filter(Objects::nonNull) + .filter(kv -> !StringUtils.equalsIgnoreCase(kv.getValue(), "Unknown Repository")) + .map(kv -> kv.getKey()) + .filter(StringUtils::isNotBlank) + .distinct() + .filter(id -> !collectedFromSet.contains(id)) + .collect(Collectors.toSet()); + + final DatasourceRelationsAccumulator res = new DatasourceRelationsAccumulator(); + collectedFromSet + .stream() + .map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.COLLECTED_FROM_REL)) + .forEach(res::addTuple); + hostedBySet.stream().map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.HOSTED_BY_REL)).forEach(res::addTuple); + return res; + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java index b6328eb954..1ab56cc346 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java @@ -11,6 +11,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource; import eu.dnetlib.dhp.broker.model.EventFactory; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy; @@ -80,9 +81,11 @@ public class EventFinder { final List> list = new ArrayList<>(); for (final OaBrokerMainEntity target : results.getData()) { - if (verifyTarget(target, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) { - for (final UpdateMatcher matcher : matchers) { - list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), accumulators)); + for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) { + if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) { + for (final UpdateMatcher matcher : matchers) { + list.addAll(matcher.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)); + } } } } @@ -90,17 +93,17 @@ public class EventFinder { return asEventGroup(list); } - private static boolean verifyTarget(final OaBrokerMainEntity target, + private static boolean verifyTarget(final OaBrokerRelatedDatasource target, final Set dsIdWhitelist, final Set dsIdBlacklist, final Set dsTypeWhitelist) { - if (dsIdWhitelist.contains(target.getCollectedFromId())) { + if (dsIdWhitelist.contains(target.getOpenaireId())) { return true; - } else if (dsIdBlacklist.contains(target.getCollectedFromId())) { + } else if (dsIdBlacklist.contains(target.getOpenaireId())) { return false; } else { - return dsTypeWhitelist.contains(target.getCollectedFromType()); + return dsTypeWhitelist.contains(target.getType()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index ef8fb240c4..fca954247d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -8,6 +8,7 @@ import eu.dnetlib.broker.objects.OaBrokerEventPayload; import eu.dnetlib.broker.objects.OaBrokerInstance; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerProvenance; +import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource; import eu.dnetlib.dhp.broker.model.Topic; public final class UpdateInfo { @@ -20,6 +21,8 @@ public final class UpdateInfo { private final OaBrokerMainEntity target; + private final OaBrokerRelatedDatasource targetDs; + private final BiConsumer compileHighlight; private final Function highlightToString; @@ -28,12 +31,14 @@ public final class UpdateInfo { public UpdateInfo(final Topic topic, final T highlightValue, final OaBrokerMainEntity source, final OaBrokerMainEntity target, + final OaBrokerRelatedDatasource targetDs, final BiConsumer compileHighlight, final Function highlightToString) { this.topic = topic; this.highlightValue = highlightValue; this.source = source; this.target = target; + this.targetDs = targetDs; this.compileHighlight = compileHighlight; this.highlightToString = highlightToString; this.trust = TrustUtils.calculateTrust(source, target); @@ -51,6 +56,10 @@ public final class UpdateInfo { return target; } + public OaBrokerRelatedDatasource getTargetDs() { + return targetDs; + } + protected Topic getTopic() { return topic; } @@ -75,8 +84,20 @@ public final class UpdateInfo { compileHighlight.accept(hl, getHighlightValue()); final String provId = getSource().getOpenaireId(); - final String provRepo = getSource().getCollectedFromName(); - final String provType = getSource().getCollectedFromType(); + final String provRepo = getSource() + .getDatasources() + .stream() + .filter(ds -> ds.getRelType().equals(BrokerConstants.COLLECTED_FROM_REL)) + .map(ds -> ds.getName()) + .findFirst() + .orElse(""); + final String provType = getSource() + .getDatasources() + .stream() + .filter(ds -> ds.getRelType().equals(BrokerConstants.COLLECTED_FROM_REL)) + .map(ds -> ds.getType()) + .findFirst() + .orElse(""); final String provUrl = getSource() .getInstances() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasource.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasource.java new file mode 100644 index 0000000000..a27df502bf --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasource.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; + +import java.io.Serializable; + +import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource; + +public class RelatedDatasource implements Serializable { + + /** + * + */ + private static final long serialVersionUID = 3015550240920424010L; + + private String source; + private OaBrokerRelatedDatasource relDatasource; + + public RelatedDatasource() { + } + + public RelatedDatasource(final String source, final OaBrokerRelatedDatasource relDatasource) { + this.source = source; + this.relDatasource = relDatasource; + } + + public String getSource() { + return source; + } + + public void setSource(final String source) { + this.source = source; + } + + public OaBrokerRelatedDatasource getRelDatasource() { + return relDatasource; + } + + public void setRelDatasource(final OaBrokerRelatedDatasource relDatasource) { + this.relDatasource = relDatasource; + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/AddDatasourceTypeAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasourceAggregator.java similarity index 55% rename from dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/AddDatasourceTypeAggregator.java rename to dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasourceAggregator.java index ccd15c8c6e..2c0c7917d9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/AddDatasourceTypeAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/RelatedDatasourceAggregator.java @@ -7,15 +7,16 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.expressions.Aggregator; import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import scala.Tuple2; -public class AddDatasourceTypeAggregator - extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { +public class RelatedDatasourceAggregator + extends Aggregator, OaBrokerMainEntity, OaBrokerMainEntity> { /** * */ - private static final long serialVersionUID = 8788588975496014728L; + private static final long serialVersionUID = -7212121913834713672L; @Override public OaBrokerMainEntity zero() { @@ -29,10 +30,10 @@ public class AddDatasourceTypeAggregator @Override public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, - final Tuple2 t) { + final Tuple2 t) { final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1; - if (t._2 != null && StringUtils.isNotBlank(t._2.getType())) { - res.setCollectedFromType(t._2.getType()); + if (t._2 != null && res.getDatasources().size() < BrokerConstants.MAX_NUMBER_OF_RELS) { + res.getDatasources().add(t._2.getRelDatasource()); } return res; @@ -40,7 +41,15 @@ public class AddDatasourceTypeAggregator @Override public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) { - if (StringUtils.isNotBlank(g1.getOpenaireId()) && StringUtils.isNotBlank(g1.getCollectedFromType())) { + if (StringUtils.isNotBlank(g1.getOpenaireId())) { + final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getDatasources().size(); + if (availables > 0) { + if (g2.getDatasources().size() <= availables) { + g1.getDatasources().addAll(g2.getDatasources()); + } else { + g1.getDatasources().addAll(g2.getDatasources().subList(0, availables)); + } + } return g1; } else { return g2; @@ -56,4 +65,5 @@ public class AddDatasourceTypeAggregator public Encoder outputEncoder() { return Encoders.bean(OaBrokerMainEntity.class); } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/SimpleDatasourceInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/SimpleDatasourceInfo.java deleted file mode 100644 index 966f63fa0f..0000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/aggregators/withRels/SimpleDatasourceInfo.java +++ /dev/null @@ -1,40 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels; - -import java.io.Serializable; - -public class SimpleDatasourceInfo implements Serializable { - - /** - * - */ - private static final long serialVersionUID = 2996609859416024734L; - - private String id; - private String type; - - public SimpleDatasourceInfo() { - } - - public SimpleDatasourceInfo(final String id, final String type) { - this.id = id; - this.type = type; - } - - public String getId() { - return id; - } - - public void setId(final String id) { - this.id = id; - } - - public String getType() { - return type; - } - - public void setType(final String type) { - this.type = type; - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index 2c728cd981..568d5dc5a0 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -448,6 +448,30 @@ --index${esIndexName} --esHost${esIndexHost} + + + + + + + yarn + cluster + GenerateStatsJob + eu.dnetlib.dhp.broker.oa.GenerateStatsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml index b4155f93ff..2271a9e0e9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml @@ -64,14 +64,209 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + yarn + cluster + JoinStep0 + eu.dnetlib.dhp.broker.oa.JoinStep0Job + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + JoinStep1 + eu.dnetlib.dhp.broker.oa.JoinStep1Job + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + JoinStep2 + eu.dnetlib.dhp.broker.oa.JoinStep2Job + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + JoinStep3 + eu.dnetlib.dhp.broker.oa.JoinStep3Job + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + JoinStep4 + eu.dnetlib.dhp.broker.oa.JoinStep4Job + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + - + + + yarn + cluster + PrepareGroupsJob + eu.dnetlib.dhp.broker.oa.PrepareGroupsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --graphPath${graphInputPath} + --workingPath${workingPath} + + + + + + + + yarn + cluster + GenerateEventsJob + eu.dnetlib.dhp.broker.oa.GenerateEventsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --workingPath${workingPath} + --datasourceIdWhitelist${datasourceIdWhitelist} + --datasourceTypeWhitelist${datasourceTypeWhitelist} + --datasourceIdBlacklist${datasourceIdBlacklist} + + + + + + + + yarn + cluster + IndexOnESJob + eu.dnetlib.dhp.broker.oa.IndexOnESJob + dhp-broker-events-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.dynamicAllocation.maxExecutors="8" + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --workingPath${workingPath} + --index${esIndexName} + --esHost${esIndexHost} + + + + + + yarn cluster diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java index 82374b335f..8fa95abe5f 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java @@ -8,15 +8,23 @@ import java.util.Collection; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource; import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +@ExtendWith(MockitoExtension.class) class UpdateMatcherTest { UpdateMatcher matcher = new EnrichMissingPublicationDate(); + @Mock + private OaBrokerRelatedDatasource targetDs; + @BeforeEach void setUp() throws Exception { } @@ -30,7 +38,7 @@ class UpdateMatcherTest { final OaBrokerMainEntity p4 = new OaBrokerMainEntity(); final Collection> list = matcher - .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null); + .searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null); assertTrue(list.isEmpty()); } @@ -46,7 +54,7 @@ class UpdateMatcherTest { res.setPublicationdate("2018"); final Collection> list = matcher - .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null); + .searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null); assertTrue(list.isEmpty()); } @@ -62,7 +70,7 @@ class UpdateMatcherTest { p2.setPublicationdate("2018"); final Collection> list = matcher - .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null); + .searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null); assertTrue(list.size() == 1); } @@ -79,7 +87,7 @@ class UpdateMatcherTest { p2.setPublicationdate("2018"); final Collection> list = matcher - .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null); + .searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null); assertTrue(list.isEmpty()); } @@ -98,7 +106,7 @@ class UpdateMatcherTest { p4.setPublicationdate("2018"); final Collection> list = matcher - .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null); + .searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null); assertTrue(list.isEmpty()); } @@ -117,7 +125,7 @@ class UpdateMatcherTest { p4.setPublicationdate("2018"); final Collection> list = matcher - .searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null); + .searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null); assertTrue(list.size() == 1); } diff --git a/pom.xml b/pom.xml index 411ef9521c..eec6c303ff 100644 --- a/pom.xml +++ b/pom.xml @@ -624,6 +624,6 @@ 3.3.3 3.4.2 [2.12,3.0) - 3.0.0 + 3.1.0 From 4e6f46e8fa1f8cc419487d5f9423757129cfb3d7 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 15 Jul 2020 11:22:20 +0200 Subject: [PATCH 47/65] filter blocks with one record only --- .../java/eu/dnetlib/dhp/oa/dedup/Deduper.java | 1 + .../dnetlib/dhp/oa/dedup/SparkBlockStats.java | 20 ++++++++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java index 180f9f8460..5e8a50fccb 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java @@ -51,6 +51,7 @@ public class Deduper implements Serializable { .map(it -> Block.from(it, a)) .collect(Collectors.toList()) .iterator()) + .filter(b -> b.getDocuments().size() > 1) .mapToPair(block -> new Tuple2<>(block.getKey(), block)) .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize)); } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java index d5de309670..8016361dbb 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java @@ -9,6 +9,7 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; @@ -100,16 +101,9 @@ public class SparkBlockStats extends AbstractSparkAction { }); // create blocks for deduplication - JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); - - JavaRDD blockStats = blocks + JavaRDD blockStats = Deduper.createSortedBlocks(mapDocuments, dedupConf) .repartition(numPartitions) - .map( - b -> new BlockStats( - b._1(), - (long) b._2().getDocuments().size(), - computeComparisons( - (long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize()))); + .map(b -> asBlockStats(dedupConf, b)); // save the blockstats in the workingdir spark @@ -120,4 +114,12 @@ public class SparkBlockStats extends AbstractSparkAction { } } + private BlockStats asBlockStats(DedupConfig dedupConf, Tuple2 b) { + return new BlockStats( + b._1(), + (long) b._2().getDocuments().size(), + computeComparisons( + (long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize())); + } + } From b90389bac4fa6c76e17f8baf7b83f24a5e0e34be Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 15 Jul 2020 11:24:48 +0200 Subject: [PATCH 48/65] code formatting --- .../src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java index 8016361dbb..1e13485e5a 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java @@ -101,7 +101,8 @@ public class SparkBlockStats extends AbstractSparkAction { }); // create blocks for deduplication - JavaRDD blockStats = Deduper.createSortedBlocks(mapDocuments, dedupConf) + JavaRDD blockStats = Deduper + .createSortedBlocks(mapDocuments, dedupConf) .repartition(numPartitions) .map(b -> asBlockStats(dedupConf, b)); From 5033c25587c9fbd6226b1c76c08e0bfa3c428bbb Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 15 Jul 2020 11:26:00 +0200 Subject: [PATCH 49/65] code formatting --- pom.xml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pom.xml b/pom.xml index 411ef9521c..305bacda3c 100644 --- a/pom.xml +++ b/pom.xml @@ -323,13 +323,13 @@ [2.0.0,3.0.0) - - eu.dnetlib.dhp - dnet-openaire-broker-common - ${dnet.openaire.broker.common} - - + eu.dnetlib.dhp + dnet-openaire-broker-common + ${dnet.openaire.broker.common} + + + org.apache.cxf cxf-rt-transports-http 3.1.5 From 805de4eca15822092f99bc6c1e64b0cbda95b668 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 16 Jul 2020 10:11:32 +0200 Subject: [PATCH 50/65] fix: filter the blocks with size = 1 --- .../src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java index 5e8a50fccb..68201677e5 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java @@ -51,8 +51,8 @@ public class Deduper implements Serializable { .map(it -> Block.from(it, a)) .collect(Collectors.toList()) .iterator()) - .filter(b -> b.getDocuments().size() > 1) .mapToPair(block -> new Tuple2<>(block.getKey(), block)) - .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize)); + .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize)) + .filter(b -> b._2().getDocuments().size() > 1); } } From b098cc3cbe6f0d3281b47e1cef89eba61e663ed0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 16 Jul 2020 13:45:53 +0200 Subject: [PATCH 51/65] avoid repeating identical values for fields: source, description --- .../eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index db9a68d3da..53d4c888ea 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -329,7 +329,7 @@ public class XmlRecordFactory implements Serializable { .stream() .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue())) - .collect(Collectors.toList())); + .collect(Collectors.toCollection(HashSet::new))); } if (r.getEmbargoenddate() != null) { metadata @@ -370,7 +370,7 @@ public class XmlRecordFactory implements Serializable { .stream() .filter(Objects::nonNull) .map(c -> XmlSerializationUtils.asXmlElement("source", c.getValue())) - .collect(Collectors.toList())); + .collect(Collectors.toCollection(HashSet::new))); } if (r.getFormat() != null) { metadata From cc5d13da85c4897f502b879af42fef40c639a7cf Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 16 Jul 2020 13:46:39 +0200 Subject: [PATCH 52/65] introduced parameter shouldIndex (true|false) --- .../eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index 32bf7ce83d..91ced378ca 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -578,10 +578,18 @@ --isLookupUrl${isLookupUrl} --otherDsTypeId${otherDsTypeId} - + + + + ${wf:conf('shouldIndex') eq 'true'} + ${wf:conf('shouldIndex') eq 'false'} + + + + yarn From db8b90a1566546ed8d34cdc7a19b5f3b3b9f28d9 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 16 Jul 2020 19:05:13 +0200 Subject: [PATCH 53/65] renamed CORE -> BETA --- ...{graph_core_construction.xml => graph_prod_construction.xml} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/{graph_core_construction.xml => graph_prod_construction.xml} (99%) diff --git a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_core_construction.xml b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_prod_construction.xml similarity index 99% rename from dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_core_construction.xml rename to dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_prod_construction.xml index 3bce81c50a..0474333203 100644 --- a/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_core_construction.xml +++ b/dhp-workflows/dhp-worfklow-profiles/src/main/resources/eu/dnetlib/dhp/wf/profiles/graph_prod_construction.xml @@ -7,7 +7,7 @@ - Graph Construction [OCEAN] + Graph Construction [PROD] Data Provision 30 From 17816095084a80afab58ffda3b997d08e6c3d571 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 16 Jul 2020 19:06:56 +0200 Subject: [PATCH 54/65] code formatting --- .../oa/graph/merge/MergeGraphSparkJob.java | 184 +++++++++--------- 1 file changed, 94 insertions(+), 90 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java index d33b9e8468..8baa469bfe 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java @@ -1,11 +1,11 @@ + package eu.dnetlib.dhp.oa.graph.merge; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Objects; +import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; @@ -16,13 +16,16 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; -import java.util.Objects; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - /** * Combines the content from two aggregator graph tables of the same type, entities (or relationships) with the same ids * are picked preferring those from the BETA aggregator rather then from PROD. The identity of a relationship is defined @@ -30,101 +33,102 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; */ public class MergeGraphSparkJob { - private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class); + private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - CleanGraphSparkJob.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + String jsonConfiguration = IOUtils + .toString( + CleanGraphSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String betaInputPath = parser.get("betaInputPath"); - log.info("betaInputPath: {}", betaInputPath); + String betaInputPath = parser.get("betaInputPath"); + log.info("betaInputPath: {}", betaInputPath); - String prodInputPath = parser.get("prodInputPath"); - log.info("prodInputPath: {}", prodInputPath); + String prodInputPath = parser.get("prodInputPath"); + log.info("prodInputPath: {}", prodInputPath); - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - String graphTableClassName = parser.get("graphTableClassName"); - log.info("graphTableClassName: {}", graphTableClassName); + String graphTableClassName = parser.get("graphTableClassName"); + log.info("graphTableClassName: {}", graphTableClassName); - Class entityClazz = (Class) Class.forName(graphTableClassName); + Class entityClazz = (Class) Class.forName(graphTableClassName); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - mergeGraphTable(spark, betaInputPath, prodInputPath, entityClazz, entityClazz, outputPath); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + mergeGraphTable(spark, betaInputPath, prodInputPath, entityClazz, entityClazz, outputPath); + }); + } - private static

void mergeGraphTable( - SparkSession spark, - String betaInputPath, - String prodInputPath, - Class

p_clazz, - Class b_clazz, - String outputPath) { + private static

void mergeGraphTable( + SparkSession spark, + String betaInputPath, + String prodInputPath, + Class

p_clazz, + Class b_clazz, + String outputPath) { - Dataset> beta = readTableFromPath(spark, betaInputPath, b_clazz); - Dataset> prod = readTableFromPath(spark, prodInputPath, p_clazz); + Dataset> beta = readTableFromPath(spark, betaInputPath, b_clazz); + Dataset> prod = readTableFromPath(spark, prodInputPath, p_clazz); - prod.joinWith(beta, prod.col("_1").equalTo(beta.col("_1")), "full_outer") - .map((MapFunction, Tuple2>, P>) value -> { - Optional

p = Optional.ofNullable(value._1()).map(Tuple2::_2); - Optional b = Optional.ofNullable(value._2()).map(Tuple2::_2); - if (p.isPresent() & !b.isPresent()) { - return p.get(); - } - if (b.isPresent()) { - return (P) b.get(); - } - return null; - }, Encoders.bean(p_clazz)) - .filter((FilterFunction

) Objects::nonNull) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .json(outputPath); - } + prod + .joinWith(beta, prod.col("_1").equalTo(beta.col("_1")), "full_outer") + .map((MapFunction, Tuple2>, P>) value -> { + Optional

p = Optional.ofNullable(value._1()).map(Tuple2::_2); + Optional b = Optional.ofNullable(value._2()).map(Tuple2::_2); + if (p.isPresent() & !b.isPresent()) { + return p.get(); + } + if (b.isPresent()) { + return (P) b.get(); + } + return null; + }, Encoders.bean(p_clazz)) + .filter((FilterFunction

) Objects::nonNull) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } - private static Dataset> readTableFromPath( - SparkSession spark, String inputEntityPath, Class clazz) { + private static Dataset> readTableFromPath( + SparkSession spark, String inputEntityPath, Class clazz) { - log.info("Reading Graph table from: {}", inputEntityPath); - return spark - .read() - .textFile(inputEntityPath) - .map( - (MapFunction>) value -> { - final T t = OBJECT_MAPPER.readValue(value, clazz); - final String id = ModelSupport.idFn().apply(t); - return new Tuple2<>(id, t); - }, - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); - } + log.info("Reading Graph table from: {}", inputEntityPath); + return spark + .read() + .textFile(inputEntityPath) + .map( + (MapFunction>) value -> { + final T t = OBJECT_MAPPER.readValue(value, clazz); + final String id = ModelSupport.idFn().apply(t); + return new Tuple2<>(id, t); + }, + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } From 442f30930c741c4d0e4741e2946926c695b42e95 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 17 Jul 2020 12:25:36 +0200 Subject: [PATCH 55/65] removed duplicated fields --- .../dhp/oa/graph/raw/common/OafMapperUtils.java | 12 ++++++++++++ .../eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 2 ++ .../eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml | 2 ++ 3 files changed, 16 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java index 8ede407731..63db13b8fb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java @@ -4,7 +4,11 @@ package eu.dnetlib.dhp.oa.graph.raw.common; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.Function; +import java.util.function.Predicate; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; @@ -57,6 +61,7 @@ public class OafMapperUtils { .stream(values) .map(v -> field(v, info)) .filter(Objects::nonNull) + .filter(distinctByKey(f -> f.getValue())) .collect(Collectors.toList()); } @@ -65,6 +70,7 @@ public class OafMapperUtils { .stream() .map(v -> field(v, info)) .filter(Objects::nonNull) + .filter(distinctByKey(f -> f.getValue())) .collect(Collectors.toList()); } @@ -237,4 +243,10 @@ public class OafMapperUtils { public static String asString(final Object o) { return o == null ? "" : o.toString(); } + + public static Predicate distinctByKey( + final Function keyExtractor) { + final Map seen = new ConcurrentHashMap<>(); + return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null; + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index b1f0ecf0dd..ffd21c86ec 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -71,6 +71,7 @@ public class MappersTest { assertValidId(p.getCollectedfrom().get(0).getKey()); assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); assertFalse(p.getDataInfo().getInvisible()); + assertTrue(p.getSource().size() == 1); assertTrue(p.getAuthor().size() > 0); final Optional author = p @@ -79,6 +80,7 @@ public class MappersTest { .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) .findFirst(); assertTrue(author.isPresent()); + final StructuredProperty pid = author .get() .getPid() diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml index ead22aa965..be78bf35c1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml @@ -34,6 +34,8 @@ info:eu-repo/semantics/altIdentifier/eissn/2367-8194 info:eu-repo/grantAgreement/EC/FP7/226852 One Ecosystem 2: e13718 + One Ecosystem 2: e13718 + One Ecosystem 2: e13718 Ecosystem Services hotspots Natura 2000 Quiet Protected Areas From 47c7122773cde57c6e81bd3fbdc679fa9e768a1a Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 17 Jul 2020 12:56:35 +0200 Subject: [PATCH 56/65] changed priority from beta to production --- .../eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java index 8baa469bfe..67068e0725 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java @@ -96,11 +96,11 @@ public class MergeGraphSparkJob { .map((MapFunction, Tuple2>, P>) value -> { Optional

p = Optional.ofNullable(value._1()).map(Tuple2::_2); Optional b = Optional.ofNullable(value._2()).map(Tuple2::_2); - if (p.isPresent() & !b.isPresent()) { - return p.get(); + if (b.isPresent() & !p.isPresent()) { + return (P)b.get(); } - if (b.isPresent()) { - return (P) b.get(); + if (p.isPresent()) { + return p.get(); } return null; }, Encoders.bean(p_clazz)) From d7d84c82171ced5529cdbdc219f036b3670a9921 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 17 Jul 2020 14:03:23 +0200 Subject: [PATCH 57/65] - --- .../dnetlib/dhp/bulktag/community/ResultTagger.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java index d58704b753..120ad8edc3 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java @@ -72,8 +72,9 @@ public class ResultTagger implements Serializable { // tagging for Subject final Set subjects = new HashSet<>(); - if (Objects.nonNull(result.getSubject())){ - result.getSubject() + if (Objects.nonNull(result.getSubject())) { + result + .getSubject() .stream() .map(subject -> subject.getValue()) .filter(StringUtils::isNotBlank) @@ -91,13 +92,13 @@ public class ResultTagger implements Serializable { if (Objects.nonNull(result.getInstance())) { for (Instance i : result.getInstance()) { - if(Objects.nonNull(i.getCollectedfrom())){ - if(Objects.nonNull(i.getCollectedfrom().getKey())){ + if (Objects.nonNull(i.getCollectedfrom())) { + if (Objects.nonNull(i.getCollectedfrom().getKey())) { tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|")); } } - if(Objects.nonNull(i.getHostedby())){ - if(Objects.nonNull(i.getHostedby().getKey())){ + if (Objects.nonNull(i.getHostedby())) { + if (Objects.nonNull(i.getHostedby().getKey())) { tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|")); } } From 346a1d2b5a4092f03e7dda9578e7fc79ca75544c Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Sat, 18 Jul 2020 09:40:36 +0200 Subject: [PATCH 58/65] update eventId generator --- .../dhp/broker/model/EventFactory.java | 15 +- .../broker/oa/partial/oozie_app/workflow.xml | 147 +----------------- 2 files changed, 11 insertions(+), 151 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java index 4a58cfd36d..0cb0d78018 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java @@ -34,7 +34,10 @@ public class EventFactory { final MappedFields map = createMapFromResult(updateInfo); final String eventId = calculateEventId( - updateInfo.getTopicPath(), updateInfo.getTarget().getOpenaireId(), updateInfo.getHighlightValueAsString()); + updateInfo.getTopicPath(), updateInfo.getTargetDs().getOpenaireId(), updateInfo + .getTarget() + .getOpenaireId(), + updateInfo.getHighlightValueAsString()); res.setEventId(eventId); res.setProducerId(PRODUCER_ID); @@ -93,11 +96,13 @@ public class EventFactory { return map; } - private static String calculateEventId(final String topic, final String publicationId, final String value) { + private static String calculateEventId(final String topic, final String dsId, final String publicationId, + final String value) { return "event-" - + DigestUtils.md5Hex(topic).substring(0, 6) + "-" - + DigestUtils.md5Hex(publicationId).substring(0, 8) + "-" - + DigestUtils.md5Hex(value).substring(0, 8); + + DigestUtils.md5Hex(topic).substring(0, 4) + "-" + + DigestUtils.md5Hex(dsId).substring(0, 4) + "-" + + DigestUtils.md5Hex(publicationId).substring(0, 7) + "-" + + DigestUtils.md5Hex(value).substring(0, 5); } private static long calculateExpiryDate(final long now) { diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml index 2271a9e0e9..80505ca1ea 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/partial/oozie_app/workflow.xml @@ -64,157 +64,12 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - yarn - cluster - JoinStep0 - eu.dnetlib.dhp.broker.oa.JoinStep0Job - dhp-broker-events-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - - --graphPath${graphInputPath} - --workingPath${workingPath} - - - - - - - - yarn - cluster - JoinStep1 - eu.dnetlib.dhp.broker.oa.JoinStep1Job - dhp-broker-events-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - - --graphPath${graphInputPath} - --workingPath${workingPath} - - - - - - - - yarn - cluster - JoinStep2 - eu.dnetlib.dhp.broker.oa.JoinStep2Job - dhp-broker-events-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - - --graphPath${graphInputPath} - --workingPath${workingPath} - - - - - - - - yarn - cluster - JoinStep3 - eu.dnetlib.dhp.broker.oa.JoinStep3Job - dhp-broker-events-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - - --graphPath${graphInputPath} - --workingPath${workingPath} - - - - - - - - yarn - cluster - JoinStep4 - eu.dnetlib.dhp.broker.oa.JoinStep4Job - dhp-broker-events-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - - --graphPath${graphInputPath} - --workingPath${workingPath} - - - - - - - - yarn - cluster - PrepareGroupsJob - eu.dnetlib.dhp.broker.oa.PrepareGroupsJob - dhp-broker-events-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - - --graphPath${graphInputPath} - --workingPath${workingPath} - - - - - yarn From 331a3cbdd04aa5c61088bc64ff78e387e1710f37 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Mon, 20 Jul 2020 09:50:29 +0200 Subject: [PATCH 59/65] fixed originalId --- .../graph/raw/AbstractMdRecordToOafMapper.java | 16 +++++++++++++++- .../eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 6 ++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index c43ee29fe0..3e042834f0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -266,7 +266,9 @@ public abstract class AbstractMdRecordToOafMapper { r.setDataInfo(info); r.setLastupdatetimestamp(lastUpdateTimestamp); r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); - r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); + + r.setOriginalId(Arrays.asList(findOriginalId(doc))); + r.setCollectedfrom(Arrays.asList(collectedFrom)); r.setPid(prepareResultPids(doc, info)); r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); @@ -429,6 +431,18 @@ public abstract class AbstractMdRecordToOafMapper { return null; } + private String findOriginalId(final Document doc) { + final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); + if (n != null) { + final String id = n.valueOf("./*[local-name()='identifier']"); + if (StringUtils.isNotBlank(id)) { + return id; + } + } + return doc.valueOf("//*[local-name()='header']/*[local-name()='identifier']"); + + } + protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) { return prepareQualifier(node.valueOf(xpath).trim(), schemeId); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index b1f0ecf0dd..542fd00f37 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -68,6 +68,10 @@ public class MappersTest { final Relation r2 = (Relation) list.get(2); assertValidId(p.getId()); + + assertTrue(p.getOriginalId().size() == 1); + assertEquals("10.3897/oneeco.2.e13718", p.getOriginalId().get(0)); + assertValidId(p.getCollectedfrom().get(0).getKey()); assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); assertFalse(p.getDataInfo().getInvisible()); @@ -169,6 +173,8 @@ public class MappersTest { final Relation r2 = (Relation) list.get(2); assertValidId(d.getId()); + assertTrue(d.getOriginalId().size() == 1); + assertEquals("oai:zenodo.org:3234526", d.getOriginalId().get(0)); assertValidId(d.getCollectedfrom().get(0).getKey()); assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); assertTrue(d.getAuthor().size() > 0); From de72b1c859b777458470501aae045f6db419dcd5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 20 Jul 2020 09:59:11 +0200 Subject: [PATCH 60/65] cleanup --- .../java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index 03e6674e4d..ae5bf9252f 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -12,8 +12,6 @@ import org.apache.spark.sql.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.base.Joiner; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.DataInfo; From 105176105c14095d782b951b27a8c84f277cd1ae Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 20 Jul 2020 09:59:47 +0200 Subject: [PATCH 61/65] updated dnet-pace-core dependency to version 4.0.4 to include the latest clustering function --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 5ece9aaf68..cec3dd75a0 100644 --- a/pom.xml +++ b/pom.xml @@ -315,7 +315,7 @@ eu.dnetlib dnet-pace-core - 4.0.2 + 4.0.4 eu.dnetlib From e0c4cf6f7bff8894eb12e1d72e99f57d8aafc17a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 20 Jul 2020 10:48:01 +0200 Subject: [PATCH 62/65] added parameter to drive the graph merge strategy: priority (BETA|PROD) --- .../oa/graph/merge/MergeGraphSparkJob.java | 42 +++++++++++++++---- .../dhp/oa/graph/merge/oozie_app/workflow.xml | 13 +++++- .../dhp/oa/graph/merge_graphs_parameters.json | 6 +++ 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java index 67068e0725..b723de9554 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphSparkJob.java @@ -37,6 +37,8 @@ public class MergeGraphSparkJob { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final String PRIORITY_DEFAULT = "BETA"; // BETA | PROD + public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils @@ -47,6 +49,11 @@ public class MergeGraphSparkJob { final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); + String priority = Optional + .ofNullable(parser.get("priority")) + .orElse(PRIORITY_DEFAULT); + log.info("priority: {}", priority); + Boolean isSparkSessionManaged = Optional .ofNullable(parser.get("isSparkSessionManaged")) .map(Boolean::valueOf) @@ -76,12 +83,13 @@ public class MergeGraphSparkJob { isSparkSessionManaged, spark -> { removeOutputDir(spark, outputPath); - mergeGraphTable(spark, betaInputPath, prodInputPath, entityClazz, entityClazz, outputPath); + mergeGraphTable(spark, priority, betaInputPath, prodInputPath, entityClazz, entityClazz, outputPath); }); } private static

void mergeGraphTable( SparkSession spark, + String priority, String betaInputPath, String prodInputPath, Class

p_clazz, @@ -96,13 +104,13 @@ public class MergeGraphSparkJob { .map((MapFunction, Tuple2>, P>) value -> { Optional

p = Optional.ofNullable(value._1()).map(Tuple2::_2); Optional b = Optional.ofNullable(value._2()).map(Tuple2::_2); - if (b.isPresent() & !p.isPresent()) { - return (P)b.get(); + switch (priority) { + default: + case "BETA": + return mergeWithPriorityToBETA(p, b); + case "PROD": + return mergeWithPriorityToPROD(p, b); } - if (p.isPresent()) { - return p.get(); - } - return null; }, Encoders.bean(p_clazz)) .filter((FilterFunction

) Objects::nonNull) .write() @@ -111,6 +119,26 @@ public class MergeGraphSparkJob { .json(outputPath); } + private static

P mergeWithPriorityToPROD(Optional

p, Optional b) { + if (b.isPresent() & !p.isPresent()) { + return (P) b.get(); + } + if (p.isPresent()) { + return p.get(); + } + return null; + } + + private static

P mergeWithPriorityToBETA(Optional

p, Optional b) { + if (p.isPresent() & !b.isPresent()) { + return p.get(); + } + if (b.isPresent()) { + return (P) b.get(); + } + return null; + } + private static Dataset> readTableFromPath( SparkSession spark, String inputEntityPath, Class clazz) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml index 0a512fb6ab..07a125fb64 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml @@ -13,6 +13,10 @@ graphOutputPath the output merged graph root path + + priority + decides from which infrastructure the content must win in case of ID clash + sparkDriverMemory @@ -88,6 +92,7 @@ --prodInputPath${prodInputGgraphPath}/publication --outputPath${graphOutputPath}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication + --priority${priority} @@ -114,6 +119,7 @@ --prodInputPath${prodInputGgraphPath}/dataset --outputPath${graphOutputPath}/dataset --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset + --priority${priority} @@ -140,6 +146,7 @@ --prodInputPath${prodInputGgraphPath}/otherresearchproduct --outputPath${graphOutputPath}/otherresearchproduct --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --priority${priority} @@ -166,6 +173,7 @@ --prodInputPath${prodInputGgraphPath}/software --outputPath${graphOutputPath}/software --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software + --priority${priority} @@ -192,6 +200,7 @@ --prodInputPath${prodInputGgraphPath}/datasource --outputPath${graphOutputPath}/datasource --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource + --priority${priority} @@ -218,6 +227,7 @@ --prodInputPath${prodInputGgraphPath}/organization --outputPath${graphOutputPath}/organization --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization + --priority${priority} @@ -244,6 +254,7 @@ --prodInputPath${prodInputGgraphPath}/project --outputPath${graphOutputPath}/project --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project + --priority${priority} @@ -266,11 +277,11 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=7680 - --betaInputPath${betaInputGgraphPath}/relation --prodInputPath${prodInputGgraphPath}/relation --outputPath${graphOutputPath}/relation --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation + --priority${priority} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json index 6018b7e93c..1a612807b7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json @@ -28,5 +28,11 @@ "paramLongName": "graphTableClassName", "paramDescription": "class name moelling the graph table", "paramRequired": true + }, + { + "paramName": "pr", + "paramLongName": "priority", + "paramDescription": "decides from which infrastructure the content must win in case of ID clash", + "paramRequired": false } ] \ No newline at end of file From 124e7ce19c57005127c9f5b4bbe63b7b5eee1a90 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 20 Jul 2020 17:33:37 +0200 Subject: [PATCH 63/65] in case of missing attribute //dr:CobjCategory/@type the resulttype is derived by looking up the vocabulary dnet:result_typologies with the 1st instance type available --- .../raw/AbstractMdRecordToOafMapper.java | 41 +++++++++++++++---- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 30 ++++++++++---- .../dnetlib/dhp/oa/graph/raw/oaf_record.xml | 3 +- .../dnetlib/dhp/oa/graph/raw/odf_dataset.xml | 3 +- .../dnetlib/dhp/oa/graph/raw/odf_software.xml | 3 +- 5 files changed, 60 insertions(+), 20 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 3e042834f0..150cf2ceab 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -32,7 +32,11 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyTerm; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.lang3.StringUtils; +import org.apache.spark.sql.sources.In; +import org.apache.zookeeper.Op; import org.dom4j.Document; import org.dom4j.DocumentFactory; import org.dom4j.DocumentHelper; @@ -99,7 +103,6 @@ public abstract class AbstractMdRecordToOafMapper { final Document doc = DocumentHelper .parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); - final String type = doc.valueOf("//dr:CobjCategory/@type"); final KeyValue collectedFrom = getProvenanceDatasource( doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); @@ -118,12 +121,32 @@ public abstract class AbstractMdRecordToOafMapper { final DataInfo info = prepareDataInfo(doc, invisible); final long lastUpdateTimestamp = new Date().getTime(); - return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp); + final List instances = prepareInstances(doc, info, collectedFrom, hostedBy); + + final String type = getResultType(doc, instances); + + return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); } catch (final Exception e) { throw new RuntimeException(e); } } + protected String getResultType(final Document doc, final List instances) { + String type = doc.valueOf("//dr:CobjCategory/@type"); + + if (StringUtils.isBlank(type) & vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { + String instanceType = instances + .stream() + .map(i -> i.getInstancetype().getClassid()) + .findFirst() + .orElse("0000"); // Unknown + Qualifier resultType = vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType); + return resultType.getClassid(); + } + + return type; + } + private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) { final String dsId = doc.valueOf(xpathId); final String dsName = doc.valueOf(xpathName); @@ -138,8 +161,8 @@ public abstract class AbstractMdRecordToOafMapper { protected List createOafs( final Document doc, final String type, + final List instances, final KeyValue collectedFrom, - final KeyValue hostedBy, final DataInfo info, final long lastUpdateTimestamp) { @@ -148,14 +171,14 @@ public abstract class AbstractMdRecordToOafMapper { switch (type.toLowerCase()) { case "publication": final Publication p = new Publication(); - populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp); p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); p.setJournal(prepareJournal(doc, info)); oafs.add(p); break; case "dataset": final Dataset d = new Dataset(); - populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp); d.setResulttype(DATASET_DEFAULT_RESULTTYPE); d.setStoragedate(prepareDatasetStorageDate(doc, info)); d.setDevice(prepareDatasetDevice(doc, info)); @@ -168,7 +191,7 @@ public abstract class AbstractMdRecordToOafMapper { break; case "software": final Software s = new Software(); - populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp); s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); s.setLicense(prepareSoftwareLicenses(doc, info)); @@ -180,7 +203,7 @@ public abstract class AbstractMdRecordToOafMapper { case "otherresearchproducts": default: final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp); o.setResulttype(ORP_DEFAULT_RESULTTYPE); o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); @@ -259,8 +282,8 @@ public abstract class AbstractMdRecordToOafMapper { private void populateResultFields( final Result r, final Document doc, + final List instances, final KeyValue collectedFrom, - final KeyValue hostedBy, final DataInfo info, final long lastUpdateTimestamp) { r.setDataInfo(info); @@ -293,7 +316,7 @@ public abstract class AbstractMdRecordToOafMapper { r.setCoverage(prepareCoverages(doc, info)); r.setContext(prepareContexts(doc, info)); r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES - final List instances = prepareInstances(doc, info, collectedFrom, hostedBy); + r.setInstance(instances); r.setBestaccessright(getBestAccessRights(instances)); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 204458562a..b37590d500 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -6,12 +6,15 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.when; import java.io.IOException; import java.util.List; import java.util.Optional; +import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -35,20 +38,20 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty; @ExtendWith(MockitoExtension.class) public class MappersTest { + @Mock + private ISLookUpService isLookUpService; + @Mock private VocabularyGroup vocs; @BeforeEach public void setUp() throws Exception { - when(vocs.getTermAsQualifier(anyString(), anyString())) - .thenAnswer( - invocation -> OafMapperUtils - .qualifier( - invocation.getArgument(1), invocation.getArgument(1), invocation.getArgument(0), - invocation.getArgument(0))); - - when(vocs.termExists(anyString(), anyString())).thenReturn(true); + lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); + lenient() + .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) + .thenReturn(synonyms()); + vocs = VocabularyGroup.loadVocsFromIS(isLookUpService); } @Test @@ -269,4 +272,15 @@ public class MappersTest { assertEquals(':', id.charAt(15)); assertEquals(':', id.charAt(16)); } + + private List vocs() throws IOException { + return IOUtils + .readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")); + } + + private List synonyms() throws IOException { + return IOUtils + .readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")); + } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml index be78bf35c1..3b2658bcf3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml @@ -49,7 +49,8 @@ regulating services supporting services Research Article - 0001 + + 0001 2017-01-01 corda_______::226852 OPEN diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml index 5525a27533..6648d18047 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml @@ -82,7 +82,8 @@

All files are in MATLAB .mat format.

- 0021 + + 0021 2019-01-01 OPEN und diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_software.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_software.xml index fd3fdd4733..6a9170ce17 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_software.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_software.xml @@ -52,7 +52,8 @@ subjectScheme="EDAM Ontology" valueURI="http://edamontology.org/topic_3534">Protein binding sites - 0029 + + 0029 2018-06-06 From 54ac5839237425723231a566220b8c2347f8595b Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 20 Jul 2020 17:37:08 +0200 Subject: [PATCH 64/65] code formatting --- .../oa/graph/raw/AbstractMdRecordToOafMapper.java | 12 ++++++------ .../eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 150cf2ceab..e9450c5ca0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -32,8 +32,6 @@ import java.util.List; import java.util.Map; import java.util.Optional; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyTerm; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.lang3.StringUtils; import org.apache.spark.sql.sources.In; import org.apache.zookeeper.Op; @@ -43,7 +41,9 @@ import org.dom4j.DocumentHelper; import org.dom4j.Node; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyTerm; import eu.dnetlib.dhp.schema.common.LicenseComparator; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -136,10 +136,10 @@ public abstract class AbstractMdRecordToOafMapper { if (StringUtils.isBlank(type) & vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { String instanceType = instances - .stream() - .map(i -> i.getInstancetype().getClassid()) - .findFirst() - .orElse("0000"); // Unknown + .stream() + .map(i -> i.getInstancetype().getClassid()) + .findFirst() + .orElse("0000"); // Unknown Qualifier resultType = vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType); return resultType.getClassid(); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index b37590d500..b157f7ec6d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -13,8 +13,6 @@ import java.io.IOException; import java.util.List; import java.util.Optional; -import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -23,6 +21,7 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; @@ -34,6 +33,7 @@ import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @ExtendWith(MockitoExtension.class) public class MappersTest { @@ -48,8 +48,8 @@ public class MappersTest { public void setUp() throws Exception { lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); lenient() - .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) - .thenReturn(synonyms()); + .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) + .thenReturn(synonyms()); vocs = VocabularyGroup.loadVocsFromIS(isLookUpService); } @@ -275,12 +275,12 @@ public class MappersTest { private List vocs() throws IOException { return IOUtils - .readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")); + .readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")); } private List synonyms() throws IOException { return IOUtils - .readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")); + .readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")); } } From 32f5e466e3995468833ff4c9e028c660cc7b6bae Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 20 Jul 2020 17:42:58 +0200 Subject: [PATCH 65/65] imports cleanup --- .../dhp/schema/common/ModelConstants.java | 2 - .../raw/AbstractMdRecordToOafMapper.java | 54 ++----------------- 2 files changed, 4 insertions(+), 52 deletions(-) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index c5905e45b3..bf48605d26 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -1,8 +1,6 @@ package eu.dnetlib.dhp.schema.common; -import java.security.Key; - import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Qualifier; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index e9450c5ca0..94da4c1529 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -1,67 +1,21 @@ package eu.dnetlib.dhp.oa.graph.raw; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.dataInfo; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.journal; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.keyValue; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; -import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE; -import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; -import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES; -import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY; -import static eu.dnetlib.dhp.schema.common.ModelConstants.NOT_AVAILABLE; -import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE; -import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME; -import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES; -import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE; -import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS; -import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; -import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE; -import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*; +import static eu.dnetlib.dhp.schema.common.ModelConstants.*; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; +import java.util.*; import org.apache.commons.lang3.StringUtils; -import org.apache.spark.sql.sources.In; -import org.apache.zookeeper.Op; import org.dom4j.Document; import org.dom4j.DocumentFactory; import org.dom4j.DocumentHelper; import org.dom4j.Node; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyTerm; import eu.dnetlib.dhp.schema.common.LicenseComparator; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.Author; -import eu.dnetlib.dhp.schema.oaf.Context; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.Field; -import eu.dnetlib.dhp.schema.oaf.GeoLocation; -import eu.dnetlib.dhp.schema.oaf.Instance; -import eu.dnetlib.dhp.schema.oaf.Journal; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.OAIProvenance; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.Software; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.*; public abstract class AbstractMdRecordToOafMapper {