From 711048ceedc99383c291bc532373e09294fe0815 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Tue, 7 May 2024 15:44:33 +0200 Subject: [PATCH 1/5] PrepareRelationsJob rewritten to use Spark Dataframe API and Windowing functions --- .../dhp/oa/provision/PrepareRelationsJob.java | 190 ++++-------------- 1 file changed, 38 insertions(+), 152 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index fdf397ad7..c2eb8c408 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -1,43 +1,31 @@ package eu.dnetlib.dhp.oa.provision; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.util.HashSet; -import java.util.Optional; -import java.util.PriorityQueue; -import java.util.Set; -import java.util.stream.Collectors; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.FilterFunction; -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.expressions.Aggregator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Joiner; import com.google.common.base.Splitter; -import com.google.common.collect.Iterables; import com.google.common.collect.Sets; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; -import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; -import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; import eu.dnetlib.dhp.schema.oaf.Relation; -import scala.Tuple2; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.expressions.Window; +import org.apache.spark.sql.expressions.WindowSpec; +import org.apache.spark.sql.functions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashSet; +import java.util.Optional; +import java.util.Set; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static org.apache.spark.sql.functions.col; /** * PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted @@ -130,130 +118,28 @@ public class PrepareRelationsJob { private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) { - JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath) - .filter(rel -> !(rel.getSource().startsWith("unresolved") || rel.getTarget().startsWith("unresolved"))) - .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) - .filter(rel -> !relationFilter.contains(StringUtils.lowerCase(rel.getRelClass()))); + WindowSpec source_w = Window + .partitionBy("source", "subRelType") + .orderBy(col("target").desc_nulls_last()); - JavaRDD pruned = pruneRels( - pruneRels( - rels, - sourceMaxRelations, relPartitions, (Function) Relation::getSource), - targetMaxRelations, relPartitions, (Function) Relation::getTarget); - spark - .createDataset(pruned.rdd(), Encoders.bean(Relation.class)) - .repartition(relPartitions) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } + WindowSpec target_w = Window + .partitionBy("target", "subRelType") + .orderBy(col("source").desc_nulls_last()); - private static JavaRDD pruneRels(JavaRDD rels, int maxRelations, - int relPartitions, Function idFn) { - return rels - .mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, idFn.call(r)), r)) - .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) - .groupBy(Tuple2::_1) - .map(Tuple2::_2) - .map(t -> Iterables.limit(t, maxRelations)) - .flatMap(Iterable::iterator) - .map(Tuple2::_2); - } - - // experimental - private static void prepareRelationsDataset( - SparkSession spark, String inputRelationsPath, String outputPath, Set relationFilter, int maxRelations, - int relPartitions) { - spark - .read() - .textFile(inputRelationsPath) - .repartition(relPartitions) - .map( - (MapFunction) s -> OBJECT_MAPPER.readValue(s, Relation.class), - Encoders.kryo(Relation.class)) - .filter((FilterFunction) rel -> !rel.getDataInfo().getDeletedbyinference()) - .filter((FilterFunction) rel -> !relationFilter.contains(rel.getRelClass())) - .groupByKey( - (MapFunction) Relation::getSource, - Encoders.STRING()) - .agg(new RelationAggregator(maxRelations).toColumn()) - .flatMap( - (FlatMapFunction, Relation>) t -> Iterables - .limit(t._2().getRelations(), maxRelations) - .iterator(), - Encoders.bean(Relation.class)) - .repartition(relPartitions) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } - - public static class RelationAggregator - extends Aggregator { - - private final int maxRelations; - - public RelationAggregator(int maxRelations) { - this.maxRelations = maxRelations; - } - - @Override - public RelationList zero() { - return new RelationList(); - } - - @Override - public RelationList reduce(RelationList b, Relation a) { - b.getRelations().add(a); - return getSortableRelationList(b); - } - - @Override - public RelationList merge(RelationList b1, RelationList b2) { - b1.getRelations().addAll(b2.getRelations()); - return getSortableRelationList(b1); - } - - @Override - public RelationList finish(RelationList r) { - return getSortableRelationList(r); - } - - private RelationList getSortableRelationList(RelationList b1) { - RelationList sr = new RelationList(); - sr - .setRelations( - b1 - .getRelations() - .stream() - .limit(maxRelations) - .collect(Collectors.toCollection(() -> new PriorityQueue<>(new RelationComparator())))); - return sr; - } - - @Override - public Encoder bufferEncoder() { - return Encoders.kryo(RelationList.class); - } - - @Override - public Encoder outputEncoder() { - return Encoders.kryo(RelationList.class); - } - } - - /** - * Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text - * file, - * - * @param spark - * @param inputPath - * @return the JavaRDD containing all the relationships - */ - private static JavaRDD readPathRelationRDD( - SparkSession spark, final String inputPath) { - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class)); + spark.read().schema(Encoders.bean(Relation.class).schema()).json(inputRelationsPath) + .where("source NOT LIKE 'unresolved%' AND target NOT LIKE 'unresolved%'") + .where("datainfo.deletedbyinference != true") + .where(relationFilter.isEmpty() ? "" : "lower(relClass) NOT IN ("+ Joiner.on(',').join(relationFilter) +")") + .withColumn("source_w_pos", functions.row_number().over(source_w)) + .where("source_w_pos < " + sourceMaxRelations ) + .drop("source_w_pos") + .withColumn("target_w_pos", functions.row_number().over(target_w)) + .where("target_w_pos < " + targetMaxRelations) + .drop( "target_w_pos") + .coalesce(relPartitions) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); } private static void removeOutputDir(SparkSession spark, String path) { From b4e33894322d1693460be2cfcf0afb23d3b9135f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 7 May 2024 16:25:17 +0200 Subject: [PATCH 2/5] fixed property mapping creating the RelatedEntity transient objects. spark cores & memory adjustments. Code formatting --- .../CreateRelatedEntitiesJob_phase1.java | 9 ++- .../dhp/oa/provision/PrepareRelationsJob.java | 72 +++++++++++-------- .../dhp/oa/provision/oozie_app/workflow.xml | 10 +-- 3 files changed, 54 insertions(+), 37 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index da80deee0..63f3c2ead 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -153,10 +153,15 @@ public class CreateRelatedEntitiesJob_phase1 { result .getTitle() .stream() + .filter(t -> StringUtils.isNotBlank(t.getValue())) .findFirst() - .map(StructuredProperty::getValue) .ifPresent( - title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH))); + title -> { + re.setTitle(title); + re + .getTitle() + .setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH)); + }); } if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) { result diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index c2eb8c408..f50c7774b 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -1,14 +1,15 @@ package eu.dnetlib.dhp.oa.provision; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.base.Joiner; -import com.google.common.base.Splitter; -import com.google.common.collect.Sets; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; -import eu.dnetlib.dhp.schema.oaf.Relation; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static org.apache.spark.sql.functions.col; + +import java.util.HashSet; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.Encoders; @@ -20,12 +21,15 @@ import org.apache.spark.sql.functions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.HashSet; -import java.util.Optional; -import java.util.Set; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Sets; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import static org.apache.spark.sql.functions.col; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; +import eu.dnetlib.dhp.schema.oaf.Relation; /** * PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted @@ -119,27 +123,33 @@ public class PrepareRelationsJob { Set relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) { WindowSpec source_w = Window - .partitionBy("source", "subRelType") - .orderBy(col("target").desc_nulls_last()); + .partitionBy("source", "subRelType") + .orderBy(col("target").desc_nulls_last()); WindowSpec target_w = Window - .partitionBy("target", "subRelType") - .orderBy(col("source").desc_nulls_last()); + .partitionBy("target", "subRelType") + .orderBy(col("source").desc_nulls_last()); - spark.read().schema(Encoders.bean(Relation.class).schema()).json(inputRelationsPath) - .where("source NOT LIKE 'unresolved%' AND target NOT LIKE 'unresolved%'") - .where("datainfo.deletedbyinference != true") - .where(relationFilter.isEmpty() ? "" : "lower(relClass) NOT IN ("+ Joiner.on(',').join(relationFilter) +")") - .withColumn("source_w_pos", functions.row_number().over(source_w)) - .where("source_w_pos < " + sourceMaxRelations ) - .drop("source_w_pos") - .withColumn("target_w_pos", functions.row_number().over(target_w)) - .where("target_w_pos < " + targetMaxRelations) - .drop( "target_w_pos") - .coalesce(relPartitions) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); + spark + .read() + .schema(Encoders.bean(Relation.class).schema()) + .json(inputRelationsPath) + .where("source NOT LIKE 'unresolved%' AND target NOT LIKE 'unresolved%'") + .where("datainfo.deletedbyinference != true") + .where( + relationFilter.isEmpty() ? "" + : "lower(relClass) NOT IN (" + + relationFilter.stream().map(s -> "'" + s + "'").collect(Collectors.joining(",")) + ")") + .withColumn("source_w_pos", functions.row_number().over(source_w)) + .where("source_w_pos < " + sourceMaxRelations) + .drop("source_w_pos") + .withColumn("target_w_pos", functions.row_number().over(target_w)) + .where("target_w_pos < " + targetMaxRelations) + .drop("target_w_pos") + .coalesce(relPartitions) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); } private static void removeOutputDir(SparkSession spark, String path) { diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index eb446ddd8..434b4c9af 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -144,21 +144,23 @@ eu.dnetlib.dhp.oa.provision.PrepareRelationsJob dhp-graph-provision-${projectVersion}.jar - --executor-cores=${sparkExecutorCoresForJoining} - --executor-memory=${sparkExecutorMemoryForJoining} + --executor-cores=4 + --executor-memory=6G --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.executor.memoryOverhead=6G --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.shuffle.partitions=15000 + --conf spark.network.timeout=${sparkNetworkTimeout} --inputRelationsPath${inputGraphRootPath}/relation --outputPath${workingDir}/relation --sourceMaxRelations${sourceMaxRelations} --targetMaxRelations${targetMaxRelations} --relationFilter${relationFilter} - --relPartitions5000 + --relPartitions15000 From 18aa323ee972c8b0565273ada553892f0568f83e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 8 May 2024 11:36:46 +0200 Subject: [PATCH 3/5] cleanup unused classes, adjustments in the oozie wf definition --- .../dhp/oa/provision/RelationComparator.java | 44 ---------- .../dhp/oa/provision/RelationList.java | 25 ------ .../dhp/oa/provision/SortableRelation.java | 81 ------------------- .../model/ProvisionModelSupport.java | 10 +-- .../dhp/oa/provision/oozie_app/workflow.xml | 11 +-- 5 files changed, 7 insertions(+), 164 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationComparator.java delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationList.java delete mode 100644 dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SortableRelation.java diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationComparator.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationComparator.java deleted file mode 100644 index e13bc60eb..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationComparator.java +++ /dev/null @@ -1,44 +0,0 @@ - -package eu.dnetlib.dhp.oa.provision; - -import java.util.Comparator; -import java.util.Map; -import java.util.Optional; - -import com.google.common.collect.ComparisonChain; -import com.google.common.collect.Maps; - -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.Relation; - -public class RelationComparator implements Comparator { - - private static final Map weights = Maps.newHashMap(); - - static { - weights.put(ModelConstants.OUTCOME, 0); - weights.put(ModelConstants.SUPPLEMENT, 1); - weights.put(ModelConstants.REVIEW, 2); - weights.put(ModelConstants.CITATION, 3); - weights.put(ModelConstants.AFFILIATION, 4); - weights.put(ModelConstants.RELATIONSHIP, 5); - weights.put(ModelConstants.PUBLICATION_DATASET, 6); - weights.put(ModelConstants.SIMILARITY, 7); - - weights.put(ModelConstants.PROVISION, 8); - weights.put(ModelConstants.PARTICIPATION, 9); - weights.put(ModelConstants.DEDUP, 10); - } - - private Integer getWeight(Relation o) { - return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE); - } - - @Override - public int compare(Relation o1, Relation o2) { - return ComparisonChain - .start() - .compare(getWeight(o1), getWeight(o2)) - .result(); - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationList.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationList.java deleted file mode 100644 index 6e5fd7dba..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/RelationList.java +++ /dev/null @@ -1,25 +0,0 @@ - -package eu.dnetlib.dhp.oa.provision; - -import java.io.Serializable; -import java.util.PriorityQueue; -import java.util.Queue; - -import eu.dnetlib.dhp.schema.oaf.Relation; - -public class RelationList implements Serializable { - - private Queue relations; - - public RelationList() { - this.relations = new PriorityQueue<>(new RelationComparator()); - } - - public Queue getRelations() { - return relations; - } - - public void setRelations(Queue relations) { - this.relations = relations; - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SortableRelation.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SortableRelation.java deleted file mode 100644 index 8740b47fc..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SortableRelation.java +++ /dev/null @@ -1,81 +0,0 @@ - -package eu.dnetlib.dhp.oa.provision; - -import java.io.Serializable; -import java.util.Map; -import java.util.Optional; - -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.google.common.collect.ComparisonChain; -import com.google.common.collect.Maps; - -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.Relation; - -public class SortableRelation extends Relation implements Comparable, Serializable { - - private static final Map weights = Maps.newHashMap(); - - static { - weights.put(ModelConstants.OUTCOME, 0); - weights.put(ModelConstants.SUPPLEMENT, 1); - weights.put(ModelConstants.REVIEW, 2); - weights.put(ModelConstants.CITATION, 3); - weights.put(ModelConstants.AFFILIATION, 4); - weights.put(ModelConstants.RELATIONSHIP, 5); - weights.put(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, 6); - weights.put(ModelConstants.SIMILARITY, 7); - - weights.put(ModelConstants.PROVISION, 8); - weights.put(ModelConstants.PARTICIPATION, 9); - weights.put(ModelConstants.DEDUP, 10); - } - - private static final long serialVersionUID = 34753984579L; - - private String groupingKey; - - public static SortableRelation create(Relation r, String groupingKey) { - SortableRelation sr = new SortableRelation(); - sr.setGroupingKey(groupingKey); - sr.setSource(r.getSource()); - sr.setTarget(r.getTarget()); - sr.setRelType(r.getRelType()); - sr.setSubRelType(r.getSubRelType()); - sr.setRelClass(r.getRelClass()); - sr.setDataInfo(r.getDataInfo()); - sr.setCollectedfrom(r.getCollectedfrom()); - sr.setLastupdatetimestamp(r.getLastupdatetimestamp()); - sr.setProperties(r.getProperties()); - sr.setValidated(r.getValidated()); - sr.setValidationDate(r.getValidationDate()); - - return sr; - } - - @JsonIgnore - public Relation asRelation() { - return this; - } - - @Override - public int compareTo(SortableRelation o) { - return ComparisonChain - .start() - .compare(getGroupingKey(), o.getGroupingKey()) - .compare(getWeight(this), getWeight(o)) - .result(); - } - - private Integer getWeight(SortableRelation o) { - return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE); - } - - public String getGroupingKey() { - return groupingKey; - } - - public void setGroupingKey(String groupingKey) { - this.groupingKey = groupingKey; - } -} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index 0e6e95de5..10a99704c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -1,8 +1,6 @@ package eu.dnetlib.dhp.oa.provision.model; -import static org.apache.commons.lang3.StringUtils.substringBefore; - import java.io.StringReader; import java.util.*; import java.util.stream.Collectors; @@ -16,12 +14,9 @@ import org.jetbrains.annotations.Nullable; import com.google.common.base.Splitter; import com.google.common.collect.Lists; import com.google.common.collect.Maps; -import com.google.common.collect.Sets; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm; -import eu.dnetlib.dhp.oa.provision.RelationList; -import eu.dnetlib.dhp.oa.provision.SortableRelation; import eu.dnetlib.dhp.oa.provision.utils.ContextDef; import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; import eu.dnetlib.dhp.schema.common.ModelSupport; @@ -55,10 +50,7 @@ public class ProvisionModelSupport { .newArrayList( RelatedEntityWrapper.class, JoinedEntity.class, - RelatedEntity.class, - SortableRelationKey.class, - SortableRelation.class, - RelationList.class)); + RelatedEntity.class)); return modelClasses.toArray(new Class[] {}); } diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index 434b4c9af..1fc28e7ca 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -125,7 +125,7 @@ ${wf:conf('resumeFrom') eq 'prepare_relations'} ${wf:conf('resumeFrom') eq 'fork_join_related_entities'} ${wf:conf('resumeFrom') eq 'fork_join_all_entities'} - ${wf:conf('resumeFrom') eq 'convert_to_xml'} + ${wf:conf('resumeFrom') eq 'create_payloads'} ${wf:conf('resumeFrom') eq 'drop_solr_collection'} ${wf:conf('resumeFrom') eq 'to_solr_index'} @@ -587,19 +587,20 @@ - + - + yarn cluster - convert_to_xml + create_payloads eu.dnetlib.dhp.oa.provision.XmlConverterJob dhp-graph-provision-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -607,7 +608,7 @@ --conf spark.sql.shuffle.partitions=3840 --conf spark.network.timeout=${sparkNetworkTimeout} - --inputPath${workingDir}/join_entities + --inputPath/user/claudio.atzori/data/beta_provision/join_entities --outputPath${workingDir}/xml_json --contextApiBaseUrl${contextApiBaseUrl} --isLookupUrl${isLookupUrl} From 39a2afe8b538c45b1e4d20ed31d3eee1c9dbdd7b Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 9 May 2024 13:54:42 +0200 Subject: [PATCH 4/5] [graph provision] fixed XML serialization of the usage counts measures, renamed workflow actions to better reflect their role --- ...erterJob.java => PayloadConverterJob.java} | 16 +-- .../model/ProvisionModelSupport.java | 11 +- .../oa/provision/utils/XmlRecordFactory.java | 110 ++++++++++-------- .../utils/XmlSerializationUtils.java | 33 ++++++ ...on => input_params_payload_converter.json} | 0 .../dhp/oa/provision/oozie_app/workflow.xml | 2 +- .../dhp/oa/provision/EOSCFuture_Test.java | 2 +- .../provision/IndexRecordTransformerTest.java | 6 +- .../oa/provision/XmlRecordFactoryTest.java | 14 +-- 9 files changed, 120 insertions(+), 74 deletions(-) rename dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/{XmlConverterJob.java => PayloadConverterJob.java} (92%) rename dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/{input_params_xml_converter.json => input_params_payload_converter.json} (100%) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java similarity index 92% rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java rename to dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java index 4353e863f..f34caad75 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java @@ -3,24 +3,16 @@ package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.utils.DHPUtils.toSeq; -import static org.apache.spark.sql.functions.*; import java.util.List; import java.util.Map; import java.util.Optional; import org.apache.commons.io.IOUtils; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.*; -import org.apache.spark.sql.expressions.UserDefinedFunction; -import org.apache.spark.sql.types.DataTypes; import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -45,9 +37,9 @@ import scala.Tuple2; /** * XmlConverterJob converts the JoinedEntities as XML records */ -public class XmlConverterJob { +public class PayloadConverterJob { - private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class); + private static final Logger log = LoggerFactory.getLogger(PayloadConverterJob.class); public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd"; @@ -56,8 +48,8 @@ public class XmlConverterJob { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - XmlConverterJob.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json"))); + PayloadConverterJob.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index 10a99704c..a085a72e0 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -19,8 +19,10 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm; import eu.dnetlib.dhp.oa.provision.utils.ContextDef; import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.solr.*; import eu.dnetlib.dhp.schema.solr.AccessRight; import eu.dnetlib.dhp.schema.solr.Author; @@ -66,7 +68,11 @@ public class ProvisionModelSupport { .setHeader( SolrRecordHeader .newInstance( - e.getId(), e.getOriginalId(), type, deletedbyinference)); + StringUtils + .substringAfter( + e.getId(), + IdentifierFactory.ID_PREFIX_SEPARATOR), + e.getOriginalId(), type, deletedbyinference)); r.setCollectedfrom(asProvenance(e.getCollectedfrom())); r.setContext(asContext(e.getContext(), contextMapper)); r.setPid(asPid(e.getPid())); @@ -106,7 +112,8 @@ public class ProvisionModelSupport { .newInstance( relation.getRelType(), relation.getRelClass(), - relation.getTarget(), relatedRecordType)); + StringUtils.substringAfter(relation.getTarget(), IdentifierFactory.ID_PREFIX_SEPARATOR), + relatedRecordType)); rr.setAcronym(re.getAcronym()); rr.setCode(re.getCode()); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index 63597c61e..65fa122c8 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -1,25 +1,23 @@ package eu.dnetlib.dhp.oa.provision.utils; -import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.authorPidTypes; -import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.getRelDescriptor; -import static org.apache.commons.lang3.StringUtils.isNotBlank; -import static org.apache.commons.lang3.StringUtils.substringBefore; - -import java.io.IOException; -import java.io.Serializable; -import java.io.StringReader; -import java.io.StringWriter; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import javax.xml.transform.*; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; - +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import com.mycila.xmltool.XMLDoc; +import com.mycila.xmltool.XMLTag; +import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; +import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; +import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; +import eu.dnetlib.dhp.oa.provision.model.XmlInstance; +import eu.dnetlib.dhp.schema.common.*; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; +import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; @@ -31,27 +29,26 @@ import org.dom4j.Node; import org.dom4j.io.OutputFormat; import org.dom4j.io.SAXReader; import org.dom4j.io.XMLWriter; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.base.Joiner; -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; -import com.mycila.xmltool.XMLDoc; -import com.mycila.xmltool.XMLTag; - -import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; -import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; -import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; -import eu.dnetlib.dhp.oa.provision.model.XmlInstance; -import eu.dnetlib.dhp.schema.common.*; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; -import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits; import scala.Tuple2; +import javax.xml.transform.*; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import java.io.IOException; +import java.io.Serializable; +import java.io.StringReader; +import java.io.StringWriter; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.authorPidTypes; +import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.getRelDescriptor; +import static org.apache.commons.lang3.StringUtils.isNotBlank; +import static org.apache.commons.lang3.StringUtils.substringBefore; + public class XmlRecordFactory implements Serializable { /** @@ -93,10 +90,13 @@ public class XmlRecordFactory implements Serializable { } public String build(final JoinedEntity je) { + return build(je, false); + } + + public String build(final JoinedEntity je, final Boolean validate) { final Set contexts = Sets.newHashSet(); - // final OafEntity entity = toOafEntity(je.getEntity()); final OafEntity entity = je.getEntity(); final TemplateFactory templateFactory = new TemplateFactory(); try { @@ -122,8 +122,14 @@ public class XmlRecordFactory implements Serializable { .buildBody( mainType, metadata, relations, listChildren(entity, je, templateFactory), listExtraInfo(entity)); - return templateFactory.buildRecord(entity, schemaLocation, body); - // return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent); + String xmlRecord = templateFactory.buildRecord(entity, schemaLocation, body); + + if (Boolean.TRUE.equals(validate)) { + // rise an exception when an invalid record was built + new SAXReader().read(new StringReader(xmlRecord)); + } + return xmlRecord; + // return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent); } catch (final Throwable e) { throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e); } @@ -1038,13 +1044,21 @@ public class XmlRecordFactory implements Serializable { } private List measuresAsXml(List measures) { - return measures - .stream() - .map(m -> { - List> l = Lists.newArrayList(new Tuple2<>("id", m.getId())); - m.getUnit().forEach(kv -> l.add(new Tuple2<>(kv.getKey(), kv.getValue()))); - return XmlSerializationUtils.asXmlElement("measure", l); - }) + return Stream + .concat( + measures + .stream() + .filter(m -> !"downloads".equals(m.getId()) && !"views".equals(m.getId())) + .map(m -> { + List> l = Lists.newArrayList(new Tuple2<>("id", m.getId())); + m.getUnit().forEach(kv -> l.add(new Tuple2<>(kv.getKey(), kv.getValue()))); + return XmlSerializationUtils.asXmlElement("measure", l); + }), + measures + .stream() + .filter(m -> "downloads".equals(m.getId()) || "views".equals(m.getId())) + .filter(m -> m.getUnit().stream().anyMatch(u -> Integer.parseInt(u.getValue()) > 0)) + .map(m -> XmlSerializationUtils.usageMeasureAsXmlElement("measure", m))) .collect(Collectors.toList()); } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java index deacac3ad..31763ace3 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java @@ -5,7 +5,11 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix; import static org.apache.commons.lang3.StringUtils.isBlank; import static org.apache.commons.lang3.StringUtils.isNotBlank; +import java.util.HashSet; import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; @@ -166,6 +170,35 @@ public class XmlSerializationUtils { return sb.toString(); } + // infrastruct_::f66f1bd369679b5b077dcdf006089556||OpenAIRE + public static String usageMeasureAsXmlElement(String name, Measure measure) { + HashSet dsIds = Optional + .ofNullable(measure.getUnit()) + .map( + m -> m + .stream() + .map(KeyValue::getKey) + .collect(Collectors.toCollection(HashSet::new))) + .orElse(new HashSet<>()); + + StringBuilder sb = new StringBuilder(); + dsIds.forEach(dsId -> { + sb + .append("<") + .append(name); + for (KeyValue kv : measure.getUnit()) { + sb.append(" ").append(attr(measure.getId(), kv.getValue())); + } + sb + .append(">") + .append(dsId) + .append(""); + }); + return sb.toString(); + } + public static String mapEoscIf(EoscIfGuidelines e) { return asXmlElement( "eoscifguidelines", Lists diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json similarity index 100% rename from dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json rename to dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index 1fc28e7ca..59058d467 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -594,7 +594,7 @@ yarn cluster create_payloads - eu.dnetlib.dhp.oa.provision.XmlConverterJob + eu.dnetlib.dhp.oa.provision.PayloadConverterJob dhp-graph-provision-${projectVersion}.jar --executor-cores=${sparkExecutorCores} diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java index 1a982ca39..4c43de25c 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java @@ -50,7 +50,7 @@ public class EOSCFuture_Test { final ContextMapper contextMapper = new ContextMapper(); final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, - XmlConverterJob.schemaLocation); + PayloadConverterJob.schemaLocation); final OtherResearchProduct p = OBJECT_MAPPER .readValue( diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java index 8d5aa3f3a..718b43f03 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java @@ -57,7 +57,7 @@ public class IndexRecordTransformerTest { public void testPublicationRecordTransformation() throws IOException, TransformerException { final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, - XmlConverterJob.schemaLocation); + PayloadConverterJob.schemaLocation); final Publication p = load("publication.json", Publication.class); final Project pj = load("project.json", Project.class); @@ -82,7 +82,7 @@ public class IndexRecordTransformerTest { void testPeerReviewed() throws IOException, TransformerException { final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, - XmlConverterJob.schemaLocation); + PayloadConverterJob.schemaLocation); final Publication p = load("publication.json", Publication.class); @@ -98,7 +98,7 @@ public class IndexRecordTransformerTest { public void testRiunet() throws IOException, TransformerException { final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, - XmlConverterJob.schemaLocation); + PayloadConverterJob.schemaLocation); final Publication p = load("riunet.json", Publication.class); diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java index f26c384d2..d617991a1 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java @@ -37,7 +37,7 @@ public class XmlRecordFactoryTest { final ContextMapper contextMapper = new ContextMapper(); final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, - XmlConverterJob.schemaLocation); + PayloadConverterJob.schemaLocation); final Publication p = OBJECT_MAPPER .readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class); @@ -105,7 +105,7 @@ public class XmlRecordFactoryTest { final ContextMapper contextMapper = new ContextMapper(); final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, - XmlConverterJob.schemaLocation); + PayloadConverterJob.schemaLocation); final Publication p = OBJECT_MAPPER .readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class); @@ -136,7 +136,7 @@ public class XmlRecordFactoryTest { final ContextMapper contextMapper = new ContextMapper(); final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, - XmlConverterJob.schemaLocation); + PayloadConverterJob.schemaLocation); final Publication p = OBJECT_MAPPER .readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class); @@ -166,7 +166,7 @@ public class XmlRecordFactoryTest { final ContextMapper contextMapper = new ContextMapper(); final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, - XmlConverterJob.schemaLocation); + PayloadConverterJob.schemaLocation); final Datasource d = OBJECT_MAPPER .readValue(IOUtils.toString(getClass().getResourceAsStream("datasource.json")), Datasource.class); @@ -203,7 +203,7 @@ public class XmlRecordFactoryTest { final ContextMapper contextMapper = new ContextMapper(); final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, - XmlConverterJob.schemaLocation); + PayloadConverterJob.schemaLocation); final OtherResearchProduct p = OBJECT_MAPPER .readValue( @@ -226,7 +226,7 @@ public class XmlRecordFactoryTest { final ContextMapper contextMapper = new ContextMapper(); final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, - XmlConverterJob.schemaLocation); + PayloadConverterJob.schemaLocation); final OtherResearchProduct p = OBJECT_MAPPER .readValue( @@ -249,7 +249,7 @@ public class XmlRecordFactoryTest { final ContextMapper contextMapper = new ContextMapper(); final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, - XmlConverterJob.schemaLocation); + PayloadConverterJob.schemaLocation); final Publication p = OBJECT_MAPPER .readValue( From 55f39f785094f6500171d06945b3e5fcfc479a4c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 9 May 2024 14:06:04 +0200 Subject: [PATCH 5/5] [graph provision] adds the possibility to validate the XML records before storing them via the validateXML parameter --- .../dhp/oa/provision/PayloadConverterJob.java | 17 ++++++++++++----- .../input_params_payload_converter.json | 6 ++++++ .../dhp/oa/provision/oozie_app/workflow.xml | 6 ++++++ 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java index f34caad75..d7e22e557 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java @@ -64,6 +64,12 @@ public class PayloadConverterJob { final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); + final Boolean validateXML = Optional + .ofNullable(parser.get("validateXML")) + .map(Boolean::valueOf) + .orElse(Boolean.FALSE); + log.info("validateXML: {}", validateXML); + final String contextApiBaseUrl = parser.get("contextApiBaseUrl"); log.info("contextApiBaseUrl: {}", contextApiBaseUrl); @@ -78,18 +84,19 @@ public class PayloadConverterJob { runWithSparkSession(conf, isSparkSessionManaged, spark -> { removeOutputDir(spark, outputPath); - convertToXml( + createPayloads( spark, inputPath, outputPath, ContextMapper.fromAPI(contextApiBaseUrl), - VocabularyGroup.loadVocsFromIS(isLookup)); + VocabularyGroup.loadVocsFromIS(isLookup), validateXML); }); } - private static void convertToXml( + private static void createPayloads( final SparkSession spark, final String inputPath, final String outputPath, final ContextMapper contextMapper, - final VocabularyGroup vocabularies) { + final VocabularyGroup vocabularies, + final Boolean validateXML) { final XmlRecordFactory recordFactory = new XmlRecordFactory( prepareAccumulators(spark.sparkContext()), @@ -110,7 +117,7 @@ public class PayloadConverterJob { .as(Encoders.kryo(JoinedEntity.class)) .map( (MapFunction>) je -> new Tuple2<>( - recordFactory.build(je), + recordFactory.build(je, validateXML), ProvisionModelSupport.transform(je, contextMapper, vocabularies)), Encoders.tuple(Encoders.STRING(), Encoders.bean(SolrRecord.class))) .map( diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json index 4509eb9de..1b43ca5fd 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json @@ -22,5 +22,11 @@ "paramLongName": "isLookupUrl", "paramDescription": "URL of the context ISLookup Service", "paramRequired": true + }, + { + "paramName": "val", + "paramLongName": "validateXML", + "paramDescription": "should the process check the XML validity", + "paramRequired": false } ] diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index 59058d467..1682f2ed5 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -13,6 +13,11 @@ contextApiBaseUrl context API URL + + validateXML + should the payload converter validate the XMLs + false + relPartitions number or partitions for the relations Dataset @@ -610,6 +615,7 @@ --inputPath/user/claudio.atzori/data/beta_provision/join_entities --outputPath${workingDir}/xml_json + --validateXML${validateXML} --contextApiBaseUrl${contextApiBaseUrl} --isLookupUrl${isLookupUrl}