From 0d7b2bf83d9cea3e948d6d7481db785b6fa8a20b Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 28 Aug 2023 10:34:54 +0200 Subject: [PATCH 1/2] Rewrite SparkPropagateRelation exploiting Dataframe API --- .../dhp/oa/dedup/SparkPropagateRelation.java | 302 ++++++++---------- 1 file changed, 127 insertions(+), 175 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index 0fa41bd6d..00dfd215d 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -1,19 +1,6 @@ package eu.dnetlib.dhp.oa.dedup; -import static org.apache.spark.sql.functions.col; - -import java.util.Objects; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.FilterFunction; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.*; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; @@ -21,198 +8,163 @@ import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.ReduceFunction; +import org.apache.spark.sql.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import scala.Tuple2; import scala.Tuple3; +import java.util.Objects; + +import static org.apache.spark.sql.functions.col; + public class SparkPropagateRelation extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkPropagateRelation.class); + private static final Logger log = LoggerFactory.getLogger(SparkPropagateRelation.class); - enum FieldType { - SOURCE, TARGET - } + private static Encoder REL_BEAN_ENC = Encoders.bean(Relation.class); - public SparkPropagateRelation(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } + private static Encoder REL_KRYO_ENC = Encoders.kryo(Relation.class); - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkPropagateRelation.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); + public SparkPropagateRelation(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkPropagateRelation.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + parser.parseArgument(args); - new SparkPropagateRelation(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); - } + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - @Override - public void run(ISLookUpService isLookUpService) { + new SparkPropagateRelation(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } - final String graphBasePath = parser.get("graphBasePath"); - final String workingPath = parser.get("workingPath"); - final String graphOutputPath = parser.get("graphOutputPath"); + @Override + public void run(ISLookUpService isLookUpService) { - log.info("graphBasePath: '{}'", graphBasePath); - log.info("workingPath: '{}'", workingPath); - log.info("graphOutputPath: '{}'", graphOutputPath); + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String graphOutputPath = parser.get("graphOutputPath"); - final String outputRelationPath = DedupUtility.createEntityPath(graphOutputPath, "relation"); - removeOutputDir(spark, outputRelationPath); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("workingPath: '{}'", workingPath); + log.info("graphOutputPath: '{}'", graphOutputPath); - Dataset mergeRels = spark - .read() - .load(DedupUtility.createMergeRelPath(workingPath, "*", "*")) - .as(Encoders.bean(Relation.class)); + final String outputRelationPath = DedupUtility.createEntityPath(graphOutputPath, "relation"); + removeOutputDir(spark, outputRelationPath); - // - Dataset> mergedIds = mergeRels - .where(col("relClass").equalTo(ModelConstants.MERGES)) - .select(col("source"), col("target")) - .distinct() - .map( - (MapFunction>) r -> new Tuple2<>(r.getString(1), r.getString(0)), - Encoders.tuple(Encoders.STRING(), Encoders.STRING())) - .cache(); + Dataset mergeRels = spark + .read() + .load(DedupUtility.createMergeRelPath(workingPath, "*", "*")) + .as(REL_BEAN_ENC); - final String relationPath = DedupUtility.createEntityPath(graphBasePath, "relation"); + // + Dataset mergedIds = mergeRels + .where(col("relClass").equalTo(ModelConstants.MERGES)) + .select(col("source").as("dedupID"), col("target").as("mergedObjectID")) + .distinct() + .cache(); - Dataset rels = spark.read().textFile(relationPath).map(patchRelFn(), Encoders.bean(Relation.class)); + final String inputRelationPath = DedupUtility.createEntityPath(graphBasePath, "relation"); - Dataset newRels = createNewRels(rels, mergedIds, getFixRelFn()); + Dataset rels = spark.read().schema(REL_BEAN_ENC.schema()).json(inputRelationPath) + .as(REL_BEAN_ENC) +// .map((MapFunction) rel -> { +// if (rel.getDataInfo() == null) { +// rel.setDataInfo(new DataInfo()); +// } +// return rel; +// }, REL_BEAN_ENC) + ; - Dataset updated = processDataset( - processDataset(rels, mergedIds, FieldType.SOURCE, getDeletedFn()), - mergedIds, - FieldType.TARGET, - getDeletedFn()); + Dataset> dedupedRels = rels + .joinWith(mergedIds, rels.col("source").equalTo(mergedIds.col("mergedObjectID")), "left_outer") + .joinWith(mergedIds, col("_1.target").equalTo(mergedIds.col("mergedObjectID")), "left_outer") + .filter("_1._2 IS NOT NULL OR _2 IS NOT NULL") + .select("_1._1", "_1._2.dedupID", "_2.dedupID") + .as(Encoders.tuple(REL_BEAN_ENC, Encoders.STRING(), Encoders.STRING())) + .cache(); - save( - distinctRelations( - newRels - .union(updated) - .union(mergeRels) - .map((MapFunction) r -> r, Encoders.kryo(Relation.class))) - .filter((FilterFunction) r -> !Objects.equals(r.getSource(), r.getTarget())), - outputRelationPath, SaveMode.Overwrite); - } + mergedIds.unpersist(); - private Dataset distinctRelations(Dataset rels) { - return rels - .filter(getRelationFilterFunction()) - .groupByKey( - (MapFunction) r -> String - .join(r.getSource(), r.getTarget(), r.getRelType(), r.getSubRelType(), r.getRelClass()), - Encoders.STRING()) - .agg(new RelationAggregator().toColumn()) - .map((MapFunction, Relation>) Tuple2::_2, Encoders.bean(Relation.class)); - } + Dataset newRels = dedupedRels + .map((MapFunction, Relation>) t -> { + Relation r = t._1(); + String newSource = t._2(); + String newTarget = t._3(); - // redirect the relations to the dedupID - private static Dataset createNewRels( - Dataset rels, // all the relations to be redirected - Dataset> mergedIds, // merge rels: - MapFunction, Tuple2>, Tuple2>, Relation> mapRel) { + if (r.getDataInfo() == null) { + r.setDataInfo(new DataInfo()); + } + r.getDataInfo().setDeletedbyinference(false); - // - Dataset> mapped = rels - .map( - (MapFunction>) r -> new Tuple3<>(getId(r, FieldType.SOURCE), - r, getId(r, FieldType.TARGET)), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class), Encoders.STRING())); + if (newSource != null) + r.setSource(newSource); - // < , > - Dataset, Tuple2>> relSource = mapped - .joinWith(mergedIds, mapped.col("_1").equalTo(mergedIds.col("_1")), "left_outer"); + if (newTarget != null) + r.setTarget(newTarget); - // < <, >, > - Dataset, Tuple2>, Tuple2>> relSourceTarget = relSource - .joinWith(mergedIds, relSource.col("_1._3").equalTo(mergedIds.col("_1")), "left_outer"); + return r; + }, REL_BEAN_ENC) + .distinct(); - return relSourceTarget - .filter( - (FilterFunction, Tuple2>, Tuple2>>) r -> r - ._1() - ._1() != null || r._2() != null) - .map(mapRel, Encoders.bean(Relation.class)) - .distinct(); - } + Dataset updated = dedupedRels + .map((MapFunction, Relation>) t -> { + Relation r = t._1(); + if (r.getDataInfo() == null) { + r.setDataInfo(new DataInfo()); + } + r.getDataInfo().setDeletedbyinference(true); + return r; + }, REL_BEAN_ENC); - private static Dataset processDataset( - Dataset rels, - Dataset> mergedIds, - FieldType type, - MapFunction, Tuple2>, Relation> mapFn) { - final Dataset> mapped = rels - .map( - (MapFunction>) r -> new Tuple2<>(getId(r, type), r), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class))); - return mapped - .joinWith(mergedIds, mapped.col("_1").equalTo(mergedIds.col("_1")), "left_outer") - .map(mapFn, Encoders.bean(Relation.class)); - } + save( + distinctRelations( + newRels + .union(updated) + .union(mergeRels) + .map((MapFunction) r -> r, REL_KRYO_ENC) + ) + .filter((FilterFunction) r -> !Objects.equals(r.getSource(), r.getTarget())), + outputRelationPath, SaveMode.Overwrite); + } - private FilterFunction getRelationFilterFunction() { - return r -> StringUtils.isNotBlank(r.getSource()) || - StringUtils.isNotBlank(r.getTarget()) || - StringUtils.isNotBlank(r.getRelType()) || - StringUtils.isNotBlank(r.getSubRelType()) || - StringUtils.isNotBlank(r.getRelClass()); - } + private Dataset distinctRelations(Dataset rels) { + return rels + .filter(getRelationFilterFunction()) + .groupByKey( + (MapFunction) r -> String + .join(" ", r.getSource(), r.getTarget(), r.getRelType(), r.getSubRelType(), r.getRelClass()), + Encoders.STRING()) + .reduceGroups((ReduceFunction) (b, a) -> { + b.mergeFrom(a); + return b; + } + ) - private static String getId(Relation r, FieldType type) { - switch (type) { - case SOURCE: - return r.getSource(); - case TARGET: - return r.getTarget(); - default: - throw new IllegalArgumentException(""); - } - } - - private static MapFunction, Tuple2>, Tuple2>, Relation> getFixRelFn() { - return value -> { - - Relation r = value._1()._1()._2(); - String newSource = value._1()._2() != null ? value._1()._2()._2() : null; - String newTarget = value._2() != null ? value._2()._2() : null; - - if (r.getDataInfo() == null) { - r.setDataInfo(new DataInfo()); - } - r.getDataInfo().setDeletedbyinference(false); - - if (newSource != null) - r.setSource(newSource); - - if (newTarget != null) - r.setTarget(newTarget); - - return r; - }; - } - - private static MapFunction, Tuple2>, Relation> getDeletedFn() { - return value -> { - if (value._2() != null) { - Relation r = value._1()._2(); - if (r.getDataInfo() == null) { - r.setDataInfo(new DataInfo()); - } - r.getDataInfo().setDeletedbyinference(true); - return r; - } - return value._1()._2(); - }; - } + .map((MapFunction, Relation>) Tuple2::_2, REL_BEAN_ENC); + } + private FilterFunction getRelationFilterFunction() { + return r -> StringUtils.isNotBlank(r.getSource()) || + StringUtils.isNotBlank(r.getTarget()) || + StringUtils.isNotBlank(r.getRelType()) || + StringUtils.isNotBlank(r.getSubRelType()) || + StringUtils.isNotBlank(r.getRelClass()); + } } From a860e1942313b0ee2f9a1dae07493108157f3a64 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 28 Aug 2023 15:36:02 +0200 Subject: [PATCH 2/2] Fix ensure all relations are written out, not only those managed by dedup --- .../dhp/oa/dedup/SparkPropagateRelation.java | 90 ++++++++----------- 1 file changed, 38 insertions(+), 52 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index 00dfd215d..d16a3a63d 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -8,6 +8,7 @@ import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.beanutils.BeanUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -20,6 +21,9 @@ import org.slf4j.LoggerFactory; import scala.Tuple2; import scala.Tuple3; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; import java.util.Objects; import static org.apache.spark.sql.functions.col; @@ -80,68 +84,51 @@ public class SparkPropagateRelation extends AbstractSparkAction { .distinct() .cache(); - final String inputRelationPath = DedupUtility.createEntityPath(graphBasePath, "relation"); + Dataset allRels = spark.read() + .schema(REL_BEAN_ENC.schema()) + .json(DedupUtility.createEntityPath(graphBasePath, "relation")); - Dataset rels = spark.read().schema(REL_BEAN_ENC.schema()).json(inputRelationPath) - .as(REL_BEAN_ENC) -// .map((MapFunction) rel -> { -// if (rel.getDataInfo() == null) { -// rel.setDataInfo(new DataInfo()); -// } -// return rel; -// }, REL_BEAN_ENC) - ; - - Dataset> dedupedRels = rels - .joinWith(mergedIds, rels.col("source").equalTo(mergedIds.col("mergedObjectID")), "left_outer") + Dataset dedupedRels = allRels + .joinWith(mergedIds, allRels.col("source").equalTo(mergedIds.col("mergedObjectID")), "left_outer") .joinWith(mergedIds, col("_1.target").equalTo(mergedIds.col("mergedObjectID")), "left_outer") - .filter("_1._2 IS NOT NULL OR _2 IS NOT NULL") .select("_1._1", "_1._2.dedupID", "_2.dedupID") .as(Encoders.tuple(REL_BEAN_ENC, Encoders.STRING(), Encoders.STRING())) - .cache(); + .flatMap(SparkPropagateRelation::addInferredRelations, REL_KRYO_ENC); - mergedIds.unpersist(); + Dataset processedRelations = distinctRelations(dedupedRels.union(mergeRels.map((MapFunction) r -> r, REL_KRYO_ENC))) + .filter((FilterFunction) r -> !Objects.equals(r.getSource(), r.getTarget())); - Dataset newRels = dedupedRels - .map((MapFunction, Relation>) t -> { - Relation r = t._1(); - String newSource = t._2(); - String newTarget = t._3(); + save(processedRelations, outputRelationPath, SaveMode.Overwrite); + } - if (r.getDataInfo() == null) { - r.setDataInfo(new DataInfo()); - } - r.getDataInfo().setDeletedbyinference(false); + private static Iterator addInferredRelations(Tuple3 t) throws Exception { + Relation existingRel = t._1(); + String newSource = t._2(); + String newTarget = t._3(); - if (newSource != null) - r.setSource(newSource); + if (newSource == null && newTarget == null) { + return Collections.singleton(t._1()).iterator(); + } - if (newTarget != null) - r.setTarget(newTarget); + // update existing relation + if (existingRel.getDataInfo() == null) { + existingRel.setDataInfo(new DataInfo()); + } + existingRel.getDataInfo().setDeletedbyinference(true); - return r; - }, REL_BEAN_ENC) - .distinct(); + // Create new relation inferred by dedupIDs + Relation inferredRel = (Relation) BeanUtils.cloneBean(existingRel); - Dataset updated = dedupedRels - .map((MapFunction, Relation>) t -> { - Relation r = t._1(); - if (r.getDataInfo() == null) { - r.setDataInfo(new DataInfo()); - } - r.getDataInfo().setDeletedbyinference(true); - return r; - }, REL_BEAN_ENC); + inferredRel.setDataInfo((DataInfo) BeanUtils.cloneBean(existingRel.getDataInfo())); + inferredRel.getDataInfo().setDeletedbyinference(false); - save( - distinctRelations( - newRels - .union(updated) - .union(mergeRels) - .map((MapFunction) r -> r, REL_KRYO_ENC) - ) - .filter((FilterFunction) r -> !Objects.equals(r.getSource(), r.getTarget())), - outputRelationPath, SaveMode.Overwrite); + if (newSource != null) + inferredRel.setSource(newSource); + + if (newTarget != null) + inferredRel.setTarget(newTarget); + + return Arrays.asList(existingRel, inferredRel).iterator(); } private Dataset distinctRelations(Dataset rels) { @@ -156,8 +143,7 @@ public class SparkPropagateRelation extends AbstractSparkAction { return b; } ) - - .map((MapFunction, Relation>) Tuple2::_2, REL_BEAN_ENC); + .map((MapFunction, Relation>) Tuple2::_2, REL_BEAN_ENC); } private FilterFunction getRelationFilterFunction() {