forked from antonis.lempesis/dnet-hadoop
fix a typo in the compression keyword and added some logging info in the spark job
This commit is contained in:
parent
28556507e7
commit
7e66bc2527
|
@ -67,11 +67,12 @@ public class PrepareMergedRelationJob {
|
||||||
|
|
||||||
Dataset<Relation> relation = readRelations(spark, inputPath);
|
Dataset<Relation> relation = readRelations(spark, inputPath);
|
||||||
|
|
||||||
relation.filter("relclass = 'merges' and datainfo.deletedbyinference=false")
|
relation
|
||||||
.write()
|
.filter("relclass = 'merges' and datainfo.deletedbyinference=false")
|
||||||
.mode(SaveMode.Overwrite)
|
.write()
|
||||||
.option("compression","gizp")
|
.mode(SaveMode.Overwrite)
|
||||||
.json(outputPath);
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
// relation.createOrReplaceTempView("relation");
|
// relation.createOrReplaceTempView("relation");
|
||||||
//
|
//
|
||||||
// spark
|
// spark
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.blacklist;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -18,6 +19,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class SparkRemoveBlacklistedRelationJob {
|
public class SparkRemoveBlacklistedRelationJob {
|
||||||
private static final Logger log = LoggerFactory.getLogger(SparkRemoveBlacklistedRelationJob.class);
|
private static final Logger log = LoggerFactory.getLogger(SparkRemoveBlacklistedRelationJob.class);
|
||||||
|
@ -78,8 +80,12 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
|
|
||||||
log.info("InputRelationCount: {}", inputRelation.count());
|
log.info("InputRelationCount: {}", inputRelation.count());
|
||||||
|
|
||||||
|
log.info("NumberOfBlacklistedRelations: {}", blackListed.count());
|
||||||
|
|
||||||
Dataset<Relation> dedupSource = blackListed
|
Dataset<Relation> dedupSource = blackListed
|
||||||
.joinWith(mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")), "left_outer")
|
.joinWith(
|
||||||
|
mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")),
|
||||||
|
"left_outer")
|
||||||
.map(c -> {
|
.map(c -> {
|
||||||
Optional
|
Optional
|
||||||
.ofNullable(c._2())
|
.ofNullable(c._2())
|
||||||
|
@ -88,7 +94,9 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
}, Encoders.bean(Relation.class));
|
}, Encoders.bean(Relation.class));
|
||||||
|
|
||||||
Dataset<Relation> dedupBL = dedupSource
|
Dataset<Relation> dedupBL = dedupSource
|
||||||
.joinWith(mergesRelation, dedupSource.col("target").equalTo(mergesRelation.col("target")), "left_outer")
|
.joinWith(
|
||||||
|
mergesRelation, dedupSource.col("target").equalTo(mergesRelation.col("target")),
|
||||||
|
"left_outer")
|
||||||
.map(c -> {
|
.map(c -> {
|
||||||
Optional
|
Optional
|
||||||
.ofNullable(c._2())
|
.ofNullable(c._2())
|
||||||
|
@ -98,28 +106,41 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
|
|
||||||
dedupBL
|
dedupBL
|
||||||
.write()
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
.json(blacklistPath + "/deduped");
|
.json(blacklistPath + "/deduped");
|
||||||
|
|
||||||
Dataset<Relation> tmp = inputRelation
|
log.info("number of dedupedBL: {}", dedupBL.count());
|
||||||
|
|
||||||
|
Dataset<Tuple2<Relation, Relation>> tmp = inputRelation
|
||||||
.joinWith(
|
.joinWith(
|
||||||
dedupBL, inputRelation.col("source").equalTo(dedupBL.col("source")),
|
dedupBL, (inputRelation
|
||||||
"left_outer")
|
.col("source")
|
||||||
.map(c -> {
|
.equalTo(dedupBL.col("source"))
|
||||||
Relation ir = c._1();
|
.and(
|
||||||
Optional<Relation> obl = Optional.ofNullable(c._2());
|
inputRelation
|
||||||
if (obl.isPresent()) {
|
.col("target")
|
||||||
if (ir.equals(obl.get())) {
|
.equalTo(dedupBL.col("target"))
|
||||||
return null;
|
.and(inputRelation.col("relclass").equalTo(dedupBL.col("relclass"))))),
|
||||||
}
|
"left_outer");
|
||||||
|
|
||||||
|
log.info("numberOfRelationAfterJoin: {}", tmp.count());
|
||||||
|
|
||||||
|
Dataset<Relation> tmp1 = tmp.map(c -> {
|
||||||
|
Relation ir = c._1();
|
||||||
|
Optional<Relation> obl = Optional.ofNullable(c._2());
|
||||||
|
if (obl.isPresent()) {
|
||||||
|
if (areEquals(ir, obl.get())) {
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
return ir;
|
}
|
||||||
|
return ir;
|
||||||
|
|
||||||
}, Encoders.bean(Relation.class))
|
}, Encoders.bean(Relation.class))
|
||||||
.filter(r -> r != null);
|
.filter(Objects::nonNull);
|
||||||
|
|
||||||
log.info("NumberOfRelationAfterBlacklisting: {} ", tmp.count());
|
log.info("NumberOfRelationAfterBlacklisting: {} ", tmp1.count());
|
||||||
|
|
||||||
tmp
|
tmp1
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
|
@ -127,6 +148,12 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean areEquals(Relation ir, Relation bl) {
|
||||||
|
return ir.getRelClass().equals(bl.getRelClass()) &&
|
||||||
|
ir.getRelType().equals(bl.getRelType()) &&
|
||||||
|
ir.getSubRelType().equals(bl.getSubRelType());
|
||||||
|
}
|
||||||
|
|
||||||
public static org.apache.spark.sql.Dataset<Relation> readRelations(
|
public static org.apache.spark.sql.Dataset<Relation> readRelations(
|
||||||
SparkSession spark, String inputPath) {
|
SparkSession spark, String inputPath) {
|
||||||
return spark
|
return spark
|
||||||
|
|
Loading…
Reference in New Issue