master #11
|
@ -62,8 +62,8 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
spark -> {
|
spark -> {
|
||||||
removeBlacklistedRelations(
|
removeBlacklistedRelations(
|
||||||
spark,
|
spark,
|
||||||
inputPath,
|
|
||||||
blacklistPath,
|
blacklistPath,
|
||||||
|
inputPath,
|
||||||
outputPath,
|
outputPath,
|
||||||
mergesPath);
|
mergesPath);
|
||||||
});
|
});
|
||||||
|
@ -76,30 +76,34 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
Dataset<Relation> inputRelation = readRelations(spark, inputPath);
|
Dataset<Relation> inputRelation = readRelations(spark, inputPath);
|
||||||
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);
|
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);
|
||||||
|
|
||||||
|
log.info("InputRelationCount: {}", inputRelation.count());
|
||||||
|
|
||||||
Dataset<Relation> dedupSource = blackListed
|
Dataset<Relation> dedupSource = blackListed
|
||||||
.joinWith(mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")), "left_outer")
|
.joinWith(mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")), "left_outer")
|
||||||
.map(c -> {
|
.map(c -> {
|
||||||
Optional<Relation> merged = Optional.ofNullable(c._2());
|
Optional
|
||||||
Relation bl = c._1();
|
.ofNullable(c._2())
|
||||||
if (merged.isPresent()) {
|
.ifPresent(mr -> c._1().setSource(mr.getSource()));
|
||||||
bl.setSource(merged.get().getSource());
|
return c._1();
|
||||||
}
|
|
||||||
return bl;
|
|
||||||
}, Encoders.bean(Relation.class));
|
}, Encoders.bean(Relation.class));
|
||||||
|
|
||||||
Dataset<Relation> dedupBL = dedupSource
|
Dataset<Relation> dedupBL = dedupSource
|
||||||
.joinWith(mergesRelation, dedupSource.col("target").equalTo(mergesRelation.col("target")), "left_outer")
|
.joinWith(mergesRelation, dedupSource.col("target").equalTo(mergesRelation.col("target")), "left_outer")
|
||||||
.map(c -> {
|
.map(c -> {
|
||||||
Optional<Relation> merged = Optional.ofNullable(c._2());
|
Optional
|
||||||
Relation bl = c._1();
|
.ofNullable(c._2())
|
||||||
if (merged.isPresent()) {
|
.ifPresent(mr -> c._1().setTarget(mr.getSource()));
|
||||||
bl.setTarget(merged.get().getSource());
|
return c._1();
|
||||||
}
|
|
||||||
return bl;
|
|
||||||
}, Encoders.bean(Relation.class));
|
}, Encoders.bean(Relation.class));
|
||||||
|
|
||||||
inputRelation
|
dedupBL
|
||||||
.joinWith(dedupBL, inputRelation.col("source").equalTo(dedupBL.col("source")), "left_outer")
|
.write()
|
||||||
|
.json(blacklistPath + "/deduped");
|
||||||
|
|
||||||
|
Dataset<Relation> tmp = inputRelation
|
||||||
|
.joinWith(
|
||||||
|
dedupBL, inputRelation.col("source").equalTo(dedupBL.col("source")),
|
||||||
|
"left_outer")
|
||||||
.map(c -> {
|
.map(c -> {
|
||||||
Relation ir = c._1();
|
Relation ir = c._1();
|
||||||
Optional<Relation> obl = Optional.ofNullable(c._2());
|
Optional<Relation> obl = Optional.ofNullable(c._2());
|
||||||
|
@ -111,12 +115,15 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
return ir;
|
return ir;
|
||||||
|
|
||||||
}, Encoders.bean(Relation.class))
|
}, Encoders.bean(Relation.class))
|
||||||
.filter(r -> !(r == null))
|
.filter(r -> r != null);
|
||||||
.toJSON()
|
|
||||||
|
log.info("NumberOfRelationAfterBlacklisting: {} ", tmp.count());
|
||||||
|
|
||||||
|
tmp
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("conpression", "gzip")
|
.option("compression", "gzip")
|
||||||
.text(outputPath);
|
.json(outputPath);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/mergesRelation</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/mergesRelation</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
@ -86,7 +86,7 @@
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/relation</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/relation</arg>
|
||||||
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
|
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
|
||||||
<arg>--mergesPath</arg><arg>${workingDir}/mergesRelation</arg>
|
<arg>--mergesPath</arg><arg>${workingDir}/mergesRelation</arg>
|
||||||
|
|
Loading…
Reference in New Issue