master #11

Manually merged
claudio.atzori merged 275 commits from :master into enrichment_wfs 2020-05-11 15:14:56 +02:00
2 changed files with 28 additions and 21 deletions
Showing only changes of commit f95d288681 - Show all commits

View File

@ -62,8 +62,8 @@ public class SparkRemoveBlacklistedRelationJob {
spark -> { spark -> {
removeBlacklistedRelations( removeBlacklistedRelations(
spark, spark,
inputPath,
blacklistPath, blacklistPath,
inputPath,
outputPath, outputPath,
mergesPath); mergesPath);
}); });
@ -76,30 +76,34 @@ public class SparkRemoveBlacklistedRelationJob {
Dataset<Relation> inputRelation = readRelations(spark, inputPath); Dataset<Relation> inputRelation = readRelations(spark, inputPath);
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath); Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);
log.info("InputRelationCount: {}", inputRelation.count());
Dataset<Relation> dedupSource = blackListed Dataset<Relation> dedupSource = blackListed
.joinWith(mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")), "left_outer") .joinWith(mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")), "left_outer")
.map(c -> { .map(c -> {
Optional<Relation> merged = Optional.ofNullable(c._2()); Optional
Relation bl = c._1(); .ofNullable(c._2())
if (merged.isPresent()) { .ifPresent(mr -> c._1().setSource(mr.getSource()));
bl.setSource(merged.get().getSource()); return c._1();
}
return bl;
}, Encoders.bean(Relation.class)); }, Encoders.bean(Relation.class));
Dataset<Relation> dedupBL = dedupSource Dataset<Relation> dedupBL = dedupSource
.joinWith(mergesRelation, dedupSource.col("target").equalTo(mergesRelation.col("target")), "left_outer") .joinWith(mergesRelation, dedupSource.col("target").equalTo(mergesRelation.col("target")), "left_outer")
.map(c -> { .map(c -> {
Optional<Relation> merged = Optional.ofNullable(c._2()); Optional
Relation bl = c._1(); .ofNullable(c._2())
if (merged.isPresent()) { .ifPresent(mr -> c._1().setTarget(mr.getSource()));
bl.setTarget(merged.get().getSource()); return c._1();
}
return bl;
}, Encoders.bean(Relation.class)); }, Encoders.bean(Relation.class));
inputRelation dedupBL
.joinWith(dedupBL, inputRelation.col("source").equalTo(dedupBL.col("source")), "left_outer") .write()
.json(blacklistPath + "/deduped");
Dataset<Relation> tmp = inputRelation
.joinWith(
dedupBL, inputRelation.col("source").equalTo(dedupBL.col("source")),
"left_outer")
.map(c -> { .map(c -> {
Relation ir = c._1(); Relation ir = c._1();
Optional<Relation> obl = Optional.ofNullable(c._2()); Optional<Relation> obl = Optional.ofNullable(c._2());
@ -111,12 +115,15 @@ public class SparkRemoveBlacklistedRelationJob {
return ir; return ir;
}, Encoders.bean(Relation.class)) }, Encoders.bean(Relation.class))
.filter(r -> !(r == null)) .filter(r -> r != null);
.toJSON()
log.info("NumberOfRelationAfterBlacklisting: {} ", tmp.count());
tmp
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("conpression", "gzip") .option("compression", "gzip")
.text(outputPath); .json(outputPath);
} }

View File

@ -62,7 +62,7 @@
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/mergesRelation</arg> <arg>--outputPath</arg><arg>${workingDir}/mergesRelation</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
</spark> </spark>
@ -86,7 +86,7 @@
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation</arg> <arg>--outputPath</arg><arg>${workingDir}/relation</arg>
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg> <arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
<arg>--mergesPath</arg><arg>${workingDir}/mergesRelation</arg> <arg>--mergesPath</arg><arg>${workingDir}/mergesRelation</arg>