forked from D-Net/dnet-hadoop
WIP: various refactors
This commit is contained in:
parent
521dd7f167
commit
326367eccc
|
@ -29,7 +29,7 @@ case class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Seria
|
||||||
val modelExtractor: (Dataset[String] => Dataset[Row]) = df => {
|
val modelExtractor: (Dataset[String] => Dataset[Row]) = df => {
|
||||||
df.withColumn("mapDocument", rowFromJsonUDF.apply(df.col(df.columns(0))))
|
df.withColumn("mapDocument", rowFromJsonUDF.apply(df.col(df.columns(0))))
|
||||||
.withColumn("identifier", new Column("mapDocument.identifier"))
|
.withColumn("identifier", new Column("mapDocument.identifier"))
|
||||||
.repartition(numPartitions, new Column("identifier"))
|
.repartition(new Column("identifier"))
|
||||||
.dropDuplicates("identifier")
|
.dropDuplicates("identifier")
|
||||||
.select("mapDocument.*")
|
.select("mapDocument.*")
|
||||||
}
|
}
|
||||||
|
@ -178,12 +178,11 @@ case class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Seria
|
||||||
|
|
||||||
val res = relBlocks.filter(col("match").equalTo(true))
|
val res = relBlocks.filter(col("match").equalTo(true))
|
||||||
.select(col("l.identifier").as("from"), col("r.identifier").as("to"))
|
.select(col("l.identifier").as("from"), col("r.identifier").as("to"))
|
||||||
.repartition(numPartitions)
|
.repartition()
|
||||||
.dropDuplicates()
|
.dropDuplicates()
|
||||||
|
|
||||||
// res.show(false)
|
// res.show(false)
|
||||||
res.union(res.select(col("to").as("from"), col("from").as("to")))
|
res.select(functions.struct("from", "to"))
|
||||||
.select(functions.struct("from", "to"))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
val processClusters: (Dataset[Row] => Dataset[Row]) = df => {
|
val processClusters: (Dataset[Row] => Dataset[Row]) = df => {
|
||||||
|
@ -193,7 +192,7 @@ case class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Seria
|
||||||
df.filter(functions.size(new Column("block")).geq(new Literal(2, DataTypes.IntegerType)))
|
df.filter(functions.size(new Column("block")).geq(new Literal(2, DataTypes.IntegerType)))
|
||||||
.withColumn("relations", processBlock(df.sqlContext.sparkContext).apply(new Column("block")))
|
.withColumn("relations", processBlock(df.sqlContext.sparkContext).apply(new Column("block")))
|
||||||
.select(functions.explode(new Column("relations")).as("relation"))
|
.select(functions.explode(new Column("relations")).as("relation"))
|
||||||
.repartition(numPartitions, new Column("relation"))
|
.repartition(new Column("relation"))
|
||||||
.dropDuplicates("relation")
|
.dropDuplicates("relation")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -145,7 +145,6 @@ public class BlockProcessor {
|
||||||
final String type = dedupConf.getWf().getEntityType();
|
final String type = dedupConf.getWf().getEntityType();
|
||||||
|
|
||||||
context.emit(type, from, to);
|
context.emit(type, from, to);
|
||||||
context.emit(type, to, from);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -81,8 +81,6 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
final String outputPath = DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity);
|
final String outputPath = DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity);
|
||||||
removeOutputDir(spark, outputPath);
|
removeOutputDir(spark, outputPath);
|
||||||
|
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
|
|
||||||
SparkDedupConfig sparkConfig = new SparkDedupConfig(dedupConf, numPartitions);
|
SparkDedupConfig sparkConfig = new SparkDedupConfig(dedupConf, numPartitions);
|
||||||
|
|
||||||
Dataset<?> simRels = spark
|
Dataset<?> simRels = spark
|
||||||
|
|
|
@ -190,11 +190,11 @@ public class SparkDedupTest implements Serializable {
|
||||||
System.out.println("ds_simrel = " + ds_simrel);
|
System.out.println("ds_simrel = " + ds_simrel);
|
||||||
System.out.println("orp_simrel = " + orp_simrel);
|
System.out.println("orp_simrel = " + orp_simrel);
|
||||||
|
|
||||||
assertEquals(3076, orgs_simrel);
|
assertEquals(1538, orgs_simrel);
|
||||||
assertEquals(7046, pubs_simrel);
|
assertEquals(3523, pubs_simrel);
|
||||||
assertEquals(336, sw_simrel);
|
assertEquals(168, sw_simrel);
|
||||||
assertEquals(442, ds_simrel);
|
assertEquals(221, ds_simrel);
|
||||||
assertEquals(6784, orp_simrel);
|
assertEquals(3392, orp_simrel);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -239,10 +239,10 @@ public class SparkDedupTest implements Serializable {
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
||||||
assertEquals(3076, orgs_simrel);
|
assertEquals(1538, orgs_simrel);
|
||||||
assertEquals(7046, pubs_simrel);
|
assertEquals(3523, pubs_simrel);
|
||||||
assertEquals(442, ds_simrel);
|
assertEquals(221, ds_simrel);
|
||||||
assertEquals(6784, orp_simrel);
|
assertEquals(3392, orp_simrel);
|
||||||
// System.out.println("orgs_simrel = " + orgs_simrel);
|
// System.out.println("orgs_simrel = " + orgs_simrel);
|
||||||
// System.out.println("pubs_simrel = " + pubs_simrel);
|
// System.out.println("pubs_simrel = " + pubs_simrel);
|
||||||
// System.out.println("ds_simrel = " + ds_simrel);
|
// System.out.println("ds_simrel = " + ds_simrel);
|
||||||
|
@ -272,7 +272,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
&& rel.getTarget().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[1]))
|
&& rel.getTarget().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[1]))
|
||||||
.count() > 0);
|
.count() > 0);
|
||||||
|
|
||||||
assertEquals(338, sw_simrel.count());
|
assertEquals(170, sw_simrel.count());
|
||||||
// System.out.println("sw_simrel = " + sw_simrel.count());
|
// System.out.println("sw_simrel = " + sw_simrel.count());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue