spark dedup test fixed

This commit is contained in:
miconis 2020-04-21 10:19:04 +02:00
parent d772d967aa
commit 5c9ef08a8e
1 changed files with 17 additions and 1 deletions

View File

@ -272,7 +272,17 @@ public class SparkDedupTest implements Serializable {
.distinct() .distinct()
.count(); .count();
assertEquals(831, publications); long mergedSw =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
.as(Encoders.bean(Relation.class))
.where("relClass=='merges'")
.javaRDD()
.map(Relation::getTarget)
.distinct()
.count();
assertEquals(897, publications);
assertEquals(835, organizations); assertEquals(835, organizations);
assertEquals(100, projects); assertEquals(100, projects);
assertEquals(100, datasource); assertEquals(100, datasource);
@ -288,8 +298,14 @@ public class SparkDedupTest implements Serializable {
.filter(this::isDeletedByInference) .filter(this::isDeletedByInference)
.count(); .count();
long deletedSw =
jsc.textFile(testDedupGraphBasePath + "/software")
.filter(this::isDeletedByInference)
.count();
assertEquals(mergedOrgs, deletedOrgs); assertEquals(mergedOrgs, deletedOrgs);
assertEquals(mergedPubs, deletedPubs); assertEquals(mergedPubs, deletedPubs);
assertEquals(mergedSw, deletedSw);
} }
@Test @Test