Dataset based implementation for SparkCreateDedupRecord phase, fixed datasource entity dump supplementing dedup unit tests

This commit is contained in:
Claudio Atzori 2020-04-21 12:06:08 +02:00
parent 5c9ef08a8e
commit 91e72a6944
4 changed files with 112 additions and 120 deletions

View File

@ -11,8 +11,6 @@ import eu.dnetlib.pace.config.DedupConfig;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException;
@ -72,12 +70,9 @@ public class SparkCreateDedupRecord extends AbstractSparkAction {
Class<OafEntity> clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity));
DedupRecordFactory.createDedupRecord(spark, mergeRelPath, entityPath, clazz)
.map(
(MapFunction<OafEntity, String>)
value -> OBJECT_MAPPER.writeValueAsString(value),
Encoders.STRING())
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
}

View File

@ -33,7 +33,7 @@ public class SparkUpdateEntity extends AbstractSparkAction {
private static final Logger log = LoggerFactory.getLogger(SparkUpdateEntity.class);
final String IDJSONPATH = "$.id";
private static final String IDJSONPATH = "$.id";
public SparkUpdateEntity(ArgumentApplicationParser parser, SparkSession spark) {
super(parser, spark);
@ -65,27 +65,25 @@ public class SparkUpdateEntity extends AbstractSparkAction {
log.info("workingPath: '{}'", workingPath);
log.info("dedupGraphPath: '{}'", dedupGraphPath);
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
// for each entity
ModelSupport.entityTypes.forEach(
(entity, clazz) -> {
final String outputPath = dedupGraphPath + "/" + entity;
(type, clazz) -> {
final String outputPath = dedupGraphPath + "/" + type;
removeOutputDir(spark, outputPath);
JavaRDD<String> sourceEntity =
sc.textFile(
DedupUtility.createEntityPath(
graphBasePath, entity.toString()));
DedupUtility.createEntityPath(graphBasePath, type.toString()));
if (mergeRelExists(workingPath, entity.toString())) {
if (mergeRelExists(workingPath, type.toString())) {
final String mergeRelPath =
DedupUtility.createMergeRelPath(
workingPath, "*", entity.toString());
DedupUtility.createMergeRelPath(workingPath, "*", type.toString());
final String dedupRecordPath =
DedupUtility.createDedupRecordPath(
workingPath, "*", entity.toString());
workingPath, "*", type.toString());
final Dataset<Relation> rel =
spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class));
@ -107,7 +105,6 @@ public class SparkUpdateEntity extends AbstractSparkAction {
MapDocumentUtil.getJPathString(
IDJSONPATH, s),
s));
JavaRDD<String> map =
entitiesWithId
.leftOuterJoin(mergedIds)

View File

@ -72,7 +72,7 @@ public class SparkDedupTest implements Serializable {
spark =
SparkSession.builder()
.appName(SparkCreateSimRels.class.getSimpleName())
.appName(SparkDedupTest.class.getSimpleName())
.master("local[*]")
.config(new SparkConf())
.getOrCreate();
@ -300,8 +300,8 @@ public class SparkDedupTest implements Serializable {
long deletedSw =
jsc.textFile(testDedupGraphBasePath + "/software")
.filter(this::isDeletedByInference)
.count();
.filter(this::isDeletedByInference)
.count();
assertEquals(mergedOrgs, deletedOrgs);
assertEquals(mergedPubs, deletedPubs);