forked from D-Net/dnet-hadoop
Dataset based implementation for SparkCreateDedupRecord phase, fixed datasource entity dump supplementing dedup unit tests
This commit is contained in:
parent
5c9ef08a8e
commit
91e72a6944
|
@ -11,8 +11,6 @@ import eu.dnetlib.pace.config.DedupConfig;
|
|||
import java.io.IOException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.DocumentException;
|
||||
|
@ -72,12 +70,9 @@ public class SparkCreateDedupRecord extends AbstractSparkAction {
|
|||
Class<OafEntity> clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity));
|
||||
|
||||
DedupRecordFactory.createDedupRecord(spark, mergeRelPath, entityPath, clazz)
|
||||
.map(
|
||||
(MapFunction<OafEntity, String>)
|
||||
value -> OBJECT_MAPPER.writeValueAsString(value),
|
||||
Encoders.STRING())
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,7 +33,7 @@ public class SparkUpdateEntity extends AbstractSparkAction {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkUpdateEntity.class);
|
||||
|
||||
final String IDJSONPATH = "$.id";
|
||||
private static final String IDJSONPATH = "$.id";
|
||||
|
||||
public SparkUpdateEntity(ArgumentApplicationParser parser, SparkSession spark) {
|
||||
super(parser, spark);
|
||||
|
@ -65,27 +65,25 @@ public class SparkUpdateEntity extends AbstractSparkAction {
|
|||
log.info("workingPath: '{}'", workingPath);
|
||||
log.info("dedupGraphPath: '{}'", dedupGraphPath);
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
// for each entity
|
||||
ModelSupport.entityTypes.forEach(
|
||||
(entity, clazz) -> {
|
||||
final String outputPath = dedupGraphPath + "/" + entity;
|
||||
(type, clazz) -> {
|
||||
final String outputPath = dedupGraphPath + "/" + type;
|
||||
removeOutputDir(spark, outputPath);
|
||||
|
||||
JavaRDD<String> sourceEntity =
|
||||
sc.textFile(
|
||||
DedupUtility.createEntityPath(
|
||||
graphBasePath, entity.toString()));
|
||||
DedupUtility.createEntityPath(graphBasePath, type.toString()));
|
||||
|
||||
if (mergeRelExists(workingPath, entity.toString())) {
|
||||
if (mergeRelExists(workingPath, type.toString())) {
|
||||
|
||||
final String mergeRelPath =
|
||||
DedupUtility.createMergeRelPath(
|
||||
workingPath, "*", entity.toString());
|
||||
DedupUtility.createMergeRelPath(workingPath, "*", type.toString());
|
||||
final String dedupRecordPath =
|
||||
DedupUtility.createDedupRecordPath(
|
||||
workingPath, "*", entity.toString());
|
||||
workingPath, "*", type.toString());
|
||||
|
||||
final Dataset<Relation> rel =
|
||||
spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class));
|
||||
|
@ -107,7 +105,6 @@ public class SparkUpdateEntity extends AbstractSparkAction {
|
|||
MapDocumentUtil.getJPathString(
|
||||
IDJSONPATH, s),
|
||||
s));
|
||||
|
||||
JavaRDD<String> map =
|
||||
entitiesWithId
|
||||
.leftOuterJoin(mergedIds)
|
||||
|
|
|
@ -72,7 +72,7 @@ public class SparkDedupTest implements Serializable {
|
|||
|
||||
spark =
|
||||
SparkSession.builder()
|
||||
.appName(SparkCreateSimRels.class.getSimpleName())
|
||||
.appName(SparkDedupTest.class.getSimpleName())
|
||||
.master("local[*]")
|
||||
.config(new SparkConf())
|
||||
.getOrCreate();
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue