forked from D-Net/dnet-hadoop
Use json loading in place of text loading + jackson mapper
This commit is contained in:
parent
1a3fc87599
commit
79e4728afd
|
@ -45,11 +45,11 @@
|
||||||
<!-- <artifactId>dhp-broker-events</artifactId>-->
|
<!-- <artifactId>dhp-broker-events</artifactId>-->
|
||||||
<!-- <version>${project.version}</version>-->
|
<!-- <version>${project.version}</version>-->
|
||||||
<!-- </dependency>-->
|
<!-- </dependency>-->
|
||||||
<!-- <dependency>-->
|
<dependency>
|
||||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<!-- <artifactId>dhp-dedup-openaire</artifactId>-->
|
<artifactId>dhp-dedup-openaire</artifactId>
|
||||||
<!-- <version>${project.version}</version>-->
|
<version>${project.version}</version>
|
||||||
<!-- </dependency>-->
|
</dependency>
|
||||||
<!-- <dependency>-->
|
<!-- <dependency>-->
|
||||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
<!-- <artifactId>dhp-enrichment</artifactId>-->
|
<!-- <artifactId>dhp-enrichment</artifactId>-->
|
||||||
|
|
|
@ -128,13 +128,12 @@ abstract class AbstractSparkAction implements Serializable {
|
||||||
.collect(Collectors.joining(SP_SEPARATOR));
|
.collect(Collectors.joining(SP_SEPARATOR));
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static MapFunction<String, Relation> patchRelFn() {
|
protected static MapFunction<Relation, Relation> patchRelFn() {
|
||||||
return value -> {
|
return value -> {
|
||||||
final Relation rel = OBJECT_MAPPER.readValue(value, Relation.class);
|
if (value.getDataInfo() == null) {
|
||||||
if (rel.getDataInfo() == null) {
|
value.setDataInfo(new DataInfo());
|
||||||
rel.setDataInfo(new DataInfo());
|
|
||||||
}
|
}
|
||||||
return rel;
|
return value;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -68,22 +68,20 @@ public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
|
||||||
final String relationPath = DedupUtility.createEntityPath(graphBasePath, "relation");
|
final String relationPath = DedupUtility.createEntityPath(graphBasePath, "relation");
|
||||||
|
|
||||||
// collect organization merge relations from openorgs database
|
// collect organization merge relations from openorgs database
|
||||||
JavaRDD<Relation> mergeRelsRDD = spark
|
Dataset<Relation> relations = spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(relationPath)
|
.schema(Encoders.bean(Relation.class).schema())
|
||||||
|
.json(relationPath)
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
.map(patchRelFn(), Encoders.bean(Relation.class))
|
.map(patchRelFn(), Encoders.bean(Relation.class))
|
||||||
.toJavaRDD()
|
|
||||||
.filter(this::isOpenorgs) // take only openorgs relations
|
.filter(this::isOpenorgs) // take only openorgs relations
|
||||||
.filter(this::isMergeRel); // take merges and isMergedIn relations
|
.filter(this::isMergeRel); // take merges and isMergedIn relations
|
||||||
|
|
||||||
log.info("Number of Openorgs Merge Relations collected: {}", mergeRelsRDD.count());
|
relations.cache();
|
||||||
|
log.info("Number of Openorgs Merge Relations collected: {}", relations.count());
|
||||||
final Dataset<Relation> relations = spark
|
|
||||||
.createDataset(
|
|
||||||
mergeRelsRDD.rdd(),
|
|
||||||
Encoders.bean(Relation.class));
|
|
||||||
|
|
||||||
saveParquet(relations, outputPath, SaveMode.Append);
|
saveParquet(relations, outputPath, SaveMode.Append);
|
||||||
|
relations.unpersist();
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isMergeRel(Relation rel) {
|
private boolean isMergeRel(Relation rel) {
|
||||||
|
|
|
@ -69,7 +69,9 @@ public class SparkCopyOpenorgsSimRels extends AbstractSparkAction {
|
||||||
|
|
||||||
Dataset<Relation> rawRels = spark
|
Dataset<Relation> rawRels = spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(relationPath)
|
.schema(Encoders.bean(Relation.class).schema())
|
||||||
|
.json(relationPath)
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
.map(patchRelFn(), Encoders.bean(Relation.class))
|
.map(patchRelFn(), Encoders.bean(Relation.class))
|
||||||
.filter(this::filterOpenorgsRels);
|
.filter(this::filterOpenorgsRels);
|
||||||
|
|
||||||
|
|
|
@ -58,7 +58,9 @@ public class SparkCopyRelationsNoOpenorgs extends AbstractSparkAction {
|
||||||
|
|
||||||
JavaRDD<Relation> simRels = spark
|
JavaRDD<Relation> simRels = spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(relationPath)
|
.schema(Encoders.bean(Relation.class).schema())
|
||||||
|
.json(relationPath)
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
.map(patchRelFn(), Encoders.bean(Relation.class))
|
.map(patchRelFn(), Encoders.bean(Relation.class))
|
||||||
.toJavaRDD()
|
.toJavaRDD()
|
||||||
.filter(x -> !isOpenorgsDedupRel(x));
|
.filter(x -> !isOpenorgsDedupRel(x));
|
||||||
|
|
|
@ -111,7 +111,9 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
|
||||||
// collect diffrels from the raw graph relations: <other id, "diffRel">
|
// collect diffrels from the raw graph relations: <other id, "diffRel">
|
||||||
JavaPairRDD<String, String> diffRels = spark
|
JavaPairRDD<String, String> diffRels = spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(relationPath)
|
.schema(Encoders.bean(Relation.class).schema())
|
||||||
|
.json(relationPath)
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
.map(patchRelFn(), Encoders.bean(Relation.class))
|
.map(patchRelFn(), Encoders.bean(Relation.class))
|
||||||
.toJavaRDD()
|
.toJavaRDD()
|
||||||
.filter(r -> filterRels(r, ModelSupport.getMainType(EntityType.organization)))
|
.filter(r -> filterRels(r, ModelSupport.getMainType(EntityType.organization)))
|
||||||
|
|
|
@ -133,7 +133,9 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
|
||||||
// collect diffrels from the raw graph relations: <<best id, other id>, "diffRel">
|
// collect diffrels from the raw graph relations: <<best id, other id>, "diffRel">
|
||||||
JavaRDD<Tuple2<Tuple2<String, String>, String>> diffRels = spark
|
JavaRDD<Tuple2<Tuple2<String, String>, String>> diffRels = spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(relationPath)
|
.schema(Encoders.bean(Relation.class).schema())
|
||||||
|
.json(relationPath)
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
.map(patchRelFn(), Encoders.bean(Relation.class))
|
.map(patchRelFn(), Encoders.bean(Relation.class))
|
||||||
.toJavaRDD()
|
.toJavaRDD()
|
||||||
.filter(r -> filterRels(r, "organization"))
|
.filter(r -> filterRels(r, "organization"))
|
||||||
|
|
Loading…
Reference in New Issue