2020-03-27 10:42:17 +01:00
|
|
|
package eu.dnetlib.dhp.oa.dedup;
|
2019-12-10 14:57:16 +01:00
|
|
|
|
2020-04-18 12:06:23 +02:00
|
|
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
2019-12-10 14:57:16 +01:00
|
|
|
import com.google.common.collect.Lists;
|
2020-04-18 12:06:23 +02:00
|
|
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
2019-12-10 14:57:16 +01:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.*;
|
|
|
|
import eu.dnetlib.pace.config.DedupConfig;
|
2020-04-18 12:42:58 +02:00
|
|
|
import java.util.Collection;
|
|
|
|
import java.util.Iterator;
|
2020-04-18 12:06:23 +02:00
|
|
|
import org.apache.spark.api.java.function.FilterFunction;
|
|
|
|
import org.apache.spark.api.java.function.MapFunction;
|
|
|
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
|
|
|
import org.apache.spark.sql.Dataset;
|
2019-12-10 14:57:16 +01:00
|
|
|
import org.apache.spark.sql.Encoders;
|
|
|
|
import org.apache.spark.sql.SparkSession;
|
|
|
|
import scala.Tuple2;
|
|
|
|
|
|
|
|
public class DedupRecordFactory {
|
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
protected static final ObjectMapper OBJECT_MAPPER =
|
|
|
|
new com.fasterxml.jackson.databind.ObjectMapper()
|
|
|
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
2019-12-12 15:18:48 +01:00
|
|
|
|
2020-04-18 12:06:23 +02:00
|
|
|
public static <T extends OafEntity> Dataset<T> createDedupRecord(
|
2020-04-18 12:42:58 +02:00
|
|
|
final SparkSession spark,
|
|
|
|
final String mergeRelsInputPath,
|
|
|
|
final String entitiesInputPath,
|
|
|
|
final Class<T> clazz,
|
|
|
|
final DedupConfig dedupConf) {
|
2019-12-12 15:18:48 +01:00
|
|
|
|
2020-04-18 12:06:23 +02:00
|
|
|
long ts = System.currentTimeMillis();
|
2019-12-10 14:57:16 +01:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
// <id, json_entity>
|
|
|
|
Dataset<Tuple2<String, T>> entities =
|
|
|
|
spark.read()
|
|
|
|
.textFile(entitiesInputPath)
|
|
|
|
.map(
|
|
|
|
(MapFunction<String, Tuple2<String, T>>)
|
|
|
|
it -> {
|
|
|
|
T entity = OBJECT_MAPPER.readValue(it, clazz);
|
|
|
|
return new Tuple2<>(entity.getId(), entity);
|
|
|
|
},
|
|
|
|
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
|
|
|
|
|
|
|
// <source, target>: source is the dedup_id, target is the id of the mergedIn
|
|
|
|
Dataset<Tuple2<String, String>> mergeRels =
|
|
|
|
spark.read()
|
|
|
|
.load(mergeRelsInputPath)
|
|
|
|
.as(Encoders.bean(Relation.class))
|
|
|
|
.where("relClass == 'merges'")
|
|
|
|
.map(
|
|
|
|
(MapFunction<Relation, Tuple2<String, String>>)
|
|
|
|
r -> new Tuple2<>(r.getSource(), r.getTarget()),
|
|
|
|
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
|
|
|
|
|
|
|
|
// <dedup_id, json_entity_merged>
|
|
|
|
return mergeRels
|
|
|
|
.joinWith(entities, mergeRels.col("_1").equalTo(entities.col("_1")), "left_outer")
|
|
|
|
.filter(
|
|
|
|
(FilterFunction<Tuple2<Tuple2<String, String>, Tuple2<String, T>>>)
|
|
|
|
value -> value._2() != null)
|
|
|
|
.map(
|
|
|
|
(MapFunction<Tuple2<Tuple2<String, String>, Tuple2<String, T>>, T>)
|
|
|
|
value -> value._2()._2(),
|
|
|
|
Encoders.kryo(clazz))
|
2020-04-18 12:06:23 +02:00
|
|
|
.groupByKey((MapFunction<T, String>) value -> value.getId(), Encoders.STRING())
|
2020-04-18 12:42:58 +02:00
|
|
|
.mapGroups(
|
|
|
|
(MapGroupsFunction<String, T, T>)
|
|
|
|
(key, values) -> entityMerger(key, values, ts, clazz),
|
|
|
|
Encoders.bean(clazz));
|
2019-12-10 14:57:16 +01:00
|
|
|
}
|
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
private static <T extends OafEntity> T entityMerger(
|
|
|
|
String id, Iterator<T> entities, final long ts, Class<T> clazz) {
|
2020-04-18 12:06:23 +02:00
|
|
|
try {
|
|
|
|
T entity = clazz.newInstance();
|
|
|
|
entity.setId(id);
|
|
|
|
if (entity.getDataInfo() == null) {
|
|
|
|
entity.setDataInfo(new DataInfo());
|
|
|
|
}
|
|
|
|
entity.getDataInfo().setTrust("0.9");
|
|
|
|
entity.setLastupdatetimestamp(ts);
|
|
|
|
|
|
|
|
final Collection<String> dates = Lists.newArrayList();
|
2020-04-18 12:42:58 +02:00
|
|
|
entities.forEachRemaining(
|
|
|
|
e -> {
|
|
|
|
entity.mergeFrom(e);
|
|
|
|
if (ModelSupport.isSubClass(e, Result.class)) {
|
|
|
|
Result r1 = (Result) e;
|
|
|
|
Result er = (Result) entity;
|
|
|
|
er.setAuthor(DedupUtility.mergeAuthor(er.getAuthor(), r1.getAuthor()));
|
|
|
|
|
|
|
|
if (er.getDateofacceptance() != null) {
|
|
|
|
dates.add(r1.getDateofacceptance().getValue());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
2019-12-13 12:20:35 +01:00
|
|
|
|
2020-04-18 12:06:23 +02:00
|
|
|
if (ModelSupport.isSubClass(entity, Result.class)) {
|
|
|
|
((Result) entity).setDateofacceptance(DatePicker.pick(dates));
|
|
|
|
}
|
|
|
|
return entity;
|
|
|
|
} catch (IllegalAccessException | InstantiationException e) {
|
|
|
|
throw new RuntimeException(e);
|
2019-12-13 12:20:35 +01:00
|
|
|
}
|
2019-12-10 14:57:16 +01:00
|
|
|
}
|
|
|
|
}
|