2020-10-20 12:19:46 +02:00
|
|
|
|
2020-03-27 10:42:17 +01:00
|
|
|
package eu.dnetlib.dhp.oa.dedup;
|
2019-12-10 14:57:16 +01:00
|
|
|
|
2024-01-23 08:47:12 +01:00
|
|
|
import java.util.*;
|
|
|
|
import java.util.stream.Stream;
|
|
|
|
|
|
|
|
import org.apache.commons.beanutils.BeanUtils;
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
2024-04-11 15:49:29 +02:00
|
|
|
import org.apache.spark.api.java.function.FlatMapGroupsFunction;
|
2024-01-23 08:47:12 +01:00
|
|
|
import org.apache.spark.api.java.function.MapFunction;
|
|
|
|
import org.apache.spark.api.java.function.ReduceFunction;
|
|
|
|
import org.apache.spark.sql.*;
|
|
|
|
|
2022-11-09 14:20:59 +01:00
|
|
|
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
2020-10-08 17:29:29 +02:00
|
|
|
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
2020-09-29 15:31:46 +02:00
|
|
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
2023-12-22 09:57:30 +01:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
2024-03-26 10:42:10 +01:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
2020-09-29 15:31:46 +02:00
|
|
|
import scala.Tuple2;
|
2023-12-22 09:57:30 +01:00
|
|
|
import scala.Tuple3;
|
|
|
|
import scala.collection.JavaConversions;
|
2020-07-22 17:29:48 +02:00
|
|
|
|
2023-12-22 09:57:30 +01:00
|
|
|
public class DedupRecordFactory {
|
2024-03-26 10:42:10 +01:00
|
|
|
public static final class DedupRecordReduceState {
|
|
|
|
public final String dedupId;
|
|
|
|
|
|
|
|
public final ArrayList<String> aliases = new ArrayList<>();
|
|
|
|
|
|
|
|
public final HashSet<String> acceptanceDate = new HashSet<>();
|
|
|
|
|
|
|
|
public OafEntity entity;
|
|
|
|
|
|
|
|
public DedupRecordReduceState(String dedupId, String id, OafEntity entity) {
|
|
|
|
this.dedupId = dedupId;
|
|
|
|
this.entity = entity;
|
|
|
|
if (entity == null) {
|
|
|
|
aliases.add(id);
|
|
|
|
} else {
|
|
|
|
if (Result.class.isAssignableFrom(entity.getClass())) {
|
|
|
|
Result result = (Result) entity;
|
|
|
|
if (result.getDateofacceptance() != null
|
|
|
|
&& StringUtils.isNotBlank(result.getDateofacceptance().getValue())) {
|
|
|
|
acceptanceDate.add(result.getDateofacceptance().getValue());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public String getDedupId() {
|
|
|
|
return dedupId;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private static final int MAX_ACCEPTANCE_DATE = 20;
|
|
|
|
|
|
|
|
private DedupRecordFactory() {
|
|
|
|
}
|
|
|
|
|
|
|
|
public static Dataset<OafEntity> createDedupRecord(
|
|
|
|
final SparkSession spark,
|
|
|
|
final DataInfo dataInfo,
|
|
|
|
final String mergeRelsInputPath,
|
|
|
|
final String entitiesInputPath,
|
|
|
|
final Class<OafEntity> clazz) {
|
|
|
|
|
|
|
|
final long ts = System.currentTimeMillis();
|
|
|
|
final Encoder<OafEntity> beanEncoder = Encoders.bean(clazz);
|
|
|
|
final Encoder<OafEntity> kryoEncoder = Encoders.kryo(clazz);
|
|
|
|
|
|
|
|
// <id, json_entity>
|
|
|
|
Dataset<Row> entities = spark
|
|
|
|
.read()
|
|
|
|
.schema(Encoders.bean(clazz).schema())
|
|
|
|
.json(entitiesInputPath)
|
|
|
|
.as(beanEncoder)
|
|
|
|
.map(
|
|
|
|
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) entity -> {
|
|
|
|
return new Tuple2<>(entity.getId(), entity);
|
|
|
|
},
|
|
|
|
Encoders.tuple(Encoders.STRING(), kryoEncoder))
|
|
|
|
.selectExpr("_1 AS id", "_2 AS kryoObject");
|
|
|
|
|
|
|
|
// <source, target>: source is the dedup_id, target is the id of the mergedIn
|
|
|
|
Dataset<Row> mergeRels = spark
|
|
|
|
.read()
|
|
|
|
.load(mergeRelsInputPath)
|
|
|
|
.where("relClass == 'merges'")
|
|
|
|
.selectExpr("source as dedupId", "target as id");
|
|
|
|
|
|
|
|
return mergeRels
|
|
|
|
.join(entities, JavaConversions.asScalaBuffer(Collections.singletonList("id")), "left")
|
|
|
|
.select("dedupId", "id", "kryoObject")
|
|
|
|
.as(Encoders.tuple(Encoders.STRING(), Encoders.STRING(), kryoEncoder))
|
2024-04-11 15:49:29 +02:00
|
|
|
.groupByKey((MapFunction<Tuple3<String, String, OafEntity>, String>) Tuple3::_1, Encoders.STRING())
|
|
|
|
.flatMapGroups(
|
|
|
|
(FlatMapGroupsFunction<String, Tuple3<String, String, OafEntity>, OafEntity>) (dedupId, it) -> {
|
|
|
|
if (!it.hasNext())
|
|
|
|
return Collections.emptyIterator();
|
|
|
|
|
|
|
|
final ArrayList<OafEntity> cliques = new ArrayList<>();
|
|
|
|
|
|
|
|
final ArrayList<String> aliases = new ArrayList<>();
|
|
|
|
|
|
|
|
final HashSet<String> acceptanceDate = new HashSet<>();
|
|
|
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
Tuple3<String, String, OafEntity> t = it.next();
|
|
|
|
OafEntity entity = t._3();
|
|
|
|
|
|
|
|
if (entity == null) {
|
|
|
|
aliases.add(t._2());
|
|
|
|
} else {
|
|
|
|
cliques.add(entity);
|
|
|
|
|
|
|
|
if (acceptanceDate.size() < MAX_ACCEPTANCE_DATE) {
|
|
|
|
if (Result.class.isAssignableFrom(entity.getClass())) {
|
|
|
|
Result result = (Result) entity;
|
|
|
|
if (result.getDateofacceptance() != null
|
|
|
|
&& StringUtils.isNotBlank(result.getDateofacceptance().getValue())) {
|
|
|
|
acceptanceDate.add(result.getDateofacceptance().getValue());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-03-26 10:42:10 +01:00
|
|
|
|
2024-04-11 15:49:29 +02:00
|
|
|
}
|
2024-03-26 10:42:10 +01:00
|
|
|
|
2024-04-11 15:49:29 +02:00
|
|
|
if (acceptanceDate.size() >= MAX_ACCEPTANCE_DATE || cliques.isEmpty()) {
|
|
|
|
return Collections.emptyIterator();
|
|
|
|
}
|
2024-03-26 10:42:10 +01:00
|
|
|
|
2024-04-11 15:49:29 +02:00
|
|
|
OafEntity mergedEntity = MergeUtils.mergeGroup(dedupId, cliques.iterator());
|
|
|
|
// dedup records do not have date of transformation attribute
|
|
|
|
mergedEntity.setDateoftransformation(null);
|
|
|
|
|
|
|
|
return Stream
|
|
|
|
.concat(
|
|
|
|
Stream
|
|
|
|
.of(dedupId)
|
|
|
|
.map(id -> createDedupOafEntity(id, mergedEntity, dataInfo, ts)),
|
|
|
|
aliases
|
|
|
|
.stream()
|
|
|
|
.map(id -> createMergedDedupAliasOafEntity(id, mergedEntity, dataInfo, ts)))
|
|
|
|
.iterator();
|
|
|
|
|
|
|
|
}, beanEncoder);
|
2024-03-26 10:42:10 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
private static OafEntity createDedupOafEntity(String id, OafEntity base, DataInfo dataInfo, long ts) {
|
|
|
|
try {
|
|
|
|
OafEntity res = (OafEntity) BeanUtils.cloneBean(base);
|
|
|
|
res.setId(id);
|
|
|
|
res.setDataInfo(dataInfo);
|
|
|
|
res.setLastupdatetimestamp(ts);
|
|
|
|
return res;
|
|
|
|
} catch (Exception e) {
|
|
|
|
throw new RuntimeException(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private static OafEntity createMergedDedupAliasOafEntity(String id, OafEntity base, DataInfo dataInfo, long ts) {
|
|
|
|
try {
|
|
|
|
OafEntity res = createDedupOafEntity(id, base, dataInfo, ts);
|
|
|
|
DataInfo ds = (DataInfo) BeanUtils.cloneBean(dataInfo);
|
|
|
|
ds.setDeletedbyinference(true);
|
|
|
|
res.setDataInfo(ds);
|
|
|
|
return res;
|
|
|
|
} catch (Exception e) {
|
|
|
|
throw new RuntimeException(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private static OafEntity reduceEntity(OafEntity entity, OafEntity duplicate) {
|
|
|
|
|
|
|
|
if (duplicate == null) {
|
|
|
|
return entity;
|
|
|
|
}
|
|
|
|
|
|
|
|
int compare = new IdentifierComparator<>()
|
|
|
|
.compare(Identifier.newInstance(entity), Identifier.newInstance(duplicate));
|
|
|
|
|
|
|
|
if (compare > 0) {
|
|
|
|
OafEntity swap = duplicate;
|
|
|
|
duplicate = entity;
|
|
|
|
entity = swap;
|
|
|
|
}
|
|
|
|
|
|
|
|
entity = MergeUtils.checkedMerge(entity, duplicate);
|
|
|
|
|
|
|
|
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
|
|
|
Result re = (Result) entity;
|
|
|
|
Result rd = (Result) duplicate;
|
|
|
|
|
|
|
|
List<List<Author>> authors = new ArrayList<>();
|
|
|
|
if (re.getAuthor() != null) {
|
|
|
|
authors.add(re.getAuthor());
|
|
|
|
}
|
|
|
|
if (rd.getAuthor() != null) {
|
|
|
|
authors.add(rd.getAuthor());
|
|
|
|
}
|
|
|
|
|
|
|
|
re.setAuthor(AuthorMerger.merge(authors));
|
|
|
|
}
|
|
|
|
|
|
|
|
return entity;
|
|
|
|
}
|
|
|
|
|
|
|
|
public static <T extends OafEntity> T entityMerger(
|
|
|
|
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz) {
|
|
|
|
T base = entities.next()._2();
|
|
|
|
|
|
|
|
while (entities.hasNext()) {
|
|
|
|
T duplicate = entities.next()._2();
|
|
|
|
if (duplicate != null)
|
|
|
|
base = (T) reduceEntity(base, duplicate);
|
|
|
|
}
|
|
|
|
|
|
|
|
base.setId(id);
|
|
|
|
base.setDataInfo(dataInfo);
|
|
|
|
base.setLastupdatetimestamp(ts);
|
|
|
|
|
|
|
|
return base;
|
|
|
|
}
|
2023-10-02 09:25:12 +02:00
|
|
|
|
2019-12-10 14:57:16 +01:00
|
|
|
}
|