2019-11-07 12:47:12 +01:00
|
|
|
package eu.dnetlib;
|
|
|
|
|
2020-09-29 12:01:25 +02:00
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
2020-06-11 10:46:46 +02:00
|
|
|
import com.google.common.hash.Hashing;
|
2019-11-07 12:47:12 +01:00
|
|
|
import eu.dnetlib.graph.GraphProcessor;
|
|
|
|
import eu.dnetlib.pace.config.DedupConfig;
|
2020-12-04 15:41:31 +01:00
|
|
|
import eu.dnetlib.pace.config.WfConfig;
|
|
|
|
import eu.dnetlib.pace.model.Field;
|
2019-11-07 12:47:12 +01:00
|
|
|
import eu.dnetlib.pace.model.MapDocument;
|
2020-12-04 15:41:31 +01:00
|
|
|
import eu.dnetlib.pace.model.MapDocumentComparator;
|
|
|
|
import eu.dnetlib.pace.tree.JsonListMatch;
|
|
|
|
import eu.dnetlib.pace.tree.LevensteinTitle;
|
|
|
|
import eu.dnetlib.pace.tree.SizeMatch;
|
|
|
|
import eu.dnetlib.pace.tree.TitleVersionMatch;
|
|
|
|
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
2019-11-07 12:47:12 +01:00
|
|
|
import eu.dnetlib.pace.util.BlockProcessor;
|
2019-12-13 11:30:02 +01:00
|
|
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
2020-12-04 15:41:31 +01:00
|
|
|
import eu.dnetlib.pace.util.Reporter;
|
2019-11-07 12:47:12 +01:00
|
|
|
import eu.dnetlib.pace.utils.Utility;
|
|
|
|
import eu.dnetlib.reporter.SparkReporter;
|
2020-06-11 10:46:46 +02:00
|
|
|
import eu.dnetlib.support.Block;
|
2019-11-07 12:47:12 +01:00
|
|
|
import eu.dnetlib.support.ConnectedComponent;
|
2020-06-11 10:46:46 +02:00
|
|
|
import eu.dnetlib.support.Relation;
|
2020-12-04 15:41:31 +01:00
|
|
|
import org.apache.commons.lang3.StringUtils;
|
2019-11-07 12:47:12 +01:00
|
|
|
import org.apache.commons.logging.Log;
|
|
|
|
import org.apache.commons.logging.LogFactory;
|
|
|
|
import org.apache.spark.api.java.JavaPairRDD;
|
|
|
|
import org.apache.spark.api.java.JavaRDD;
|
|
|
|
import org.apache.spark.api.java.JavaSparkContext;
|
2020-06-11 10:46:46 +02:00
|
|
|
import org.apache.spark.api.java.function.MapFunction;
|
2020-12-04 15:41:31 +01:00
|
|
|
import org.apache.spark.api.java.function.PairFlatMapFunction;
|
2020-06-11 10:46:46 +02:00
|
|
|
import org.apache.spark.api.java.function.PairFunction;
|
2019-11-07 12:47:12 +01:00
|
|
|
import org.apache.spark.graphx.Edge;
|
|
|
|
import org.apache.spark.rdd.RDD;
|
2020-06-11 10:46:46 +02:00
|
|
|
import org.apache.spark.sql.Dataset;
|
|
|
|
import org.apache.spark.sql.Encoders;
|
|
|
|
import org.apache.spark.sql.SaveMode;
|
|
|
|
import org.apache.spark.sql.SparkSession;
|
2019-11-07 12:47:12 +01:00
|
|
|
import org.apache.spark.util.LongAccumulator;
|
2019-11-20 10:45:00 +01:00
|
|
|
import scala.Serializable;
|
2019-11-07 12:47:12 +01:00
|
|
|
import scala.Tuple2;
|
2020-09-29 12:01:25 +02:00
|
|
|
import scala.math.Ordering;
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-09-29 12:01:25 +02:00
|
|
|
import java.nio.charset.Charset;
|
2020-06-11 10:46:46 +02:00
|
|
|
import java.util.*;
|
2019-11-07 12:47:12 +01:00
|
|
|
import java.util.stream.Collectors;
|
2020-06-11 10:46:46 +02:00
|
|
|
import java.util.stream.StreamSupport;
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2019-11-20 10:45:00 +01:00
|
|
|
public class Deduper implements Serializable {
|
2019-11-07 12:47:12 +01:00
|
|
|
|
|
|
|
private static final Log log = LogFactory.getLog(Deduper.class);
|
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
public static JavaPairRDD<String, Block> createSortedBlocks(
|
|
|
|
JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
|
|
|
|
final String of = config.getWf().getOrderField();
|
|
|
|
final int maxQueueSize = config.getWf().getGroupMaxSize();
|
|
|
|
|
|
|
|
return mapDocs
|
|
|
|
// the reduce is just to be sure that we haven't document with same id
|
|
|
|
.reduceByKey((a, b) -> a)
|
|
|
|
.map(Tuple2::_2)
|
|
|
|
// Clustering: from <id, doc> to List<groupkey,doc>
|
|
|
|
.flatMap(
|
|
|
|
a -> Utility
|
|
|
|
.getGroupingKeys(config, a)
|
|
|
|
.stream()
|
|
|
|
.map(it -> Block.from(it, a))
|
|
|
|
.collect(Collectors.toList())
|
|
|
|
.iterator())
|
|
|
|
.mapToPair(block -> new Tuple2<>(block.getKey(), block))
|
|
|
|
.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
|
|
|
|
}
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
public static Iterator<Tuple2<String, String>> ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) {
|
|
|
|
return cc
|
|
|
|
.getDocs()
|
|
|
|
.stream()
|
|
|
|
.flatMap(
|
|
|
|
id -> {
|
|
|
|
List<Tuple2<String, String>> tmp = new ArrayList<>();
|
|
|
|
tmp.add(new Tuple2<>(cc.getCcId(), id));
|
|
|
|
return tmp.stream();
|
|
|
|
})
|
|
|
|
.iterator();
|
|
|
|
}
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
public static long hash(final String id) {
|
2020-09-29 12:01:25 +02:00
|
|
|
return Hashing.murmur3_128().hashString(id, Charset.defaultCharset()).asLong();
|
2020-06-11 10:46:46 +02:00
|
|
|
}
|
2019-11-20 10:45:00 +01:00
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
public static ConnectedComponent entityMerger(String key, Iterator<String> values) {
|
2020-01-14 10:42:43 +01:00
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
ConnectedComponent cc = new ConnectedComponent();
|
|
|
|
cc.setCcId(key);
|
|
|
|
cc.setDocs(StreamSupport.stream(Spliterators.spliteratorUnknownSize(values, Spliterator.ORDERED), false)
|
2020-09-29 12:01:25 +02:00
|
|
|
.collect(Collectors.toCollection(HashSet::new)));
|
2020-06-11 10:46:46 +02:00
|
|
|
return cc;
|
|
|
|
}
|
2019-11-20 10:45:00 +01:00
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
public static JavaRDD<Relation> computeRelations(
|
|
|
|
JavaSparkContext context, JavaPairRDD<String, Block> blocks, DedupConfig config) {
|
|
|
|
Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
|
2019-11-20 10:45:00 +01:00
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
return blocks
|
|
|
|
.flatMapToPair(
|
|
|
|
it -> {
|
|
|
|
final SparkReporter reporter = new SparkReporter(accumulators);
|
|
|
|
new BlockProcessor(config)
|
|
|
|
.processSortedBlock(it._1(), it._2().getDocuments(), reporter);
|
|
|
|
return reporter.getRelations().iterator();
|
|
|
|
})
|
|
|
|
.mapToPair(it -> new Tuple2<>(it._1() + it._2(), new Relation(it._1(), it._2(), "simRel")))
|
|
|
|
.reduceByKey((a, b) -> a)
|
|
|
|
.map(Tuple2::_2);
|
2019-11-20 10:45:00 +01:00
|
|
|
}
|
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
public static Queue<MapDocument> prepareQueue(final Iterable<MapDocument> documents, DedupConfig config) {
|
|
|
|
final Queue<MapDocument> queue = new PriorityQueue<>(100, new MapDocumentComparator(config.getWf().getOrderField()));
|
|
|
|
|
|
|
|
final Set<String> seen = new HashSet<String>();
|
|
|
|
final int queueMaxSize = config.getWf().getQueueMaxSize();
|
|
|
|
|
|
|
|
documents.forEach(doc -> {
|
|
|
|
if (queue.size() <= queueMaxSize) {
|
|
|
|
final String id = doc.getIdentifier();
|
|
|
|
|
|
|
|
if (!seen.contains(id)) {
|
|
|
|
seen.add(id);
|
|
|
|
queue.add(doc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
return queue;
|
|
|
|
}
|
|
|
|
|
|
|
|
public static JavaRDD<Relation> computePublicationRelations(
|
|
|
|
JavaSparkContext context, JavaPairRDD<String, Block> blocks, DedupConfig config) {
|
|
|
|
|
|
|
|
return blocks.
|
|
|
|
flatMapToPair((PairFlatMapFunction<Tuple2<String, Block>, String, String>)
|
|
|
|
it -> {
|
|
|
|
List<Tuple2<String,String>> relations = new ArrayList<>();
|
|
|
|
|
|
|
|
if (it._2().getDocuments().size()>1) {
|
|
|
|
|
|
|
|
Queue<MapDocument> queue = prepareQueue(it._2().getDocuments(), config);
|
|
|
|
|
|
|
|
while (!queue.isEmpty()) {
|
|
|
|
|
|
|
|
final MapDocument pivot = queue.remove();
|
|
|
|
final String idPivot = pivot.getIdentifier();
|
|
|
|
|
|
|
|
WfConfig wf = config.getWf();
|
|
|
|
final Field fieldsPivot = pivot.values(wf.getOrderField());
|
|
|
|
final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue();
|
|
|
|
|
|
|
|
if (fieldPivot != null) {
|
|
|
|
int i = 0;
|
|
|
|
for (final MapDocument curr : queue) {
|
|
|
|
final String idCurr = curr.getIdentifier();
|
|
|
|
|
|
|
|
if (config.getWf().getSkipList().contains(StringUtils.substringBetween(idCurr, "|", "::"))) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (i > wf.getSlidingWindowSize()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
final Field fieldsCurr = curr.values(wf.getOrderField());
|
|
|
|
final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null : fieldsCurr.stringValue();
|
|
|
|
|
|
|
|
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
|
|
|
|
|
|
|
double score = 0.0;
|
|
|
|
Map<String, String> params = new HashMap<>();
|
|
|
|
params.put("jpath_value", "$.value");
|
|
|
|
params.put("jpath_classid", "$.qualifier.classid");
|
|
|
|
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
|
|
|
double result = jsonListMatch.compare(pivot.getFieldMap().get("pid"), curr.getFieldMap().get("pid"), config);
|
|
|
|
if (result > 0.5) //if the result of the comparison is greater than the threshold
|
|
|
|
score += 10.0; //high score because it should match when the first condition is satisfied
|
|
|
|
else
|
|
|
|
score += 0.0;
|
|
|
|
|
|
|
|
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
|
|
|
double result1 = titleVersionMatch.compare(pivot.getFieldMap().get("title"), curr.getFieldMap().get("title"), config);
|
|
|
|
SizeMatch sizeMatch = new SizeMatch(params);
|
|
|
|
double result2 = sizeMatch.compare(pivot.getFieldMap().get("authors"), curr.getFieldMap().get("authors"), config);
|
|
|
|
if ((result1 == 1.0 && result2 == 1.0) || (result1 == -1.0 && result2 == 1.0) || (result1 == 1.0 && result2 == -1.0) || (result1 == -1.0 && result2 == -1.0))
|
|
|
|
score += 0.0;
|
|
|
|
else
|
|
|
|
score -= 1.0;
|
|
|
|
|
|
|
|
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
|
|
|
double result3 = levensteinTitle.compare(pivot.getFieldMap().get("title"), curr.getFieldMap().get("title"), config);
|
|
|
|
score += result3;
|
|
|
|
|
|
|
|
if (score >= 0.99) {
|
|
|
|
relations.add(new Tuple2<>(idPivot, idCurr));
|
|
|
|
relations.add(new Tuple2<>(idCurr, idPivot));
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return relations.iterator();
|
|
|
|
})
|
|
|
|
.mapToPair(it -> new Tuple2<>(it._1() + it._2(), new Relation(it._1(), it._2(), "simRel")))
|
|
|
|
.reduceByKey((a,b) -> a)
|
|
|
|
.map(Tuple2::_2);
|
|
|
|
}
|
|
|
|
|
|
|
|
public static boolean comparePublications(MapDocument a, MapDocument b, DedupConfig config){
|
|
|
|
|
|
|
|
double score = 0.0;
|
|
|
|
Map<String, String> params = new HashMap<>();
|
|
|
|
params.put("jpath_value", "$.value");
|
|
|
|
params.put("jpath_classid", "$.qualifier.classid");
|
|
|
|
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
|
|
|
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
|
|
|
|
if (result > 0.5) //if the result of the comparison is greater than the threshold
|
|
|
|
score += 1.0;
|
|
|
|
else
|
|
|
|
score += 0.0;
|
|
|
|
|
|
|
|
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
|
|
|
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
|
|
|
SizeMatch sizeMatch = new SizeMatch(params);
|
|
|
|
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
|
|
|
|
if ((result1 == 1.0 && result2 == 1.0) || (result1 == -1.0 && result2 == 1.0) || (result1 == 1.0 && result2 == -1.0) || (result1 == -1.0 && result2 == -1.0))
|
|
|
|
score += 0.0;
|
|
|
|
else
|
|
|
|
score -= 1.0;
|
|
|
|
|
|
|
|
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
|
|
|
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
|
|
|
score += result3;
|
|
|
|
|
|
|
|
return score >= 0.99;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
public static void createSimRels(DedupConfig dedupConf, SparkSession spark, String entitiesPath, String simRelsPath){
|
2019-11-20 10:45:00 +01:00
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
|
|
|
|
|
|
JavaPairRDD<String, MapDocument> mapDocuments = sc
|
|
|
|
.textFile(entitiesPath)
|
|
|
|
.mapToPair(
|
|
|
|
(PairFunction<String, String, MapDocument>) s -> {
|
|
|
|
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
|
|
|
|
return new Tuple2<>(d.getIdentifier(), d);
|
|
|
|
});
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
// create blocks for deduplication
|
|
|
|
JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
// create relations by comparing only elements in the same group
|
|
|
|
JavaRDD<Relation> relations = Deduper.computeRelations(sc, blocks, dedupConf);
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
// save the simrel in the workingdir
|
|
|
|
spark
|
|
|
|
.createDataset(relations.rdd(), Encoders.bean(Relation.class))
|
|
|
|
.write()
|
|
|
|
.mode(SaveMode.Overwrite)
|
|
|
|
.save(simRelsPath);
|
|
|
|
}
|
2020-03-20 18:02:52 +01:00
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
public static void createMergeRels(DedupConfig dedupConf, String entitiesPath, String mergeRelsPath, String simRelsPath, SparkSession spark){
|
|
|
|
|
|
|
|
final int maxIterations = dedupConf.getWf().getMaxIterations();
|
|
|
|
|
|
|
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
|
|
|
|
|
|
final JavaPairRDD<Object, String> vertexes = sc
|
|
|
|
.textFile(entitiesPath)
|
|
|
|
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
|
|
|
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
|
|
|
|
|
|
|
|
final RDD<Edge<String>> edgeRdd = spark
|
|
|
|
.read()
|
|
|
|
.load(simRelsPath)
|
|
|
|
.as(Encoders.bean(Relation.class))
|
|
|
|
.javaRDD()
|
|
|
|
.map(Relation::toEdgeRdd)
|
|
|
|
.rdd();
|
|
|
|
|
2020-09-29 12:01:25 +02:00
|
|
|
JavaRDD<ConnectedComponent> ccs = GraphProcessor
|
|
|
|
.findCCs(vertexes.rdd(), edgeRdd, maxIterations)
|
|
|
|
.toJavaRDD();
|
|
|
|
|
|
|
|
JavaRDD<Relation> mergeRel = ccs
|
|
|
|
.filter(k -> k.getDocs().size() > 1)
|
|
|
|
.flatMap(cc -> ccToMergeRel(cc, dedupConf))
|
|
|
|
.map(it -> new Relation(it._1(), it._2(), "mergeRel"));
|
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
final Dataset<Relation> mergeRels = spark
|
|
|
|
.createDataset(
|
2020-09-29 12:01:25 +02:00
|
|
|
mergeRel.rdd(),
|
2020-06-11 10:46:46 +02:00
|
|
|
Encoders.bean(Relation.class));
|
|
|
|
|
|
|
|
mergeRels.write().mode(SaveMode.Overwrite).parquet(mergeRelsPath);
|
2019-11-20 10:45:00 +01:00
|
|
|
}
|
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
public static void createDedupEntity(DedupConfig dedupConf, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
|
|
|
|
|
|
|
|
JavaPairRDD<String, String> entities = spark
|
|
|
|
.read()
|
|
|
|
.textFile(entitiesPath)
|
|
|
|
.map((MapFunction<String, Tuple2<String, String>>) it ->
|
|
|
|
new Tuple2<>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it),
|
|
|
|
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
|
|
|
.toJavaRDD()
|
|
|
|
.mapToPair(t -> t);
|
|
|
|
|
|
|
|
// <source, target>: source is the dedup_id, target is the id of the mergedIn
|
|
|
|
JavaPairRDD<String, Relation> mergeRels = spark
|
|
|
|
.read()
|
|
|
|
.load(mergeRelsPath)
|
|
|
|
.as(Encoders.bean(Relation.class))
|
|
|
|
.toJavaRDD()
|
|
|
|
.mapToPair(r -> new Tuple2<>(r.getTarget(), r));
|
|
|
|
|
|
|
|
JavaRDD<ConnectedComponent> dedupEntities = mergeRels.join(entities)
|
|
|
|
.mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
|
|
|
|
.groupByKey()
|
|
|
|
.map(t-> entityMerger(t._1(), t._2().iterator()));
|
|
|
|
|
|
|
|
dedupEntities.saveAsTextFile(dedupEntityPath);
|
2019-11-07 12:47:12 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|