implementation of the test for the dedup and addition of new support classes
This commit is contained in:
parent
aa4d03cfa3
commit
b3ec4194da
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib;
|
package eu.dnetlib;
|
||||||
|
|
||||||
|
import com.google.common.hash.Hashing;
|
||||||
import eu.dnetlib.graph.GraphProcessor;
|
import eu.dnetlib.graph.GraphProcessor;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
|
@ -7,105 +8,182 @@ import eu.dnetlib.pace.util.BlockProcessor;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
import eu.dnetlib.pace.utils.Utility;
|
import eu.dnetlib.pace.utils.Utility;
|
||||||
import eu.dnetlib.reporter.SparkReporter;
|
import eu.dnetlib.reporter.SparkReporter;
|
||||||
|
import eu.dnetlib.support.Block;
|
||||||
import eu.dnetlib.support.ConnectedComponent;
|
import eu.dnetlib.support.ConnectedComponent;
|
||||||
|
import eu.dnetlib.support.Relation;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.Optional;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
import org.apache.spark.graphx.Edge;
|
import org.apache.spark.graphx.Edge;
|
||||||
import org.apache.spark.rdd.RDD;
|
import org.apache.spark.rdd.RDD;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.apache.spark.util.LongAccumulator;
|
import org.apache.spark.util.LongAccumulator;
|
||||||
import scala.Serializable;
|
import scala.Serializable;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
public class Deduper implements Serializable {
|
public class Deduper implements Serializable {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(Deduper.class);
|
private static final Log log = LogFactory.getLog(Deduper.class);
|
||||||
|
|
||||||
/**
|
public static JavaPairRDD<String, Block> createSortedBlocks(
|
||||||
* @param: the spark context
|
JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
|
||||||
* @param: list of JSON entities to be deduped
|
final String of = config.getWf().getOrderField();
|
||||||
* @param: the dedup configuration
|
final int maxQueueSize = config.getWf().getGroupMaxSize();
|
||||||
*
|
|
||||||
* @return the list of connected components generated by the deduplication
|
|
||||||
*/
|
|
||||||
public static JavaRDD<ConnectedComponent> dedup(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config){
|
|
||||||
|
|
||||||
|
return mapDocs
|
||||||
|
// the reduce is just to be sure that we haven't document with same id
|
||||||
|
.reduceByKey((a, b) -> a)
|
||||||
|
.map(Tuple2::_2)
|
||||||
|
// Clustering: from <id, doc> to List<groupkey,doc>
|
||||||
|
.flatMap(
|
||||||
|
a -> Utility
|
||||||
|
.getGroupingKeys(config, a)
|
||||||
|
.stream()
|
||||||
|
.map(it -> Block.from(it, a))
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
.iterator())
|
||||||
|
.mapToPair(block -> new Tuple2<>(block.getKey(), block))
|
||||||
|
.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Iterator<Tuple2<String, String>> ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) {
|
||||||
|
return cc
|
||||||
|
.getDocs()
|
||||||
|
.stream()
|
||||||
|
.flatMap(
|
||||||
|
id -> {
|
||||||
|
List<Tuple2<String, String>> tmp = new ArrayList<>();
|
||||||
|
tmp.add(new Tuple2<>(cc.getCcId(), id));
|
||||||
|
return tmp.stream();
|
||||||
|
})
|
||||||
|
.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long hash(final String id) {
|
||||||
|
return Hashing.murmur3_128().hashString(id).asLong();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ConnectedComponent entityMerger(String key, Iterator<String> values) {
|
||||||
|
|
||||||
|
ConnectedComponent cc = new ConnectedComponent();
|
||||||
|
cc.setCcId(key);
|
||||||
|
cc.setDocs(StreamSupport.stream(Spliterators.spliteratorUnknownSize(values, Spliterator.ORDERED), false)
|
||||||
|
.collect(Collectors.toSet()));
|
||||||
|
return cc;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static JavaRDD<Relation> computeRelations(
|
||||||
|
JavaSparkContext context, JavaPairRDD<String, Block> blocks, DedupConfig config) {
|
||||||
Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
|
Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
|
||||||
|
|
||||||
//create vertexes of the graph: <ID, MapDocument>
|
return blocks
|
||||||
JavaPairRDD<String, MapDocument> mapDocs = mapToVertexes(context, entities, config);
|
.flatMapToPair(
|
||||||
RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair(t -> new Tuple2<Object, MapDocument>(Utility.getHashcode(t._1()), t._2())).rdd();
|
it -> {
|
||||||
|
final SparkReporter reporter = new SparkReporter(accumulators);
|
||||||
//create blocks for deduplication
|
new BlockProcessor(config)
|
||||||
JavaPairRDD<String, Iterable<MapDocument>> blocks = createBlocks(context, mapDocs, config);
|
.processSortedBlock(it._1(), it._2().getDocuments(), reporter);
|
||||||
|
return reporter.getRelations().iterator();
|
||||||
//create relations by comparing only elements in the same group
|
})
|
||||||
final JavaPairRDD<String, String> relationRDD = computeRelations(context, blocks, config);
|
.mapToPair(it -> new Tuple2<>(it._1() + it._2(), new Relation(it._1(), it._2(), "simRel")))
|
||||||
|
.reduceByKey((a, b) -> a)
|
||||||
System.out.println("Number of relations = " + relationRDD.distinct().count());
|
.map(Tuple2::_2);
|
||||||
|
|
||||||
final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(Utility.getHashcode(it._1()),Utility.getHashcode(it._2()), "isSimilarTo")).rdd();
|
|
||||||
|
|
||||||
accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value()));
|
|
||||||
|
|
||||||
return GraphProcessor.findCCs(vertexes, edgeRdd, config.getWf().getMaxIterations()).toJavaRDD();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public static void createSimRels(DedupConfig dedupConf, SparkSession spark, String entitiesPath, String simRelsPath){
|
||||||
* @param: the spark context
|
|
||||||
* @param: list of blocks
|
|
||||||
* @param: the dedup configuration
|
|
||||||
*
|
|
||||||
* @return the list of relations generated by the deduplication
|
|
||||||
*/
|
|
||||||
public static JavaPairRDD<String, String> computeRelations(JavaSparkContext context, JavaPairRDD<String, Iterable<MapDocument>> blocks, DedupConfig config) {
|
|
||||||
|
|
||||||
Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
return blocks.flatMapToPair(it -> {
|
|
||||||
final SparkReporter reporter = new SparkReporter(accumulators);
|
JavaPairRDD<String, MapDocument> mapDocuments = sc
|
||||||
new BlockProcessor(config).process(it._1(), it._2(), reporter);
|
.textFile(entitiesPath)
|
||||||
return reporter.getRelations().iterator();
|
.mapToPair(
|
||||||
});
|
(PairFunction<String, String, MapDocument>) s -> {
|
||||||
|
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
|
||||||
|
return new Tuple2<>(d.getIdentifier(), d);
|
||||||
|
});
|
||||||
|
|
||||||
|
// create blocks for deduplication
|
||||||
|
JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);
|
||||||
|
|
||||||
|
// create relations by comparing only elements in the same group
|
||||||
|
JavaRDD<Relation> relations = Deduper.computeRelations(sc, blocks, dedupConf);
|
||||||
|
|
||||||
|
// save the simrel in the workingdir
|
||||||
|
spark
|
||||||
|
.createDataset(relations.rdd(), Encoders.bean(Relation.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(simRelsPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public static void createMergeRels(DedupConfig dedupConf, String entitiesPath, String mergeRelsPath, String simRelsPath, SparkSession spark){
|
||||||
* @param: the spark context
|
|
||||||
* @param: list of entities: <id, entity>
|
|
||||||
* @param: the dedup configuration
|
|
||||||
*
|
|
||||||
* @return the list of blocks based on clustering of dedup configuration
|
|
||||||
*/
|
|
||||||
public static JavaPairRDD<String, Iterable<MapDocument>> createBlocks(JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
|
|
||||||
|
|
||||||
return mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id
|
final int maxIterations = dedupConf.getWf().getMaxIterations();
|
||||||
//Clustering: from <id, doc> to List<groupkey,doc>
|
|
||||||
.flatMapToPair(a -> {
|
|
||||||
final MapDocument currentDocument = a._2();
|
|
||||||
|
|
||||||
return Utility.getGroupingKeys(config, currentDocument).stream()
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
.map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
|
|
||||||
}).groupByKey();
|
|
||||||
|
|
||||||
|
final JavaPairRDD<Object, String> vertexes = sc
|
||||||
|
.textFile(entitiesPath)
|
||||||
|
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
||||||
|
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
|
||||||
|
|
||||||
|
final RDD<Edge<String>> edgeRdd = spark
|
||||||
|
.read()
|
||||||
|
.load(simRelsPath)
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
|
.javaRDD()
|
||||||
|
.map(Relation::toEdgeRdd)
|
||||||
|
.rdd();
|
||||||
|
|
||||||
|
final Dataset<Relation> mergeRels = spark
|
||||||
|
.createDataset(
|
||||||
|
GraphProcessor
|
||||||
|
.findCCs(vertexes.rdd(), edgeRdd, maxIterations)
|
||||||
|
.toJavaRDD()
|
||||||
|
.filter(k -> k.getDocs().size() > 1)
|
||||||
|
.flatMap(cc -> ccToMergeRel(cc, dedupConf))
|
||||||
|
.map(it -> new Relation(it._1(), it._2(), "mergeRel"))
|
||||||
|
.rdd(),
|
||||||
|
Encoders.bean(Relation.class));
|
||||||
|
|
||||||
|
mergeRels.write().mode(SaveMode.Overwrite).parquet(mergeRelsPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public static void createDedupEntity(DedupConfig dedupConf, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
|
||||||
* @param: the spark context
|
|
||||||
* @param: list of JSON entities
|
JavaPairRDD<String, String> entities = spark
|
||||||
* @param: the dedup configuration
|
.read()
|
||||||
*
|
.textFile(entitiesPath)
|
||||||
* @return the list of vertexes: <id, mapDocument>
|
.map((MapFunction<String, Tuple2<String, String>>) it ->
|
||||||
*/
|
new Tuple2<>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it),
|
||||||
public static JavaPairRDD<String, MapDocument> mapToVertexes(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config){
|
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
||||||
return entities.mapToPair(it -> {
|
.toJavaRDD()
|
||||||
MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, it);
|
.mapToPair(t -> t);
|
||||||
return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
|
|
||||||
});
|
// <source, target>: source is the dedup_id, target is the id of the mergedIn
|
||||||
|
JavaPairRDD<String, Relation> mergeRels = spark
|
||||||
|
.read()
|
||||||
|
.load(mergeRelsPath)
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
|
.toJavaRDD()
|
||||||
|
.mapToPair(r -> new Tuple2<>(r.getTarget(), r));
|
||||||
|
|
||||||
|
JavaRDD<ConnectedComponent> dedupEntities = mergeRels.join(entities)
|
||||||
|
.mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
|
||||||
|
.groupByKey()
|
||||||
|
.map(t-> entityMerger(t._1(), t._2().iterator()));
|
||||||
|
|
||||||
|
dedupEntities.saveAsTextFile(dedupEntityPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -1,6 +1,5 @@
|
||||||
package eu.dnetlib.graph
|
package eu.dnetlib.graph
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.MapDocument
|
|
||||||
import eu.dnetlib.support.ConnectedComponent
|
import eu.dnetlib.support.ConnectedComponent
|
||||||
import org.apache.spark.graphx._
|
import org.apache.spark.graphx._
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
|
@ -9,30 +8,29 @@ import scala.collection.JavaConversions;
|
||||||
|
|
||||||
object GraphProcessor {
|
object GraphProcessor {
|
||||||
|
|
||||||
def findCCs(vertexes: RDD[(VertexId, MapDocument)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
|
def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
|
||||||
val graph: Graph[MapDocument, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
|
val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
|
||||||
val cc = graph.connectedComponents(maxIterations).vertices
|
val cc = graph.connectedComponents(maxIterations).vertices
|
||||||
|
|
||||||
val joinResult = vertexes.leftOuterJoin(cc).map {
|
val joinResult = vertexes.leftOuterJoin(cc).map {
|
||||||
case (id, (openaireId, cc)) => {
|
case (id, (rawId, cc)) => {
|
||||||
if (cc.isEmpty) {
|
if (cc.isEmpty) {
|
||||||
(id, openaireId)
|
(id, rawId)
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
(cc.get, openaireId)
|
(cc.get, rawId)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
val connectedComponents = joinResult.groupByKey()
|
||||||
val connectedComponents = joinResult.groupByKey().map[ConnectedComponent](cc => asConnectedComponent(cc))
|
.map[ConnectedComponent](cc => asConnectedComponent(cc))
|
||||||
|
connectedComponents
|
||||||
(connectedComponents)
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def asConnectedComponent(group: (VertexId, Iterable[MapDocument])): ConnectedComponent = {
|
|
||||||
val docs = group._2.toSet[MapDocument]
|
def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = {
|
||||||
val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[MapDocument](docs));
|
val docs = group._2.toSet[String]
|
||||||
|
val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs));
|
||||||
connectedComponent
|
connectedComponent
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -135,7 +135,7 @@ public class BlockUtils implements Serializable {
|
||||||
|
|
||||||
System.out.println("optimalBlockSize = " + optimalBlockSize);
|
System.out.println("optimalBlockSize = " + optimalBlockSize);
|
||||||
|
|
||||||
return blocks.filter(b -> b.getElements().size() < optimalBlockSize);
|
return blocks.filter(b -> b.elements() < optimalBlockSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
//cut blocks basing on number of comparisons
|
//cut blocks basing on number of comparisons
|
||||||
|
@ -152,7 +152,7 @@ public class BlockUtils implements Serializable {
|
||||||
double RATIO = 0.85;
|
double RATIO = 0.85;
|
||||||
|
|
||||||
return blocks
|
return blocks
|
||||||
.flatMapToPair(b -> b.getElements().stream().map(e -> new Tuple2<>(e, new Tuple2<>(b.getKey(), b.comparisons()))).iterator())
|
.flatMapToPair(b -> b.getDocuments().stream().map(e -> new Tuple2<>(e, new Tuple2<>(b.getKey(), b.comparisons()))).iterator())
|
||||||
.groupByKey()
|
.groupByKey()
|
||||||
.mapToPair(es -> {
|
.mapToPair(es -> {
|
||||||
List<Tuple2<String, Integer>> b = Lists.newArrayList(es._2());
|
List<Tuple2<String, Integer>> b = Lists.newArrayList(es._2());
|
||||||
|
|
|
@ -5,6 +5,7 @@ import com.google.common.hash.Hashing;
|
||||||
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
|
import org.apache.commons.codec.binary.Hex;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
|
@ -18,6 +19,7 @@ import org.apache.spark.util.LongAccumulator;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.security.MessageDigest;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -83,4 +85,16 @@ public class Utility {
|
||||||
public static long getHashcode(final String id) {
|
public static long getHashcode(final String id) {
|
||||||
return Hashing.murmur3_128().hashUnencodedChars(id).asLong();
|
return Hashing.murmur3_128().hashUnencodedChars(id).asLong();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String md5(final String s) {
|
||||||
|
try {
|
||||||
|
final MessageDigest md = MessageDigest.getInstance("MD5");
|
||||||
|
md.update(s.getBytes(StandardCharsets.UTF_8));
|
||||||
|
return new String(Hex.encodeHex(md.digest()));
|
||||||
|
} catch (final Exception e) {
|
||||||
|
System.err.println("Error creating id");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,29 +1,68 @@
|
||||||
package eu.dnetlib.support;
|
package eu.dnetlib.support;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Arrays;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
import java.util.stream.StreamSupport;
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
|
|
||||||
public class Block implements Serializable {
|
public class Block implements Serializable {
|
||||||
|
|
||||||
String key;
|
private String key;
|
||||||
List<MapDocument> elements;
|
|
||||||
|
|
||||||
public Block(String key, Iterable<MapDocument> elements){
|
private List<MapDocument> documents;
|
||||||
this.key = key;
|
|
||||||
this.elements = StreamSupport.stream(elements.spliterator(), false).collect(Collectors.toList());
|
public Block() {
|
||||||
|
super();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Block(String key, List<MapDocument> elements){
|
public Block(String key, Iterable<MapDocument> documents) {
|
||||||
this.key = key;
|
this.key = key;
|
||||||
this.elements = elements;
|
this.documents = Lists.newArrayList(documents);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Block from(String key, MapDocument doc) {
|
||||||
|
Block block = new Block();
|
||||||
|
block.setKey(key);
|
||||||
|
block.setDocuments(Lists.newArrayList(doc));
|
||||||
|
return block;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Block from(String key, Iterator<Block> blocks, String orderField, int maxSize) {
|
||||||
|
Block block = new Block();
|
||||||
|
block.setKey(key);
|
||||||
|
|
||||||
|
Iterable<Block> it = () -> blocks;
|
||||||
|
|
||||||
|
block
|
||||||
|
.setDocuments(
|
||||||
|
StreamSupport
|
||||||
|
.stream(it.spliterator(), false)
|
||||||
|
.flatMap(b -> b.getDocuments().stream())
|
||||||
|
.sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue()))
|
||||||
|
.limit(maxSize)
|
||||||
|
.collect(Collectors.toCollection(ArrayList::new)));
|
||||||
|
return block;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Block from(Block b1, Block b2, String orderField, int maxSize) {
|
||||||
|
Block block = new Block();
|
||||||
|
block.setKey(b1.getKey());
|
||||||
|
block
|
||||||
|
.setDocuments(
|
||||||
|
Stream
|
||||||
|
.concat(b1.getDocuments().stream(), b2.getDocuments().stream())
|
||||||
|
.sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue()))
|
||||||
|
.limit(maxSize)
|
||||||
|
.collect(Collectors.toCollection(ArrayList::new)));
|
||||||
|
return block;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getKey() {
|
public String getKey() {
|
||||||
|
@ -34,30 +73,22 @@ public class Block implements Serializable {
|
||||||
this.key = key;
|
this.key = key;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<MapDocument> getElements() {
|
public List<MapDocument> getDocuments() {
|
||||||
return elements;
|
return documents;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setElements(List<MapDocument> elements) {
|
public void setDocuments(List<MapDocument> documents) {
|
||||||
this.elements = elements;
|
this.documents = documents;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int comparisons(){
|
public int comparisons() {
|
||||||
int size = elements.size();
|
return (documents.size()*(documents.size()-1))/2;
|
||||||
return (size*(size-1)/2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int elements(){
|
public int elements() {
|
||||||
return elements.size();
|
return documents.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString(){
|
|
||||||
ObjectMapper mapper = new ObjectMapper();
|
|
||||||
try {
|
|
||||||
return mapper.writeValueAsString(this);
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new PaceException("Failed to create Json: ", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,89 +1,61 @@
|
||||||
package eu.dnetlib.support;
|
package eu.dnetlib.support;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
|
||||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
import eu.dnetlib.pace.utils.Utility;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
|
|
||||||
public class ConnectedComponent implements Serializable {
|
public class ConnectedComponent implements Serializable {
|
||||||
|
|
||||||
private Set<MapDocument> docs;
|
private Set<String> docs;
|
||||||
private String id;
|
private String ccId;
|
||||||
private Map<String, Field> fieldMap;
|
|
||||||
|
|
||||||
public ConnectedComponent() {
|
public ConnectedComponent() {
|
||||||
}
|
}
|
||||||
|
|
||||||
public ConnectedComponent(Set<MapDocument> docs) {
|
public ConnectedComponent(Set<String> docs) {
|
||||||
this.docs = docs;
|
this.docs = docs;
|
||||||
this.id = createID(docs);
|
createID();
|
||||||
this.fieldMap = chooseFields(docs);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<MapDocument> getDocs() {
|
public String createID() {
|
||||||
return docs;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDocs(Set<MapDocument> docs) {
|
|
||||||
this.docs = docs;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getId() {
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setId(String id) {
|
|
||||||
this.id = id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Map<String, Field> chooseFields(Set<MapDocument> docs) {
|
|
||||||
|
|
||||||
int maxLength = 0;
|
|
||||||
Map<String, Field> maxFieldMap = new HashMap<>();
|
|
||||||
for (MapDocument doc : docs) {
|
|
||||||
if (doc.toString().length()>maxLength){
|
|
||||||
maxFieldMap = doc.getFieldMap();
|
|
||||||
maxLength = doc.toString().length();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return maxFieldMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String createID(Set<MapDocument> docs) {
|
|
||||||
if (docs.size() > 1) {
|
if (docs.size() > 1) {
|
||||||
String ccID = getMin(docs.stream().map(doc -> doc.getIdentifier()).collect(Collectors.toList()));
|
final String s = getMin();
|
||||||
String prefix = ccID.split("\\|")[0];
|
ccId = "dedup::" + Utility.md5(s);
|
||||||
String id = ccID.split("::")[1];
|
return ccId;
|
||||||
return prefix + "|dedup_______::" + id;
|
|
||||||
} else {
|
} else {
|
||||||
return docs.iterator().next().getIdentifier();
|
return docs.iterator().next();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
public String getMin(List<String> ids){
|
public String getMin() {
|
||||||
|
|
||||||
String min = ids.get(0);
|
final StringBuilder min = new StringBuilder();
|
||||||
for(String id: ids)
|
docs
|
||||||
if (min.compareTo(id) > 0) {
|
.forEach(
|
||||||
min = id;
|
i -> {
|
||||||
}
|
if (StringUtils.isBlank(min.toString())) {
|
||||||
|
min.append(i);
|
||||||
return min;
|
} else {
|
||||||
|
if (min.toString().compareTo(i) > 0) {
|
||||||
|
min.setLength(0);
|
||||||
|
min.append(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return min.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString(){
|
public String toString() {
|
||||||
ObjectMapper mapper = new ObjectMapper();
|
ObjectMapper mapper = new ObjectMapper();
|
||||||
try {
|
try {
|
||||||
return mapper.writeValueAsString(this);
|
return mapper.writeValueAsString(this);
|
||||||
|
@ -92,11 +64,19 @@ public class ConnectedComponent implements Serializable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, Field> getFieldMap() {
|
public Set<String> getDocs() {
|
||||||
return fieldMap;
|
return docs;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setFieldMap(Map<String, Field> fieldMap) {
|
public void setDocs(Set<String> docs) {
|
||||||
this.fieldMap = fieldMap;
|
this.docs = docs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getCcId() {
|
||||||
|
return ccId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCcId(String ccId) {
|
||||||
|
this.ccId = ccId;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@ import org.apache.spark.util.AccumulatorV2;
|
||||||
|
|
||||||
public class DnetAccumulator extends AccumulatorV2<Long, Long> {
|
public class DnetAccumulator extends AccumulatorV2<Long, Long> {
|
||||||
|
|
||||||
private Long counter= 0L;
|
private Long counter = 0L;
|
||||||
|
|
||||||
private String group;
|
private String group;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
package eu.dnetlib.support;
|
||||||
|
|
||||||
|
import eu.dnetlib.Deduper;
|
||||||
|
import org.apache.spark.graphx.Edge;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class Relation implements Serializable {
|
||||||
|
|
||||||
|
String source;
|
||||||
|
String target;
|
||||||
|
String type;
|
||||||
|
|
||||||
|
public Relation() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public Relation(String source, String target, String type) {
|
||||||
|
this.source = source;
|
||||||
|
this.target = target;
|
||||||
|
this.type = type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSource() {
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSource(String source) {
|
||||||
|
this.source = source;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTarget() {
|
||||||
|
return target;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTarget(String target) {
|
||||||
|
this.target = target;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getType() {
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setType(String type) {
|
||||||
|
this.type = type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Edge<String> toEdgeRdd(){
|
||||||
|
return new Edge<>(Deduper.hash(source), Deduper.hash(target), type);
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
|
@ -9,29 +9,29 @@ import java.util.List;
|
||||||
public abstract class DedupTestUtils {
|
public abstract class DedupTestUtils {
|
||||||
|
|
||||||
|
|
||||||
public static void printStatistics(JavaRDD<ConnectedComponent> ccs){
|
// public static void printStatistics(JavaRDD<ConnectedComponent> ccs){
|
||||||
final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
|
// final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
|
||||||
final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
|
// final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
|
||||||
|
//
|
||||||
//print deduped
|
// //print deduped
|
||||||
connectedComponents.map(cc -> {
|
// connectedComponents.map(cc -> {
|
||||||
StringBuilder sb = new StringBuilder();
|
// StringBuilder sb = new StringBuilder();
|
||||||
for (MapDocument m : cc.getDocs()){
|
// for (MapDocument m : cc.getDocs()){
|
||||||
sb.append(m.getFieldMap().get("originalId").stringValue() + " - "+ m.getFieldMap().get("legalname").stringValue() + "\n");
|
// sb.append(m.getFieldMap().get("originalId").stringValue() + " - "+ m.getFieldMap().get("legalname").stringValue() + "\n");
|
||||||
}
|
// }
|
||||||
return sb.toString();
|
// return sb.toString();
|
||||||
}).foreach(s -> System.out.println("*******\n" + s + "*******\n"));
|
// }).foreach(s -> System.out.println("*******\n" + s + "*******\n"));
|
||||||
|
//
|
||||||
//print nondeduped
|
// //print nondeduped
|
||||||
nonDeduplicated.foreach(cc -> {
|
// nonDeduplicated.foreach(cc -> {
|
||||||
System.out.println(cc.getId() + " - " + cc.getFieldMap().get("legalname").stringValue());
|
// System.out.println(cc.getId() + " - " + cc.getFieldMap().get("legalname").stringValue());
|
||||||
});
|
// });
|
||||||
|
//
|
||||||
System.out.println("Non duplicates: " + nonDeduplicated.count());
|
// System.out.println("Non duplicates: " + nonDeduplicated.count());
|
||||||
System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
|
// System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
|
||||||
System.out.println("Connected Components: " + connectedComponents.count());
|
// System.out.println("Connected Components: " + connectedComponents.count());
|
||||||
|
//
|
||||||
}
|
// }
|
||||||
|
|
||||||
public static String getOrganizationLegalname(MapDocument mapDocument){
|
public static String getOrganizationLegalname(MapDocument mapDocument){
|
||||||
return mapDocument.getFieldMap().get("legalname").stringValue();
|
return mapDocument.getFieldMap().get("legalname").stringValue();
|
||||||
|
|
Binary file not shown.
|
@ -0,0 +1,11 @@
|
||||||
|
#release configuration
|
||||||
|
#Fri Apr 24 14:37:08 CEST 2020
|
||||||
|
scm.tagNameFormat=@{project.artifactId}-@{project.version}
|
||||||
|
pushChanges=true
|
||||||
|
scm.url=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git/dnet-pace-core
|
||||||
|
preparationGoals=clean verify
|
||||||
|
projectVersionPolicyId=default
|
||||||
|
remoteTagging=true
|
||||||
|
scm.commentPrefix=[maven-release-plugin]
|
||||||
|
exec.snapshotReleasePluginAllowed=false
|
||||||
|
completedPhase=scm-check-modifications
|
Loading…
Reference in New Issue