update of the spark test
This commit is contained in:
parent
951313eeb1
commit
1f0eeaf7ab
|
@ -41,6 +41,7 @@
|
||||||
<!--</exclusions>-->
|
<!--</exclusions>-->
|
||||||
<!--</dependency>-->
|
<!--</dependency>-->
|
||||||
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib</groupId>
|
<groupId>eu.dnetlib</groupId>
|
||||||
<artifactId>dnet-openaire-data-protos</artifactId>
|
<artifactId>dnet-openaire-data-protos</artifactId>
|
||||||
|
|
|
@ -0,0 +1,73 @@
|
||||||
|
package eu.dnetlib;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
public class ConnectedComponent implements Serializable {
|
||||||
|
|
||||||
|
private Set<MapDocument> docs;
|
||||||
|
private String id;
|
||||||
|
|
||||||
|
public ConnectedComponent() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public ConnectedComponent(String id, Set<MapDocument> docs) {
|
||||||
|
this.id = id;
|
||||||
|
this.docs = docs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<MapDocument> getDocs() {
|
||||||
|
return docs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDocs(Set<MapDocument> docs) {
|
||||||
|
this.docs = docs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getId() {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setId(String id) {
|
||||||
|
this.id = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void initializeID() {
|
||||||
|
if (docs.size() > 1) {
|
||||||
|
String ccID = getMin(docs.stream().map(doc -> doc.getIdentifier()).collect(Collectors.toList()));
|
||||||
|
String prefix = ccID.split("\\|")[0];
|
||||||
|
String id = ccID.split("::")[1];
|
||||||
|
this.id = prefix + "|dedup_______::" + id;
|
||||||
|
} else {
|
||||||
|
this.id = docs.iterator().next().getIdentifier();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getMin(List<String> ids){
|
||||||
|
|
||||||
|
String min = ids.get(0);
|
||||||
|
for(String id: ids)
|
||||||
|
if (min.compareTo(id) > 0) {
|
||||||
|
min = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
return min;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString(){
|
||||||
|
ObjectMapper mapper = new ObjectMapper();
|
||||||
|
try {
|
||||||
|
return mapper.writeValueAsString(this);
|
||||||
|
} catch (IOException e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib;
|
package eu.dnetlib;
|
||||||
|
|
||||||
|
import com.google.common.collect.Iterables;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import eu.dnetlib.graph.GraphProcessor;
|
import eu.dnetlib.graph.GraphProcessor;
|
||||||
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
||||||
|
@ -18,9 +19,11 @@ import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.graphx.Edge;
|
import org.apache.spark.graphx.Edge;
|
||||||
import org.apache.spark.rdd.RDD;
|
import org.apache.spark.rdd.RDD;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
import scala.collection.Iterable;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@ -30,12 +33,12 @@ public class SparkTest {
|
||||||
private static final Log log = LogFactory.getLog(SparkTest.class);
|
private static final Log log = LogFactory.getLog(SparkTest.class);
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
final JavaSparkContext context = new JavaSparkContext(new SparkConf().setAppName("Hello World").setMaster("local[*]"));
|
final JavaSparkContext context = new JavaSparkContext(new SparkConf().setAppName("Deduplication").setMaster("local[*]"));
|
||||||
final JavaRDD<String> dataRDD = context.textFile("file:///Users/sandro/Downloads/software.json");
|
final JavaRDD<String> dataRDD = context.textFile("file:///Users/miconis/Downloads/dumps/organizations_sample.json");
|
||||||
|
|
||||||
counter = new SparkCounter(context);
|
counter = new SparkCounter(context);
|
||||||
|
|
||||||
final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/software.pace.conf"));
|
final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
|
||||||
BlockProcessor.constructAccumulator(config);
|
BlockProcessor.constructAccumulator(config);
|
||||||
|
|
||||||
BlockProcessor.accumulators.forEach(acc -> {
|
BlockProcessor.accumulators.forEach(acc -> {
|
||||||
|
@ -45,61 +48,47 @@ public class SparkTest {
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
//create vertexes of the graph: <ID, MapDocument>
|
||||||
JavaPairRDD<String, MapDocument> mapDocs = dataRDD.mapToPair(it -> {
|
JavaPairRDD<String, MapDocument> mapDocs = dataRDD.mapToPair(it -> {
|
||||||
MapDocument mapDocument = PaceUtils.asMapDocument(config, it);
|
MapDocument mapDocument = PaceUtils.asMapDocument(config, it);
|
||||||
return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
|
return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
|
||||||
});
|
});
|
||||||
|
RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair(t -> new Tuple2<Object, MapDocument>( (long) t._1().hashCode(), t._2())).rdd();
|
||||||
|
|
||||||
|
//create relations between documents
|
||||||
final JavaPairRDD<String, String> relationRDD = mapDocs.reduceByKey((a, b) -> a)
|
final JavaPairRDD<String, String> relationRDD = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id
|
||||||
|
//from <id, doc> to List<groupkey,doc>
|
||||||
.flatMapToPair(a -> {
|
.flatMapToPair(a -> {
|
||||||
final MapDocument currentDocument = a._2();
|
final MapDocument currentDocument = a._2();
|
||||||
return getGroupingKeys(config, currentDocument).stream()
|
return getGroupingKeys(config, currentDocument).stream()
|
||||||
.map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
|
.map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
|
||||||
}).groupByKey().flatMapToPair(it -> {
|
}).groupByKey() //group documents basing on the key
|
||||||
|
//create relations by comparing only elements in the same group
|
||||||
|
.flatMapToPair(it -> {
|
||||||
final SparkReporter reporter = new SparkReporter(counter);
|
final SparkReporter reporter = new SparkReporter(counter);
|
||||||
new BlockProcessor(config).process(it._1(), it._2(), reporter);
|
new BlockProcessor(config).process(it._1(), it._2(), reporter);
|
||||||
return reporter.getReport().iterator();
|
return reporter.getReport().iterator();
|
||||||
});
|
});
|
||||||
|
|
||||||
RDD<Tuple2<Object, String>> vertexes = relationRDD.groupByKey().map(it -> {
|
final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(),it._2().hashCode(), "similarTo")).rdd();
|
||||||
|
|
||||||
Long id = (long) it._1().hashCode();
|
JavaRDD<ConnectedComponent> ccs = GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
|
||||||
return new Tuple2<Object, String>(id, it._1());
|
|
||||||
|
|
||||||
}).rdd();
|
final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
|
||||||
|
final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
|
||||||
|
|
||||||
final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(), it._2().hashCode(), "similarTo")).rdd();
|
System.out.println("Non duplicates: " + nonDeduplicated.count());
|
||||||
|
System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
|
||||||
Tuple2<Object, RDD<String>> cc = GraphProcessor.findCCs(vertexes, edgeRdd, 20);
|
System.out.println("Connected Components: " + connectedComponents.count());
|
||||||
|
|
||||||
final Long total = (Long) cc._1();
|
|
||||||
|
|
||||||
|
|
||||||
final JavaRDD<String> map = mapDocs.map(Tuple2::_1);
|
|
||||||
|
|
||||||
|
|
||||||
final JavaRDD<String> duplicatesRDD = cc._2().toJavaRDD();
|
|
||||||
|
|
||||||
|
|
||||||
final JavaRDD<String> nonDuplicates = map.subtract(duplicatesRDD);
|
|
||||||
|
|
||||||
|
|
||||||
relationRDD.collect().forEach(it-> System.out.println(it._1()+"<--->"+it._2()));
|
|
||||||
|
|
||||||
System.out.println("Non duplicates: "+ nonDuplicates.count());
|
|
||||||
System.out.println("Connected Components: "+ total);
|
|
||||||
|
|
||||||
counter.getAccumulators().values().forEach(it-> System.out.println(it.getGroup()+" "+it.getName()+" -->"+it.value()));
|
counter.getAccumulators().values().forEach(it-> System.out.println(it.getGroup()+" "+it.getName()+" -->"+it.value()));
|
||||||
|
|
||||||
|
//print ids
|
||||||
|
// ccs.foreach(cc -> System.out.println(cc.getId()));
|
||||||
|
ccs.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static String readFromClasspath(final String filename) {
|
static String readFromClasspath(final String filename) {
|
||||||
final StringWriter sw = new StringWriter();
|
final StringWriter sw = new StringWriter();
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -1,24 +1,43 @@
|
||||||
package eu.dnetlib.graph
|
package eu.dnetlib.graph
|
||||||
|
import java.lang
|
||||||
|
|
||||||
|
import eu.dnetlib.ConnectedComponent
|
||||||
|
import eu.dnetlib.pace.model.MapDocument
|
||||||
import org.apache.spark.graphx._
|
import org.apache.spark.graphx._
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
|
|
||||||
|
import scala.collection.JavaConversions
|
||||||
;
|
;
|
||||||
|
|
||||||
|
|
||||||
object GraphProcessor {
|
object GraphProcessor {
|
||||||
|
|
||||||
def findCCs(vertexes: RDD[(VertexId,String)], edges:RDD[Edge[String]], maxIterations: Int): (Long, RDD[String]) = {
|
def findCCs(vertexes: RDD[(VertexId,MapDocument)], edges:RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
|
||||||
val graph: Graph[String, String] = Graph(vertexes, edges)
|
val graph: Graph[MapDocument, String] = Graph(vertexes, edges)
|
||||||
val cc = graph.connectedComponents(maxIterations).vertices
|
val cc = graph.connectedComponents(maxIterations).vertices
|
||||||
|
|
||||||
|
val joinResult = vertexes.leftOuterJoin(cc).map {
|
||||||
|
case (id, (openaireId, cc)) => {
|
||||||
|
if (cc.isEmpty){
|
||||||
|
(id, openaireId)
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
(cc.get, openaireId)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
val totalCC =cc.map{
|
val connectedComponents = joinResult.groupByKey().map[ConnectedComponent](cc => asConnectedComponent(cc))
|
||||||
case (openaireId, ccId) =>ccId
|
|
||||||
}.distinct().count()
|
|
||||||
|
|
||||||
val connectedComponents: RDD[String] = vertexes.join(cc).map {
|
(connectedComponents)
|
||||||
case (id, (openaireId, ccId)) => openaireId
|
|
||||||
}.distinct()
|
}
|
||||||
(totalCC, connectedComponents)
|
|
||||||
|
def asConnectedComponent(group: (VertexId, Iterable[MapDocument])) : ConnectedComponent = {
|
||||||
|
val docs = group._2.toSet[MapDocument]
|
||||||
|
val connectedComponent = new ConnectedComponent("empty", JavaConversions.setAsJavaSet[MapDocument](docs));
|
||||||
|
connectedComponent.initializeID();
|
||||||
|
connectedComponent
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -33,8 +33,6 @@ public class SparkReporter implements Reporter {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void emit(String type, String from, String to) {
|
public void emit(String type, String from, String to) {
|
||||||
|
|
||||||
|
|
||||||
report.add(new Tuple2<>(from, to));
|
report.add(new Tuple2<>(from, to));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -50,6 +50,7 @@ public class ScoreResult {
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
final GsonBuilder b = new GsonBuilder();
|
final GsonBuilder b = new GsonBuilder();
|
||||||
|
b.serializeSpecialFloatingPointValues();
|
||||||
return b.setPrettyPrinting().create().toJson(this);
|
return b.setPrettyPrinting().create().toJson(this);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue