WIP: prepare relation job

This commit is contained in:
Claudio Atzori 2020-06-25 11:04:12 +02:00
parent a6c0faac70
commit 6933ec11fb
3 changed files with 35 additions and 29 deletions

View File

@ -7,6 +7,8 @@ import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.function.Supplier; import java.util.function.Supplier;
import javax.annotation.Nullable;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -21,6 +23,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Predicate;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.ComparisonChain; import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Iterables; import com.google.common.collect.Iterables;
@ -142,7 +145,7 @@ public class PrepareRelationsJob {
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
.groupBy(Tuple2::_1) .groupBy(Tuple2::_1)
.map(Tuple2::_2) .map(Tuple2::_2)
.map(t -> Iterables.limit(t, maxRelations)) .map(t -> Iterables.filter(t, input -> input._1().getSubRelType().equals("outcome")))
.flatMap(Iterable::iterator) .flatMap(Iterable::iterator)
.map(Tuple2::_2) .map(Tuple2::_2)
@ -151,7 +154,8 @@ public class PrepareRelationsJob {
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions)) .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
.groupBy(Tuple2::_1) .groupBy(Tuple2::_1)
.map(Tuple2::_2) .map(Tuple2::_2)
.map(t -> Iterables.limit(t, maxRelations)) .map(t -> Iterables.filter(t, input -> input._1().getSubRelType().equals("outcome")))
// .map(t -> Iterables.limit(t, maxRelations))
.flatMap(Iterable::iterator) .flatMap(Iterable::iterator)
.map(Tuple2::_2) .map(Tuple2::_2)
.rdd(); .rdd();

View File

@ -62,8 +62,9 @@ public class SortableRelationKey implements Comparable<SortableRelationKey>, Ser
public int compareTo(SortableRelationKey o) { public int compareTo(SortableRelationKey o) {
return ComparisonChain return ComparisonChain
.start() .start()
.compare(getGroupingKey(), o.getGroupingKey())
.compare(getWeight(this), getWeight(o)) .compare(getWeight(this), getWeight(o))
.result() * -1; .result();
} }
private Integer getWeight(SortableRelationKey o) { private Integer getWeight(SortableRelationKey o) {

View File

@ -1,41 +1,42 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import java.io.IOException;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.List;
public class SortableRelationKeyTest { public class SortableRelationKeyTest {
@Test @Test
public void doTesSorting() throws IOException { public void doTesSorting() throws IOException {
final ObjectMapper mapper = new ObjectMapper(); final ObjectMapper mapper = new ObjectMapper();
final String json = IOUtils.toString(this.getClass().getResourceAsStream("relations.json")); final String json = IOUtils.toString(this.getClass().getResourceAsStream("relations.json"));
final List<Relation> relations = mapper.readValue(json, new TypeReference<List<Relation>>() { }); final List<Relation> relations = mapper.readValue(json, new TypeReference<List<Relation>>() {
});
relations
.stream()
.map(r -> SortableRelationKey.create(r, r.getSource()))
.sorted()
.forEach(
relations.stream().map(r -> SortableRelationKey.create(r, r.getSource())).sorted() it -> {
.forEach( try {
System.out.println(mapper.writeValueAsString(it));
it -> { } catch (JsonProcessingException e) {
try { e.printStackTrace();
System.out.println(mapper.writeValueAsString(it)); }
} catch (JsonProcessingException e) { });
e.printStackTrace();
}
});
}
}
} }