enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
3 changed files with 35 additions and 29 deletions
Showing only changes of commit 6933ec11fb - Show all commits

View File

@ -7,6 +7,8 @@ import java.io.Serializable;
import java.util.*;
import java.util.function.Supplier;
import javax.annotation.Nullable;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -21,6 +23,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Predicate;
import com.google.common.base.Splitter;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Iterables;
@ -142,7 +145,7 @@ public class PrepareRelationsJob {
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
.groupBy(Tuple2::_1)
.map(Tuple2::_2)
.map(t -> Iterables.limit(t, maxRelations))
.map(t -> Iterables.filter(t, input -> input._1().getSubRelType().equals("outcome")))
.flatMap(Iterable::iterator)
.map(Tuple2::_2)
@ -151,7 +154,8 @@ public class PrepareRelationsJob {
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
.groupBy(Tuple2::_1)
.map(Tuple2::_2)
.map(t -> Iterables.limit(t, maxRelations))
.map(t -> Iterables.filter(t, input -> input._1().getSubRelType().equals("outcome")))
// .map(t -> Iterables.limit(t, maxRelations))
.flatMap(Iterable::iterator)
.map(Tuple2::_2)
.rdd();

View File

@ -62,8 +62,9 @@ public class SortableRelationKey implements Comparable<SortableRelationKey>, Ser
public int compareTo(SortableRelationKey o) {
return ComparisonChain
.start()
.compare(getGroupingKey(), o.getGroupingKey())
.compare(getWeight(this), getWeight(o))
.result() * -1;
.result();
}
private Integer getWeight(SortableRelationKey o) {

View File

@ -1,14 +1,18 @@
package eu.dnetlib.dhp.oa.provision;
import java.io.IOException;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.List;
public class SortableRelationKeyTest {
@ -16,10 +20,13 @@ public class SortableRelationKeyTest {
public void doTesSorting() throws IOException {
final ObjectMapper mapper = new ObjectMapper();
final String json = IOUtils.toString(this.getClass().getResourceAsStream("relations.json"));
final List<Relation> relations = mapper.readValue(json, new TypeReference<List<Relation>>() { });
final List<Relation> relations = mapper.readValue(json, new TypeReference<List<Relation>>() {
});
relations.stream().map(r -> SortableRelationKey.create(r, r.getSource())).sorted()
relations
.stream()
.map(r -> SortableRelationKey.create(r, r.getSource()))
.sorted()
.forEach(
it -> {
@ -30,12 +37,6 @@ public class SortableRelationKeyTest {
}
});
}
}