forked from D-Net/dnet-hadoop
WIP: prepare relation job
This commit is contained in:
parent
a6c0faac70
commit
6933ec11fb
|
@ -7,6 +7,8 @@ import java.io.Serializable;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Supplier;
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
@ -21,6 +23,7 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.common.base.Predicate;
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.ComparisonChain;
|
import com.google.common.collect.ComparisonChain;
|
||||||
import com.google.common.collect.Iterables;
|
import com.google.common.collect.Iterables;
|
||||||
|
@ -142,7 +145,7 @@ public class PrepareRelationsJob {
|
||||||
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
|
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
|
||||||
.groupBy(Tuple2::_1)
|
.groupBy(Tuple2::_1)
|
||||||
.map(Tuple2::_2)
|
.map(Tuple2::_2)
|
||||||
.map(t -> Iterables.limit(t, maxRelations))
|
.map(t -> Iterables.filter(t, input -> input._1().getSubRelType().equals("outcome")))
|
||||||
.flatMap(Iterable::iterator)
|
.flatMap(Iterable::iterator)
|
||||||
.map(Tuple2::_2)
|
.map(Tuple2::_2)
|
||||||
|
|
||||||
|
@ -151,7 +154,8 @@ public class PrepareRelationsJob {
|
||||||
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
|
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
|
||||||
.groupBy(Tuple2::_1)
|
.groupBy(Tuple2::_1)
|
||||||
.map(Tuple2::_2)
|
.map(Tuple2::_2)
|
||||||
.map(t -> Iterables.limit(t, maxRelations))
|
.map(t -> Iterables.filter(t, input -> input._1().getSubRelType().equals("outcome")))
|
||||||
|
// .map(t -> Iterables.limit(t, maxRelations))
|
||||||
.flatMap(Iterable::iterator)
|
.flatMap(Iterable::iterator)
|
||||||
.map(Tuple2::_2)
|
.map(Tuple2::_2)
|
||||||
.rdd();
|
.rdd();
|
||||||
|
|
|
@ -62,8 +62,9 @@ public class SortableRelationKey implements Comparable<SortableRelationKey>, Ser
|
||||||
public int compareTo(SortableRelationKey o) {
|
public int compareTo(SortableRelationKey o) {
|
||||||
return ComparisonChain
|
return ComparisonChain
|
||||||
.start()
|
.start()
|
||||||
|
.compare(getGroupingKey(), o.getGroupingKey())
|
||||||
.compare(getWeight(this), getWeight(o))
|
.compare(getWeight(this), getWeight(o))
|
||||||
.result() * -1;
|
.result();
|
||||||
}
|
}
|
||||||
|
|
||||||
private Integer getWeight(SortableRelationKey o) {
|
private Integer getWeight(SortableRelationKey o) {
|
||||||
|
|
|
@ -1,41 +1,42 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.provision;
|
package eu.dnetlib.dhp.oa.provision;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
import com.fasterxml.jackson.core.type.TypeReference;
|
import com.fasterxml.jackson.core.type.TypeReference;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
|
import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class SortableRelationKeyTest {
|
public class SortableRelationKeyTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void doTesSorting() throws IOException {
|
public void doTesSorting() throws IOException {
|
||||||
final ObjectMapper mapper = new ObjectMapper();
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
final String json = IOUtils.toString(this.getClass().getResourceAsStream("relations.json"));
|
final String json = IOUtils.toString(this.getClass().getResourceAsStream("relations.json"));
|
||||||
final List<Relation> relations = mapper.readValue(json, new TypeReference<List<Relation>>() { });
|
final List<Relation> relations = mapper.readValue(json, new TypeReference<List<Relation>>() {
|
||||||
|
});
|
||||||
|
|
||||||
|
relations
|
||||||
|
.stream()
|
||||||
|
.map(r -> SortableRelationKey.create(r, r.getSource()))
|
||||||
|
.sorted()
|
||||||
|
.forEach(
|
||||||
|
|
||||||
relations.stream().map(r -> SortableRelationKey.create(r, r.getSource())).sorted()
|
it -> {
|
||||||
.forEach(
|
try {
|
||||||
|
System.out.println(mapper.writeValueAsString(it));
|
||||||
it -> {
|
} catch (JsonProcessingException e) {
|
||||||
try {
|
e.printStackTrace();
|
||||||
System.out.println(mapper.writeValueAsString(it));
|
}
|
||||||
} catch (JsonProcessingException e) {
|
});
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue