WIP: prepare relation job
This commit is contained in:
parent
46e76affeb
commit
69b0391708
|
@ -11,6 +11,8 @@ import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.Function;
|
||||||
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
import org.apache.spark.rdd.RDD;
|
import org.apache.spark.rdd.RDD;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
@ -30,6 +32,7 @@ import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
|
import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
|
import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The
|
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The
|
||||||
|
@ -63,21 +66,6 @@ public class PrepareRelationsJob {
|
||||||
|
|
||||||
public static final int DEFAULT_NUM_PARTITIONS = 3000;
|
public static final int DEFAULT_NUM_PARTITIONS = 3000;
|
||||||
|
|
||||||
private static final Map<String, Integer> weights = Maps.newHashMap();
|
|
||||||
|
|
||||||
static {
|
|
||||||
weights.put("outcome", 0);
|
|
||||||
weights.put("supplement", 1);
|
|
||||||
weights.put("affiliation", 2);
|
|
||||||
weights.put("relationship", 3);
|
|
||||||
weights.put("publicationDataset", 4);
|
|
||||||
weights.put("similarity", 5);
|
|
||||||
|
|
||||||
weights.put("provision", 6);
|
|
||||||
weights.put("participation", 7);
|
|
||||||
weights.put("dedup", 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
String jsonConfiguration = IOUtils
|
String jsonConfiguration = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
|
@ -146,21 +134,26 @@ public class PrepareRelationsJob {
|
||||||
int relPartitions) {
|
int relPartitions) {
|
||||||
|
|
||||||
RDD<Relation> cappedRels = readPathRelationRDD(spark, inputRelationsPath)
|
RDD<Relation> cappedRels = readPathRelationRDD(spark, inputRelationsPath)
|
||||||
.repartition(relPartitions)
|
.filter(rel -> rel.getDataInfo().getDeletedbyinference() == false)
|
||||||
.filter(rel -> !rel.getDataInfo().getDeletedbyinference())
|
.filter(rel -> relationFilter.contains(rel.getRelClass()) == false)
|
||||||
.filter(rel -> !relationFilter.contains(rel.getRelClass()))
|
|
||||||
// group by SOURCE and apply limit
|
// group by SOURCE and apply limit
|
||||||
.groupBy(r -> SortableRelationKey.create(r, r.getSource()))
|
.mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, r.getSource()), r))
|
||||||
.repartitionAndSortWithinPartitions(
|
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
|
||||||
new RelationPartitioner(relPartitions),
|
.groupBy(Tuple2::_1)
|
||||||
(SerializableComparator<SortableRelationKey>) (o1, o2) -> compare(o1, o2))
|
.map(Tuple2::_2)
|
||||||
.flatMap(t -> Iterables.limit(t._2(), maxRelations).iterator())
|
.map(t -> Iterables.limit(t, maxRelations))
|
||||||
|
.flatMap(Iterable::iterator)
|
||||||
|
.map(Tuple2::_2)
|
||||||
|
|
||||||
// group by TARGET and apply limit
|
// group by TARGET and apply limit
|
||||||
.groupBy(r -> SortableRelationKey.create(r, r.getTarget()))
|
.mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, r.getTarget()), r))
|
||||||
.repartitionAndSortWithinPartitions(
|
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
|
||||||
new RelationPartitioner(relPartitions),
|
.groupBy(Tuple2::_1)
|
||||||
(SerializableComparator<SortableRelationKey>) (o1, o2) -> compare(o1, o2))
|
.map(Tuple2::_2)
|
||||||
.flatMap(t -> Iterables.limit(t._2(), maxRelations).iterator())
|
.map(t -> Iterables.limit(t, maxRelations))
|
||||||
|
.flatMap(Iterable::iterator)
|
||||||
|
.map(Tuple2::_2)
|
||||||
.rdd();
|
.rdd();
|
||||||
|
|
||||||
spark
|
spark
|
||||||
|
@ -170,24 +163,6 @@ public class PrepareRelationsJob {
|
||||||
.parquet(outputPath);
|
.parquet(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int compare(SortableRelationKey o1, SortableRelationKey o2) {
|
|
||||||
final Integer w1 = Optional.ofNullable(weights.get(o1.getSubRelType())).orElse(Integer.MAX_VALUE);
|
|
||||||
final Integer w2 = Optional.ofNullable(weights.get(o2.getSubRelType())).orElse(Integer.MAX_VALUE);
|
|
||||||
return ComparisonChain
|
|
||||||
.start()
|
|
||||||
.compare(w1, w2)
|
|
||||||
.compare(o1.getSource(), o2.getSource())
|
|
||||||
.compare(o1.getTarget(), o2.getTarget())
|
|
||||||
.result();
|
|
||||||
}
|
|
||||||
|
|
||||||
@FunctionalInterface
|
|
||||||
public interface SerializableComparator<T> extends Comparator<T>, Serializable {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
int compare(T o1, T o2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text
|
* Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text
|
||||||
* file,
|
* file,
|
||||||
|
|
|
@ -11,25 +11,34 @@ import com.google.common.collect.Maps;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
public class SortableRelationKey implements Serializable {
|
public class SortableRelationKey implements Comparable<SortableRelationKey>, Serializable {
|
||||||
|
|
||||||
|
private static final Map<String, Integer> weights = Maps.newHashMap();
|
||||||
|
|
||||||
|
static {
|
||||||
|
weights.put("outcome", 0);
|
||||||
|
weights.put("supplement", 1);
|
||||||
|
weights.put("review", 2);
|
||||||
|
weights.put("citation", 3);
|
||||||
|
weights.put("affiliation", 4);
|
||||||
|
weights.put("relationship", 5);
|
||||||
|
weights.put("publicationDataset", 6);
|
||||||
|
weights.put("similarity", 7);
|
||||||
|
|
||||||
|
weights.put("provision", 8);
|
||||||
|
weights.put("participation", 9);
|
||||||
|
weights.put("dedup", 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final long serialVersionUID = 3232323;
|
||||||
|
|
||||||
private String groupingKey;
|
private String groupingKey;
|
||||||
|
|
||||||
private String source;
|
|
||||||
|
|
||||||
private String target;
|
|
||||||
|
|
||||||
private String subRelType;
|
private String subRelType;
|
||||||
|
|
||||||
public String getSource() {
|
|
||||||
return source;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static SortableRelationKey create(Relation r, String groupingKey) {
|
public static SortableRelationKey create(Relation r, String groupingKey) {
|
||||||
SortableRelationKey sr = new SortableRelationKey();
|
SortableRelationKey sr = new SortableRelationKey();
|
||||||
sr.setGroupingKey(groupingKey);
|
sr.setGroupingKey(groupingKey);
|
||||||
sr.setSource(r.getSource());
|
|
||||||
sr.setTarget(r.getTarget());
|
|
||||||
sr.setSubRelType(r.getSubRelType());
|
sr.setSubRelType(r.getSubRelType());
|
||||||
return sr;
|
return sr;
|
||||||
}
|
}
|
||||||
|
@ -49,16 +58,16 @@ public class SortableRelationKey implements Serializable {
|
||||||
return Objects.hashCode(getGroupingKey());
|
return Objects.hashCode(getGroupingKey());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setSource(String source) {
|
@Override
|
||||||
this.source = source;
|
public int compareTo(SortableRelationKey o) {
|
||||||
|
return ComparisonChain
|
||||||
|
.start()
|
||||||
|
.compare(getWeight(this), getWeight(o))
|
||||||
|
.result() * -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTarget() {
|
private Integer getWeight(SortableRelationKey o) {
|
||||||
return target;
|
return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE);
|
||||||
}
|
|
||||||
|
|
||||||
public void setTarget(String target) {
|
|
||||||
this.target = target;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getSubRelType() {
|
public String getSubRelType() {
|
||||||
|
@ -76,4 +85,5 @@ public class SortableRelationKey implements Serializable {
|
||||||
public void setGroupingKey(String groupingKey) {
|
public void setGroupingKey(String groupingKey) {
|
||||||
this.groupingKey = groupingKey;
|
this.groupingKey = groupingKey;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,6 +12,8 @@ import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
|
||||||
*/
|
*/
|
||||||
public class RelationPartitioner extends Partitioner {
|
public class RelationPartitioner extends Partitioner {
|
||||||
|
|
||||||
|
private static final long serialVersionUID = 343434456L;
|
||||||
|
|
||||||
private final int numPartitions;
|
private final int numPartitions;
|
||||||
|
|
||||||
public RelationPartitioner(int numPartitions) {
|
public RelationPartitioner(int numPartitions) {
|
||||||
|
@ -29,4 +31,14 @@ public class RelationPartitioner extends Partitioner {
|
||||||
return Utils.nonNegativeMod(partitionKey.getGroupingKey().hashCode(), numPartitions());
|
return Utils.nonNegativeMod(partitionKey.getGroupingKey().hashCode(), numPartitions());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (obj instanceof RelationPartitioner) {
|
||||||
|
RelationPartitioner p = (RelationPartitioner) obj;
|
||||||
|
if (p.numPartitions() == numPartitions())
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue