enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
3 changed files with 62 additions and 65 deletions
Showing only changes of commit 69b0391708 - Show all commits

View File

@ -11,6 +11,8 @@ import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.rdd.RDD; import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
@ -30,6 +32,7 @@ import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey; import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
/** /**
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The
@ -63,21 +66,6 @@ public class PrepareRelationsJob {
public static final int DEFAULT_NUM_PARTITIONS = 3000; public static final int DEFAULT_NUM_PARTITIONS = 3000;
private static final Map<String, Integer> weights = Maps.newHashMap();
static {
weights.put("outcome", 0);
weights.put("supplement", 1);
weights.put("affiliation", 2);
weights.put("relationship", 3);
weights.put("publicationDataset", 4);
weights.put("similarity", 5);
weights.put("provision", 6);
weights.put("participation", 7);
weights.put("dedup", 8);
}
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
@ -146,21 +134,26 @@ public class PrepareRelationsJob {
int relPartitions) { int relPartitions) {
RDD<Relation> cappedRels = readPathRelationRDD(spark, inputRelationsPath) RDD<Relation> cappedRels = readPathRelationRDD(spark, inputRelationsPath)
.repartition(relPartitions) .filter(rel -> rel.getDataInfo().getDeletedbyinference() == false)
.filter(rel -> !rel.getDataInfo().getDeletedbyinference()) .filter(rel -> relationFilter.contains(rel.getRelClass()) == false)
.filter(rel -> !relationFilter.contains(rel.getRelClass()))
// group by SOURCE and apply limit // group by SOURCE and apply limit
.groupBy(r -> SortableRelationKey.create(r, r.getSource())) .mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, r.getSource()), r))
.repartitionAndSortWithinPartitions( .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
new RelationPartitioner(relPartitions), .groupBy(Tuple2::_1)
(SerializableComparator<SortableRelationKey>) (o1, o2) -> compare(o1, o2)) .map(Tuple2::_2)
.flatMap(t -> Iterables.limit(t._2(), maxRelations).iterator()) .map(t -> Iterables.limit(t, maxRelations))
.flatMap(Iterable::iterator)
.map(Tuple2::_2)
// group by TARGET and apply limit // group by TARGET and apply limit
.groupBy(r -> SortableRelationKey.create(r, r.getTarget())) .mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, r.getTarget()), r))
.repartitionAndSortWithinPartitions( .repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
new RelationPartitioner(relPartitions), .groupBy(Tuple2::_1)
(SerializableComparator<SortableRelationKey>) (o1, o2) -> compare(o1, o2)) .map(Tuple2::_2)
.flatMap(t -> Iterables.limit(t._2(), maxRelations).iterator()) .map(t -> Iterables.limit(t, maxRelations))
.flatMap(Iterable::iterator)
.map(Tuple2::_2)
.rdd(); .rdd();
spark spark
@ -170,24 +163,6 @@ public class PrepareRelationsJob {
.parquet(outputPath); .parquet(outputPath);
} }
private static int compare(SortableRelationKey o1, SortableRelationKey o2) {
final Integer w1 = Optional.ofNullable(weights.get(o1.getSubRelType())).orElse(Integer.MAX_VALUE);
final Integer w2 = Optional.ofNullable(weights.get(o2.getSubRelType())).orElse(Integer.MAX_VALUE);
return ComparisonChain
.start()
.compare(w1, w2)
.compare(o1.getSource(), o2.getSource())
.compare(o1.getTarget(), o2.getTarget())
.result();
}
@FunctionalInterface
public interface SerializableComparator<T> extends Comparator<T>, Serializable {
@Override
int compare(T o1, T o2);
}
/** /**
* Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text * Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text
* file, * file,

View File

@ -11,25 +11,34 @@ import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
public class SortableRelationKey implements Serializable { public class SortableRelationKey implements Comparable<SortableRelationKey>, Serializable {
private static final Map<String, Integer> weights = Maps.newHashMap();
static {
weights.put("outcome", 0);
weights.put("supplement", 1);
weights.put("review", 2);
weights.put("citation", 3);
weights.put("affiliation", 4);
weights.put("relationship", 5);
weights.put("publicationDataset", 6);
weights.put("similarity", 7);
weights.put("provision", 8);
weights.put("participation", 9);
weights.put("dedup", 10);
}
private static final long serialVersionUID = 3232323;
private String groupingKey; private String groupingKey;
private String source;
private String target;
private String subRelType; private String subRelType;
public String getSource() {
return source;
}
public static SortableRelationKey create(Relation r, String groupingKey) { public static SortableRelationKey create(Relation r, String groupingKey) {
SortableRelationKey sr = new SortableRelationKey(); SortableRelationKey sr = new SortableRelationKey();
sr.setGroupingKey(groupingKey); sr.setGroupingKey(groupingKey);
sr.setSource(r.getSource());
sr.setTarget(r.getTarget());
sr.setSubRelType(r.getSubRelType()); sr.setSubRelType(r.getSubRelType());
return sr; return sr;
} }
@ -49,16 +58,16 @@ public class SortableRelationKey implements Serializable {
return Objects.hashCode(getGroupingKey()); return Objects.hashCode(getGroupingKey());
} }
public void setSource(String source) { @Override
this.source = source; public int compareTo(SortableRelationKey o) {
return ComparisonChain
.start()
.compare(getWeight(this), getWeight(o))
.result() * -1;
} }
public String getTarget() { private Integer getWeight(SortableRelationKey o) {
return target; return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE);
}
public void setTarget(String target) {
this.target = target;
} }
public String getSubRelType() { public String getSubRelType() {
@ -76,4 +85,5 @@ public class SortableRelationKey implements Serializable {
public void setGroupingKey(String groupingKey) { public void setGroupingKey(String groupingKey) {
this.groupingKey = groupingKey; this.groupingKey = groupingKey;
} }
} }

View File

@ -12,6 +12,8 @@ import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
*/ */
public class RelationPartitioner extends Partitioner { public class RelationPartitioner extends Partitioner {
private static final long serialVersionUID = 343434456L;
private final int numPartitions; private final int numPartitions;
public RelationPartitioner(int numPartitions) { public RelationPartitioner(int numPartitions) {
@ -29,4 +31,14 @@ public class RelationPartitioner extends Partitioner {
return Utils.nonNegativeMod(partitionKey.getGroupingKey().hashCode(), numPartitions()); return Utils.nonNegativeMod(partitionKey.getGroupingKey().hashCode(), numPartitions());
} }
@Override
public boolean equals(Object obj) {
if (obj instanceof RelationPartitioner) {
RelationPartitioner p = (RelationPartitioner) obj;
if (p.numPartitions() == numPartitions())
return true;
}
return false;
}
} }