forked from D-Net/dnet-hadoop
allow to set different to relations cut points by source and by target; adjusted weight assigned to relationship types
This commit is contained in:
parent
ff4d6214f1
commit
b21866a2da
|
@ -100,11 +100,17 @@ public class PrepareRelationsJob {
|
||||||
.orElse(new HashSet<>());
|
.orElse(new HashSet<>());
|
||||||
log.info("relationFilter: {}", relationFilter);
|
log.info("relationFilter: {}", relationFilter);
|
||||||
|
|
||||||
int maxRelations = Optional
|
int sourceMaxRelations = Optional
|
||||||
.ofNullable(parser.get("maxRelations"))
|
.ofNullable(parser.get("sourceMaxRelations"))
|
||||||
.map(Integer::valueOf)
|
.map(Integer::valueOf)
|
||||||
.orElse(MAX_RELS);
|
.orElse(MAX_RELS);
|
||||||
log.info("maxRelations: {}", maxRelations);
|
log.info("sourceMaxRelations: {}", sourceMaxRelations);
|
||||||
|
|
||||||
|
int targetMaxRelations = Optional
|
||||||
|
.ofNullable(parser.get("targetMaxRelations"))
|
||||||
|
.map(Integer::valueOf)
|
||||||
|
.orElse(MAX_RELS);
|
||||||
|
log.info("targetMaxRelations: {}", targetMaxRelations);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||||
|
@ -116,7 +122,8 @@ public class PrepareRelationsJob {
|
||||||
spark -> {
|
spark -> {
|
||||||
removeOutputDir(spark, outputPath);
|
removeOutputDir(spark, outputPath);
|
||||||
prepareRelationsRDD(
|
prepareRelationsRDD(
|
||||||
spark, inputRelationsPath, outputPath, relationFilter, maxRelations, relPartitions);
|
spark, inputRelationsPath, outputPath, relationFilter, sourceMaxRelations, targetMaxRelations,
|
||||||
|
relPartitions);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -129,17 +136,22 @@ public class PrepareRelationsJob {
|
||||||
* @param inputRelationsPath source path for the graph relations
|
* @param inputRelationsPath source path for the graph relations
|
||||||
* @param outputPath output path for the processed relations
|
* @param outputPath output path for the processed relations
|
||||||
* @param relationFilter set of relation filters applied to the `relClass` field
|
* @param relationFilter set of relation filters applied to the `relClass` field
|
||||||
* @param maxRelations maximum number of allowed outgoing edges
|
* @param sourceMaxRelations maximum number of allowed outgoing edges grouping by relation.source
|
||||||
|
* @param targetMaxRelations maximum number of allowed outgoing edges grouping by relation.target
|
||||||
* @param relPartitions number of partitions for the output RDD
|
* @param relPartitions number of partitions for the output RDD
|
||||||
*/
|
*/
|
||||||
private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath,
|
private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath,
|
||||||
Set<String> relationFilter, int maxRelations, int relPartitions) {
|
Set<String> relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) {
|
||||||
|
|
||||||
JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath);
|
JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath)
|
||||||
|
.filter(rel -> rel.getDataInfo().getDeletedbyinference() == false)
|
||||||
|
.filter(rel -> relationFilter.contains(rel.getRelClass()) == false);
|
||||||
|
|
||||||
JavaRDD<Relation> pruned = pruneRels(
|
JavaRDD<Relation> pruned = pruneRels(
|
||||||
pruneRels(rels, relationFilter, maxRelations, relPartitions, (Function<Relation, String>) r -> r.getSource()),
|
pruneRels(
|
||||||
relationFilter, maxRelations, relPartitions, (Function<Relation, String>) r -> r.getTarget());
|
rels,
|
||||||
|
sourceMaxRelations, relPartitions, (Function<Relation, String>) r -> r.getSource()),
|
||||||
|
targetMaxRelations, relPartitions, (Function<Relation, String>) r -> r.getTarget());
|
||||||
spark
|
spark
|
||||||
.createDataset(pruned.rdd(), Encoders.bean(Relation.class))
|
.createDataset(pruned.rdd(), Encoders.bean(Relation.class))
|
||||||
.repartition(relPartitions)
|
.repartition(relPartitions)
|
||||||
|
@ -148,16 +160,16 @@ public class PrepareRelationsJob {
|
||||||
.parquet(outputPath);
|
.parquet(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static JavaRDD<Relation> pruneRels(JavaRDD<Relation> rels, Set<String> relationFilter, int maxRelations, int relPartitions, Function<Relation, String> idFn) {
|
private static JavaRDD<Relation> pruneRels(JavaRDD<Relation> rels, int maxRelations,
|
||||||
|
int relPartitions, Function<Relation, String> idFn) {
|
||||||
return rels
|
return rels
|
||||||
.filter(rel -> rel.getDataInfo().getDeletedbyinference() == false)
|
|
||||||
.filter(rel -> relationFilter.contains(rel.getRelClass()) == false)
|
|
||||||
.mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, idFn.call(r)), r))
|
.mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, idFn.call(r)), r))
|
||||||
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
|
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
|
||||||
.groupBy(Tuple2::_1)
|
.groupBy(Tuple2::_1)
|
||||||
.map(Tuple2::_2)
|
.map(Tuple2::_2)
|
||||||
.map(t -> Iterables.limit(t, maxRelations))
|
.map(t -> Iterables.limit(t, maxRelations))
|
||||||
.flatMap(Iterable::iterator).map(Tuple2::_2);
|
.flatMap(Iterable::iterator)
|
||||||
|
.map(Tuple2::_2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// experimental
|
// experimental
|
||||||
|
|
|
@ -16,18 +16,18 @@ public class SortableRelationKey implements Comparable<SortableRelationKey>, Ser
|
||||||
private static final Map<String, Integer> weights = Maps.newHashMap();
|
private static final Map<String, Integer> weights = Maps.newHashMap();
|
||||||
|
|
||||||
static {
|
static {
|
||||||
weights.put("outcome", 0);
|
weights.put("participation", 0);
|
||||||
weights.put("supplement", 1);
|
|
||||||
weights.put("review", 2);
|
|
||||||
weights.put("citation", 3);
|
|
||||||
weights.put("affiliation", 4);
|
|
||||||
weights.put("relationship", 5);
|
|
||||||
weights.put("publicationDataset", 6);
|
|
||||||
weights.put("similarity", 7);
|
|
||||||
|
|
||||||
weights.put("provision", 8);
|
weights.put("outcome", 1);
|
||||||
weights.put("participation", 9);
|
weights.put("affiliation", 2);
|
||||||
weights.put("dedup", 10);
|
weights.put("dedup", 3);
|
||||||
|
weights.put("publicationDataset", 4);
|
||||||
|
weights.put("citation", 5);
|
||||||
|
weights.put("supplement", 6);
|
||||||
|
weights.put("review", 7);
|
||||||
|
weights.put("relationship", 8);
|
||||||
|
weights.put("provision", 9);
|
||||||
|
weights.put("similarity", 10);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final long serialVersionUID = 3232323;
|
private static final long serialVersionUID = 3232323;
|
||||||
|
|
|
@ -30,9 +30,16 @@
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "mr",
|
"paramName": "smr",
|
||||||
"paramLongName": "maxRelations",
|
"paramLongName": "sourceMaxRelations",
|
||||||
"paramDescription": "maximum number of relations allowed for a each entity",
|
"paramDescription": "maximum number of relations allowed for a each entity grouping by source",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "tmr",
|
||||||
|
"paramLongName": "targetMaxRelations",
|
||||||
|
"paramDescription": "maximum number of relations allowed for a each entity grouping by target",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
}
|
}
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
Loading…
Reference in New Issue