forked from antonis.lempesis/dnet-hadoop
changed the direction of the relation between projects and result considered to select the results linked to projects
This commit is contained in:
parent
00c377dac2
commit
3319440c53
|
@ -76,16 +76,16 @@ public class SparkPrepareResultProject implements Serializable {
|
||||||
private static void prepareResultProjectList(SparkSession spark, String inputPath, String outputPath) {
|
private static void prepareResultProjectList(SparkSession spark, String inputPath, String outputPath) {
|
||||||
Dataset<Relation> relation = Utils
|
Dataset<Relation> relation = Utils
|
||||||
.readPath(spark, inputPath + "/relation", Relation.class)
|
.readPath(spark, inputPath + "/relation", Relation.class)
|
||||||
.filter("dataInfo.deletedbyinference = false and relClass = 'produces'");
|
.filter("dataInfo.deletedbyinference = false and lower(relClass) = 'isproducedby'");
|
||||||
Dataset<eu.dnetlib.dhp.schema.oaf.Project> projects = Utils
|
Dataset<eu.dnetlib.dhp.schema.oaf.Project> projects = Utils
|
||||||
.readPath(spark, inputPath + "/project", eu.dnetlib.dhp.schema.oaf.Project.class);
|
.readPath(spark, inputPath + "/project", eu.dnetlib.dhp.schema.oaf.Project.class);
|
||||||
|
|
||||||
projects
|
projects
|
||||||
.joinWith(relation, projects.col("id").equalTo(relation.col("source")))
|
.joinWith(relation, projects.col("id").equalTo(relation.col("target")), "inner")
|
||||||
.groupByKey(
|
.groupByKey(
|
||||||
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, String>) value -> value
|
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, String>) value -> value
|
||||||
._2()
|
._2()
|
||||||
.getTarget(),
|
.getSource(),
|
||||||
Encoders.STRING())
|
Encoders.STRING())
|
||||||
.mapGroups(
|
.mapGroups(
|
||||||
(MapGroupsFunction<String, Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, ResultProject>) (s,
|
(MapGroupsFunction<String, Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, ResultProject>) (s,
|
||||||
|
@ -93,7 +93,7 @@ public class SparkPrepareResultProject implements Serializable {
|
||||||
Set<String> projectSet = new HashSet<>();
|
Set<String> projectSet = new HashSet<>();
|
||||||
Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation> first = it.next();
|
Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation> first = it.next();
|
||||||
ResultProject rp = new ResultProject();
|
ResultProject rp = new ResultProject();
|
||||||
rp.setResultId(first._2().getTarget());
|
rp.setResultId(s);
|
||||||
eu.dnetlib.dhp.schema.oaf.Project p = first._1();
|
eu.dnetlib.dhp.schema.oaf.Project p = first._1();
|
||||||
projectSet.add(p.getId());
|
projectSet.add(p.getId());
|
||||||
Project ps = getProject(p);
|
Project ps = getProject(p);
|
||||||
|
|
Loading…
Reference in New Issue