forked from D-Net/dnet-hadoop
re-implemented inverting the couple: from (projectId, relatedResultList) to (resultId, relatedProjectList)
This commit is contained in:
parent
adcbf0e29a
commit
8802e4126b
|
@ -12,6 +12,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
|
@ -95,45 +96,57 @@ public class PrepareProjectResultsAssociation {
|
||||||
resproj_relation.createOrReplaceTempView("resproj_relation");
|
resproj_relation.createOrReplaceTempView("resproj_relation");
|
||||||
|
|
||||||
query =
|
query =
|
||||||
"SELECT projectId, collect_set(resId) resultSet "
|
"SELECT resultId, collect_set(projectId) projectSet "
|
||||||
+ "FROM ("
|
+ "FROM ( "
|
||||||
+ " SELECT r1.target resId, r2.target projectId "
|
+ "SELECT r1.target resultId, r2.target projectId "
|
||||||
+ " FROM (SELECT source, target "
|
+ " FROM (SELECT source, target "
|
||||||
+ " FROM relation "
|
+ " FROM relation "
|
||||||
+ " WHERE datainfo.deletedbyinference = false "
|
+ " WHERE datainfo.deletedbyinference = false "
|
||||||
+ getConstraintList(" relClass = '", allowedsemrel)
|
+ getConstraintList(" relClass = '", allowedsemrel)
|
||||||
+ ") r1"
|
+ " ) r1"
|
||||||
+ " JOIN resproj_relation r2 "
|
+ " JOIN resproj_relation r2 "
|
||||||
+ " ON r1.source = r2.source "
|
+ " ON r1.source = r2.source "
|
||||||
+ " ) tmp "
|
+ " ) tmp "
|
||||||
+ "GROUP BY projectId ";
|
+ "GROUP BY resultId ";
|
||||||
|
// query =
|
||||||
|
// "SELECT projectId, collect_set(resId) resultSet "
|
||||||
|
// + "FROM ("
|
||||||
|
// + " SELECT r1.target resId, r2.target projectId "
|
||||||
|
// + " FROM (SELECT source, target "
|
||||||
|
// + " FROM relation "
|
||||||
|
// + " WHERE datainfo.deletedbyinference = false "
|
||||||
|
// + getConstraintList(" relClass = '", allowedsemrel)
|
||||||
|
// + ") r1"
|
||||||
|
// + " JOIN resproj_relation r2 "
|
||||||
|
// + " ON r1.source = r2.source "
|
||||||
|
// + " ) tmp "
|
||||||
|
// + "GROUP BY projectId ";
|
||||||
|
|
||||||
spark.sql(query)
|
spark.sql(query)
|
||||||
.as(Encoders.bean(ProjectResultSet.class))
|
.as(Encoders.bean(ResultProjectSet.class))
|
||||||
.toJSON()
|
// .toJSON()
|
||||||
.write()
|
// .write()
|
||||||
.mode(SaveMode.Overwrite)
|
// .mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
// .option("compression", "gzip")
|
||||||
.text(potentialUpdatePath);
|
// .text(potentialUpdatePath);
|
||||||
// .toJavaRDD()
|
.toJavaRDD()
|
||||||
// .map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
||||||
// .saveAsTextFile(potentialUpdatePath, GzipCodec.class);
|
.saveAsTextFile(potentialUpdatePath, GzipCodec.class);
|
||||||
|
|
||||||
query =
|
query =
|
||||||
"SELECT target projectId, collect_set(source) resultSet "
|
"SELECT source resultId, collect_set(target) projectSet "
|
||||||
+ "FROM resproj_relation "
|
+ "FROM resproj_relation "
|
||||||
+ "GROUP BY target";
|
+ "GROUP BY source";
|
||||||
|
|
||||||
spark.sql(query)
|
spark.sql(query)
|
||||||
.as(Encoders.bean(ProjectResultSet.class))
|
.as(Encoders.bean(ResultProjectSet.class))
|
||||||
.toJSON()
|
// .toJSON()
|
||||||
.write()
|
// .write()
|
||||||
.mode(SaveMode.Overwrite)
|
// .mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
// .option("compression", "gzip")
|
||||||
.text(alreadyLinkedPath);
|
// .text(alreadyLinkedPath);
|
||||||
// .toJavaRDD()
|
.toJavaRDD()
|
||||||
// .map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
||||||
// .saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
|
.saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,25 +0,0 @@
|
||||||
package eu.dnetlib.dhp.projecttoresult;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
|
|
||||||
public class ProjectResultSet implements Serializable {
|
|
||||||
private String projectId;
|
|
||||||
private ArrayList<String> resultSet;
|
|
||||||
|
|
||||||
public String getProjectId() {
|
|
||||||
return projectId;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setProjectId(String projectId) {
|
|
||||||
this.projectId = projectId;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ArrayList<String> getResultSet() {
|
|
||||||
return resultSet;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setResultSet(ArrayList<String> resultSet) {
|
|
||||||
this.resultSet = resultSet;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
package eu.dnetlib.dhp.projecttoresult;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class ResultProjectSet implements Serializable {
|
||||||
|
private String resultId;
|
||||||
|
private ArrayList<String> projectSet;
|
||||||
|
|
||||||
|
public String getResultId() {
|
||||||
|
return resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResultId(String resultId) {
|
||||||
|
this.resultId = resultId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ArrayList<String> getProjectSet() {
|
||||||
|
return projectSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setProjectSet(ArrayList<String> project) {
|
||||||
|
this.projectSet = project;
|
||||||
|
}
|
||||||
|
}
|
|
@ -9,6 +9,7 @@ import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
|
@ -44,9 +45,6 @@ public class SparkResultToProjectThroughSemRelJob3 {
|
||||||
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
|
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
|
||||||
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
|
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
|
||||||
|
|
||||||
final Boolean writeUpdates = Boolean.valueOf(parser.get("writeUpdate"));
|
|
||||||
log.info("writeUpdate: {}", writeUpdates);
|
|
||||||
|
|
||||||
final Boolean saveGraph = Boolean.valueOf(parser.get("saveGraph"));
|
final Boolean saveGraph = Boolean.valueOf(parser.get("saveGraph"));
|
||||||
log.info("saveGraph: {}", saveGraph);
|
log.info("saveGraph: {}", saveGraph);
|
||||||
|
|
||||||
|
@ -60,12 +58,7 @@ public class SparkResultToProjectThroughSemRelJob3 {
|
||||||
removeOutputDir(spark, outputPath);
|
removeOutputDir(spark, outputPath);
|
||||||
}
|
}
|
||||||
execPropagation(
|
execPropagation(
|
||||||
spark,
|
spark, outputPath, alreadyLinkedPath, potentialUpdatePath, saveGraph);
|
||||||
outputPath,
|
|
||||||
alreadyLinkedPath,
|
|
||||||
potentialUpdatePath,
|
|
||||||
writeUpdates,
|
|
||||||
saveGraph);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,21 +67,12 @@ public class SparkResultToProjectThroughSemRelJob3 {
|
||||||
String outputPath,
|
String outputPath,
|
||||||
String alreadyLinkedPath,
|
String alreadyLinkedPath,
|
||||||
String potentialUpdatePath,
|
String potentialUpdatePath,
|
||||||
Boolean writeUpdate,
|
|
||||||
Boolean saveGraph) {
|
Boolean saveGraph) {
|
||||||
|
|
||||||
Dataset<ProjectResultSet> toaddrelations =
|
Dataset<ResultProjectSet> toaddrelations =
|
||||||
readAssocProjectResults(spark, potentialUpdatePath);
|
readAssocResultProjects(spark, potentialUpdatePath);
|
||||||
Dataset<ProjectResultSet> alreadyLinked = readAssocProjectResults(spark, alreadyLinkedPath);
|
Dataset<ResultProjectSet> alreadyLinked = readAssocResultProjects(spark, alreadyLinkedPath);
|
||||||
|
|
||||||
if (writeUpdate) {
|
|
||||||
toaddrelations
|
|
||||||
.toJSON()
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.text(outputPath + "/potential_updates");
|
|
||||||
}
|
|
||||||
if (saveGraph) {
|
if (saveGraph) {
|
||||||
getNewRelations(alreadyLinked, toaddrelations)
|
getNewRelations(alreadyLinked, toaddrelations)
|
||||||
.toJSON()
|
.toJSON()
|
||||||
|
@ -100,56 +84,66 @@ public class SparkResultToProjectThroughSemRelJob3 {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Dataset<Relation> getNewRelations(
|
private static Dataset<Relation> getNewRelations(
|
||||||
Dataset<ProjectResultSet> alreadyLinked, Dataset<ProjectResultSet> toaddrelations) {
|
Dataset<ResultProjectSet> alreadyLinked, Dataset<ResultProjectSet> toaddrelations) {
|
||||||
|
|
||||||
return toaddrelations
|
return toaddrelations
|
||||||
.joinWith(
|
.joinWith(
|
||||||
alreadyLinked,
|
alreadyLinked,
|
||||||
toaddrelations.col("projectId").equalTo(alreadyLinked.col("projectId")),
|
toaddrelations.col("resultId").equalTo(alreadyLinked.col("resultId")),
|
||||||
"left")
|
"left_outer")
|
||||||
.flatMap(
|
.flatMap(
|
||||||
value -> {
|
value -> {
|
||||||
List<Relation> new_relations = new ArrayList<>();
|
List<Relation> new_relations = new ArrayList<>();
|
||||||
ProjectResultSet potential_update = value._1();
|
ResultProjectSet potential_update = value._1();
|
||||||
ProjectResultSet already_linked = value._2();
|
Optional<ResultProjectSet> already_linked =
|
||||||
String projId = already_linked.getProjectId();
|
Optional.ofNullable(value._2());
|
||||||
potential_update.getResultSet().stream()
|
if (already_linked.isPresent()) {
|
||||||
|
already_linked.get().getProjectSet().stream()
|
||||||
|
.forEach(
|
||||||
|
(p -> {
|
||||||
|
if (potential_update
|
||||||
|
.getProjectSet()
|
||||||
|
.contains(p)) {
|
||||||
|
potential_update.getProjectSet().remove(p);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
String resId = potential_update.getResultId();
|
||||||
|
potential_update.getProjectSet().stream()
|
||||||
.forEach(
|
.forEach(
|
||||||
rId -> {
|
pId -> {
|
||||||
if (!already_linked.getResultSet().contains(rId)) {
|
new_relations.add(
|
||||||
new_relations.add(
|
getRelation(
|
||||||
getRelation(
|
resId,
|
||||||
rId,
|
pId,
|
||||||
projId,
|
RELATION_RESULT_PROJECT_REL_CLASS,
|
||||||
RELATION_RESULT_PROJECT_REL_CLASS,
|
RELATION_RESULTPROJECT_REL_TYPE,
|
||||||
RELATION_RESULTPROJECT_REL_TYPE,
|
RELATION_RESULTPROJECT_SUBREL_TYPE,
|
||||||
RELATION_RESULTPROJECT_SUBREL_TYPE,
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
PROPAGATION_DATA_INFO_TYPE,
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
||||||
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
||||||
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
new_relations.add(
|
||||||
new_relations.add(
|
getRelation(
|
||||||
getRelation(
|
pId,
|
||||||
projId,
|
resId,
|
||||||
rId,
|
RELATION_PROJECT_RESULT_REL_CLASS,
|
||||||
RELATION_PROJECT_RESULT_REL_CLASS,
|
RELATION_RESULTPROJECT_REL_TYPE,
|
||||||
RELATION_RESULTPROJECT_REL_TYPE,
|
RELATION_RESULTPROJECT_SUBREL_TYPE,
|
||||||
RELATION_RESULTPROJECT_SUBREL_TYPE,
|
PROPAGATION_DATA_INFO_TYPE,
|
||||||
PROPAGATION_DATA_INFO_TYPE,
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
||||||
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID,
|
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
||||||
PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME));
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
return new_relations.iterator();
|
return new_relations.iterator();
|
||||||
},
|
},
|
||||||
Encoders.bean(Relation.class));
|
Encoders.bean(Relation.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Dataset<ProjectResultSet> readAssocProjectResults(
|
private static Dataset<ResultProjectSet> readAssocResultProjects(
|
||||||
SparkSession spark, String potentialUpdatePath) {
|
SparkSession spark, String potentialUpdatePath) {
|
||||||
return spark.read()
|
return spark.read()
|
||||||
.textFile(potentialUpdatePath)
|
.textFile(potentialUpdatePath)
|
||||||
.map(
|
.map(
|
||||||
value -> OBJECT_MAPPER.readValue(value, ProjectResultSet.class),
|
value -> OBJECT_MAPPER.readValue(value, ResultProjectSet.class),
|
||||||
Encoders.bean(ProjectResultSet.class));
|
Encoders.bean(ResultProjectSet.class));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue