[Affilaition Propagation] moved the selection of graph relation as a preparation step

This commit is contained in:
Miriam Baglioni 2021-11-16 15:24:19 +01:00
parent 7c96e3fd46
commit 28ea532ece
8 changed files with 71 additions and 28 deletions

View File

@ -73,6 +73,9 @@ public class PrepareInfo implements Serializable {
final String resultOrganizationPath = parser.get("resultOrgPath");
log.info("resultOrganizationPath: {}", resultOrganizationPath);
final String relationPath = parser.get("relationPath");
log.info("relationPath: {}", relationPath);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
@ -84,11 +87,12 @@ public class PrepareInfo implements Serializable {
graphPath,
childParentPath,
leavesPath,
resultOrganizationPath));
resultOrganizationPath,
relationPath));
}
private static void prepareInfo(SparkSession spark, String inputPath, String childParentOrganizationPath,
String currentIterationPath, String resultOrganizationPath) {
String currentIterationPath, String resultOrganizationPath, String relationPath) {
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class);
relation.createOrReplaceTempView("relation");
@ -108,6 +112,15 @@ public class PrepareInfo implements Serializable {
.option("compression", "gzip")
.json(resultOrganizationPath);
relation
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
r.getRelClass().equals(ModelConstants.HAS_AUTHOR_INSTITUTION))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(relationPath);
Dataset<String> children = spark
.sql(
"Select distinct target as child from relation where " +

View File

@ -49,8 +49,8 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
String relationPath = parser.get("relationPath");
log.info("relationPath: {}", relationPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
@ -78,7 +78,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
leavesPath,
childParentPath,
resultOrganizationPath,
graphPath,
relationPath,
workingPath,
outputPath));
}

View File

@ -26,16 +26,11 @@ import scala.Tuple2;
public class StepActions implements Serializable {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void execStep(SparkSession spark,
String graphPath, String newRelationPath,
String leavesPath, String chldParentOrgPath, String resultOrgPath) {
Dataset<Relation> relationGraph = readPath(spark, graphPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
r.getRelClass().equals(ModelConstants.HAS_AUTHOR_INSTITUTION));
Dataset<Relation> relationGraph = readPath(spark, graphPath, Relation.class);
// select only the relation source target among those proposed by propagation that are not already existent
getNewRels(
newRelationPath, relationGraph,
@ -80,8 +75,8 @@ public class StepActions implements Serializable {
ret.setValueSet(orgs);
return ret;
}, Encoders.bean(KeyValueSet.class))
.write()
.mode(SaveMode.Overwrite)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
@ -116,7 +111,7 @@ public class StepActions implements Serializable {
// union of new propagation relations to the relation set
// grouping from sourcetarget (we are sure the only relations are those from result to organization by
// construction of the set)
// if at least one relation in the set was harvested no new relation will be returned
// if at least one relation in the set was not produced by propagation no new relation will be returned
relationDataset
.union(newRels)

View File

@ -34,5 +34,11 @@
"paramLongName": "isSparkSessionManaged",
"paramDescription": "the path where prepared info have been stored",
"paramRequired": false
},
{
"paramName": "rep",
"paramLongName": "relationPath",
"paramDescription": "the path where to store the selected subset of relations",
"paramRequired": false
}
]

View File

@ -1,7 +1,7 @@
[
{
"paramName":"gp",
"paramLongName":"graphPath",
"paramName":"rep",
"paramLongName":"relationPath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},

View File

@ -150,6 +150,7 @@
<arg>--leavesPath</arg><arg>${workingDir}/preparedInfo/leavesPath</arg>
<arg>--childParentPath</arg><arg>${workingDir}/preparedInfo/childParentPath</arg>
<arg>--resultOrgPath</arg><arg>${workingDir}/preparedInfo/resultOrgPath</arg>
<arg>--relationPath</arg><arg>${workingDir}/preparedInfo/relation</arg>
</spark>
<ok to="apply_resulttoorganization_propagation"/>
<error to="Kill"/>
@ -173,7 +174,7 @@
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--graphPath</arg><arg>${sourcePath}</arg>
<arg>--relationPath</arg><arg>${workingDir}/preparedInfo/relation</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--leavesPath</arg><arg>${workingDir}/preparedInfo/leavesPath</arg>
<arg>--childParentPath</arg><arg>${workingDir}/preparedInfo/childParentPath</arg>

View File

@ -84,6 +84,7 @@ public class PrepareInfoJobTest {
"-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation"
});
@ -228,6 +229,7 @@ public class PrepareInfoJobTest {
"-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
@ -332,6 +334,35 @@ public class PrepareInfoJobTest {
}
@Test
public void relationTest()throws Exception {
PrepareInfo
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-graphPath", getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest")
.getPath(),
"-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Dataset<Relation> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
Assertions.assertEquals(7, verificationDs.count());
}
@Test
public void resultOrganizationTest1() throws Exception {
@ -347,6 +378,7 @@ public class PrepareInfoJobTest {
"-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
@ -467,10 +499,6 @@ public class PrepareInfoJobTest {
@Test
public void foundLeavesTest1() throws Exception {
// PrepareInfo.prepareInfo(spark, getClass()
// .getResource(
// "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest")
// .getPath(), workingDir.toString() + "/childParentOrg/", workingDir.toString() + "/currentIteration/",workingDir.toString() + "/resultOrganization/");
PrepareInfo
.main(
@ -484,6 +512,7 @@ public class PrepareInfoJobTest {
"-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
@ -510,12 +539,9 @@ public class PrepareInfoJobTest {
"-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation"
});
// PrepareInfo.prepareInfo(spark, getClass()
// .getResource(
// "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest1")
// .getPath(), workingDir.toString() + "/childParentOrg/", workingDir.toString() + "/currentIteration/",workingDir.toString() + "/resultOrganization/");
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

View File

@ -101,9 +101,9 @@ public class SparkJobTest {
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-graphPath", graphPath,
"-relationPath", graphPath,
"-hive_metastore_uris", "",
"-outputPath", workingDir.toString() + "/relation",
"-outputPath", workingDir.toString() + "/finalrelation",
"-leavesPath", workingDir.toString() + "/leavesInput",
"-resultOrgPath", workingDir.toString() + "/orgsInput",
"-childParentPath", childParentPath,
@ -113,9 +113,11 @@ public class SparkJobTest {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation")
.textFile(workingDir.toString() + "/finalrelation")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
Assertions.assertEquals(18, tmp.count());
tmp.foreach(r -> Assertions.assertEquals(ModelConstants.AFFILIATION, r.getSubRelType()));
tmp.foreach(r -> Assertions.assertEquals(ModelConstants.RESULT_ORGANIZATION, r.getRelType()));