forked from antonis.lempesis/dnet-hadoop
[Affilaition Propagation] moved the selection of graph relation as a preparation step
This commit is contained in:
parent
7c96e3fd46
commit
28ea532ece
|
@ -73,6 +73,9 @@ public class PrepareInfo implements Serializable {
|
|||
final String resultOrganizationPath = parser.get("resultOrgPath");
|
||||
log.info("resultOrganizationPath: {}", resultOrganizationPath);
|
||||
|
||||
final String relationPath = parser.get("relationPath");
|
||||
log.info("relationPath: {}", relationPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||
|
||||
|
@ -84,11 +87,12 @@ public class PrepareInfo implements Serializable {
|
|||
graphPath,
|
||||
childParentPath,
|
||||
leavesPath,
|
||||
resultOrganizationPath));
|
||||
resultOrganizationPath,
|
||||
relationPath));
|
||||
}
|
||||
|
||||
private static void prepareInfo(SparkSession spark, String inputPath, String childParentOrganizationPath,
|
||||
String currentIterationPath, String resultOrganizationPath) {
|
||||
String currentIterationPath, String resultOrganizationPath, String relationPath) {
|
||||
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class);
|
||||
relation.createOrReplaceTempView("relation");
|
||||
|
||||
|
@ -108,6 +112,15 @@ public class PrepareInfo implements Serializable {
|
|||
.option("compression", "gzip")
|
||||
.json(resultOrganizationPath);
|
||||
|
||||
relation
|
||||
.filter(
|
||||
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||
r.getRelClass().equals(ModelConstants.HAS_AUTHOR_INSTITUTION))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression","gzip")
|
||||
.json(relationPath);
|
||||
|
||||
Dataset<String> children = spark
|
||||
.sql(
|
||||
"Select distinct target as child from relation where " +
|
||||
|
|
|
@ -49,8 +49,8 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
|
|||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
String relationPath = parser.get("relationPath");
|
||||
log.info("relationPath: {}", relationPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
@ -78,7 +78,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
|
|||
leavesPath,
|
||||
childParentPath,
|
||||
resultOrganizationPath,
|
||||
graphPath,
|
||||
relationPath,
|
||||
workingPath,
|
||||
outputPath));
|
||||
}
|
||||
|
|
|
@ -26,16 +26,11 @@ import scala.Tuple2;
|
|||
|
||||
public class StepActions implements Serializable {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void execStep(SparkSession spark,
|
||||
String graphPath, String newRelationPath,
|
||||
String leavesPath, String chldParentOrgPath, String resultOrgPath) {
|
||||
|
||||
Dataset<Relation> relationGraph = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(
|
||||
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||
r.getRelClass().equals(ModelConstants.HAS_AUTHOR_INSTITUTION));
|
||||
Dataset<Relation> relationGraph = readPath(spark, graphPath, Relation.class);
|
||||
// select only the relation source target among those proposed by propagation that are not already existent
|
||||
getNewRels(
|
||||
newRelationPath, relationGraph,
|
||||
|
@ -80,8 +75,8 @@ public class StepActions implements Serializable {
|
|||
ret.setValueSet(orgs);
|
||||
return ret;
|
||||
}, Encoders.bean(KeyValueSet.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
@ -116,7 +111,7 @@ public class StepActions implements Serializable {
|
|||
// union of new propagation relations to the relation set
|
||||
// grouping from sourcetarget (we are sure the only relations are those from result to organization by
|
||||
// construction of the set)
|
||||
// if at least one relation in the set was harvested no new relation will be returned
|
||||
// if at least one relation in the set was not produced by propagation no new relation will be returned
|
||||
|
||||
relationDataset
|
||||
.union(newRels)
|
||||
|
|
|
@ -34,5 +34,11 @@
|
|||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "the path where prepared info have been stored",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "rep",
|
||||
"paramLongName": "relationPath",
|
||||
"paramDescription": "the path where to store the selected subset of relations",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -1,7 +1,7 @@
|
|||
[
|
||||
{
|
||||
"paramName":"gp",
|
||||
"paramLongName":"graphPath",
|
||||
"paramName":"rep",
|
||||
"paramLongName":"relationPath",
|
||||
"paramDescription": "the path of the sequencial file to read",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
|
|
@ -150,6 +150,7 @@
|
|||
<arg>--leavesPath</arg><arg>${workingDir}/preparedInfo/leavesPath</arg>
|
||||
<arg>--childParentPath</arg><arg>${workingDir}/preparedInfo/childParentPath</arg>
|
||||
<arg>--resultOrgPath</arg><arg>${workingDir}/preparedInfo/resultOrgPath</arg>
|
||||
<arg>--relationPath</arg><arg>${workingDir}/preparedInfo/relation</arg>
|
||||
</spark>
|
||||
<ok to="apply_resulttoorganization_propagation"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -173,7 +174,7 @@
|
|||
--conf spark.dynamicAllocation.enabled=true
|
||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--relationPath</arg><arg>${workingDir}/preparedInfo/relation</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||
<arg>--leavesPath</arg><arg>${workingDir}/preparedInfo/leavesPath</arg>
|
||||
<arg>--childParentPath</arg><arg>${workingDir}/preparedInfo/childParentPath</arg>
|
||||
|
|
|
@ -84,6 +84,7 @@ public class PrepareInfoJobTest {
|
|||
"-leavesPath", workingDir.toString() + "/currentIteration/",
|
||||
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
|
||||
"-childParentPath", workingDir.toString() + "/childParentOrg/",
|
||||
"-relationPath", workingDir.toString() + "/relation"
|
||||
|
||||
});
|
||||
|
||||
|
@ -228,6 +229,7 @@ public class PrepareInfoJobTest {
|
|||
"-leavesPath", workingDir.toString() + "/currentIteration/",
|
||||
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
|
||||
"-childParentPath", workingDir.toString() + "/childParentOrg/",
|
||||
"-relationPath", workingDir.toString() + "/relation"
|
||||
|
||||
});
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
@ -332,6 +334,35 @@ public class PrepareInfoJobTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void relationTest()throws Exception {
|
||||
|
||||
PrepareInfo
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-graphPath", getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest")
|
||||
.getPath(),
|
||||
"-hive_metastore_uris", "",
|
||||
"-leavesPath", workingDir.toString() + "/currentIteration/",
|
||||
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
|
||||
"-childParentPath", workingDir.toString() + "/childParentOrg/",
|
||||
"-relationPath", workingDir.toString() + "/relation"
|
||||
|
||||
});
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Relation> tmp = sc
|
||||
.textFile(workingDir.toString() + "/relation")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
|
||||
|
||||
Dataset<Relation> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||
|
||||
Assertions.assertEquals(7, verificationDs.count());
|
||||
|
||||
}
|
||||
@Test
|
||||
public void resultOrganizationTest1() throws Exception {
|
||||
|
||||
|
@ -347,6 +378,7 @@ public class PrepareInfoJobTest {
|
|||
"-leavesPath", workingDir.toString() + "/currentIteration/",
|
||||
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
|
||||
"-childParentPath", workingDir.toString() + "/childParentOrg/",
|
||||
"-relationPath", workingDir.toString() + "/relation"
|
||||
|
||||
});
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
@ -467,10 +499,6 @@ public class PrepareInfoJobTest {
|
|||
|
||||
@Test
|
||||
public void foundLeavesTest1() throws Exception {
|
||||
// PrepareInfo.prepareInfo(spark, getClass()
|
||||
// .getResource(
|
||||
// "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest")
|
||||
// .getPath(), workingDir.toString() + "/childParentOrg/", workingDir.toString() + "/currentIteration/",workingDir.toString() + "/resultOrganization/");
|
||||
|
||||
PrepareInfo
|
||||
.main(
|
||||
|
@ -484,6 +512,7 @@ public class PrepareInfoJobTest {
|
|||
"-leavesPath", workingDir.toString() + "/currentIteration/",
|
||||
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
|
||||
"-childParentPath", workingDir.toString() + "/childParentOrg/",
|
||||
"-relationPath", workingDir.toString() + "/relation"
|
||||
|
||||
});
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
@ -510,12 +539,9 @@ public class PrepareInfoJobTest {
|
|||
"-leavesPath", workingDir.toString() + "/currentIteration/",
|
||||
"-resultOrgPath", workingDir.toString() + "/resultOrganization/",
|
||||
"-childParentPath", workingDir.toString() + "/childParentOrg/",
|
||||
"-relationPath", workingDir.toString() + "/relation"
|
||||
|
||||
});
|
||||
// PrepareInfo.prepareInfo(spark, getClass()
|
||||
// .getResource(
|
||||
// "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest1")
|
||||
// .getPath(), workingDir.toString() + "/childParentOrg/", workingDir.toString() + "/currentIteration/",workingDir.toString() + "/resultOrganization/");
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
|
|
|
@ -101,9 +101,9 @@ public class SparkJobTest {
|
|||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-graphPath", graphPath,
|
||||
"-relationPath", graphPath,
|
||||
"-hive_metastore_uris", "",
|
||||
"-outputPath", workingDir.toString() + "/relation",
|
||||
"-outputPath", workingDir.toString() + "/finalrelation",
|
||||
"-leavesPath", workingDir.toString() + "/leavesInput",
|
||||
"-resultOrgPath", workingDir.toString() + "/orgsInput",
|
||||
"-childParentPath", childParentPath,
|
||||
|
@ -113,9 +113,11 @@ public class SparkJobTest {
|
|||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Relation> tmp = sc
|
||||
.textFile(workingDir.toString() + "/relation")
|
||||
.textFile(workingDir.toString() + "/finalrelation")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
|
||||
|
||||
tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
|
||||
|
||||
Assertions.assertEquals(18, tmp.count());
|
||||
tmp.foreach(r -> Assertions.assertEquals(ModelConstants.AFFILIATION, r.getSubRelType()));
|
||||
tmp.foreach(r -> Assertions.assertEquals(ModelConstants.RESULT_ORGANIZATION, r.getRelType()));
|
||||
|
|
Loading…
Reference in New Issue