Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
7 changed files with 518 additions and 496 deletions
Showing only changes of commit daf4d7971b - Show all commits

View File

@ -174,37 +174,39 @@ public class PropagationConstant {
return newRelations; return newRelations;
} }
public static Relation getRelation(String source, String target, String rel_class){ public static Relation getRelation(String source, String target, String rel_class) {
if (ModelConstants.HAS_PARTICIPANT.equals(rel_class)){ if (ModelConstants.HAS_PARTICIPANT.equals(rel_class)) {
return getParticipantRelation(source, target, rel_class); return getParticipantRelation(source, target, rel_class);
}else } else
return getAffiliationRelation(source, target, rel_class); return getAffiliationRelation(source, target, rel_class);
} }
public static Relation getParticipantRelation( public static Relation getParticipantRelation(
String source, String source,
String target, String target,
String rel_class) { String rel_class) {
return getRelation(source, target , return getRelation(
rel_class, source, target,
ModelConstants.PROJECT_ORGANIZATION, rel_class,
ModelConstants.PARTICIPATION, ModelConstants.PROJECT_ORGANIZATION,
PROPAGATION_DATA_INFO_TYPE, ModelConstants.PARTICIPATION,
PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID, PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME); PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID,
PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME);
} }
public static Relation getAffiliationRelation( public static Relation getAffiliationRelation(
String source, String source,
String target, String target,
String rel_class) { String rel_class) {
return getRelation(source, target , return getRelation(
rel_class, source, target,
ModelConstants.RESULT_ORGANIZATION, rel_class,
ModelConstants.AFFILIATION, ModelConstants.RESULT_ORGANIZATION,
PROPAGATION_DATA_INFO_TYPE, ModelConstants.AFFILIATION,
PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID, PROPAGATION_DATA_INFO_TYPE,
PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME); PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID,
PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME);
} }
public static Relation getRelation( public static Relation getRelation(

View File

@ -7,7 +7,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.*; import org.apache.spark.api.java.function.*;
@ -15,6 +14,8 @@ import org.apache.spark.sql.*;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.KeyValueSet; import eu.dnetlib.dhp.KeyValueSet;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob; import eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob;
@ -48,10 +49,10 @@ public class PrepareInfo implements Serializable {
// associate projects to all the participant orgs // associate projects to all the participant orgs
private static final String PROJECT_ORGANIZATION_QUERY = "SELECT source key, collect_set(target) as valueSet " + private static final String PROJECT_ORGANIZATION_QUERY = "SELECT source key, collect_set(target) as valueSet " +
"FROM relation " + "FROM relation " +
"WHERE lower(relclass) = '" + ModelConstants.HAS_PARTICIPANT.toLowerCase() + "WHERE lower(relclass) = '" + ModelConstants.HAS_PARTICIPANT.toLowerCase() +
"' and datainfo.deletedbyinference = false " + "' and datainfo.deletedbyinference = false " +
"GROUP BY source"; "GROUP BY source";
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@ -98,12 +99,13 @@ public class PrepareInfo implements Serializable {
childParentPath, childParentPath,
leavesPath, leavesPath,
resultOrganizationPath, resultOrganizationPath,
projectOrgPath, projectOrgPath,
relationPath)); relationPath));
} }
private static void prepareInfo(SparkSession spark, String inputPath, String childParentOrganizationPath, private static void prepareInfo(SparkSession spark, String inputPath, String childParentOrganizationPath,
String currentIterationPath, String resultOrganizationPath, String projectOrganizationPath, String relationPath) { String currentIterationPath, String resultOrganizationPath, String projectOrganizationPath,
String relationPath) {
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class); Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class);
relation.createOrReplaceTempView("relation"); relation.createOrReplaceTempView("relation");
@ -124,30 +126,30 @@ public class PrepareInfo implements Serializable {
.json(resultOrganizationPath); .json(resultOrganizationPath);
spark spark
.sql(PROJECT_ORGANIZATION_QUERY) .sql(PROJECT_ORGANIZATION_QUERY)
.as(Encoders.bean(KeyValueSet.class)) .as(Encoders.bean(KeyValueSet.class))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(projectOrganizationPath); .json(projectOrganizationPath);
relation relation
.filter( .filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() && (FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
r.getRelClass().equals(ModelConstants.HAS_AUTHOR_INSTITUTION)) r.getRelClass().equals(ModelConstants.HAS_AUTHOR_INSTITUTION))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(relationPath + "/result"); .json(relationPath + "/result");
relation relation
.filter( .filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() && (FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
r.getRelClass().equals(ModelConstants.HAS_PARTICIPANT)) r.getRelClass().equals(ModelConstants.HAS_PARTICIPANT))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(relationPath + "/project"); .json(relationPath + "/project");
Dataset<String> children = spark Dataset<String> children = spark
.sql( .sql(

View File

@ -92,7 +92,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
leavesPath, leavesPath,
childParentPath, childParentPath,
resultOrganizationPath, resultOrganizationPath,
projectOrganizationPath, projectOrganizationPath,
relationPath, relationPath,
workingPath, workingPath,
outputPath, outputPath,
@ -147,15 +147,16 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
addNewRelations(spark, workingPath + NEW_RESULT_RELATION_PATH, outputPath); addNewRelations(spark, workingPath + NEW_RESULT_RELATION_PATH, outputPath);
StepActions StepActions
.execStep( .execStep(
spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH, spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH,
leavesPath, childParentPath, projectOrganizationPath, ModelConstants.HAS_PARTICIPANT); leavesPath, childParentPath, projectOrganizationPath, ModelConstants.HAS_PARTICIPANT);
addNewRelations(spark, workingPath + NEW_PROJECT_RELATION_PATH, outputPath); addNewRelations(spark, workingPath + NEW_PROJECT_RELATION_PATH, outputPath);
} }
private static void doPropagate(SparkSession spark, String leavesPath, String childParentPath, private static void doPropagate(SparkSession spark, String leavesPath, String childParentPath,
String resultOrganizationPath, String projectOrganizationPath, String graphPath, String workingPath, String outputPath, String resultOrganizationPath, String projectOrganizationPath, String graphPath, String workingPath,
String outputPath,
PropagationCounter propagationCounter) { PropagationCounter propagationCounter) {
int iteration = 0; int iteration = 0;
long leavesCount; long leavesCount;
@ -167,13 +168,13 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
spark, graphPath + "/result", workingPath + NEW_RESULT_RELATION_PATH, spark, graphPath + "/result", workingPath + NEW_RESULT_RELATION_PATH,
leavesPath, childParentPath, resultOrganizationPath, ModelConstants.HAS_AUTHOR_INSTITUTION); leavesPath, childParentPath, resultOrganizationPath, ModelConstants.HAS_AUTHOR_INSTITUTION);
StepActions StepActions
.execStep( .execStep(
spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH, spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH,
leavesPath, childParentPath, projectOrganizationPath, ModelConstants.HAS_PARTICIPANT); leavesPath, childParentPath, projectOrganizationPath, ModelConstants.HAS_PARTICIPANT);
StepActions StepActions
.prepareForNextStep( .prepareForNextStep(
spark, workingPath , resultOrganizationPath, projectOrganizationPath, leavesPath, spark, workingPath, resultOrganizationPath, projectOrganizationPath, leavesPath,
childParentPath, workingPath + "/leaves", workingPath + "/resOrg", workingPath + "/projOrg"); childParentPath, workingPath + "/leaves", workingPath + "/resOrg", workingPath + "/projOrg");
moveOutput(spark, workingPath, leavesPath, resultOrganizationPath, projectOrganizationPath); moveOutput(spark, workingPath, leavesPath, resultOrganizationPath, projectOrganizationPath);
leavesCount = readPath(spark, leavesPath, Leaves.class).count(); leavesCount = readPath(spark, leavesPath, Leaves.class).count();
@ -224,24 +225,24 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
} }
private static void moveOutput(SparkSession spark, String workingPath, String leavesPath, private static void moveOutput(SparkSession spark, String workingPath, String leavesPath,
String resultOrganizationPath, String projectOrganizationPath) { String resultOrganizationPath, String projectOrganizationPath) {
readPath(spark, workingPath + "/leaves", Leaves.class) readPath(spark, workingPath + "/leaves", Leaves.class)
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(leavesPath); .json(leavesPath);
readPath(spark, workingPath + "/resOrg", KeyValueSet.class) readPath(spark, workingPath + "/resOrg", KeyValueSet.class)
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(resultOrganizationPath); .json(resultOrganizationPath);
readPath(spark, workingPath + "/projOrg", KeyValueSet.class) readPath(spark, workingPath + "/projOrg", KeyValueSet.class)
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(projectOrganizationPath); .json(projectOrganizationPath);
} }
@ -253,25 +254,22 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
.mapGroups( .mapGroups(
(MapGroupsFunction<String, Relation, Relation>) (k, it) -> it.next(), Encoders.bean(Relation.class)) (MapGroupsFunction<String, Relation, Relation>) (k, it) -> it.next(), Encoders.bean(Relation.class))
.flatMap( .flatMap(
(FlatMapFunction<Relation, Relation>) r -> (FlatMapFunction<Relation, Relation>) r -> {
{ if (r.getSource().startsWith("50|")) {
if(r.getSource().startsWith("50|")){
return Arrays return Arrays
.asList( .asList(
r, getAffiliationRelation( r, getAffiliationRelation(
r.getTarget(), r.getSource(), ModelConstants.IS_AUTHOR_INSTITUTION_OF)) r.getTarget(), r.getSource(), ModelConstants.IS_AUTHOR_INSTITUTION_OF))
.iterator(); .iterator();
}else{ } else {
return Arrays return Arrays
.asList( .asList(
r, getParticipantRelation( r, getParticipantRelation(
r.getTarget(), r.getSource(), ModelConstants.IS_PARTICIPANT)) r.getTarget(), r.getSource(), ModelConstants.IS_PARTICIPANT))
.iterator(); .iterator();
} }
} }
, Encoders.bean(Relation.class)) , Encoders.bean(Relation.class))
.write() .write()

View File

@ -50,22 +50,25 @@ public class StepActions implements Serializable {
spark, resultOrgPath, readPath(spark, selectedRelsPath, Relation.class), orgOutputPath); spark, resultOrgPath, readPath(spark, selectedRelsPath, Relation.class), orgOutputPath);
} }
public static void prepareForNextStep(SparkSession spark, String selectedRelsPath, String resultOrgPath, String projectOrgPath, public static void prepareForNextStep(SparkSession spark, String selectedRelsPath, String resultOrgPath,
String leavesPath, String chldParentOrgPath, String leavesOutputPath, String projectOrgPath,
String orgOutputPath, String outputProjectPath) { String leavesPath, String chldParentOrgPath, String leavesOutputPath,
String orgOutputPath, String outputProjectPath) {
// use of the parents as new leaves set // use of the parents as new leaves set
changeLeavesSet(spark, leavesPath, chldParentOrgPath, leavesOutputPath); changeLeavesSet(spark, leavesPath, chldParentOrgPath, leavesOutputPath);
// add the new relations obtained from propagation to the keyvalueset result organization // add the new relations obtained from propagation to the keyvalueset result organization
updateEntityOrganization( updateEntityOrganization(
spark, resultOrgPath, readPath(spark, selectedRelsPath + NEW_RESULT_RELATION_PATH, Relation.class), orgOutputPath); spark, resultOrgPath, readPath(spark, selectedRelsPath + NEW_RESULT_RELATION_PATH, Relation.class),
orgOutputPath);
updateEntityOrganization( updateEntityOrganization(
spark, projectOrgPath, readPath(spark, selectedRelsPath + NEW_PROJECT_RELATION_PATH, Relation.class), outputProjectPath); spark, projectOrgPath, readPath(spark, selectedRelsPath + NEW_PROJECT_RELATION_PATH, Relation.class),
outputProjectPath);
} }
private static void updateEntityOrganization(SparkSession spark, String entityOrgPath, private static void updateEntityOrganization(SparkSession spark, String entityOrgPath,
Dataset<Relation> selectedRels, String outputPath) { Dataset<Relation> selectedRels, String outputPath) {
Dataset<KeyValueSet> entityOrg = readPath(spark, entityOrgPath, KeyValueSet.class); Dataset<KeyValueSet> entityOrg = readPath(spark, entityOrgPath, KeyValueSet.class);
entityOrg entityOrg
.joinWith( .joinWith(
@ -128,45 +131,43 @@ public class StepActions implements Serializable {
// construction of the set) // construction of the set)
// if at least one relation in the set was not produced by propagation no new relation will be returned // if at least one relation in the set was not produced by propagation no new relation will be returned
relationDataset relationDataset
.union(newRels) .union(newRels)
.groupByKey((MapFunction<Relation, String>) r -> r.getSource() + r.getTarget(), Encoders.STRING()) .groupByKey((MapFunction<Relation, String>) r -> r.getSource() + r.getTarget(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Relation, String>) (k, it) -> { .mapGroups((MapGroupsFunction<String, Relation, String>) (k, it) -> {
ArrayList<Relation> relationList = new ArrayList<>(); ArrayList<Relation> relationList = new ArrayList<>();
relationList.add(it.next()); relationList.add(it.next());
it.forEachRemaining(rel -> relationList.add(rel)); it.forEachRemaining(rel -> relationList.add(rel));
if (relationList if (relationList
.stream() .stream()
.filter( .filter(
rel -> !rel rel -> !rel
.getDataInfo() .getDataInfo()
.getProvenanceaction() .getProvenanceaction()
.getClassid() .getClassid()
.equals(PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID) && !rel .equals(PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID)
.getDataInfo() && !rel
.getProvenanceaction() .getDataInfo()
.getClassid() .getProvenanceaction()
.equals(PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID)) .getClassid()
.count() > 0) { .equals(PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID))
return null; .count() > 0) {
} return null;
}
return new ObjectMapper().writeValueAsString(relationList.get(0));
}, Encoders.STRING())
.filter(Objects::nonNull)
.map(
(MapFunction<String, Relation>) r -> new ObjectMapper().readValue(r, Relation.class),
Encoders.bean(Relation.class))
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(newRelationPath);
return new ObjectMapper().writeValueAsString(relationList.get(0));
}, Encoders.STRING())
.filter(Objects::nonNull)
.map(
(MapFunction<String, Relation>) r -> new ObjectMapper().readValue(r, Relation.class),
Encoders.bean(Relation.class))
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(newRelationPath);
} }
@ -175,7 +176,7 @@ public class StepActions implements Serializable {
String leavesPath, String leavesPath,
String chldParentOrgPath, String chldParentOrgPath,
String entityOrgPath, String entityOrgPath,
String semantics) { String semantics) {
Dataset<KeyValueSet> childParent = readPath(spark, chldParentOrgPath, KeyValueSet.class); Dataset<KeyValueSet> childParent = readPath(spark, chldParentOrgPath, KeyValueSet.class);
Dataset<KeyValueSet> entityOrg = readPath(spark, entityOrgPath, KeyValueSet.class); Dataset<KeyValueSet> entityOrg = readPath(spark, entityOrgPath, KeyValueSet.class);
@ -202,7 +203,6 @@ public class StepActions implements Serializable {
"GROUP BY entityId") "GROUP BY entityId")
.as(Encoders.bean(KeyValueSet.class)); .as(Encoders.bean(KeyValueSet.class));
// create new relations from entity to organization for each entity linked to a leaf // create new relations from entity to organization for each entity linked to a leaf
return resultParent return resultParent
.flatMap( .flatMap(
@ -213,13 +213,11 @@ public class StepActions implements Serializable {
orgId -> getRelation( orgId -> getRelation(
v.getKey(), v.getKey(),
orgId, orgId,
semantics)) semantics))
.collect(Collectors.toList()) .collect(Collectors.toList())
.iterator(), .iterator(),
Encoders.bean(Relation.class)); Encoders.bean(Relation.class));
} }
} }

View File

@ -77,7 +77,7 @@ public class PrepareInfoJobTest {
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"
@ -223,7 +223,7 @@ public class PrepareInfoJobTest {
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"
@ -344,7 +344,7 @@ public class PrepareInfoJobTest {
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"
@ -365,26 +365,26 @@ public class PrepareInfoJobTest {
public void relationProjectTest() throws Exception { public void relationProjectTest() throws Exception {
PrepareInfo PrepareInfo
.main( .main(
new String[] { new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-graphPath", getClass() "-graphPath", getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest")
.getPath(), .getPath(),
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation/project") .textFile(workingDir.toString() + "/relation/project")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Dataset<Relation> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); Dataset<Relation> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
@ -406,7 +406,7 @@ public class PrepareInfoJobTest {
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"
@ -531,134 +531,134 @@ public class PrepareInfoJobTest {
public void projectOrganizationTest1() throws Exception { public void projectOrganizationTest1() throws Exception {
PrepareInfo PrepareInfo
.main( .main(
new String[] { new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-graphPath", getClass() "-graphPath", getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest")
.getPath(), .getPath(),
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<KeyValueSet> tmp = sc JavaRDD<KeyValueSet> tmp = sc
.textFile(workingDir.toString() + "/projectOrganization/") .textFile(workingDir.toString() + "/projectOrganization/")
.map(item -> OBJECT_MAPPER.readValue(item, KeyValueSet.class)); .map(item -> OBJECT_MAPPER.readValue(item, KeyValueSet.class));
Dataset<KeyValueSet> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(KeyValueSet.class)); Dataset<KeyValueSet> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(KeyValueSet.class));
Assertions.assertEquals(5, verificationDs.count()); Assertions.assertEquals(5, verificationDs.count());
Assertions Assertions
.assertEquals( .assertEquals(
2, verificationDs 2, verificationDs
.filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'") .filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'")
.collectAsList() .collectAsList()
.get(0) .get(0)
.getValueSet() .getValueSet()
.size()); .size());
Assertions Assertions
.assertTrue( .assertTrue(
verificationDs verificationDs
.filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'") .filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'")
.collectAsList() .collectAsList()
.get(0) .get(0)
.getValueSet() .getValueSet()
.contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")); .contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions Assertions
.assertTrue( .assertTrue(
verificationDs verificationDs
.filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'") .filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'")
.collectAsList() .collectAsList()
.get(0) .get(0)
.getValueSet() .getValueSet()
.contains("20|pippo_wf_001::2899e571609779168222fdeb59cb916d")); .contains("20|pippo_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions Assertions
.assertEquals( .assertEquals(
2, verificationDs 2, verificationDs
.filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'") .filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'")
.collectAsList() .collectAsList()
.get(0) .get(0)
.getValueSet() .getValueSet()
.size()); .size());
Assertions Assertions
.assertTrue( .assertTrue(
verificationDs verificationDs
.filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'") .filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'")
.collectAsList() .collectAsList()
.get(0) .get(0)
.getValueSet() .getValueSet()
.contains("20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0")); .contains("20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0"));
Assertions Assertions
.assertTrue( .assertTrue(
verificationDs verificationDs
.filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'") .filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'")
.collectAsList() .collectAsList()
.get(0) .get(0)
.getValueSet() .getValueSet()
.contains("20|pippo_wf_001::2899e571609779168222fdeb59cb916d")); .contains("20|pippo_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions Assertions
.assertEquals( .assertEquals(
1, verificationDs 1, verificationDs
.filter("key = '40|doajarticles::03748bcb5d754c951efec9700e18a56d'") .filter("key = '40|doajarticles::03748bcb5d754c951efec9700e18a56d'")
.collectAsList() .collectAsList()
.get(0) .get(0)
.getValueSet() .getValueSet()
.size()); .size());
Assertions Assertions
.assertTrue( .assertTrue(
verificationDs verificationDs
.filter("key = '40|doajarticles::03748bcb5d754c951efec9700e18a56d'") .filter("key = '40|doajarticles::03748bcb5d754c951efec9700e18a56d'")
.collectAsList() .collectAsList()
.get(0) .get(0)
.getValueSet() .getValueSet()
.contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions Assertions
.assertEquals( .assertEquals(
1, verificationDs 1, verificationDs
.filter("key = '40|openaire____::ec653e804967133b9436fdd30d3ff51d'") .filter("key = '40|openaire____::ec653e804967133b9436fdd30d3ff51d'")
.collectAsList() .collectAsList()
.get(0) .get(0)
.getValueSet() .getValueSet()
.size()); .size());
Assertions Assertions
.assertTrue( .assertTrue(
verificationDs verificationDs
.filter("key = '40|openaire____::ec653e804967133b9436fdd30d3ff51d'") .filter("key = '40|openaire____::ec653e804967133b9436fdd30d3ff51d'")
.collectAsList() .collectAsList()
.get(0) .get(0)
.getValueSet() .getValueSet()
.contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); .contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
Assertions Assertions
.assertEquals( .assertEquals(
1, verificationDs 1, verificationDs
.filter("key = '40|doajarticles::1cae0b82b56ccd97c2db1f698def7074'") .filter("key = '40|doajarticles::1cae0b82b56ccd97c2db1f698def7074'")
.collectAsList() .collectAsList()
.get(0) .get(0)
.getValueSet() .getValueSet()
.size()); .size());
Assertions Assertions
.assertTrue( .assertTrue(
verificationDs verificationDs
.filter("key = '40|doajarticles::1cae0b82b56ccd97c2db1f698def7074'") .filter("key = '40|doajarticles::1cae0b82b56ccd97c2db1f698def7074'")
.collectAsList() .collectAsList()
.get(0) .get(0)
.getValueSet() .getValueSet()
.contains("20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1")); .contains("20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1"));
verificationDs verificationDs
.foreach((ForeachFunction<KeyValueSet>) v -> System.out.println(OBJECT_MAPPER.writeValueAsString(v))); .foreach((ForeachFunction<KeyValueSet>) v -> System.out.println(OBJECT_MAPPER.writeValueAsString(v)));
} }
@ -676,7 +676,7 @@ public class PrepareInfoJobTest {
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"
@ -704,7 +704,7 @@ public class PrepareInfoJobTest {
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-leavesPath", workingDir.toString() + "/currentIteration/", "-leavesPath", workingDir.toString() + "/currentIteration/",
"-resultOrgPath", workingDir.toString() + "/resultOrganization/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/",
"-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/",
"-childParentPath", workingDir.toString() + "/childParentOrg/", "-childParentPath", workingDir.toString() + "/childParentOrg/",
"-relationPath", workingDir.toString() + "/relation" "-relationPath", workingDir.toString() + "/relation"

View File

@ -85,9 +85,9 @@ public class SparkJobTest {
.getPath(); .getPath();
final String projectOrgPath = getClass() final String projectOrgPath = getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/")
.getPath(); .getPath();
readPath(spark, leavesPath, Leaves.class) readPath(spark, leavesPath, Leaves.class)
.write() .write()
@ -100,9 +100,9 @@ public class SparkJobTest {
.json(workingDir.toString() + "/orgsInput"); .json(workingDir.toString() + "/orgsInput");
readPath(spark, projectOrgPath, KeyValueSet.class) readPath(spark, projectOrgPath, KeyValueSet.class)
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")
.json(workingDir.toString() + "/projectInput"); .json(workingDir.toString() + "/projectInput");
SparkResultToOrganizationFromSemRel SparkResultToOrganizationFromSemRel
@ -114,7 +114,7 @@ public class SparkJobTest {
"-outputPath", workingDir.toString() + "/finalrelation", "-outputPath", workingDir.toString() + "/finalrelation",
"-leavesPath", workingDir.toString() + "/leavesInput", "-leavesPath", workingDir.toString() + "/leavesInput",
"-resultOrgPath", workingDir.toString() + "/orgsInput", "-resultOrgPath", workingDir.toString() + "/orgsInput",
"-projectOrganizationPath", workingDir.toString() + "/projectInput", "-projectOrganizationPath", workingDir.toString() + "/projectInput",
"-childParentPath", childParentPath, "-childParentPath", childParentPath,
"-workingDir", workingDir.toString() "-workingDir", workingDir.toString()
}); });
@ -161,19 +161,24 @@ public class SparkJobTest {
.foreach(r -> Assertions.assertEquals(ModelConstants.HAS_AUTHOR_INSTITUTION, r.getRelClass())); .foreach(r -> Assertions.assertEquals(ModelConstants.HAS_AUTHOR_INSTITUTION, r.getRelClass()));
Assertions Assertions
.assertEquals( .assertEquals(
2, result.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); 2,
result.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions Assertions
.assertEquals( .assertEquals(
3, result.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); 3,
result.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions Assertions
.assertEquals( .assertEquals(
2, result.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); 2,
result.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions Assertions
.assertEquals( .assertEquals(
1, result.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); 1,
result.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions Assertions
.assertEquals( .assertEquals(
1, result.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); 1,
result.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions.assertEquals(9, result.filter(r -> r.getSource().substring(0, 3).equals("20|")).count()); Assertions.assertEquals(9, result.filter(r -> r.getSource().substring(0, 3).equals("20|")).count());
result result
@ -181,19 +186,24 @@ public class SparkJobTest {
.foreach(r -> Assertions.assertEquals(ModelConstants.IS_AUTHOR_INSTITUTION_OF, r.getRelClass())); .foreach(r -> Assertions.assertEquals(ModelConstants.IS_AUTHOR_INSTITUTION_OF, r.getRelClass()));
Assertions Assertions
.assertEquals( .assertEquals(
1, result.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); 1,
result.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions Assertions
.assertEquals( .assertEquals(
1, result.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); 1,
result.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions Assertions
.assertEquals( .assertEquals(
2, result.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); 2,
result.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions Assertions
.assertEquals( .assertEquals(
2, result.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); 2,
result.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions Assertions
.assertEquals( .assertEquals(
3, result.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); 3,
result.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions Assertions
.assertTrue( .assertTrue(
@ -336,332 +346,343 @@ public class SparkJobTest {
public void completeProjectExecution() throws Exception { public void completeProjectExecution() throws Exception {
final String graphPath = getClass() final String graphPath = getClass()
.getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph") .getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph")
.getPath(); .getPath();
final String leavesPath = getClass() final String leavesPath = getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/")
.getPath(); .getPath();
final String childParentPath = getClass() final String childParentPath = getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/")
.getPath(); .getPath();
final String resultOrgPath = getClass() final String resultOrgPath = getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/")
.getPath(); .getPath();
final String projectOrgPath = getClass() final String projectOrgPath = getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/")
.getPath(); .getPath();
readPath(spark, leavesPath, Leaves.class) readPath(spark, leavesPath, Leaves.class)
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")
.json(workingDir.toString() + "/leavesInput"); .json(workingDir.toString() + "/leavesInput");
readPath(spark, resultOrgPath, KeyValueSet.class) readPath(spark, resultOrgPath, KeyValueSet.class)
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")
.json(workingDir.toString() + "/orgsInput"); .json(workingDir.toString() + "/orgsInput");
readPath(spark, projectOrgPath, KeyValueSet.class) readPath(spark, projectOrgPath, KeyValueSet.class)
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")
.json(workingDir.toString() + "/projectInput"); .json(workingDir.toString() + "/projectInput");
SparkResultToOrganizationFromSemRel SparkResultToOrganizationFromSemRel
.main( .main(
new String[] { new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-relationPath", graphPath, "-relationPath", graphPath,
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-outputPath", workingDir.toString() + "/finalrelation", "-outputPath", workingDir.toString() + "/finalrelation",
"-leavesPath", workingDir.toString() + "/leavesInput", "-leavesPath", workingDir.toString() + "/leavesInput",
"-resultOrgPath", workingDir.toString() + "/orgsInput", "-resultOrgPath", workingDir.toString() + "/orgsInput",
"-projectOrganizationPath", workingDir.toString() + "/projectInput", "-projectOrganizationPath", workingDir.toString() + "/projectInput",
"-childParentPath", childParentPath, "-childParentPath", childParentPath,
"-workingDir", workingDir.toString() "-workingDir", workingDir.toString()
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> temp = sc JavaRDD<Relation> temp = sc
.textFile(workingDir.toString() + "/finalrelation") .textFile(workingDir.toString() + "/finalrelation")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Assertions.assertEquals(36, temp.count()); Assertions.assertEquals(36, temp.count());
JavaRDD<Relation> project = temp.filter(r -> r.getSource().startsWith("40|") || r.getTarget().startsWith("40|")); JavaRDD<Relation> project = temp
.filter(r -> r.getSource().startsWith("40|") || r.getTarget().startsWith("40|"));
Assertions.assertEquals(18, project.count()); Assertions.assertEquals(18, project.count());
project.foreach(r -> Assertions.assertEquals(ModelConstants.PARTICIPATION, r.getSubRelType())); project.foreach(r -> Assertions.assertEquals(ModelConstants.PARTICIPATION, r.getSubRelType()));
project.foreach(r -> Assertions.assertEquals(ModelConstants.PROJECT_ORGANIZATION, r.getRelType())); project.foreach(r -> Assertions.assertEquals(ModelConstants.PROJECT_ORGANIZATION, r.getRelType()));
project project
.foreach( .foreach(
r -> Assertions r -> Assertions
.assertEquals( .assertEquals(
PropagationConstant.PROPAGATION_DATA_INFO_TYPE, r.getDataInfo().getInferenceprovenance())); PropagationConstant.PROPAGATION_DATA_INFO_TYPE, r.getDataInfo().getInferenceprovenance()));
project project
.foreach( .foreach(
r -> Assertions r -> Assertions
.assertEquals( .assertEquals(
PropagationConstant.PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID, PropagationConstant.PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID,
r.getDataInfo().getProvenanceaction().getClassid())); r.getDataInfo().getProvenanceaction().getClassid()));
project project
.foreach( .foreach(
r -> Assertions r -> Assertions
.assertEquals( .assertEquals(
PropagationConstant.PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME, PropagationConstant.PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME,
r.getDataInfo().getProvenanceaction().getClassname())); r.getDataInfo().getProvenanceaction().getClassname()));
project project
.foreach( .foreach(
r -> Assertions r -> Assertions
.assertEquals( .assertEquals(
"0.85", "0.85",
r.getDataInfo().getTrust())); r.getDataInfo().getTrust()));
Assertions.assertEquals(9, project.filter(r -> r.getSource().substring(0, 3).equals("40|")).count()); Assertions.assertEquals(9, project.filter(r -> r.getSource().substring(0, 3).equals("40|")).count());
project project
.filter(r -> r.getSource().substring(0, 3).equals("40|")) .filter(r -> r.getSource().substring(0, 3).equals("40|"))
.foreach(r -> Assertions.assertEquals(ModelConstants.HAS_PARTICIPANT, r.getRelClass())); .foreach(r -> Assertions.assertEquals(ModelConstants.HAS_PARTICIPANT, r.getRelClass()));
Assertions Assertions
.assertEquals( .assertEquals(
2, project.filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); 2,
project.filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions Assertions
.assertEquals( .assertEquals(
3, project.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); 3,
project.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions Assertions
.assertEquals( .assertEquals(
2, project.filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); 2,
project.filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions Assertions
.assertEquals( .assertEquals(
1, project.filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); 1,
project.filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions Assertions
.assertEquals( .assertEquals(
1, project.filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); 1,
project.filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions.assertEquals(9, project.filter(r -> r.getSource().substring(0, 3).equals("20|")).count()); Assertions.assertEquals(9, project.filter(r -> r.getSource().substring(0, 3).equals("20|")).count());
project project
.filter(r -> r.getSource().substring(0, 3).equals("20|")) .filter(r -> r.getSource().substring(0, 3).equals("20|"))
.foreach(r -> Assertions.assertEquals(ModelConstants.IS_PARTICIPANT, r.getRelClass())); .foreach(r -> Assertions.assertEquals(ModelConstants.IS_PARTICIPANT, r.getRelClass()));
Assertions Assertions
.assertEquals( .assertEquals(
1, project.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); 1,
project.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions Assertions
.assertEquals( .assertEquals(
1, project.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); 1,
project.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions Assertions
.assertEquals( .assertEquals(
2, project.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); 2,
project.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions Assertions
.assertEquals( .assertEquals(
2, project.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); 2,
project.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions Assertions
.assertEquals( .assertEquals(
3, project.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); 3,
project.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) .filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); .contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) .filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d")); .contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) .filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) .filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")) .filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")); .contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")) .filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")) .filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d")) .filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d")); .contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d")) .filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")) .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); .contains("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")) .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("40|openaire____::ec653e804967133b9436fdd30d3ff51d")); .contains("40|openaire____::ec653e804967133b9436fdd30d3ff51d"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")); .contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); .contains("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")); .contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); .contains("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("40|doajarticles::03748bcb5d754c951efec9700e18a56d")); .contains("40|doajarticles::03748bcb5d754c951efec9700e18a56d"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")) .filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")); .contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d"));
Assertions Assertions
.assertTrue( .assertTrue(
project project
.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) .filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"))
.map(r -> r.getTarget()) .map(r -> r.getTarget())
.collect() .collect()
.contains("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); .contains("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074"));
} }
@Test @Test
public void singleIterationExecution() throws Exception { public void singleIterationExecution() throws Exception {
final String graphPath = getClass() final String graphPath = getClass()
.getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph") .getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph")
.getPath(); .getPath();
final String leavesPath = getClass() final String leavesPath = getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/")
.getPath(); .getPath();
final String childParentPath = getClass() final String childParentPath = getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/")
.getPath(); .getPath();
final String resultOrgPath = getClass() final String resultOrgPath = getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/")
.getPath(); .getPath();
final String projectOrgPath = getClass() final String projectOrgPath = getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/")
.getPath(); .getPath();
readPath(spark, leavesPath, Leaves.class) readPath(spark, leavesPath, Leaves.class)
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")
.json(workingDir.toString() + "/leavesInput"); .json(workingDir.toString() + "/leavesInput");
readPath(spark, resultOrgPath, KeyValueSet.class) readPath(spark, resultOrgPath, KeyValueSet.class)
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")
.json(workingDir.toString() + "/orgsInput"); .json(workingDir.toString() + "/orgsInput");
readPath(spark, projectOrgPath, KeyValueSet.class) readPath(spark, projectOrgPath, KeyValueSet.class)
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")
.json(workingDir.toString() + "/projectInput"); .json(workingDir.toString() + "/projectInput");
SparkResultToOrganizationFromSemRel SparkResultToOrganizationFromSemRel
.main( .main(
new String[] { new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-relationPath", graphPath, "-relationPath", graphPath,
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-outputPath", workingDir.toString() + "/finalrelation", "-outputPath", workingDir.toString() + "/finalrelation",
"-leavesPath", workingDir.toString() + "/leavesInput", "-leavesPath", workingDir.toString() + "/leavesInput",
"-resultOrgPath", workingDir.toString() + "/orgsInput", "-resultOrgPath", workingDir.toString() + "/orgsInput",
"-projectOrganizationPath", workingDir.toString() + "/projectInput", "-projectOrganizationPath", workingDir.toString() + "/projectInput",
"-childParentPath", childParentPath, "-childParentPath", childParentPath,
"-workingDir", workingDir.toString(), "-workingDir", workingDir.toString(),
"-iterations", "1" "-iterations", "1"
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> temp = sc JavaRDD<Relation> temp = sc
.textFile(workingDir.toString() + "/finalrelation") .textFile(workingDir.toString() + "/finalrelation")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Assertions.assertEquals(16, temp.count()); Assertions.assertEquals(16, temp.count());

View File

@ -87,7 +87,8 @@ public class StepActionsTest {
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/")
.getPath(), ModelConstants.HAS_AUTHOR_INSTITUTION); .getPath(),
ModelConstants.HAS_AUTHOR_INSTITUTION);
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());