From 0389b57ca7cf0ba54e8949e5cecec20c3991d8bd Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 31 May 2023 11:06:58 +0200 Subject: [PATCH 1/3] added propagation for project to organization --- .../eu/dnetlib/dhp/PropagationConstant.java | 31 +++++++++- .../Leaves.java | 2 +- .../PrepareInfo.java | 47 +++++++++++---- .../PropagationCounter.java | 2 +- .../SparkResultToOrganizationFromSemRel.java | 59 +++++++++++++------ .../StepActions.java | 30 ++++------ .../input_preparation_parameter.json | 6 ++ .../input_propagation_parameter.json | 7 ++- .../oozie_app/config-default.xml | 0 .../oozie_app/workflow.xml | 5 +- .../PrepareInfoJobTest.java | 28 ++++----- .../SparkJobTest.java | 4 +- .../StepActionsTest.java | 2 +- .../childparenttest1/relation | 0 .../childparenttest2/relation | 0 .../execstep/childParentOrg/childparent | 0 .../execstep/currentIteration/leaves | 0 .../execstep/relation | 0 .../execstep/relsforiteration1/relation | 0 .../resultOrganization/resultorganization | 0 .../resultorganizationtest/relation | 0 21 files changed, 153 insertions(+), 70 deletions(-) rename dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/Leaves.java (79%) rename dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/PrepareInfo.java (78%) rename dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/PropagationCounter.java (97%) rename dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/SparkResultToOrganizationFromSemRel.java (80%) rename dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/StepActions.java (89%) rename dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/input_preparation_parameter.json (87%) rename dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/input_propagation_parameter.json (90%) rename dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/oozie_app/config-default.xml (100%) rename dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/oozie_app/workflow.xml (96%) rename dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/PrepareInfoJobTest.java (94%) rename dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/SparkJobTest.java (98%) rename dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/StepActionsTest.java (99%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/childparenttest1/relation (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/childparenttest2/relation (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/execstep/childParentOrg/childparent (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/execstep/currentIteration/leaves (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/execstep/relation (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/execstep/relsforiteration1/relation (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/execstep/resultOrganization/resultorganization (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/resultorganizationtest/relation (100%) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 89bdf0982..53769c9fb 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -57,7 +57,10 @@ public class PropagationConstant { public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME = "Propagation of affiliation to result collected from datasources of type institutional repository"; public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID = "result:organization:semrel"; - public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME = "Propagation of affiliation to result through sematic relations"; + public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME = "Propagation of affiliation to result through semantic relations"; + + public static final String PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID = "project:organization:semrel"; + public static final String PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME = "Propagation of participation to project through semantic relations"; public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID = "result:project:semrel"; public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME = "Propagation of result to project through semantic relation"; @@ -171,6 +174,32 @@ public class PropagationConstant { return newRelations; } + public static Relation getParticipantRelation( + String source, + String target, + String rel_class) { + return getRelation(source, target , + rel_class, + ModelConstants.PROJECT_ORGANIZATION, + ModelConstants.PARTICIPATION, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID, + PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME); + } + + public static Relation getAffiliationRelation( + String source, + String target, + String rel_class) { + return getRelation(source, target , + rel_class, + ModelConstants.RESULT_ORGANIZATION, + ModelConstants.AFFILIATION, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID, + PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME); + } + public static Relation getRelation( String source, String target, diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/Leaves.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/Leaves.java similarity index 79% rename from dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/Leaves.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/Leaves.java index 7984721e8..e010b54c0 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/Leaves.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/Leaves.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import java.io.Serializable; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfo.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java similarity index 78% rename from dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfo.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java index 23909fd9a..7ad9c4cee 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfo.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; @@ -7,6 +7,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.io.Serializable; import java.util.*; +import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.*; @@ -14,8 +15,6 @@ import org.apache.spark.sql.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.dhp.KeyValueSet; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob; @@ -47,13 +46,20 @@ public class PrepareInfo implements Serializable { "' and datainfo.deletedbyinference = false " + "GROUP BY source"; + // associate projects to all the participant orgs + private static final String PROJECT_ORGANIZATION_QUERY = "SELECT source key, collect_set(target) as valueSet " + + "FROM relation " + + "WHERE lower(relclass) = '" + ModelConstants.IS_PARTICIPANT.toLowerCase() + + "' and datainfo.deletedbyinference = false " + + "GROUP BY source"; + public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( SparkResultToOrganizationFromIstRepoJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_preparation_parameter.json")); + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -74,6 +80,9 @@ public class PrepareInfo implements Serializable { final String resultOrganizationPath = parser.get("resultOrgPath"); log.info("resultOrganizationPath: {}", resultOrganizationPath); + final String projectOrgPath = parser.get("projectOrganizationPath"); + log.info("projectOrgPath: {}", projectOrgPath); + final String relationPath = parser.get("relationPath"); log.info("relationPath: {}", relationPath); @@ -89,11 +98,12 @@ public class PrepareInfo implements Serializable { childParentPath, leavesPath, resultOrganizationPath, + projectOrgPath, relationPath)); } private static void prepareInfo(SparkSession spark, String inputPath, String childParentOrganizationPath, - String currentIterationPath, String resultOrganizationPath, String relationPath) { + String currentIterationPath, String resultOrganizationPath, String resultProjectPath, String relationPath) { Dataset relation = readPath(spark, inputPath + "/relation", Relation.class); relation.createOrReplaceTempView("relation"); @@ -113,14 +123,31 @@ public class PrepareInfo implements Serializable { .option("compression", "gzip") .json(resultOrganizationPath); + spark + .sql(PROJECT_ORGANIZATION_QUERY) + .as(Encoders.bean(KeyValueSet.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(resultProjectPath); + relation - .filter( - (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - r.getRelClass().equals(ModelConstants.HAS_AUTHOR_INSTITUTION)) - .write() + .filter( + (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + r.getRelClass().equals(ModelConstants.HAS_AUTHOR_INSTITUTION)) + .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") - .json(relationPath); + .json(relationPath + "/result"); + + relation + .filter( + (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + r.getRelClass().equals(ModelConstants.IS_PARTICIPANT)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(relationPath + "/project"); Dataset children = spark .sql( diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PropagationCounter.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PropagationCounter.java similarity index 97% rename from dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PropagationCounter.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PropagationCounter.java index 788eff0e3..1c408d1c3 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PropagationCounter.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PropagationCounter.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import java.io.Serializable; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java similarity index 80% rename from dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java index cfc69a8f0..19e55a905 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; @@ -30,7 +30,8 @@ import eu.dnetlib.dhp.schema.oaf.Relation; public class SparkResultToOrganizationFromSemRel implements Serializable { private static final Logger log = LoggerFactory.getLogger(SparkResultToOrganizationFromSemRel.class); private static final int MAX_ITERATION = 5; - public static final String NEW_RELATION_PATH = "/newRelation"; + public static final String NEW_RESULT_RELATION_PATH = "/newResultRelation"; + public static final String NEW_PROJECT_RELATION_PATH = "/newProjectRelation"; public static void main(String[] args) throws Exception { @@ -62,6 +63,9 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { final String resultOrganizationPath = parser.get("resultOrgPath"); log.info("resultOrganizationPath: {}", resultOrganizationPath); + final String projectOrganizationPath = parser.get("projectOrganizationPath"); + log.info("projectOrganizationPath: {}", projectOrganizationPath); + final String workingPath = parser.get("workingDir"); log.info("workingPath: {}", workingPath); @@ -88,6 +92,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { leavesPath, childParentPath, resultOrganizationPath, + projectOrganizationPath, relationPath, workingPath, outputPath, @@ -98,13 +103,14 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { String leavesPath, String childParentPath, String resultOrganizationPath, + String projectOrganizationPath, String graphPath, String workingPath, String outputPath, int iterations) { if (iterations == 1) { doPropagateOnce( - spark, leavesPath, childParentPath, resultOrganizationPath, graphPath, + spark, leavesPath, childParentPath, resultOrganizationPath, projectOrganizationPath, graphPath, workingPath, outputPath); } else { @@ -130,15 +136,22 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { } private static void doPropagateOnce(SparkSession spark, String leavesPath, String childParentPath, - String resultOrganizationPath, String graphPath, String workingPath, + String resultOrganizationPath, String projectOrganizationPath, String graphPath, String workingPath, String outputPath) { StepActions .execStep( - spark, graphPath, workingPath + NEW_RELATION_PATH, + spark, graphPath + "/result", workingPath + NEW_RESULT_RELATION_PATH, leavesPath, childParentPath, resultOrganizationPath); - addNewRelations(spark, workingPath + NEW_RELATION_PATH, outputPath); + addNewRelations(spark, workingPath + NEW_RESULT_RELATION_PATH, outputPath); + + StepActions + .execStep( + spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH, + leavesPath, childParentPath, projectOrganizationPath); + + addNewRelations(spark, workingPath + NEW_PROJECT_RELATION_PATH, outputPath); } private static void doPropagate(SparkSession spark, String leavesPath, String childParentPath, @@ -151,11 +164,11 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { iteration++; StepActions .execStep( - spark, graphPath, workingPath + NEW_RELATION_PATH, + spark, graphPath, workingPath + NEW_RESULT_RELATION_PATH, leavesPath, childParentPath, resultOrganizationPath); StepActions .prepareForNextStep( - spark, workingPath + NEW_RELATION_PATH, resultOrganizationPath, leavesPath, + spark, workingPath + NEW_RESULT_RELATION_PATH, resultOrganizationPath, leavesPath, childParentPath, workingPath + "/leaves", workingPath + "/resOrg"); moveOutput(spark, workingPath, leavesPath, resultOrganizationPath); leavesCount = readPath(spark, leavesPath, Leaves.class).count(); @@ -185,7 +198,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { propagationCounter.getNotReachedFirstParent().add(1); } - addNewRelations(spark, workingPath + NEW_RELATION_PATH, outputPath); + addNewRelations(spark, workingPath + NEW_RESULT_RELATION_PATH, outputPath); } private static void moveOutput(SparkSession spark, String workingPath, String leavesPath, @@ -212,16 +225,24 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { .mapGroups( (MapGroupsFunction) (k, it) -> it.next(), Encoders.bean(Relation.class)) .flatMap( - (FlatMapFunction) r -> Arrays - .asList( - r, getRelation( - r.getTarget(), r.getSource(), ModelConstants.IS_AUTHOR_INSTITUTION_OF, - ModelConstants.RESULT_ORGANIZATION, - ModelConstants.AFFILIATION, - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID, - PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME)) - .iterator() + (FlatMapFunction) r -> + { + if(r.getSource().startsWith("50|")){ + return Arrays + .asList( + r, getAffiliationRelation( + r.getTarget(), r.getSource(), ModelConstants.IS_AUTHOR_INSTITUTION_OF)) + .iterator(); + }else{ + return Arrays + .asList( + r, getParticipantRelation( + r.getTarget(), r.getSource(), ModelConstants.HAS_PARTICIPANT)) + .iterator(); + } + } + + , Encoders.bean(Relation.class)) .write() diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActions.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java similarity index 89% rename from dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActions.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java index 1adbbe60e..5b6c397cf 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActions.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.readPath; @@ -14,8 +14,6 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.jetbrains.annotations.NotNull; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; @@ -28,13 +26,13 @@ public class StepActions implements Serializable { public static void execStep(SparkSession spark, String graphPath, String newRelationPath, - String leavesPath, String chldParentOrgPath, String resultOrgPath) { + String leavesPath, String chldParentOrgPath, String entityOrgPath) { Dataset relationGraph = readPath(spark, graphPath, Relation.class); // select only the relation source target among those proposed by propagation that are not already existent getNewRels( newRelationPath, relationGraph, - getPropagationRelation(spark, leavesPath, chldParentOrgPath, resultOrgPath)); + getPropagationRelation(spark, leavesPath, chldParentOrgPath, entityOrgPath, ModelConstants.HAS_AUTHOR_INSTITUTION)); } @@ -152,19 +150,20 @@ public class StepActions implements Serializable { private static Dataset getPropagationRelation(SparkSession spark, String leavesPath, String chldParentOrgPath, - String resultOrgPath) { + String entityOrgPath, + String semantics) { Dataset childParent = readPath(spark, chldParentOrgPath, KeyValueSet.class); - Dataset resultOrg = readPath(spark, resultOrgPath, KeyValueSet.class); + Dataset entityOrg = readPath(spark, entityOrgPath, KeyValueSet.class); Dataset leaves = readPath(spark, leavesPath, Leaves.class); childParent.createOrReplaceTempView("childParent"); - resultOrg.createOrReplaceTempView("resultOrg"); + entityOrg.createOrReplaceTempView("entityOrg"); leaves.createOrReplaceTempView("leaves"); Dataset resultParent = spark .sql( - "SELECT resId as key, " + + "SELECT entityId as key, " + "collect_set(parent) valueSet " + "FROM (SELECT key as child, parent " + " FROM childParent " + @@ -172,7 +171,7 @@ public class StepActions implements Serializable { "JOIN leaves " + "ON leaves.value = cp.child " + "JOIN (" + - "SELECT key as resId, org " + + "SELECT key as entityId, org " + "FROM resultOrg " + "LATERAL VIEW explode (valueSet) ks as org ) as ro " + "ON leaves.value = ro.org " + @@ -186,19 +185,16 @@ public class StepActions implements Serializable { .getValueSet() .stream() .map( - orgId -> getRelation( + orgId -> getAffiliationRelation( v.getKey(), orgId, - ModelConstants.HAS_AUTHOR_INSTITUTION, - ModelConstants.RESULT_ORGANIZATION, - ModelConstants.AFFILIATION, - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID, - PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME)) + semantics)) .collect(Collectors.toList()) .iterator(), Encoders.bean(Relation.class)); } + + } diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_preparation_parameter.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json similarity index 87% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_preparation_parameter.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json index c79bfe05d..b59937331 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_preparation_parameter.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json @@ -40,5 +40,11 @@ "paramLongName": "relationPath", "paramDescription": "the path where to store the selected subset of relations", "paramRequired": false + }, + { + "paramName": "pop", + "paramLongName": "projectOrganizationPath", + "paramDescription": "the number of iterations to be computed", + "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_propagation_parameter.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_propagation_parameter.json similarity index 90% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_propagation_parameter.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_propagation_parameter.json index e09cd62fa..5a8597f38 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_propagation_parameter.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_propagation_parameter.json @@ -52,5 +52,10 @@ "paramLongName": "iterations", "paramDescription": "the number of iterations to be computed", "paramRequired": false - } + },{ + "paramName": "pop", + "paramLongName": "projectOrganizationPath", + "paramDescription": "the number of iterations to be computed", + "paramRequired": true +} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/workflow.xml similarity index 96% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/workflow.xml index 5ce2f5c06..ff6ec8f37 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/workflow.xml @@ -134,7 +134,7 @@ yarn cluster PrepareResultOrganizationAssociation - eu.dnetlib.dhp.resulttoorganizationfromsemrel.PrepareInfo + eu.dnetlib.dhp.entitytoorganizationfromsemrel.PrepareInfo dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -150,6 +150,7 @@ --leavesPath${workingDir}/preparedInfo/leavesPath --childParentPath${workingDir}/preparedInfo/childParentPath --resultOrgPath${workingDir}/preparedInfo/resultOrgPath + --projectOrganizationPath${workingDir}/preparedInfo/projectOrganizationPath --relationPath${workingDir}/preparedInfo/relation @@ -161,7 +162,7 @@ yarn cluster resultToOrganizationFromSemRel - eu.dnetlib.dhp.resulttoorganizationfromsemrel.SparkResultToOrganizationFromSemRel + eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkResultToOrganizationFromSemRel dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfoJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java similarity index 94% rename from dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfoJobTest.java rename to dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java index 2d2668db3..3d7086739 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfoJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java @@ -1,22 +1,17 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; - -import static eu.dnetlib.dhp.PropagationConstant.readPath; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.List; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.ForeachFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Assertions; @@ -28,7 +23,6 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.KeyValueSet; -import eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob; import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareInfoJobTest { @@ -78,11 +72,12 @@ public class PrepareInfoJobTest { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-graphPath", getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest1") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest1") .getPath(), "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -223,11 +218,12 @@ public class PrepareInfoJobTest { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-graphPath", getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest2") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest2") .getPath(), "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -343,11 +339,12 @@ public class PrepareInfoJobTest { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-graphPath", getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/resultorganizationtest") .getPath(), "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -355,7 +352,7 @@ public class PrepareInfoJobTest { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc - .textFile(workingDir.toString() + "/relation") + .textFile(workingDir.toString() + "/relation/result") .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); @@ -373,11 +370,12 @@ public class PrepareInfoJobTest { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-graphPath", getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/resultorganizationtest") .getPath(), "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -507,11 +505,12 @@ public class PrepareInfoJobTest { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-graphPath", getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/resultorganizationtest") .getPath(), "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -534,11 +533,12 @@ public class PrepareInfoJobTest { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-graphPath", getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest1") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest1") .getPath(), "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java similarity index 98% rename from dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkJobTest.java rename to dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java index 7dd575b66..a4d8f83e3 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java @@ -1,7 +1,6 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; -import static eu.dnetlib.dhp.PropagationConstant.isSparkSessionManaged; import static eu.dnetlib.dhp.PropagationConstant.readPath; import java.io.IOException; @@ -12,7 +11,6 @@ import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.ForeachFunction; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Assertions; diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActionsTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java similarity index 99% rename from dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActionsTest.java rename to dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java index 5c715f3b9..77ed4dcbf 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActionsTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import java.io.IOException; import java.nio.file.Files; diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest1/relation b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest1/relation similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest1/relation rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest1/relation diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest2/relation b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest2/relation similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest2/relation rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest2/relation diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/childParentOrg/childparent b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/childparent similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/childParentOrg/childparent rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/childparent diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/currentIteration/leaves b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/leaves similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/currentIteration/leaves rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/leaves diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/relation b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relation similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/relation rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relation diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/relsforiteration1/relation b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relsforiteration1/relation similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/relsforiteration1/relation rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relsforiteration1/relation diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/resultOrganization/resultorganization b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/resultorganization similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/resultOrganization/resultorganization rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/resultorganization diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest/relation b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/resultorganizationtest/relation similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest/relation rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/resultorganizationtest/relation From 97d72d41c31cc222f50b47a05d4993692892fb3c Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 31 May 2023 18:53:22 +0200 Subject: [PATCH 2/3] finalization of implementation and testing --- .../eu/dnetlib/dhp/PropagationConstant.java | 7 + .../PrepareInfo.java | 8 +- .../SparkResultToOrganizationFromSemRel.java | 50 +- .../StepActions.java | 103 +-- .../PrepareInfoJobTest.java | 166 +++++ .../SparkJobTest.java | 648 ++++++++++++++++-- .../StepActionsTest.java | 26 +- .../execstep/graph/project | 7 + .../execstep/graph/result | 7 + .../projectOrganization/projectorganization | 5 + .../execstep/relation | 14 - .../projectorganizationtest/relation | 7 + 12 files changed, 921 insertions(+), 127 deletions(-) create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph/project create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph/result create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/projectorganization delete mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relation create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest/relation diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 53769c9fb..87528ef58 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -174,6 +174,13 @@ public class PropagationConstant { return newRelations; } + public static Relation getRelation(String source, String target, String rel_class){ + if (ModelConstants.HAS_PARTICIPANT.equals(rel_class)){ + return getParticipantRelation(source, target, rel_class); + }else + return getAffiliationRelation(source, target, rel_class); + } + public static Relation getParticipantRelation( String source, String target, diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java index 7ad9c4cee..971ef436f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java @@ -49,7 +49,7 @@ public class PrepareInfo implements Serializable { // associate projects to all the participant orgs private static final String PROJECT_ORGANIZATION_QUERY = "SELECT source key, collect_set(target) as valueSet " + "FROM relation " + - "WHERE lower(relclass) = '" + ModelConstants.IS_PARTICIPANT.toLowerCase() + + "WHERE lower(relclass) = '" + ModelConstants.HAS_PARTICIPANT.toLowerCase() + "' and datainfo.deletedbyinference = false " + "GROUP BY source"; @@ -103,7 +103,7 @@ public class PrepareInfo implements Serializable { } private static void prepareInfo(SparkSession spark, String inputPath, String childParentOrganizationPath, - String currentIterationPath, String resultOrganizationPath, String resultProjectPath, String relationPath) { + String currentIterationPath, String resultOrganizationPath, String projectOrganizationPath, String relationPath) { Dataset relation = readPath(spark, inputPath + "/relation", Relation.class); relation.createOrReplaceTempView("relation"); @@ -129,7 +129,7 @@ public class PrepareInfo implements Serializable { .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") - .json(resultProjectPath); + .json(projectOrganizationPath); relation .filter( @@ -143,7 +143,7 @@ public class PrepareInfo implements Serializable { relation .filter( (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - r.getRelClass().equals(ModelConstants.IS_PARTICIPANT)) + r.getRelClass().equals(ModelConstants.HAS_PARTICIPANT)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java index 19e55a905..dd32552ad 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java @@ -39,7 +39,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { .toString( SparkResultToOrganizationFromIstRepoJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_propagation_parameter.json")); + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_propagation_parameter.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -129,7 +129,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { notReachedFirstParent); doPropagate( - spark, leavesPath, childParentPath, resultOrganizationPath, graphPath, + spark, leavesPath, childParentPath, resultOrganizationPath, projectOrganizationPath, graphPath, workingPath, outputPath, propagationCounter); } @@ -142,20 +142,20 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { StepActions .execStep( spark, graphPath + "/result", workingPath + NEW_RESULT_RELATION_PATH, - leavesPath, childParentPath, resultOrganizationPath); + leavesPath, childParentPath, resultOrganizationPath, ModelConstants.HAS_AUTHOR_INSTITUTION); addNewRelations(spark, workingPath + NEW_RESULT_RELATION_PATH, outputPath); StepActions .execStep( spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH, - leavesPath, childParentPath, projectOrganizationPath); + leavesPath, childParentPath, projectOrganizationPath, ModelConstants.HAS_PARTICIPANT); addNewRelations(spark, workingPath + NEW_PROJECT_RELATION_PATH, outputPath); } private static void doPropagate(SparkSession spark, String leavesPath, String childParentPath, - String resultOrganizationPath, String graphPath, String workingPath, String outputPath, + String resultOrganizationPath, String projectOrganizationPath, String graphPath, String workingPath, String outputPath, PropagationCounter propagationCounter) { int iteration = 0; long leavesCount; @@ -164,13 +164,18 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { iteration++; StepActions .execStep( - spark, graphPath, workingPath + NEW_RESULT_RELATION_PATH, - leavesPath, childParentPath, resultOrganizationPath); + spark, graphPath + "/result", workingPath + NEW_RESULT_RELATION_PATH, + leavesPath, childParentPath, resultOrganizationPath, ModelConstants.HAS_AUTHOR_INSTITUTION); + StepActions + .execStep( + spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH, + leavesPath, childParentPath, projectOrganizationPath, ModelConstants.HAS_PARTICIPANT); + StepActions .prepareForNextStep( - spark, workingPath + NEW_RESULT_RELATION_PATH, resultOrganizationPath, leavesPath, - childParentPath, workingPath + "/leaves", workingPath + "/resOrg"); - moveOutput(spark, workingPath, leavesPath, resultOrganizationPath); + spark, workingPath , resultOrganizationPath, projectOrganizationPath, leavesPath, + childParentPath, workingPath + "/leaves", workingPath + "/resOrg", workingPath + "/projOrg"); + moveOutput(spark, workingPath, leavesPath, resultOrganizationPath, projectOrganizationPath); leavesCount = readPath(spark, leavesPath, Leaves.class).count(); } while (leavesCount > 0 && iteration < MAX_ITERATION); @@ -199,6 +204,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { } addNewRelations(spark, workingPath + NEW_RESULT_RELATION_PATH, outputPath); + addNewRelations(spark, workingPath + NEW_PROJECT_RELATION_PATH, outputPath); } private static void moveOutput(SparkSession spark, String workingPath, String leavesPath, @@ -217,6 +223,28 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { } + private static void moveOutput(SparkSession spark, String workingPath, String leavesPath, + String resultOrganizationPath, String projectOrganizationPath) { + readPath(spark, workingPath + "/leaves", Leaves.class) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(leavesPath); + + readPath(spark, workingPath + "/resOrg", KeyValueSet.class) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(resultOrganizationPath); + + readPath(spark, workingPath + "/projOrg", KeyValueSet.class) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(projectOrganizationPath); + + } + private static void addNewRelations(SparkSession spark, String newRelationPath, String outputPath) { Dataset relation = readPath(spark, newRelationPath, Relation.class); @@ -237,7 +265,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { return Arrays .asList( r, getParticipantRelation( - r.getTarget(), r.getSource(), ModelConstants.HAS_PARTICIPANT)) + r.getTarget(), r.getSource(), ModelConstants.IS_PARTICIPANT)) .iterator(); } } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java index 5b6c397cf..de5034d38 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java @@ -3,6 +3,8 @@ package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.readPath; +import static eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkResultToOrganizationFromSemRel.NEW_PROJECT_RELATION_PATH; +import static eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkResultToOrganizationFromSemRel.NEW_RESULT_RELATION_PATH; import java.io.Serializable; import java.util.*; @@ -26,13 +28,14 @@ public class StepActions implements Serializable { public static void execStep(SparkSession spark, String graphPath, String newRelationPath, - String leavesPath, String chldParentOrgPath, String entityOrgPath) { + String leavesPath, String chldParentOrgPath, String entityOrgPath, String rel_class) { Dataset relationGraph = readPath(spark, graphPath, Relation.class); // select only the relation source target among those proposed by propagation that are not already existent + getNewRels( newRelationPath, relationGraph, - getPropagationRelation(spark, leavesPath, chldParentOrgPath, entityOrgPath, ModelConstants.HAS_AUTHOR_INSTITUTION)); + getPropagationRelation(spark, leavesPath, chldParentOrgPath, entityOrgPath, rel_class)); } @@ -43,16 +46,30 @@ public class StepActions implements Serializable { changeLeavesSet(spark, leavesPath, chldParentOrgPath, leavesOutputPath); // add the new relations obtained from propagation to the keyvalueset result organization - updateResultOrganization( + updateEntityOrganization( spark, resultOrgPath, readPath(spark, selectedRelsPath, Relation.class), orgOutputPath); } - private static void updateResultOrganization(SparkSession spark, String resultOrgPath, - Dataset selectedRels, String outputPath) { - Dataset resultOrg = readPath(spark, resultOrgPath, KeyValueSet.class); - resultOrg + public static void prepareForNextStep(SparkSession spark, String selectedRelsPath, String resultOrgPath, String projectOrgPath, + String leavesPath, String chldParentOrgPath, String leavesOutputPath, + String orgOutputPath, String outputProjectPath) { + // use of the parents as new leaves set + changeLeavesSet(spark, leavesPath, chldParentOrgPath, leavesOutputPath); + + // add the new relations obtained from propagation to the keyvalueset result organization + updateEntityOrganization( + spark, resultOrgPath, readPath(spark, selectedRelsPath + NEW_RESULT_RELATION_PATH, Relation.class), orgOutputPath); + + updateEntityOrganization( + spark, projectOrgPath, readPath(spark, selectedRelsPath + NEW_PROJECT_RELATION_PATH, Relation.class), outputProjectPath); + } + + private static void updateEntityOrganization(SparkSession spark, String entityOrgPath, + Dataset selectedRels, String outputPath) { + Dataset entityOrg = readPath(spark, entityOrgPath, KeyValueSet.class); + entityOrg .joinWith( - selectedRels, resultOrg + selectedRels, entityOrg .col("key") .equalTo(selectedRels.col("source")), "left") @@ -111,38 +128,45 @@ public class StepActions implements Serializable { // construction of the set) // if at least one relation in the set was not produced by propagation no new relation will be returned + relationDataset - .union(newRels) - .groupByKey((MapFunction) r -> r.getSource() + r.getTarget(), Encoders.STRING()) - .mapGroups((MapGroupsFunction) (k, it) -> { + .union(newRels) + .groupByKey((MapFunction) r -> r.getSource() + r.getTarget(), Encoders.STRING()) + .mapGroups((MapGroupsFunction) (k, it) -> { - ArrayList relationList = new ArrayList<>(); - relationList.add(it.next()); - it.forEachRemaining(rel -> relationList.add(rel)); + ArrayList relationList = new ArrayList<>(); + relationList.add(it.next()); + it.forEachRemaining(rel -> relationList.add(rel)); - if (relationList - .stream() - .filter( - rel -> !rel - .getDataInfo() - .getProvenanceaction() - .getClassid() - .equals(PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID)) - .count() > 0) { - return null; - } + if (relationList + .stream() + .filter( + rel -> !rel + .getDataInfo() + .getProvenanceaction() + .getClassid() + .equals(PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID) && !rel + .getDataInfo() + .getProvenanceaction() + .getClassid() + .equals(PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID)) + .count() > 0) { + return null; + } + + return new ObjectMapper().writeValueAsString(relationList.get(0)); + + }, Encoders.STRING()) + .filter(Objects::nonNull) + .map( + (MapFunction) r -> new ObjectMapper().readValue(r, Relation.class), + Encoders.bean(Relation.class)) + .write() + .mode(SaveMode.Append) + .option("compression", "gzip") + .json(newRelationPath); - return new ObjectMapper().writeValueAsString(relationList.get(0)); - }, Encoders.STRING()) - .filter(Objects::nonNull) - .map( - (MapFunction) r -> new ObjectMapper().readValue(r, Relation.class), - Encoders.bean(Relation.class)) - .write() - .mode(SaveMode.Append) - .option("compression", "gzip") - .json(newRelationPath); } @@ -172,20 +196,21 @@ public class StepActions implements Serializable { "ON leaves.value = cp.child " + "JOIN (" + "SELECT key as entityId, org " + - "FROM resultOrg " + + "FROM entityOrg " + "LATERAL VIEW explode (valueSet) ks as org ) as ro " + "ON leaves.value = ro.org " + - "GROUP BY resId") + "GROUP BY entityId") .as(Encoders.bean(KeyValueSet.class)); - // create new relations from result to organization for each result linked to a leaf + + // create new relations from entity to organization for each entity linked to a leaf return resultParent .flatMap( (FlatMapFunction) v -> v .getValueSet() .stream() .map( - orgId -> getAffiliationRelation( + orgId -> getRelation( v.getKey(), orgId, semantics)) diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java index 3d7086739..f29e8d24a 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java @@ -361,6 +361,37 @@ public class PrepareInfoJobTest { } + @Test + public void relationProjectTest() throws Exception { + + PrepareInfo + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-graphPath", getClass() + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest") + .getPath(), + "-hive_metastore_uris", "", + "-leavesPath", workingDir.toString() + "/currentIteration/", + "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", + "-childParentPath", workingDir.toString() + "/childParentOrg/", + "-relationPath", workingDir.toString() + "/relation" + + }); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation/project") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); + + Assertions.assertEquals(7, verificationDs.count()); + + } + @Test public void resultOrganizationTest1() throws Exception { @@ -496,6 +527,141 @@ public class PrepareInfoJobTest { } + @Test + public void projectOrganizationTest1() throws Exception { + + PrepareInfo + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-graphPath", getClass() + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest") + .getPath(), + "-hive_metastore_uris", "", + "-leavesPath", workingDir.toString() + "/currentIteration/", + "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", + "-childParentPath", workingDir.toString() + "/childParentOrg/", + "-relationPath", workingDir.toString() + "/relation" + + }); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/projectOrganization/") + .map(item -> OBJECT_MAPPER.readValue(item, KeyValueSet.class)); + + Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(KeyValueSet.class)); + + Assertions.assertEquals(5, verificationDs.count()); + + Assertions + .assertEquals( + 2, verificationDs + .filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'") + .collectAsList() + .get(0) + .getValueSet() + .size()); + Assertions + .assertTrue( + verificationDs + .filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")); + Assertions + .assertTrue( + verificationDs + .filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|pippo_wf_001::2899e571609779168222fdeb59cb916d")); + + Assertions + .assertEquals( + 2, verificationDs + .filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'") + .collectAsList() + .get(0) + .getValueSet() + .size()); + Assertions + .assertTrue( + verificationDs + .filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0")); + Assertions + .assertTrue( + verificationDs + .filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|pippo_wf_001::2899e571609779168222fdeb59cb916d")); + + Assertions + .assertEquals( + 1, verificationDs + .filter("key = '40|doajarticles::03748bcb5d754c951efec9700e18a56d'") + .collectAsList() + .get(0) + .getValueSet() + .size()); + Assertions + .assertTrue( + verificationDs + .filter("key = '40|doajarticles::03748bcb5d754c951efec9700e18a56d'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); + + Assertions + .assertEquals( + 1, verificationDs + .filter("key = '40|openaire____::ec653e804967133b9436fdd30d3ff51d'") + .collectAsList() + .get(0) + .getValueSet() + .size()); + Assertions + .assertTrue( + verificationDs + .filter("key = '40|openaire____::ec653e804967133b9436fdd30d3ff51d'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); + + Assertions + .assertEquals( + 1, verificationDs + .filter("key = '40|doajarticles::1cae0b82b56ccd97c2db1f698def7074'") + .collectAsList() + .get(0) + .getValueSet() + .size()); + Assertions + .assertTrue( + verificationDs + .filter("key = '40|doajarticles::1cae0b82b56ccd97c2db1f698def7074'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1")); + + verificationDs + .foreach((ForeachFunction) v -> System.out.println(OBJECT_MAPPER.writeValueAsString(v))); + + } + @Test public void foundLeavesTest1() throws Exception { diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java index a4d8f83e3..eb4ade0da 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java @@ -65,25 +65,30 @@ public class SparkJobTest { } @Test - public void completeExecution() throws Exception { + public void completeResultExecution() throws Exception { final String graphPath = getClass() - .getResource("/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep") + .getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph") .getPath(); final String leavesPath = getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/currentIteration/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/") .getPath(); final String childParentPath = getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/childParentOrg/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/") .getPath(); final String resultOrgPath = getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/resultOrganization/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") .getPath(); + final String projectOrgPath = getClass() + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/") + .getPath(); + readPath(spark, leavesPath, Leaves.class) .write() .option("compression", "gzip") @@ -94,6 +99,11 @@ public class SparkJobTest { .option("compression", "gzip") .json(workingDir.toString() + "/orgsInput"); + readPath(spark, projectOrgPath, KeyValueSet.class) + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/projectInput"); + SparkResultToOrganizationFromSemRel .main( @@ -104,95 +114,97 @@ public class SparkJobTest { "-outputPath", workingDir.toString() + "/finalrelation", "-leavesPath", workingDir.toString() + "/leavesInput", "-resultOrgPath", workingDir.toString() + "/orgsInput", + "-projectOrganizationPath", workingDir.toString() + "/projectInput", "-childParentPath", childParentPath, "-workingDir", workingDir.toString() }); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaRDD tmp = sc + JavaRDD temp = sc .textFile(workingDir.toString() + "/finalrelation") .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); - tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r))); + Assertions.assertEquals(36, temp.count()); - Assertions.assertEquals(18, tmp.count()); - tmp.foreach(r -> Assertions.assertEquals(ModelConstants.AFFILIATION, r.getSubRelType())); - tmp.foreach(r -> Assertions.assertEquals(ModelConstants.RESULT_ORGANIZATION, r.getRelType())); - tmp + JavaRDD result = temp.filter(r -> r.getSource().startsWith("50|") || r.getTarget().startsWith("50|")); + Assertions.assertEquals(18, result.count()); + result.foreach(r -> Assertions.assertEquals(ModelConstants.AFFILIATION, r.getSubRelType())); + result.foreach(r -> Assertions.assertEquals(ModelConstants.RESULT_ORGANIZATION, r.getRelType())); + result .foreach( r -> Assertions .assertEquals( PropagationConstant.PROPAGATION_DATA_INFO_TYPE, r.getDataInfo().getInferenceprovenance())); - tmp + result .foreach( r -> Assertions .assertEquals( PropagationConstant.PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID, r.getDataInfo().getProvenanceaction().getClassid())); - tmp + result .foreach( r -> Assertions .assertEquals( PropagationConstant.PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME, r.getDataInfo().getProvenanceaction().getClassname())); - tmp + result .foreach( r -> Assertions .assertEquals( "0.85", r.getDataInfo().getTrust())); - Assertions.assertEquals(9, tmp.filter(r -> r.getSource().substring(0, 3).equals("50|")).count()); - tmp + Assertions.assertEquals(9, result.filter(r -> r.getSource().substring(0, 3).equals("50|")).count()); + result .filter(r -> r.getSource().substring(0, 3).equals("50|")) .foreach(r -> Assertions.assertEquals(ModelConstants.HAS_AUTHOR_INSTITUTION, r.getRelClass())); Assertions .assertEquals( - 2, tmp.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); + 2, result.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); Assertions .assertEquals( - 3, tmp.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); + 3, result.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); Assertions .assertEquals( - 2, tmp.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); + 2, result.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); Assertions .assertEquals( - 1, tmp.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); + 1, result.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); Assertions .assertEquals( - 1, tmp.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); + 1, result.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); - Assertions.assertEquals(9, tmp.filter(r -> r.getSource().substring(0, 3).equals("20|")).count()); - tmp + Assertions.assertEquals(9, result.filter(r -> r.getSource().substring(0, 3).equals("20|")).count()); + result .filter(r -> r.getSource().substring(0, 3).equals("20|")) .foreach(r -> Assertions.assertEquals(ModelConstants.IS_AUTHOR_INSTITUTION_OF, r.getRelClass())); Assertions .assertEquals( - 1, tmp.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); + 1, result.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); Assertions .assertEquals( - 1, tmp.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); + 1, result.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); Assertions .assertEquals( - 2, tmp.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); + 2, result.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); Assertions .assertEquals( - 2, tmp.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); + 2, result.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); Assertions .assertEquals( - 3, tmp.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); + 3, result.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) .map(r -> r.getTarget()) .collect() .contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) .map(r -> r.getTarget()) .collect() @@ -200,14 +212,14 @@ public class SparkJobTest { Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) .map(r -> r.getTarget()) .collect() .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) .map(r -> r.getTarget()) .collect() @@ -215,21 +227,21 @@ public class SparkJobTest { Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")) .map(r -> r.getTarget()) .collect() .contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")); Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")) .map(r -> r.getTarget()) .collect() .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")) .map(r -> r.getTarget()) .collect() @@ -237,7 +249,7 @@ public class SparkJobTest { Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")) .map(r -> r.getTarget()) .collect() @@ -245,7 +257,7 @@ public class SparkJobTest { Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")) .map(r -> r.getTarget()) .collect() @@ -253,14 +265,14 @@ public class SparkJobTest { Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")) .map(r -> r.getTarget()) .collect() .contains("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")) .map(r -> r.getTarget()) .collect() @@ -268,14 +280,14 @@ public class SparkJobTest { Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) .map(r -> r.getTarget()) .collect() .contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")); Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) .map(r -> r.getTarget()) .collect() @@ -283,21 +295,21 @@ public class SparkJobTest { Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) .map(r -> r.getTarget()) .collect() .contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")); Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) .map(r -> r.getTarget()) .collect() .contains("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) .map(r -> r.getTarget()) .collect() @@ -305,7 +317,7 @@ public class SparkJobTest { Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")) .map(r -> r.getTarget()) .collect() @@ -313,11 +325,555 @@ public class SparkJobTest { Assertions .assertTrue( - tmp + result .filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) .map(r -> r.getTarget()) .collect() .contains("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); } + @Test + public void completeProjectExecution() throws Exception { + + final String graphPath = getClass() + .getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph") + .getPath(); + final String leavesPath = getClass() + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/") + .getPath(); + final String childParentPath = getClass() + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/") + .getPath(); + + final String resultOrgPath = getClass() + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") + .getPath(); + + final String projectOrgPath = getClass() + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/") + .getPath(); + + readPath(spark, leavesPath, Leaves.class) + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/leavesInput"); + + readPath(spark, resultOrgPath, KeyValueSet.class) + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/orgsInput"); + + readPath(spark, projectOrgPath, KeyValueSet.class) + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/projectInput"); + + SparkResultToOrganizationFromSemRel + + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-relationPath", graphPath, + "-hive_metastore_uris", "", + "-outputPath", workingDir.toString() + "/finalrelation", + "-leavesPath", workingDir.toString() + "/leavesInput", + "-resultOrgPath", workingDir.toString() + "/orgsInput", + "-projectOrganizationPath", workingDir.toString() + "/projectInput", + "-childParentPath", childParentPath, + "-workingDir", workingDir.toString() + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD temp = sc + .textFile(workingDir.toString() + "/finalrelation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + Assertions.assertEquals(36, temp.count()); + + JavaRDD project = temp.filter(r -> r.getSource().startsWith("40|") || r.getTarget().startsWith("40|")); + Assertions.assertEquals(18, project.count()); + + project.foreach(r -> Assertions.assertEquals(ModelConstants.PARTICIPATION, r.getSubRelType())); + project.foreach(r -> Assertions.assertEquals(ModelConstants.PROJECT_ORGANIZATION, r.getRelType())); + project + .foreach( + r -> Assertions + .assertEquals( + PropagationConstant.PROPAGATION_DATA_INFO_TYPE, r.getDataInfo().getInferenceprovenance())); + project + .foreach( + r -> Assertions + .assertEquals( + PropagationConstant.PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID, + r.getDataInfo().getProvenanceaction().getClassid())); + project + .foreach( + r -> Assertions + .assertEquals( + PropagationConstant.PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME, + r.getDataInfo().getProvenanceaction().getClassname())); + project + .foreach( + r -> Assertions + .assertEquals( + "0.85", + r.getDataInfo().getTrust())); + + Assertions.assertEquals(9, project.filter(r -> r.getSource().substring(0, 3).equals("40|")).count()); + project + .filter(r -> r.getSource().substring(0, 3).equals("40|")) + .foreach(r -> Assertions.assertEquals(ModelConstants.HAS_PARTICIPANT, r.getRelClass())); + Assertions + .assertEquals( + 2, project.filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); + Assertions + .assertEquals( + 3, project.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); + Assertions + .assertEquals( + 2, project.filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); + Assertions + .assertEquals( + 1, project.filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); + Assertions + .assertEquals( + 1, project.filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); + + Assertions.assertEquals(9, project.filter(r -> r.getSource().substring(0, 3).equals("20|")).count()); + project + .filter(r -> r.getSource().substring(0, 3).equals("20|")) + .foreach(r -> Assertions.assertEquals(ModelConstants.IS_PARTICIPANT, r.getRelClass())); + Assertions + .assertEquals( + 1, project.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); + Assertions + .assertEquals( + 1, project.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); + Assertions + .assertEquals( + 2, project.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); + Assertions + .assertEquals( + 2, project.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); + Assertions + .assertEquals( + 3, project.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); + + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) + .map(r -> r.getTarget()) + .collect() + .contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) + .map(r -> r.getTarget()) + .collect() + .contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d")); + + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) + .map(r -> r.getTarget()) + .collect() + .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) + .map(r -> r.getTarget()) + .collect() + .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); + + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")) + .map(r -> r.getTarget()) + .collect() + .contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")); + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")) + .map(r -> r.getTarget()) + .collect() + .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")) + .map(r -> r.getTarget()) + .collect() + .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); + + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d")) + .map(r -> r.getTarget()) + .collect() + .contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d")); + + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d")) + .map(r -> r.getTarget()) + .collect() + .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); + + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")) + .map(r -> r.getTarget()) + .collect() + .contains("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")) + .map(r -> r.getTarget()) + .collect() + .contains("40|openaire____::ec653e804967133b9436fdd30d3ff51d")); + + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) + .map(r -> r.getTarget()) + .collect() + .contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")); + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) + .map(r -> r.getTarget()) + .collect() + .contains("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); + + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) + .map(r -> r.getTarget()) + .collect() + .contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")); + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) + .map(r -> r.getTarget()) + .collect() + .contains("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) + .map(r -> r.getTarget()) + .collect() + .contains("40|doajarticles::03748bcb5d754c951efec9700e18a56d")); + + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")) + .map(r -> r.getTarget()) + .collect() + .contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")); + + Assertions + .assertTrue( + project + .filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) + .map(r -> r.getTarget()) + .collect() + .contains("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); + } + + @Test + public void singleIterationExecution() throws Exception { + + final String graphPath = getClass() + .getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph") + .getPath(); + final String leavesPath = getClass() + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/") + .getPath(); + final String childParentPath = getClass() + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/") + .getPath(); + + final String resultOrgPath = getClass() + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") + .getPath(); + + final String projectOrgPath = getClass() + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/") + .getPath(); + + readPath(spark, leavesPath, Leaves.class) + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/leavesInput"); + + readPath(spark, resultOrgPath, KeyValueSet.class) + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/orgsInput"); + + readPath(spark, projectOrgPath, KeyValueSet.class) + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/projectInput"); + + SparkResultToOrganizationFromSemRel + + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-relationPath", graphPath, + "-hive_metastore_uris", "", + "-outputPath", workingDir.toString() + "/finalrelation", + "-leavesPath", workingDir.toString() + "/leavesInput", + "-resultOrgPath", workingDir.toString() + "/orgsInput", + "-projectOrganizationPath", workingDir.toString() + "/projectInput", + "-childParentPath", childParentPath, + "-workingDir", workingDir.toString(), + "-iterations", "1" + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD temp = sc + .textFile(workingDir.toString() + "/finalrelation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + Assertions.assertEquals(16, temp.count()); + + Assertions.assertEquals(4, temp.filter(r -> r.getSource().startsWith("50|")).count()); + Assertions.assertEquals(4, temp.filter(r -> r.getTarget().startsWith("50|")).count()); + Assertions.assertEquals(4, temp.filter(r -> r.getSource().startsWith("40|")).count()); + Assertions.assertEquals(4, temp.filter(r -> r.getTarget().startsWith("40|")).count()); + Assertions.assertEquals(8, temp.filter(r -> r.getSource().startsWith("20|")).count()); + Assertions.assertEquals(8, temp.filter(r -> r.getSource().startsWith("20|")).count()); + +// JavaRDD result = temp.filter(r -> r.getSource().startsWith("50|") || r.getTarget().startsWith("50|")); +// Assertions.assertEquals(18, result.count()); +// result.foreach(r -> Assertions.assertEquals(ModelConstants.AFFILIATION, r.getSubRelType())); +// result.foreach(r -> Assertions.assertEquals(ModelConstants.RESULT_ORGANIZATION, r.getRelType())); +// result +// .foreach( +// r -> Assertions +// .assertEquals( +// PropagationConstant.PROPAGATION_DATA_INFO_TYPE, r.getDataInfo().getInferenceprovenance())); +// result +// .foreach( +// r -> Assertions +// .assertEquals( +// PropagationConstant.PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID, +// r.getDataInfo().getProvenanceaction().getClassid())); +// result +// .foreach( +// r -> Assertions +// .assertEquals( +// PropagationConstant.PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME, +// r.getDataInfo().getProvenanceaction().getClassname())); +// result +// .foreach( +// r -> Assertions +// .assertEquals( +// "0.85", +// r.getDataInfo().getTrust())); +// +// Assertions.assertEquals(9, result.filter(r -> r.getSource().substring(0, 3).equals("50|")).count()); +// result +// .filter(r -> r.getSource().substring(0, 3).equals("50|")) +// .foreach(r -> Assertions.assertEquals(ModelConstants.HAS_AUTHOR_INSTITUTION, r.getRelClass())); +// Assertions +// .assertEquals( +// 2, result.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); +// Assertions +// .assertEquals( +// 3, result.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); +// Assertions +// .assertEquals( +// 2, result.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); +// Assertions +// .assertEquals( +// 1, result.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); +// Assertions +// .assertEquals( +// 1, result.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); +// +// Assertions.assertEquals(9, result.filter(r -> r.getSource().substring(0, 3).equals("20|")).count()); +// result +// .filter(r -> r.getSource().substring(0, 3).equals("20|")) +// .foreach(r -> Assertions.assertEquals(ModelConstants.IS_AUTHOR_INSTITUTION_OF, r.getRelClass())); +// Assertions +// .assertEquals( +// 1, result.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); +// Assertions +// .assertEquals( +// 1, result.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); +// Assertions +// .assertEquals( +// 2, result.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); +// Assertions +// .assertEquals( +// 2, result.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); +// Assertions +// .assertEquals( +// 3, result.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); +// +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d")); +// +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); +// +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")); +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); +// +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d")); +// +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); +// +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("50|openaire____::ec653e804967133b9436fdd30d3ff51d")); +// +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")); +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); +// +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")); +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("50|doajarticles::03748bcb5d754c951efec9700e18a56d")); +// +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")); +// +// Assertions +// .assertTrue( +// result +// .filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) +// .map(r -> r.getTarget()) +// .collect() +// .contains("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); + } } diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java index 77ed4dcbf..7a71240b2 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java @@ -73,21 +73,21 @@ public class StepActionsTest { .execStep( spark, getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph/result") .getPath(), workingDir.toString() + "/newRelationPath", getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/currentIteration/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/") .getPath(), getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/childParentOrg/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/") .getPath(), getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/resultOrganization/") - .getPath()); + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") + .getPath(), ModelConstants.HAS_AUTHOR_INSTITUTION); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); @@ -203,19 +203,19 @@ public class StepActionsTest { spark, getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/relsforiteration1/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relsforiteration1/") .getPath(), getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/resultOrganization/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") .getPath(), getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/currentIteration/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/") .getPath(), getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/childParentOrg/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/") .getPath(), workingDir.toString() + "/tempLeaves", workingDir.toString() + "/tempOrgs"); @@ -248,19 +248,19 @@ public class StepActionsTest { spark, getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/relsforiteration1/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relsforiteration1/") .getPath(), getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/resultOrganization/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") .getPath(), getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/currentIteration/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/") .getPath(), getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/childParentOrg/") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/") .getPath(), workingDir.toString() + "/tempLeaves", workingDir.toString() + "/tempOrgs"); diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph/project b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph/project new file mode 100644 index 000000000..e8e35f555 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph/project @@ -0,0 +1,7 @@ +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::03748bcb5d754c951efec9700e18a56d","subRelType":"provision","target":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|openaire____::ec653e804967133b9436fdd30d3ff51d","subRelType":"provision","target":"20|doajarticles::1cae0b82b56ccd97c2db1f698def7074","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::1cae0b82b56ccd97c2db1f698def7074","subRelType":"provision","target":"20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1","validated":false} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph/result b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph/result new file mode 100644 index 000000000..5aeabb71b --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph/result @@ -0,0 +1,7 @@ +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::03748bcb5d754c951efec9700e18a56d","subRelType":"provision","target":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|openaire____::ec653e804967133b9436fdd30d3ff51d","subRelType":"provision","target":"20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::1cae0b82b56ccd97c2db1f698def7074","subRelType":"provision","target":"20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1"} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/projectorganization b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/projectorganization new file mode 100644 index 000000000..81803f29d --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/projectorganization @@ -0,0 +1,5 @@ +{"key":"40|openaire____::ec653e804967133b9436fdd30d3ff51d","valueSet":["20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"]} +{"key":"40|doajarticles::03748bcb5d754c951efec9700e18a56d","valueSet":["20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"]} +{"key":"40|doajarticles::1cae0b82b56ccd97c2db1f698def7074","valueSet":["20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1"]} +{"key":"40|dedup_wf_001::2899e571609779168222fdeb59cb916d","valueSet":["20|pippo_wf_001::2899e571609779168222fdeb59cb916d","20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0"]} +{"key":"40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","valueSet":["20|pippo_wf_001::2899e571609779168222fdeb59cb916d","20|dedup_wf_001::2899e571609779168222fdeb59cb916d"]} diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relation b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relation deleted file mode 100644 index db7db8fdd..000000000 --- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relation +++ /dev/null @@ -1,14 +0,0 @@ -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|doajarticles::03748bcb5d754c951efec9700e18a56d","subRelType":"provision","target":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|openaire____::ec653e804967133b9436fdd30d3ff51d","subRelType":"provision","target":"20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isparentof","relType":"datasourceOrganization","source":"20|doajarticles::1cae0b82b56ccd97c2db1f698def7074","subRelType":"provision","target":"20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::03748bcb5d754c951efec9700e18a56d","subRelType":"provision","target":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|openaire____::ec653e804967133b9436fdd30d3ff51d","subRelType":"provision","target":"20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasAuthorInstitution","relType":"datasourceOrganization","source":"50|doajarticles::1cae0b82b56ccd97c2db1f698def7074","subRelType":"provision","target":"20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1"} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest/relation b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest/relation new file mode 100644 index 000000000..10d46b1cb --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest/relation @@ -0,0 +1,7 @@ +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|pippo_wf_001::2899e571609779168222fdeb59cb916d"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|dedup_wf_001::2899e571609779168222fdeb59cb916d","subRelType":"provision","target":"20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::03748bcb5d754c951efec9700e18a56d","subRelType":"provision","target":"20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|openaire____::ec653e804967133b9436fdd30d3ff51d","subRelType":"provision","target":"20|doajarticles::1cae0b82b56ccd97c2db1f698def7074"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"hasParticipant","relType":"datasourceOrganization","source":"40|doajarticles::1cae0b82b56ccd97c2db1f698def7074","subRelType":"provision","target":"20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1"} \ No newline at end of file From daf4d7971b6b1a355b8930b4a2e1f4fc25efd380 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 31 May 2023 18:56:58 +0200 Subject: [PATCH 3/3] refactoring --- .../eu/dnetlib/dhp/PropagationConstant.java | 48 +- .../PrepareInfo.java | 50 +- .../SparkResultToOrganizationFromSemRel.java | 70 ++- .../StepActions.java | 88 ++- .../PrepareInfoJobTest.java | 246 ++++----- .../SparkJobTest.java | 509 +++++++++--------- .../StepActionsTest.java | 3 +- 7 files changed, 518 insertions(+), 496 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 87528ef58..053300696 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -174,37 +174,39 @@ public class PropagationConstant { return newRelations; } - public static Relation getRelation(String source, String target, String rel_class){ - if (ModelConstants.HAS_PARTICIPANT.equals(rel_class)){ + public static Relation getRelation(String source, String target, String rel_class) { + if (ModelConstants.HAS_PARTICIPANT.equals(rel_class)) { return getParticipantRelation(source, target, rel_class); - }else + } else return getAffiliationRelation(source, target, rel_class); } public static Relation getParticipantRelation( - String source, - String target, - String rel_class) { - return getRelation(source, target , - rel_class, - ModelConstants.PROJECT_ORGANIZATION, - ModelConstants.PARTICIPATION, - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID, - PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME); + String source, + String target, + String rel_class) { + return getRelation( + source, target, + rel_class, + ModelConstants.PROJECT_ORGANIZATION, + ModelConstants.PARTICIPATION, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID, + PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME); } public static Relation getAffiliationRelation( - String source, - String target, - String rel_class) { - return getRelation(source, target , - rel_class, - ModelConstants.RESULT_ORGANIZATION, - ModelConstants.AFFILIATION, - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID, - PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME); + String source, + String target, + String rel_class) { + return getRelation( + source, target, + rel_class, + ModelConstants.RESULT_ORGANIZATION, + ModelConstants.AFFILIATION, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID, + PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME); } public static Relation getRelation( diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java index 971ef436f..8d3432f06 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java @@ -7,7 +7,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.io.Serializable; import java.util.*; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.*; @@ -15,6 +14,8 @@ import org.apache.spark.sql.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.dhp.KeyValueSet; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob; @@ -48,10 +49,10 @@ public class PrepareInfo implements Serializable { // associate projects to all the participant orgs private static final String PROJECT_ORGANIZATION_QUERY = "SELECT source key, collect_set(target) as valueSet " + - "FROM relation " + - "WHERE lower(relclass) = '" + ModelConstants.HAS_PARTICIPANT.toLowerCase() + - "' and datainfo.deletedbyinference = false " + - "GROUP BY source"; + "FROM relation " + + "WHERE lower(relclass) = '" + ModelConstants.HAS_PARTICIPANT.toLowerCase() + + "' and datainfo.deletedbyinference = false " + + "GROUP BY source"; public static void main(String[] args) throws Exception { @@ -98,12 +99,13 @@ public class PrepareInfo implements Serializable { childParentPath, leavesPath, resultOrganizationPath, - projectOrgPath, + projectOrgPath, relationPath)); } private static void prepareInfo(SparkSession spark, String inputPath, String childParentOrganizationPath, - String currentIterationPath, String resultOrganizationPath, String projectOrganizationPath, String relationPath) { + String currentIterationPath, String resultOrganizationPath, String projectOrganizationPath, + String relationPath) { Dataset relation = readPath(spark, inputPath + "/relation", Relation.class); relation.createOrReplaceTempView("relation"); @@ -124,30 +126,30 @@ public class PrepareInfo implements Serializable { .json(resultOrganizationPath); spark - .sql(PROJECT_ORGANIZATION_QUERY) - .as(Encoders.bean(KeyValueSet.class)) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .json(projectOrganizationPath); + .sql(PROJECT_ORGANIZATION_QUERY) + .as(Encoders.bean(KeyValueSet.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(projectOrganizationPath); relation - .filter( - (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - r.getRelClass().equals(ModelConstants.HAS_AUTHOR_INSTITUTION)) - .write() + .filter( + (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + r.getRelClass().equals(ModelConstants.HAS_AUTHOR_INSTITUTION)) + .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(relationPath + "/result"); relation - .filter( - (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - r.getRelClass().equals(ModelConstants.HAS_PARTICIPANT)) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .json(relationPath + "/project"); + .filter( + (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + r.getRelClass().equals(ModelConstants.HAS_PARTICIPANT)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(relationPath + "/project"); Dataset children = spark .sql( diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java index dd32552ad..27e502aba 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java @@ -92,7 +92,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { leavesPath, childParentPath, resultOrganizationPath, - projectOrganizationPath, + projectOrganizationPath, relationPath, workingPath, outputPath, @@ -147,15 +147,16 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { addNewRelations(spark, workingPath + NEW_RESULT_RELATION_PATH, outputPath); StepActions - .execStep( - spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH, - leavesPath, childParentPath, projectOrganizationPath, ModelConstants.HAS_PARTICIPANT); + .execStep( + spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH, + leavesPath, childParentPath, projectOrganizationPath, ModelConstants.HAS_PARTICIPANT); addNewRelations(spark, workingPath + NEW_PROJECT_RELATION_PATH, outputPath); } private static void doPropagate(SparkSession spark, String leavesPath, String childParentPath, - String resultOrganizationPath, String projectOrganizationPath, String graphPath, String workingPath, String outputPath, + String resultOrganizationPath, String projectOrganizationPath, String graphPath, String workingPath, + String outputPath, PropagationCounter propagationCounter) { int iteration = 0; long leavesCount; @@ -167,13 +168,13 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { spark, graphPath + "/result", workingPath + NEW_RESULT_RELATION_PATH, leavesPath, childParentPath, resultOrganizationPath, ModelConstants.HAS_AUTHOR_INSTITUTION); StepActions - .execStep( - spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH, - leavesPath, childParentPath, projectOrganizationPath, ModelConstants.HAS_PARTICIPANT); + .execStep( + spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH, + leavesPath, childParentPath, projectOrganizationPath, ModelConstants.HAS_PARTICIPANT); StepActions .prepareForNextStep( - spark, workingPath , resultOrganizationPath, projectOrganizationPath, leavesPath, + spark, workingPath, resultOrganizationPath, projectOrganizationPath, leavesPath, childParentPath, workingPath + "/leaves", workingPath + "/resOrg", workingPath + "/projOrg"); moveOutput(spark, workingPath, leavesPath, resultOrganizationPath, projectOrganizationPath); leavesCount = readPath(spark, leavesPath, Leaves.class).count(); @@ -224,24 +225,24 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { } private static void moveOutput(SparkSession spark, String workingPath, String leavesPath, - String resultOrganizationPath, String projectOrganizationPath) { + String resultOrganizationPath, String projectOrganizationPath) { readPath(spark, workingPath + "/leaves", Leaves.class) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .json(leavesPath); + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(leavesPath); readPath(spark, workingPath + "/resOrg", KeyValueSet.class) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .json(resultOrganizationPath); + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(resultOrganizationPath); readPath(spark, workingPath + "/projOrg", KeyValueSet.class) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .json(projectOrganizationPath); + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(projectOrganizationPath); } @@ -253,25 +254,22 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { .mapGroups( (MapGroupsFunction) (k, it) -> it.next(), Encoders.bean(Relation.class)) .flatMap( - (FlatMapFunction) r -> - { - if(r.getSource().startsWith("50|")){ + (FlatMapFunction) r -> { + if (r.getSource().startsWith("50|")) { return Arrays - .asList( - r, getAffiliationRelation( - r.getTarget(), r.getSource(), ModelConstants.IS_AUTHOR_INSTITUTION_OF)) - .iterator(); - }else{ + .asList( + r, getAffiliationRelation( + r.getTarget(), r.getSource(), ModelConstants.IS_AUTHOR_INSTITUTION_OF)) + .iterator(); + } else { return Arrays - .asList( - r, getParticipantRelation( - r.getTarget(), r.getSource(), ModelConstants.IS_PARTICIPANT)) - .iterator(); + .asList( + r, getParticipantRelation( + r.getTarget(), r.getSource(), ModelConstants.IS_PARTICIPANT)) + .iterator(); } } - - , Encoders.bean(Relation.class)) .write() diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java index de5034d38..386ea1a5c 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java @@ -50,22 +50,25 @@ public class StepActions implements Serializable { spark, resultOrgPath, readPath(spark, selectedRelsPath, Relation.class), orgOutputPath); } - public static void prepareForNextStep(SparkSession spark, String selectedRelsPath, String resultOrgPath, String projectOrgPath, - String leavesPath, String chldParentOrgPath, String leavesOutputPath, - String orgOutputPath, String outputProjectPath) { + public static void prepareForNextStep(SparkSession spark, String selectedRelsPath, String resultOrgPath, + String projectOrgPath, + String leavesPath, String chldParentOrgPath, String leavesOutputPath, + String orgOutputPath, String outputProjectPath) { // use of the parents as new leaves set changeLeavesSet(spark, leavesPath, chldParentOrgPath, leavesOutputPath); // add the new relations obtained from propagation to the keyvalueset result organization updateEntityOrganization( - spark, resultOrgPath, readPath(spark, selectedRelsPath + NEW_RESULT_RELATION_PATH, Relation.class), orgOutputPath); + spark, resultOrgPath, readPath(spark, selectedRelsPath + NEW_RESULT_RELATION_PATH, Relation.class), + orgOutputPath); updateEntityOrganization( - spark, projectOrgPath, readPath(spark, selectedRelsPath + NEW_PROJECT_RELATION_PATH, Relation.class), outputProjectPath); + spark, projectOrgPath, readPath(spark, selectedRelsPath + NEW_PROJECT_RELATION_PATH, Relation.class), + outputProjectPath); } private static void updateEntityOrganization(SparkSession spark, String entityOrgPath, - Dataset selectedRels, String outputPath) { + Dataset selectedRels, String outputPath) { Dataset entityOrg = readPath(spark, entityOrgPath, KeyValueSet.class); entityOrg .joinWith( @@ -128,45 +131,43 @@ public class StepActions implements Serializable { // construction of the set) // if at least one relation in the set was not produced by propagation no new relation will be returned - relationDataset - .union(newRels) - .groupByKey((MapFunction) r -> r.getSource() + r.getTarget(), Encoders.STRING()) - .mapGroups((MapGroupsFunction) (k, it) -> { + .union(newRels) + .groupByKey((MapFunction) r -> r.getSource() + r.getTarget(), Encoders.STRING()) + .mapGroups((MapGroupsFunction) (k, it) -> { - ArrayList relationList = new ArrayList<>(); - relationList.add(it.next()); - it.forEachRemaining(rel -> relationList.add(rel)); + ArrayList relationList = new ArrayList<>(); + relationList.add(it.next()); + it.forEachRemaining(rel -> relationList.add(rel)); - if (relationList - .stream() - .filter( - rel -> !rel - .getDataInfo() - .getProvenanceaction() - .getClassid() - .equals(PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID) && !rel - .getDataInfo() - .getProvenanceaction() - .getClassid() - .equals(PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID)) - .count() > 0) { - return null; - } - - return new ObjectMapper().writeValueAsString(relationList.get(0)); - - }, Encoders.STRING()) - .filter(Objects::nonNull) - .map( - (MapFunction) r -> new ObjectMapper().readValue(r, Relation.class), - Encoders.bean(Relation.class)) - .write() - .mode(SaveMode.Append) - .option("compression", "gzip") - .json(newRelationPath); + if (relationList + .stream() + .filter( + rel -> !rel + .getDataInfo() + .getProvenanceaction() + .getClassid() + .equals(PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID) + && !rel + .getDataInfo() + .getProvenanceaction() + .getClassid() + .equals(PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID)) + .count() > 0) { + return null; + } + return new ObjectMapper().writeValueAsString(relationList.get(0)); + }, Encoders.STRING()) + .filter(Objects::nonNull) + .map( + (MapFunction) r -> new ObjectMapper().readValue(r, Relation.class), + Encoders.bean(Relation.class)) + .write() + .mode(SaveMode.Append) + .option("compression", "gzip") + .json(newRelationPath); } @@ -175,7 +176,7 @@ public class StepActions implements Serializable { String leavesPath, String chldParentOrgPath, String entityOrgPath, - String semantics) { + String semantics) { Dataset childParent = readPath(spark, chldParentOrgPath, KeyValueSet.class); Dataset entityOrg = readPath(spark, entityOrgPath, KeyValueSet.class); @@ -202,7 +203,6 @@ public class StepActions implements Serializable { "GROUP BY entityId") .as(Encoders.bean(KeyValueSet.class)); - // create new relations from entity to organization for each entity linked to a leaf return resultParent .flatMap( @@ -213,13 +213,11 @@ public class StepActions implements Serializable { orgId -> getRelation( v.getKey(), orgId, - semantics)) + semantics)) .collect(Collectors.toList()) .iterator(), Encoders.bean(Relation.class)); } - - } diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java index f29e8d24a..7c9c2b97b 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java @@ -77,7 +77,7 @@ public class PrepareInfoJobTest { "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", - "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -223,7 +223,7 @@ public class PrepareInfoJobTest { "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", - "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -344,7 +344,7 @@ public class PrepareInfoJobTest { "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", - "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -365,26 +365,26 @@ public class PrepareInfoJobTest { public void relationProjectTest() throws Exception { PrepareInfo - .main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-graphPath", getClass() - .getResource( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest") - .getPath(), - "-hive_metastore_uris", "", - "-leavesPath", workingDir.toString() + "/currentIteration/", - "-resultOrgPath", workingDir.toString() + "/resultOrganization/", - "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", - "-childParentPath", workingDir.toString() + "/childParentOrg/", - "-relationPath", workingDir.toString() + "/relation" + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-graphPath", getClass() + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest") + .getPath(), + "-hive_metastore_uris", "", + "-leavesPath", workingDir.toString() + "/currentIteration/", + "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", + "-childParentPath", workingDir.toString() + "/childParentOrg/", + "-relationPath", workingDir.toString() + "/relation" - }); + }); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc - .textFile(workingDir.toString() + "/relation/project") - .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + .textFile(workingDir.toString() + "/relation/project") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); @@ -406,7 +406,7 @@ public class PrepareInfoJobTest { "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", - "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -531,134 +531,134 @@ public class PrepareInfoJobTest { public void projectOrganizationTest1() throws Exception { PrepareInfo - .main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-graphPath", getClass() - .getResource( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest") - .getPath(), - "-hive_metastore_uris", "", - "-leavesPath", workingDir.toString() + "/currentIteration/", - "-resultOrgPath", workingDir.toString() + "/resultOrganization/", - "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", - "-childParentPath", workingDir.toString() + "/childParentOrg/", - "-relationPath", workingDir.toString() + "/relation" + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-graphPath", getClass() + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/projectorganizationtest") + .getPath(), + "-hive_metastore_uris", "", + "-leavesPath", workingDir.toString() + "/currentIteration/", + "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", + "-childParentPath", workingDir.toString() + "/childParentOrg/", + "-relationPath", workingDir.toString() + "/relation" - }); + }); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc - .textFile(workingDir.toString() + "/projectOrganization/") - .map(item -> OBJECT_MAPPER.readValue(item, KeyValueSet.class)); + .textFile(workingDir.toString() + "/projectOrganization/") + .map(item -> OBJECT_MAPPER.readValue(item, KeyValueSet.class)); Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(KeyValueSet.class)); Assertions.assertEquals(5, verificationDs.count()); Assertions - .assertEquals( - 2, verificationDs - .filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'") - .collectAsList() - .get(0) - .getValueSet() - .size()); + .assertEquals( + 2, verificationDs + .filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'") + .collectAsList() + .get(0) + .getValueSet() + .size()); Assertions - .assertTrue( - verificationDs - .filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'") - .collectAsList() - .get(0) - .getValueSet() - .contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")); + .assertTrue( + verificationDs + .filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")); Assertions - .assertTrue( - verificationDs - .filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'") - .collectAsList() - .get(0) - .getValueSet() - .contains("20|pippo_wf_001::2899e571609779168222fdeb59cb916d")); + .assertTrue( + verificationDs + .filter("key = '40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|pippo_wf_001::2899e571609779168222fdeb59cb916d")); Assertions - .assertEquals( - 2, verificationDs - .filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'") - .collectAsList() - .get(0) - .getValueSet() - .size()); + .assertEquals( + 2, verificationDs + .filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'") + .collectAsList() + .get(0) + .getValueSet() + .size()); Assertions - .assertTrue( - verificationDs - .filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'") - .collectAsList() - .get(0) - .getValueSet() - .contains("20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0")); + .assertTrue( + verificationDs + .filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0")); Assertions - .assertTrue( - verificationDs - .filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'") - .collectAsList() - .get(0) - .getValueSet() - .contains("20|pippo_wf_001::2899e571609779168222fdeb59cb916d")); + .assertTrue( + verificationDs + .filter("key = '40|dedup_wf_001::2899e571609779168222fdeb59cb916d'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|pippo_wf_001::2899e571609779168222fdeb59cb916d")); Assertions - .assertEquals( - 1, verificationDs - .filter("key = '40|doajarticles::03748bcb5d754c951efec9700e18a56d'") - .collectAsList() - .get(0) - .getValueSet() - .size()); + .assertEquals( + 1, verificationDs + .filter("key = '40|doajarticles::03748bcb5d754c951efec9700e18a56d'") + .collectAsList() + .get(0) + .getValueSet() + .size()); Assertions - .assertTrue( - verificationDs - .filter("key = '40|doajarticles::03748bcb5d754c951efec9700e18a56d'") - .collectAsList() - .get(0) - .getValueSet() - .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); + .assertTrue( + verificationDs + .filter("key = '40|doajarticles::03748bcb5d754c951efec9700e18a56d'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); Assertions - .assertEquals( - 1, verificationDs - .filter("key = '40|openaire____::ec653e804967133b9436fdd30d3ff51d'") - .collectAsList() - .get(0) - .getValueSet() - .size()); + .assertEquals( + 1, verificationDs + .filter("key = '40|openaire____::ec653e804967133b9436fdd30d3ff51d'") + .collectAsList() + .get(0) + .getValueSet() + .size()); Assertions - .assertTrue( - verificationDs - .filter("key = '40|openaire____::ec653e804967133b9436fdd30d3ff51d'") - .collectAsList() - .get(0) - .getValueSet() - .contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); + .assertTrue( + verificationDs + .filter("key = '40|openaire____::ec653e804967133b9436fdd30d3ff51d'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); Assertions - .assertEquals( - 1, verificationDs - .filter("key = '40|doajarticles::1cae0b82b56ccd97c2db1f698def7074'") - .collectAsList() - .get(0) - .getValueSet() - .size()); + .assertEquals( + 1, verificationDs + .filter("key = '40|doajarticles::1cae0b82b56ccd97c2db1f698def7074'") + .collectAsList() + .get(0) + .getValueSet() + .size()); Assertions - .assertTrue( - verificationDs - .filter("key = '40|doajarticles::1cae0b82b56ccd97c2db1f698def7074'") - .collectAsList() - .get(0) - .getValueSet() - .contains("20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1")); + .assertTrue( + verificationDs + .filter("key = '40|doajarticles::1cae0b82b56ccd97c2db1f698def7074'") + .collectAsList() + .get(0) + .getValueSet() + .contains("20|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1")); verificationDs - .foreach((ForeachFunction) v -> System.out.println(OBJECT_MAPPER.writeValueAsString(v))); + .foreach((ForeachFunction) v -> System.out.println(OBJECT_MAPPER.writeValueAsString(v))); } @@ -676,7 +676,7 @@ public class PrepareInfoJobTest { "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", - "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -704,7 +704,7 @@ public class PrepareInfoJobTest { "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", - "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java index eb4ade0da..2e75c75ad 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java @@ -85,9 +85,9 @@ public class SparkJobTest { .getPath(); final String projectOrgPath = getClass() - .getResource( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/") - .getPath(); + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/") + .getPath(); readPath(spark, leavesPath, Leaves.class) .write() @@ -100,9 +100,9 @@ public class SparkJobTest { .json(workingDir.toString() + "/orgsInput"); readPath(spark, projectOrgPath, KeyValueSet.class) - .write() - .option("compression", "gzip") - .json(workingDir.toString() + "/projectInput"); + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/projectInput"); SparkResultToOrganizationFromSemRel @@ -114,7 +114,7 @@ public class SparkJobTest { "-outputPath", workingDir.toString() + "/finalrelation", "-leavesPath", workingDir.toString() + "/leavesInput", "-resultOrgPath", workingDir.toString() + "/orgsInput", - "-projectOrganizationPath", workingDir.toString() + "/projectInput", + "-projectOrganizationPath", workingDir.toString() + "/projectInput", "-childParentPath", childParentPath, "-workingDir", workingDir.toString() }); @@ -161,19 +161,24 @@ public class SparkJobTest { .foreach(r -> Assertions.assertEquals(ModelConstants.HAS_AUTHOR_INSTITUTION, r.getRelClass())); Assertions .assertEquals( - 2, result.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); + 2, + result.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); Assertions .assertEquals( - 3, result.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); + 3, + result.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); Assertions .assertEquals( - 2, result.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); + 2, + result.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); Assertions .assertEquals( - 1, result.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); + 1, + result.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); Assertions .assertEquals( - 1, result.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); + 1, + result.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); Assertions.assertEquals(9, result.filter(r -> r.getSource().substring(0, 3).equals("20|")).count()); result @@ -181,19 +186,24 @@ public class SparkJobTest { .foreach(r -> Assertions.assertEquals(ModelConstants.IS_AUTHOR_INSTITUTION_OF, r.getRelClass())); Assertions .assertEquals( - 1, result.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); + 1, + result.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); Assertions .assertEquals( - 1, result.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); + 1, + result.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); Assertions .assertEquals( - 2, result.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); + 2, + result.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); Assertions .assertEquals( - 2, result.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); + 2, + result.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); Assertions .assertEquals( - 3, result.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); + 3, + result.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); Assertions .assertTrue( @@ -336,332 +346,343 @@ public class SparkJobTest { public void completeProjectExecution() throws Exception { final String graphPath = getClass() - .getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph") - .getPath(); + .getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph") + .getPath(); final String leavesPath = getClass() - .getResource( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/") - .getPath(); + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/") + .getPath(); final String childParentPath = getClass() - .getResource( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/") - .getPath(); + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/") + .getPath(); final String resultOrgPath = getClass() - .getResource( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") - .getPath(); + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") + .getPath(); final String projectOrgPath = getClass() - .getResource( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/") - .getPath(); + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/") + .getPath(); readPath(spark, leavesPath, Leaves.class) - .write() - .option("compression", "gzip") - .json(workingDir.toString() + "/leavesInput"); + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/leavesInput"); readPath(spark, resultOrgPath, KeyValueSet.class) - .write() - .option("compression", "gzip") - .json(workingDir.toString() + "/orgsInput"); + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/orgsInput"); readPath(spark, projectOrgPath, KeyValueSet.class) - .write() - .option("compression", "gzip") - .json(workingDir.toString() + "/projectInput"); + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/projectInput"); SparkResultToOrganizationFromSemRel - .main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-relationPath", graphPath, - "-hive_metastore_uris", "", - "-outputPath", workingDir.toString() + "/finalrelation", - "-leavesPath", workingDir.toString() + "/leavesInput", - "-resultOrgPath", workingDir.toString() + "/orgsInput", - "-projectOrganizationPath", workingDir.toString() + "/projectInput", - "-childParentPath", childParentPath, - "-workingDir", workingDir.toString() - }); + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-relationPath", graphPath, + "-hive_metastore_uris", "", + "-outputPath", workingDir.toString() + "/finalrelation", + "-leavesPath", workingDir.toString() + "/leavesInput", + "-resultOrgPath", workingDir.toString() + "/orgsInput", + "-projectOrganizationPath", workingDir.toString() + "/projectInput", + "-childParentPath", childParentPath, + "-workingDir", workingDir.toString() + }); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD temp = sc - .textFile(workingDir.toString() + "/finalrelation") - .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + .textFile(workingDir.toString() + "/finalrelation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); Assertions.assertEquals(36, temp.count()); - JavaRDD project = temp.filter(r -> r.getSource().startsWith("40|") || r.getTarget().startsWith("40|")); + JavaRDD project = temp + .filter(r -> r.getSource().startsWith("40|") || r.getTarget().startsWith("40|")); Assertions.assertEquals(18, project.count()); project.foreach(r -> Assertions.assertEquals(ModelConstants.PARTICIPATION, r.getSubRelType())); project.foreach(r -> Assertions.assertEquals(ModelConstants.PROJECT_ORGANIZATION, r.getRelType())); project - .foreach( - r -> Assertions - .assertEquals( - PropagationConstant.PROPAGATION_DATA_INFO_TYPE, r.getDataInfo().getInferenceprovenance())); + .foreach( + r -> Assertions + .assertEquals( + PropagationConstant.PROPAGATION_DATA_INFO_TYPE, r.getDataInfo().getInferenceprovenance())); project - .foreach( - r -> Assertions - .assertEquals( - PropagationConstant.PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID, - r.getDataInfo().getProvenanceaction().getClassid())); + .foreach( + r -> Assertions + .assertEquals( + PropagationConstant.PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID, + r.getDataInfo().getProvenanceaction().getClassid())); project - .foreach( - r -> Assertions - .assertEquals( - PropagationConstant.PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME, - r.getDataInfo().getProvenanceaction().getClassname())); + .foreach( + r -> Assertions + .assertEquals( + PropagationConstant.PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME, + r.getDataInfo().getProvenanceaction().getClassname())); project - .foreach( - r -> Assertions - .assertEquals( - "0.85", - r.getDataInfo().getTrust())); + .foreach( + r -> Assertions + .assertEquals( + "0.85", + r.getDataInfo().getTrust())); Assertions.assertEquals(9, project.filter(r -> r.getSource().substring(0, 3).equals("40|")).count()); project - .filter(r -> r.getSource().substring(0, 3).equals("40|")) - .foreach(r -> Assertions.assertEquals(ModelConstants.HAS_PARTICIPANT, r.getRelClass())); + .filter(r -> r.getSource().substring(0, 3).equals("40|")) + .foreach(r -> Assertions.assertEquals(ModelConstants.HAS_PARTICIPANT, r.getRelClass())); Assertions - .assertEquals( - 2, project.filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); + .assertEquals( + 2, + project.filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); Assertions - .assertEquals( - 3, project.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); + .assertEquals( + 3, + project.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); Assertions - .assertEquals( - 2, project.filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); + .assertEquals( + 2, + project.filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); Assertions - .assertEquals( - 1, project.filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); + .assertEquals( + 1, + project.filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); Assertions - .assertEquals( - 1, project.filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); + .assertEquals( + 1, + project.filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); Assertions.assertEquals(9, project.filter(r -> r.getSource().substring(0, 3).equals("20|")).count()); project - .filter(r -> r.getSource().substring(0, 3).equals("20|")) - .foreach(r -> Assertions.assertEquals(ModelConstants.IS_PARTICIPANT, r.getRelClass())); + .filter(r -> r.getSource().substring(0, 3).equals("20|")) + .foreach(r -> Assertions.assertEquals(ModelConstants.IS_PARTICIPANT, r.getRelClass())); Assertions - .assertEquals( - 1, project.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); + .assertEquals( + 1, + project.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count()); Assertions - .assertEquals( - 1, project.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); + .assertEquals( + 1, + project.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count()); Assertions - .assertEquals( - 2, project.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); + .assertEquals( + 2, + project.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count()); Assertions - .assertEquals( - 2, project.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); + .assertEquals( + 2, + project.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count()); Assertions - .assertEquals( - 3, project.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); + .assertEquals( + 3, + project.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count()); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) - .map(r -> r.getTarget()) - .collect() - .contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); + .assertTrue( + project + .filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) + .map(r -> r.getTarget()) + .collect() + .contains("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) - .map(r -> r.getTarget()) - .collect() - .contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d")); + .assertTrue( + project + .filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) + .map(r -> r.getTarget()) + .collect() + .contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) - .map(r -> r.getTarget()) - .collect() - .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); + .assertTrue( + project + .filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) + .map(r -> r.getTarget()) + .collect() + .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) - .map(r -> r.getTarget()) - .collect() - .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); + .assertTrue( + project + .filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) + .map(r -> r.getTarget()) + .collect() + .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")) - .map(r -> r.getTarget()) - .collect() - .contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")); + .assertTrue( + project + .filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")) + .map(r -> r.getTarget()) + .collect() + .contains("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")) - .map(r -> r.getTarget()) - .collect() - .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); + .assertTrue( + project + .filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")) + .map(r -> r.getTarget()) + .collect() + .contains("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")) - .map(r -> r.getTarget()) - .collect() - .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); + .assertTrue( + project + .filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")) + .map(r -> r.getTarget()) + .collect() + .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d")) - .map(r -> r.getTarget()) - .collect() - .contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d")); + .assertTrue( + project + .filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d")) + .map(r -> r.getTarget()) + .collect() + .contains("20|openaire____::ec653e804967133b9436fdd30d3ff51d")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d")) - .map(r -> r.getTarget()) - .collect() - .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); + .assertTrue( + project + .filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d")) + .map(r -> r.getTarget()) + .collect() + .contains("20|doajarticles::03748bcb5d754c951efec9700e18a56d")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")) - .map(r -> r.getTarget()) - .collect() - .contains("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); + .assertTrue( + project + .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")) + .map(r -> r.getTarget()) + .collect() + .contains("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")) - .map(r -> r.getTarget()) - .collect() - .contains("40|openaire____::ec653e804967133b9436fdd30d3ff51d")); + .assertTrue( + project + .filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")) + .map(r -> r.getTarget()) + .collect() + .contains("40|openaire____::ec653e804967133b9436fdd30d3ff51d")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) - .map(r -> r.getTarget()) - .collect() - .contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")); + .assertTrue( + project + .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) + .map(r -> r.getTarget()) + .collect() + .contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) - .map(r -> r.getTarget()) - .collect() - .contains("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); + .assertTrue( + project + .filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")) + .map(r -> r.getTarget()) + .collect() + .contains("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) - .map(r -> r.getTarget()) - .collect() - .contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")); + .assertTrue( + project + .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) + .map(r -> r.getTarget()) + .collect() + .contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) - .map(r -> r.getTarget()) - .collect() - .contains("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); + .assertTrue( + project + .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) + .map(r -> r.getTarget()) + .collect() + .contains("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) - .map(r -> r.getTarget()) - .collect() - .contains("40|doajarticles::03748bcb5d754c951efec9700e18a56d")); + .assertTrue( + project + .filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")) + .map(r -> r.getTarget()) + .collect() + .contains("40|doajarticles::03748bcb5d754c951efec9700e18a56d")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")) - .map(r -> r.getTarget()) - .collect() - .contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")); + .assertTrue( + project + .filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")) + .map(r -> r.getTarget()) + .collect() + .contains("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")); Assertions - .assertTrue( - project - .filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) - .map(r -> r.getTarget()) - .collect() - .contains("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); + .assertTrue( + project + .filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")) + .map(r -> r.getTarget()) + .collect() + .contains("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")); } @Test public void singleIterationExecution() throws Exception { final String graphPath = getClass() - .getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph") - .getPath(); + .getResource("/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/graph") + .getPath(); final String leavesPath = getClass() - .getResource( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/") - .getPath(); + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/") + .getPath(); final String childParentPath = getClass() - .getResource( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/") - .getPath(); + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/") + .getPath(); final String resultOrgPath = getClass() - .getResource( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") - .getPath(); + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") + .getPath(); final String projectOrgPath = getClass() - .getResource( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/") - .getPath(); + .getResource( + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/projectOrganization/") + .getPath(); readPath(spark, leavesPath, Leaves.class) - .write() - .option("compression", "gzip") - .json(workingDir.toString() + "/leavesInput"); + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/leavesInput"); readPath(spark, resultOrgPath, KeyValueSet.class) - .write() - .option("compression", "gzip") - .json(workingDir.toString() + "/orgsInput"); + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/orgsInput"); readPath(spark, projectOrgPath, KeyValueSet.class) - .write() - .option("compression", "gzip") - .json(workingDir.toString() + "/projectInput"); + .write() + .option("compression", "gzip") + .json(workingDir.toString() + "/projectInput"); SparkResultToOrganizationFromSemRel - .main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-relationPath", graphPath, - "-hive_metastore_uris", "", - "-outputPath", workingDir.toString() + "/finalrelation", - "-leavesPath", workingDir.toString() + "/leavesInput", - "-resultOrgPath", workingDir.toString() + "/orgsInput", - "-projectOrganizationPath", workingDir.toString() + "/projectInput", - "-childParentPath", childParentPath, - "-workingDir", workingDir.toString(), - "-iterations", "1" - }); + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-relationPath", graphPath, + "-hive_metastore_uris", "", + "-outputPath", workingDir.toString() + "/finalrelation", + "-leavesPath", workingDir.toString() + "/leavesInput", + "-resultOrgPath", workingDir.toString() + "/orgsInput", + "-projectOrganizationPath", workingDir.toString() + "/projectInput", + "-childParentPath", childParentPath, + "-workingDir", workingDir.toString(), + "-iterations", "1" + }); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD temp = sc - .textFile(workingDir.toString() + "/finalrelation") - .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + .textFile(workingDir.toString() + "/finalrelation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); Assertions.assertEquals(16, temp.count()); diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java index 7a71240b2..64339e3b7 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java @@ -87,7 +87,8 @@ public class StepActionsTest { getClass() .getResource( "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/") - .getPath(), ModelConstants.HAS_AUTHOR_INSTITUTION); + .getPath(), + ModelConstants.HAS_AUTHOR_INSTITUTION); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());