From 0389b57ca7cf0ba54e8949e5cecec20c3991d8bd Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 31 May 2023 11:06:58 +0200 Subject: [PATCH] added propagation for project to organization --- .../eu/dnetlib/dhp/PropagationConstant.java | 31 +++++++++- .../Leaves.java | 2 +- .../PrepareInfo.java | 47 +++++++++++---- .../PropagationCounter.java | 2 +- .../SparkResultToOrganizationFromSemRel.java | 59 +++++++++++++------ .../StepActions.java | 30 ++++------ .../input_preparation_parameter.json | 6 ++ .../input_propagation_parameter.json | 7 ++- .../oozie_app/config-default.xml | 0 .../oozie_app/workflow.xml | 5 +- .../PrepareInfoJobTest.java | 28 ++++----- .../SparkJobTest.java | 4 +- .../StepActionsTest.java | 2 +- .../childparenttest1/relation | 0 .../childparenttest2/relation | 0 .../execstep/childParentOrg/childparent | 0 .../execstep/currentIteration/leaves | 0 .../execstep/relation | 0 .../execstep/relsforiteration1/relation | 0 .../resultOrganization/resultorganization | 0 .../resultorganizationtest/relation | 0 21 files changed, 153 insertions(+), 70 deletions(-) rename dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/Leaves.java (79%) rename dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/PrepareInfo.java (78%) rename dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/PropagationCounter.java (97%) rename dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/SparkResultToOrganizationFromSemRel.java (80%) rename dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/StepActions.java (89%) rename dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/input_preparation_parameter.json (87%) rename dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/input_propagation_parameter.json (90%) rename dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/oozie_app/config-default.xml (100%) rename dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/oozie_app/workflow.xml (96%) rename dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/PrepareInfoJobTest.java (94%) rename dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/SparkJobTest.java (98%) rename dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/StepActionsTest.java (99%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/childparenttest1/relation (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/childparenttest2/relation (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/execstep/childParentOrg/childparent (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/execstep/currentIteration/leaves (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/execstep/relation (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/execstep/relsforiteration1/relation (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/execstep/resultOrganization/resultorganization (100%) rename dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/{resulttoorganizationfromsemrel => entitytoorganizationfromsemrel}/resultorganizationtest/relation (100%) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 89bdf0982..53769c9fb 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -57,7 +57,10 @@ public class PropagationConstant { public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME = "Propagation of affiliation to result collected from datasources of type institutional repository"; public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID = "result:organization:semrel"; - public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME = "Propagation of affiliation to result through sematic relations"; + public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME = "Propagation of affiliation to result through semantic relations"; + + public static final String PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID = "project:organization:semrel"; + public static final String PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME = "Propagation of participation to project through semantic relations"; public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID = "result:project:semrel"; public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME = "Propagation of result to project through semantic relation"; @@ -171,6 +174,32 @@ public class PropagationConstant { return newRelations; } + public static Relation getParticipantRelation( + String source, + String target, + String rel_class) { + return getRelation(source, target , + rel_class, + ModelConstants.PROJECT_ORGANIZATION, + ModelConstants.PARTICIPATION, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_ID, + PROPAGATION_RELATION_PROJECT_ORGANIZATION_SEM_REL_CLASS_NAME); + } + + public static Relation getAffiliationRelation( + String source, + String target, + String rel_class) { + return getRelation(source, target , + rel_class, + ModelConstants.RESULT_ORGANIZATION, + ModelConstants.AFFILIATION, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID, + PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME); + } + public static Relation getRelation( String source, String target, diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/Leaves.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/Leaves.java similarity index 79% rename from dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/Leaves.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/Leaves.java index 7984721e8..e010b54c0 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/Leaves.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/Leaves.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import java.io.Serializable; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfo.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java similarity index 78% rename from dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfo.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java index 23909fd9a..7ad9c4cee 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfo.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; @@ -7,6 +7,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.io.Serializable; import java.util.*; +import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.*; @@ -14,8 +15,6 @@ import org.apache.spark.sql.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.dhp.KeyValueSet; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob; @@ -47,13 +46,20 @@ public class PrepareInfo implements Serializable { "' and datainfo.deletedbyinference = false " + "GROUP BY source"; + // associate projects to all the participant orgs + private static final String PROJECT_ORGANIZATION_QUERY = "SELECT source key, collect_set(target) as valueSet " + + "FROM relation " + + "WHERE lower(relclass) = '" + ModelConstants.IS_PARTICIPANT.toLowerCase() + + "' and datainfo.deletedbyinference = false " + + "GROUP BY source"; + public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( SparkResultToOrganizationFromIstRepoJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_preparation_parameter.json")); + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -74,6 +80,9 @@ public class PrepareInfo implements Serializable { final String resultOrganizationPath = parser.get("resultOrgPath"); log.info("resultOrganizationPath: {}", resultOrganizationPath); + final String projectOrgPath = parser.get("projectOrganizationPath"); + log.info("projectOrgPath: {}", projectOrgPath); + final String relationPath = parser.get("relationPath"); log.info("relationPath: {}", relationPath); @@ -89,11 +98,12 @@ public class PrepareInfo implements Serializable { childParentPath, leavesPath, resultOrganizationPath, + projectOrgPath, relationPath)); } private static void prepareInfo(SparkSession spark, String inputPath, String childParentOrganizationPath, - String currentIterationPath, String resultOrganizationPath, String relationPath) { + String currentIterationPath, String resultOrganizationPath, String resultProjectPath, String relationPath) { Dataset relation = readPath(spark, inputPath + "/relation", Relation.class); relation.createOrReplaceTempView("relation"); @@ -113,14 +123,31 @@ public class PrepareInfo implements Serializable { .option("compression", "gzip") .json(resultOrganizationPath); + spark + .sql(PROJECT_ORGANIZATION_QUERY) + .as(Encoders.bean(KeyValueSet.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(resultProjectPath); + relation - .filter( - (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - r.getRelClass().equals(ModelConstants.HAS_AUTHOR_INSTITUTION)) - .write() + .filter( + (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + r.getRelClass().equals(ModelConstants.HAS_AUTHOR_INSTITUTION)) + .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") - .json(relationPath); + .json(relationPath + "/result"); + + relation + .filter( + (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + r.getRelClass().equals(ModelConstants.IS_PARTICIPANT)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(relationPath + "/project"); Dataset children = spark .sql( diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PropagationCounter.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PropagationCounter.java similarity index 97% rename from dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PropagationCounter.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PropagationCounter.java index 788eff0e3..1c408d1c3 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PropagationCounter.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PropagationCounter.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import java.io.Serializable; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java similarity index 80% rename from dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java index cfc69a8f0..19e55a905 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; @@ -30,7 +30,8 @@ import eu.dnetlib.dhp.schema.oaf.Relation; public class SparkResultToOrganizationFromSemRel implements Serializable { private static final Logger log = LoggerFactory.getLogger(SparkResultToOrganizationFromSemRel.class); private static final int MAX_ITERATION = 5; - public static final String NEW_RELATION_PATH = "/newRelation"; + public static final String NEW_RESULT_RELATION_PATH = "/newResultRelation"; + public static final String NEW_PROJECT_RELATION_PATH = "/newProjectRelation"; public static void main(String[] args) throws Exception { @@ -62,6 +63,9 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { final String resultOrganizationPath = parser.get("resultOrgPath"); log.info("resultOrganizationPath: {}", resultOrganizationPath); + final String projectOrganizationPath = parser.get("projectOrganizationPath"); + log.info("projectOrganizationPath: {}", projectOrganizationPath); + final String workingPath = parser.get("workingDir"); log.info("workingPath: {}", workingPath); @@ -88,6 +92,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { leavesPath, childParentPath, resultOrganizationPath, + projectOrganizationPath, relationPath, workingPath, outputPath, @@ -98,13 +103,14 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { String leavesPath, String childParentPath, String resultOrganizationPath, + String projectOrganizationPath, String graphPath, String workingPath, String outputPath, int iterations) { if (iterations == 1) { doPropagateOnce( - spark, leavesPath, childParentPath, resultOrganizationPath, graphPath, + spark, leavesPath, childParentPath, resultOrganizationPath, projectOrganizationPath, graphPath, workingPath, outputPath); } else { @@ -130,15 +136,22 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { } private static void doPropagateOnce(SparkSession spark, String leavesPath, String childParentPath, - String resultOrganizationPath, String graphPath, String workingPath, + String resultOrganizationPath, String projectOrganizationPath, String graphPath, String workingPath, String outputPath) { StepActions .execStep( - spark, graphPath, workingPath + NEW_RELATION_PATH, + spark, graphPath + "/result", workingPath + NEW_RESULT_RELATION_PATH, leavesPath, childParentPath, resultOrganizationPath); - addNewRelations(spark, workingPath + NEW_RELATION_PATH, outputPath); + addNewRelations(spark, workingPath + NEW_RESULT_RELATION_PATH, outputPath); + + StepActions + .execStep( + spark, graphPath + "/project", workingPath + NEW_PROJECT_RELATION_PATH, + leavesPath, childParentPath, projectOrganizationPath); + + addNewRelations(spark, workingPath + NEW_PROJECT_RELATION_PATH, outputPath); } private static void doPropagate(SparkSession spark, String leavesPath, String childParentPath, @@ -151,11 +164,11 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { iteration++; StepActions .execStep( - spark, graphPath, workingPath + NEW_RELATION_PATH, + spark, graphPath, workingPath + NEW_RESULT_RELATION_PATH, leavesPath, childParentPath, resultOrganizationPath); StepActions .prepareForNextStep( - spark, workingPath + NEW_RELATION_PATH, resultOrganizationPath, leavesPath, + spark, workingPath + NEW_RESULT_RELATION_PATH, resultOrganizationPath, leavesPath, childParentPath, workingPath + "/leaves", workingPath + "/resOrg"); moveOutput(spark, workingPath, leavesPath, resultOrganizationPath); leavesCount = readPath(spark, leavesPath, Leaves.class).count(); @@ -185,7 +198,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { propagationCounter.getNotReachedFirstParent().add(1); } - addNewRelations(spark, workingPath + NEW_RELATION_PATH, outputPath); + addNewRelations(spark, workingPath + NEW_RESULT_RELATION_PATH, outputPath); } private static void moveOutput(SparkSession spark, String workingPath, String leavesPath, @@ -212,16 +225,24 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { .mapGroups( (MapGroupsFunction) (k, it) -> it.next(), Encoders.bean(Relation.class)) .flatMap( - (FlatMapFunction) r -> Arrays - .asList( - r, getRelation( - r.getTarget(), r.getSource(), ModelConstants.IS_AUTHOR_INSTITUTION_OF, - ModelConstants.RESULT_ORGANIZATION, - ModelConstants.AFFILIATION, - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID, - PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME)) - .iterator() + (FlatMapFunction) r -> + { + if(r.getSource().startsWith("50|")){ + return Arrays + .asList( + r, getAffiliationRelation( + r.getTarget(), r.getSource(), ModelConstants.IS_AUTHOR_INSTITUTION_OF)) + .iterator(); + }else{ + return Arrays + .asList( + r, getParticipantRelation( + r.getTarget(), r.getSource(), ModelConstants.HAS_PARTICIPANT)) + .iterator(); + } + } + + , Encoders.bean(Relation.class)) .write() diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActions.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java similarity index 89% rename from dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActions.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java index 1adbbe60e..5b6c397cf 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActions.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.readPath; @@ -14,8 +14,6 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.jetbrains.annotations.NotNull; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; @@ -28,13 +26,13 @@ public class StepActions implements Serializable { public static void execStep(SparkSession spark, String graphPath, String newRelationPath, - String leavesPath, String chldParentOrgPath, String resultOrgPath) { + String leavesPath, String chldParentOrgPath, String entityOrgPath) { Dataset relationGraph = readPath(spark, graphPath, Relation.class); // select only the relation source target among those proposed by propagation that are not already existent getNewRels( newRelationPath, relationGraph, - getPropagationRelation(spark, leavesPath, chldParentOrgPath, resultOrgPath)); + getPropagationRelation(spark, leavesPath, chldParentOrgPath, entityOrgPath, ModelConstants.HAS_AUTHOR_INSTITUTION)); } @@ -152,19 +150,20 @@ public class StepActions implements Serializable { private static Dataset getPropagationRelation(SparkSession spark, String leavesPath, String chldParentOrgPath, - String resultOrgPath) { + String entityOrgPath, + String semantics) { Dataset childParent = readPath(spark, chldParentOrgPath, KeyValueSet.class); - Dataset resultOrg = readPath(spark, resultOrgPath, KeyValueSet.class); + Dataset entityOrg = readPath(spark, entityOrgPath, KeyValueSet.class); Dataset leaves = readPath(spark, leavesPath, Leaves.class); childParent.createOrReplaceTempView("childParent"); - resultOrg.createOrReplaceTempView("resultOrg"); + entityOrg.createOrReplaceTempView("entityOrg"); leaves.createOrReplaceTempView("leaves"); Dataset resultParent = spark .sql( - "SELECT resId as key, " + + "SELECT entityId as key, " + "collect_set(parent) valueSet " + "FROM (SELECT key as child, parent " + " FROM childParent " + @@ -172,7 +171,7 @@ public class StepActions implements Serializable { "JOIN leaves " + "ON leaves.value = cp.child " + "JOIN (" + - "SELECT key as resId, org " + + "SELECT key as entityId, org " + "FROM resultOrg " + "LATERAL VIEW explode (valueSet) ks as org ) as ro " + "ON leaves.value = ro.org " + @@ -186,19 +185,16 @@ public class StepActions implements Serializable { .getValueSet() .stream() .map( - orgId -> getRelation( + orgId -> getAffiliationRelation( v.getKey(), orgId, - ModelConstants.HAS_AUTHOR_INSTITUTION, - ModelConstants.RESULT_ORGANIZATION, - ModelConstants.AFFILIATION, - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID, - PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME)) + semantics)) .collect(Collectors.toList()) .iterator(), Encoders.bean(Relation.class)); } + + } diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_preparation_parameter.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json similarity index 87% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_preparation_parameter.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json index c79bfe05d..b59937331 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_preparation_parameter.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json @@ -40,5 +40,11 @@ "paramLongName": "relationPath", "paramDescription": "the path where to store the selected subset of relations", "paramRequired": false + }, + { + "paramName": "pop", + "paramLongName": "projectOrganizationPath", + "paramDescription": "the number of iterations to be computed", + "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_propagation_parameter.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_propagation_parameter.json similarity index 90% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_propagation_parameter.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_propagation_parameter.json index e09cd62fa..5a8597f38 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/input_propagation_parameter.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_propagation_parameter.json @@ -52,5 +52,10 @@ "paramLongName": "iterations", "paramDescription": "the number of iterations to be computed", "paramRequired": false - } + },{ + "paramName": "pop", + "paramLongName": "projectOrganizationPath", + "paramDescription": "the number of iterations to be computed", + "paramRequired": true +} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/workflow.xml similarity index 96% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/workflow.xml index 5ce2f5c06..ff6ec8f37 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/workflow.xml @@ -134,7 +134,7 @@ yarn cluster PrepareResultOrganizationAssociation - eu.dnetlib.dhp.resulttoorganizationfromsemrel.PrepareInfo + eu.dnetlib.dhp.entitytoorganizationfromsemrel.PrepareInfo dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -150,6 +150,7 @@ --leavesPath${workingDir}/preparedInfo/leavesPath --childParentPath${workingDir}/preparedInfo/childParentPath --resultOrgPath${workingDir}/preparedInfo/resultOrgPath + --projectOrganizationPath${workingDir}/preparedInfo/projectOrganizationPath --relationPath${workingDir}/preparedInfo/relation @@ -161,7 +162,7 @@ yarn cluster resultToOrganizationFromSemRel - eu.dnetlib.dhp.resulttoorganizationfromsemrel.SparkResultToOrganizationFromSemRel + eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkResultToOrganizationFromSemRel dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfoJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java similarity index 94% rename from dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfoJobTest.java rename to dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java index 2d2668db3..3d7086739 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfoJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfoJobTest.java @@ -1,22 +1,17 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; - -import static eu.dnetlib.dhp.PropagationConstant.readPath; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.List; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.ForeachFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Assertions; @@ -28,7 +23,6 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.KeyValueSet; -import eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob; import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareInfoJobTest { @@ -78,11 +72,12 @@ public class PrepareInfoJobTest { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-graphPath", getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest1") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest1") .getPath(), "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -223,11 +218,12 @@ public class PrepareInfoJobTest { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-graphPath", getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest2") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest2") .getPath(), "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -343,11 +339,12 @@ public class PrepareInfoJobTest { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-graphPath", getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/resultorganizationtest") .getPath(), "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -355,7 +352,7 @@ public class PrepareInfoJobTest { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc - .textFile(workingDir.toString() + "/relation") + .textFile(workingDir.toString() + "/relation/result") .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); @@ -373,11 +370,12 @@ public class PrepareInfoJobTest { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-graphPath", getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/resultorganizationtest") .getPath(), "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -507,11 +505,12 @@ public class PrepareInfoJobTest { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-graphPath", getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/resultorganizationtest") .getPath(), "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" @@ -534,11 +533,12 @@ public class PrepareInfoJobTest { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-graphPath", getClass() .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest1") + "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest1") .getPath(), "-hive_metastore_uris", "", "-leavesPath", workingDir.toString() + "/currentIteration/", "-resultOrgPath", workingDir.toString() + "/resultOrganization/", + "-projectOrganizationPath", workingDir.toString() + "/projectOrganization/", "-childParentPath", workingDir.toString() + "/childParentOrg/", "-relationPath", workingDir.toString() + "/relation" diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java similarity index 98% rename from dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkJobTest.java rename to dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java index 7dd575b66..a4d8f83e3 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java @@ -1,7 +1,6 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; -import static eu.dnetlib.dhp.PropagationConstant.isSparkSessionManaged; import static eu.dnetlib.dhp.PropagationConstant.readPath; import java.io.IOException; @@ -12,7 +11,6 @@ import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.ForeachFunction; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Assertions; diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActionsTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java similarity index 99% rename from dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActionsTest.java rename to dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java index 5c715f3b9..77ed4dcbf 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActionsTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActionsTest.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.resulttoorganizationfromsemrel; +package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import java.io.IOException; import java.nio.file.Files; diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest1/relation b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest1/relation similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest1/relation rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest1/relation diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest2/relation b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest2/relation similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/childparenttest2/relation rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/childparenttest2/relation diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/childParentOrg/childparent b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/childparent similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/childParentOrg/childparent rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/childParentOrg/childparent diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/currentIteration/leaves b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/leaves similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/currentIteration/leaves rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/currentIteration/leaves diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/relation b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relation similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/relation rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relation diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/relsforiteration1/relation b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relsforiteration1/relation similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/relsforiteration1/relation rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/relsforiteration1/relation diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/resultOrganization/resultorganization b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/resultorganization similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/execstep/resultOrganization/resultorganization rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/resultorganization diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest/relation b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/resultorganizationtest/relation similarity index 100% rename from dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/resultorganizationtest/relation rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/resultorganizationtest/relation