From b9d124bb7cac298304fa37a4e7a74afe30646a53 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 3 Nov 2021 13:55:37 +0100 Subject: [PATCH] [Enrichment: Propagation through parent-child relationships] Added counters, and changed constraint to verify if filtering out the relation (from classname = harvested to classid != propagation) --- .../PrepareInfo.java | 5 --- .../SparkResultToOrganizationFromSemRel.java | 31 +++---------------- .../StepActions.java | 6 ++-- .../oozie_app/workflow.xml | 2 +- 4 files changed, 8 insertions(+), 36 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfo.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfo.java index 71b032b2b..732ba2c2d 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfo.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/PrepareInfo.java @@ -6,9 +6,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.io.Serializable; import java.util.*; -import java.util.stream.Collectors; -import org.apache.commons.collections.iterators.ArrayListIterator; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.*; @@ -17,15 +15,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import com.sun.tools.internal.ws.processor.model.Model; import eu.dnetlib.dhp.KeyValueSet; -import eu.dnetlib.dhp.PropagationConstant; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; -import net.sf.saxon.expr.instruct.ForEach; import scala.Tuple2; /** diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java index e716f8d86..a900fe0d8 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java @@ -2,7 +2,7 @@ package eu.dnetlib.dhp.resulttoorganizationfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; -import static eu.dnetlib.dhp.common.Constants.*; + import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.io.Serializable; @@ -22,12 +22,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.KeyValueSet; -import eu.dnetlib.dhp.PropagationConstant; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; + public class SparkResultToOrganizationFromSemRel implements Serializable { private static final Logger log = LoggerFactory.getLogger(SparkResultToOrganizationFromSemRel.class); private static final int MAX_ITERATION = 5; @@ -113,7 +114,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { String resultOrganizationPath, String graphPath, String workingPath, String outputPath, PropagationCounter propagationCounter) { int iteration = 0; - long leavesCount = 0; + long leavesCount; do { iteration++; @@ -199,30 +200,6 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { .json(outputPath); } - // per ogni figlio nel file delle organizzazioni - // devo fare una roba iterativa che legge info da un file e le cambia via via - // passo 1: creo l'informazione iniale: organizzazioni che non hanno figli con almeno un padre - // ogni organizzazione punta alla lista di padri - // eseguo la propagazione dall'organizzazione figlio all'organizzazione padre - // ricerco nel dataset delle relazioni se l'organizzazione a cui ho propagato ha, a sua volta, dei padri - // e propago anche a quelli e cosi' via fino a che arrivo ad organizzazione senza padre - // organizationFile: - // f => {p1, p2, ..., pn} - // resultFile - // o => {r1, r2, ... rm} - - // supponiamo che f => {r1, r2} e che nessuno dei padri abbia gia' l'associazione con questi result - // quindi - // p1 => {r1, r2} - // p2 => {r1, r2} - // pn => {r1, r2} - - // mi serve un file con tutta la gerarchia per organizzazioni - // un file con le organizzazioni foglia da joinare con l'altro file - // un file con le associazioni organizzazione -> result forse meglio result -> organization - - // eseguo gli step fino a che ho foglie nel set - // quando non ne ho piu' creo relazioni doppio verso per le nuove propagate che ho introdotto } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActions.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActions.java index 2d2fd866d..affe1f56f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActions.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfromsemrel/StepActions.java @@ -131,11 +131,11 @@ public class StepActions implements Serializable { if (relationList .stream() .filter( - rel -> rel + rel -> !rel .getDataInfo() .getProvenanceaction() - .getClassname() - .equals(ModelConstants.HARVESTED)) + .getClassid() + .equals(PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID)) .count() > 0) { return null; } diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/workflow.xml index 32307e13d..e62ce0f5a 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfromsemrel/oozie_app/workflow.xml @@ -176,7 +176,7 @@ --graphPath${sourcePath} --outputPath${outputPath}/relation --leavesPath${workingDir}/preparedInfo/leavesPath - --leavesPath${workingDir}/preparedInfo/childParentPath + --childParentPath${workingDir}/preparedInfo/childParentPath --resultOrgPath${workingDir}/preparedInfo/resultOrgPath --hive_metastore_uris${hive_metastore_uris} --workingDir${workingDir}/working