From fe36895c53abd194a3dfc30b8edf4506ba8463e7 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 22 Jan 2021 11:55:10 +0100 Subject: [PATCH] added datasource blacklist for the organization to result propagation through institutional repositories --- .../PrepareResultInstRepoAssociation.java | 23 ++++++++++++++++--- .../input_prepareresultorg_parameters.json | 7 +++++- .../oozie_app/workflow.xml | 1 + 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java index fe5889c53..92c09fb28 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -22,6 +22,11 @@ import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Relation; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + public class PrepareResultInstRepoAssociation { private static final Logger log = LoggerFactory.getLogger(PrepareResultInstRepoAssociation.class); @@ -51,6 +56,10 @@ public class PrepareResultInstRepoAssociation { final String alreadyLinkedPath = parser.get("alreadyLinkedPath"); log.info("alreadyLinkedPath {}: ", alreadyLinkedPath); + List blacklist = Optional.ofNullable(parser.get("blacklist")) + .map(v -> Arrays.asList(v.split(";"))) + .orElse(new ArrayList<>()); + SparkConf conf = new SparkConf(); conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); @@ -61,7 +70,7 @@ public class PrepareResultInstRepoAssociation { readNeededResources(spark, inputPath); removeOutputDir(spark, datasourceOrganizationPath); - prepareDatasourceOrganization(spark, datasourceOrganizationPath); + prepareDatasourceOrganization(spark, datasourceOrganizationPath, blacklist); removeOutputDir(spark, alreadyLinkedPath); prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath); @@ -80,7 +89,15 @@ public class PrepareResultInstRepoAssociation { } private static void prepareDatasourceOrganization( - SparkSession spark, String datasourceOrganizationPath) { + SparkSession spark, String datasourceOrganizationPath, List blacklist) { + String blacklisted = ""; + if(blacklist.size() > 0 ){ + blacklisted = " AND d.id != '" + blacklist.get(0) + "'"; + for (int i = 1; i < blacklist.size(); i++) { + blacklisted += " AND d.id != '" + blacklist.get(i) + "'"; + } + } + String query = "SELECT source datasourceId, target organizationId " + "FROM ( SELECT id " @@ -88,7 +105,7 @@ public class PrepareResultInstRepoAssociation { + "WHERE datasourcetype.classid = '" + INSTITUTIONAL_REPO_TYPE + "' " - + "AND datainfo.deletedbyinference = false ) d " + + "AND datainfo.deletedbyinference = false " + blacklisted + " ) d " + "JOIN ( SELECT source, target " + "FROM relation " + "WHERE lower(relclass) = '" diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json index c74496350..2f00bacae 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json @@ -28,5 +28,10 @@ "paramLongName": "isSparkSessionManaged", "paramDescription": "the path where prepared info have been stored", "paramRequired": false - } + },{ + "paramName": "bl", + "paramLongName": "blacklist", + "paramDescription": "institutional repositories that should not be considered for the propagation", + "paramRequired": false +} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml index 2fe9a4256..edfff8817 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml @@ -141,6 +141,7 @@ --hive_metastore_uris${hive_metastore_uris} --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --blacklist${blacklist}