From fe36895c53abd194a3dfc30b8edf4506ba8463e7 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 22 Jan 2021 11:55:10 +0100 Subject: [PATCH 1/4] added datasource blacklist for the organization to result propagation through institutional repositories --- .../PrepareResultInstRepoAssociation.java | 23 ++++++++++++++++--- .../input_prepareresultorg_parameters.json | 7 +++++- .../oozie_app/workflow.xml | 1 + 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java index fe5889c53f..92c09fb281 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -22,6 +22,11 @@ import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Relation; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + public class PrepareResultInstRepoAssociation { private static final Logger log = LoggerFactory.getLogger(PrepareResultInstRepoAssociation.class); @@ -51,6 +56,10 @@ public class PrepareResultInstRepoAssociation { final String alreadyLinkedPath = parser.get("alreadyLinkedPath"); log.info("alreadyLinkedPath {}: ", alreadyLinkedPath); + List blacklist = Optional.ofNullable(parser.get("blacklist")) + .map(v -> Arrays.asList(v.split(";"))) + .orElse(new ArrayList<>()); + SparkConf conf = new SparkConf(); conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); @@ -61,7 +70,7 @@ public class PrepareResultInstRepoAssociation { readNeededResources(spark, inputPath); removeOutputDir(spark, datasourceOrganizationPath); - prepareDatasourceOrganization(spark, datasourceOrganizationPath); + prepareDatasourceOrganization(spark, datasourceOrganizationPath, blacklist); removeOutputDir(spark, alreadyLinkedPath); prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath); @@ -80,7 +89,15 @@ public class PrepareResultInstRepoAssociation { } private static void prepareDatasourceOrganization( - SparkSession spark, String datasourceOrganizationPath) { + SparkSession spark, String datasourceOrganizationPath, List blacklist) { + String blacklisted = ""; + if(blacklist.size() > 0 ){ + blacklisted = " AND d.id != '" + blacklist.get(0) + "'"; + for (int i = 1; i < blacklist.size(); i++) { + blacklisted += " AND d.id != '" + blacklist.get(i) + "'"; + } + } + String query = "SELECT source datasourceId, target organizationId " + "FROM ( SELECT id " @@ -88,7 +105,7 @@ public class PrepareResultInstRepoAssociation { + "WHERE datasourcetype.classid = '" + INSTITUTIONAL_REPO_TYPE + "' " - + "AND datainfo.deletedbyinference = false ) d " + + "AND datainfo.deletedbyinference = false " + blacklisted + " ) d " + "JOIN ( SELECT source, target " + "FROM relation " + "WHERE lower(relclass) = '" diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json index c744963503..2f00bacae3 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json @@ -28,5 +28,10 @@ "paramLongName": "isSparkSessionManaged", "paramDescription": "the path where prepared info have been stored", "paramRequired": false - } + },{ + "paramName": "bl", + "paramLongName": "blacklist", + "paramDescription": "institutional repositories that should not be considered for the propagation", + "paramRequired": false +} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml index 2fe9a4256b..edfff8817d 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml @@ -141,6 +141,7 @@ --hive_metastore_uris${hive_metastore_uris} --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --blacklist${blacklist} From 4ae6fba01d40f53ad09b7cf29768abce8a170374 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 23 Apr 2021 12:09:19 +0200 Subject: [PATCH 2/4] refactoring --- .../main/java/eu/dnetlib/dhp/schema/oaf/Relation.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java index 8825d71378..adfc6af95a 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java @@ -1,8 +1,6 @@ package eu.dnetlib.dhp.schema.oaf; -import eu.dnetlib.dhp.schema.common.ModelSupport; - import static com.google.common.base.Preconditions.checkArgument; import java.text.ParseException; @@ -10,6 +8,8 @@ import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; +import eu.dnetlib.dhp.schema.common.ModelSupport; + /** * Relation models any edge between two nodes in the OpenAIRE graph. It has a source id and a target id pointing to * graph node identifiers and it is further characterised by the semantic of the link through the fields relType, @@ -137,7 +137,10 @@ public class Relation extends Oaf { try { setValidationDate(ModelSupport.oldest(getValidationDate(), r.getValidationDate())); } catch (ParseException e) { - throw new IllegalArgumentException(String.format("invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(), getValidationDate())); + throw new IllegalArgumentException(String + .format( + "invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(), + getValidationDate())); } super.mergeFrom(r); From 72e5aa3b42ff8011d19d81eb11805ee8d5f594e1 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 23 Apr 2021 12:10:30 +0200 Subject: [PATCH 3/4] refactoring --- .../PrepareResultInstRepoAssociation.java | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java index 92c09fb281..a41399627a 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -4,6 +4,11 @@ package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -22,11 +27,6 @@ import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Relation; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Optional; - public class PrepareResultInstRepoAssociation { private static final Logger log = LoggerFactory.getLogger(PrepareResultInstRepoAssociation.class); @@ -56,9 +56,10 @@ public class PrepareResultInstRepoAssociation { final String alreadyLinkedPath = parser.get("alreadyLinkedPath"); log.info("alreadyLinkedPath {}: ", alreadyLinkedPath); - List blacklist = Optional.ofNullable(parser.get("blacklist")) - .map(v -> Arrays.asList(v.split(";"))) - .orElse(new ArrayList<>()); + List blacklist = Optional + .ofNullable(parser.get("blacklist")) + .map(v -> Arrays.asList(v.split(";"))) + .orElse(new ArrayList<>()); SparkConf conf = new SparkConf(); conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); @@ -91,14 +92,13 @@ public class PrepareResultInstRepoAssociation { private static void prepareDatasourceOrganization( SparkSession spark, String datasourceOrganizationPath, List blacklist) { String blacklisted = ""; - if(blacklist.size() > 0 ){ + if (blacklist.size() > 0) { blacklisted = " AND d.id != '" + blacklist.get(0) + "'"; for (int i = 1; i < blacklist.size(); i++) { blacklisted += " AND d.id != '" + blacklist.get(i) + "'"; } } - String query = "SELECT source datasourceId, target organizationId " + "FROM ( SELECT id " + "FROM datasource " From dc0ad8d2e09450b40fc14f3caafbda861d5b25fc Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 20 May 2021 14:53:53 +0200 Subject: [PATCH 4/4] fixed issue related to change in the file name downloaded. Added sheet name as parameter and also a check if the name should change --- .../dhp/actionmanager/project/utils/EXCELParser.java | 10 +++++++--- .../dhp/actionmanager/project/utils/ReadExcel.java | 7 ++++--- .../dhp/actionmanager/project/oozie_app/workflow.xml | 1 + .../dnetlib/dhp/actionmanager/project/parameters.json | 5 +++++ 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java index 0f83499e4e..cc18c6f54c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java @@ -22,7 +22,7 @@ import org.apache.poi.xssf.usermodel.XSSFWorkbook; */ public class EXCELParser { - public List parse(InputStream file, String classForName) + public List parse(InputStream file, String classForName, String sheetName) throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException, InvalidFormatException { @@ -30,7 +30,11 @@ public class EXCELParser { OPCPackage pkg = OPCPackage.open(file); XSSFWorkbook wb = new XSSFWorkbook(pkg); - XSSFSheet sheet = wb.getSheet("cordisref-H2020topics"); + XSSFSheet sheet = wb.getSheet(sheetName); + + if(sheetName == null){ + throw new RuntimeException("Sheet name " + sheetName + " not present in current file"); + } List ret = new ArrayList<>(); @@ -49,7 +53,7 @@ public class EXCELParser { headers.add(dataFormatter.formatCellValue(cell)); } } else { - Class clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic"); + Class clazz = Class.forName(classForName); final Object cc = clazz.newInstance(); for (int i = 0; i < headers.size(); i++) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java index 23b58f2a01..7644ba04c9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java @@ -42,19 +42,20 @@ public class ReadExcel implements Closeable { final String hdfsPath = parser.get("hdfsPath"); final String hdfsNameNode = parser.get("hdfsNameNode"); final String classForName = parser.get("classForName"); + final String sheetName = parser.get("sheetName"); try (final ReadExcel readExcel = new ReadExcel(hdfsPath, hdfsNameNode, fileURL)) { log.info("Getting Excel file..."); - readExcel.execute(classForName); + readExcel.execute(classForName, sheetName); } } - public void execute(final String classForName) throws Exception { + public void execute(final String classForName, final String sheetName) throws Exception { EXCELParser excelParser = new EXCELParser(); excelParser - .parse(excelFile, classForName) + .parse(excelFile, classForName, sheetName) .stream() .forEach(p -> write(p)); diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml index c710c8b553..8ce5818851 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml @@ -65,6 +65,7 @@ --hdfsNameNode${nameNode} --fileURL${topicFileURL} --hdfsPath${workingDir}/topic + --sheetName${sheetName} --classForNameeu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/parameters.json index dd3de70f6e..b6c9c94b94 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/parameters.json @@ -23,6 +23,11 @@ "paramLongName" : "classForName", "paramDescription" : "the name of the class to deserialize the csv to", "paramRequired" : true +}, { + "paramName": "sn", + "paramLongName" : "sheetName", + "paramDescription" : "the name of the sheet in case the file is excel", + "paramRequired" : false }