diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseAnalyzerJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseAnalyzerJob.java index a758807fe..717c0f4a8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseAnalyzerJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseAnalyzerJob.java @@ -42,6 +42,7 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.common.aggregation.AggregatorReport; +import scala.Tuple2; public class BaseAnalyzerJob { @@ -77,6 +78,9 @@ public class BaseAnalyzerJob { final String opendoarPath = parser.get("opendoarPath"); log.info("opendoarPath {}: ", opendoarPath); + final String typesReportPath = parser.get("typesReportPath"); + log.info("typesReportPath {}: ", typesReportPath); + final int fromStep = Integer.parseInt(parser.get("fromStep")); log.info("fromStep {}: ", fromStep); @@ -115,10 +119,48 @@ public class BaseAnalyzerJob { log .info("\n**************************************\n* EXECUTING STEP 2: DONE\n**************************************"); } + + if (fromStep <= 3) { + log + .info("\n**************************************\n* EXECUTING STEP 3: Type Vocabulary Report\n**************************************"); + generateVocTypeReport(spark, outputPath, typesReportPath); + log + .info("\n**************************************\n* EXECUTING STEP 3: DONE\n**************************************"); + } }); } + private static void generateVocTypeReport(final SparkSession spark, + final String reportPath, + final String typesReportPath) { + spark + .read() + .parquet(reportPath) + .as(Encoders.bean(BaseRecordInfo.class)) + .flatMap(rec -> { + final List> list = new ArrayList<>(); + for (final String t1 : rec.getTypes()) { + if (t1.startsWith("TYPE_NORM:")) { + for (final String t2 : rec.getTypes()) { + if (t2.startsWith("TYPE:")) { + list + .add(new Tuple2<>(StringUtils.substringAfter(t1, "TYPE_NORM:").trim(), + StringUtils.substringAfter(t2, "TYPE:").trim())); + } + } + } + } + return list.iterator(); + }, Encoders.tuple(Encoders.STRING(), Encoders.STRING())) + .distinct() + .write() + .mode(SaveMode.Overwrite) + .format("parquet") + .save(typesReportPath); + + } + private static void generateOpenDoarReport(final SparkSession spark, final String reportPath, final String opendoarPath, diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/action_set_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/action_set_parameters.json index da453fe4c..9496bedb0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/action_set_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/action_set_parameters.json @@ -23,6 +23,12 @@ "paramDescription": "the path of the generated the OpenDOAR report", "paramRequired": true }, + { + "paramName": "t", + "paramLongName": "typesReportPath", + "paramDescription": "the path of the generated the types report", + "paramRequired": true + }, { "paramName": "f", "paramLongName": "fromStep", diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/oozie_app/workflow.xml index 419aacf7c..6f7e313ed 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/plugin/base/oozie_app/workflow.xml @@ -16,6 +16,10 @@ baseOpenDoarReportsPath path where to store the OpenDOAR reports + + baseTypesReportPath + path of the generated the types report + postgresURL the postgres URL to access to the database @@ -61,6 +65,7 @@ --dataPath${baseDataPath} --outputPath${baseReportsPath} --opendoarPath${baseOpenDoarReportsPath} + --typesReportPath${baseTypesReportPath} --postgresUrl${postgresURL} --postgresUser${postgresUser} --postgresPassword${postgresPassword}