reports for types

This commit is contained in:
Michele Artini 2024-03-01 11:43:37 +01:00
parent 71204a8056
commit 32f4d6f691
3 changed files with 53 additions and 0 deletions

View File

@ -42,6 +42,7 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import scala.Tuple2;
public class BaseAnalyzerJob {
@ -77,6 +78,9 @@ public class BaseAnalyzerJob {
final String opendoarPath = parser.get("opendoarPath");
log.info("opendoarPath {}: ", opendoarPath);
final String typesReportPath = parser.get("typesReportPath");
log.info("typesReportPath {}: ", typesReportPath);
final int fromStep = Integer.parseInt(parser.get("fromStep"));
log.info("fromStep {}: ", fromStep);
@ -115,10 +119,48 @@ public class BaseAnalyzerJob {
log
.info("\n**************************************\n* EXECUTING STEP 2: DONE\n**************************************");
}
if (fromStep <= 3) {
log
.info("\n**************************************\n* EXECUTING STEP 3: Type Vocabulary Report\n**************************************");
generateVocTypeReport(spark, outputPath, typesReportPath);
log
.info("\n**************************************\n* EXECUTING STEP 3: DONE\n**************************************");
}
});
}
private static void generateVocTypeReport(final SparkSession spark,
final String reportPath,
final String typesReportPath) {
spark
.read()
.parquet(reportPath)
.as(Encoders.bean(BaseRecordInfo.class))
.flatMap(rec -> {
final List<Tuple2<String, String>> list = new ArrayList<>();
for (final String t1 : rec.getTypes()) {
if (t1.startsWith("TYPE_NORM:")) {
for (final String t2 : rec.getTypes()) {
if (t2.startsWith("TYPE:")) {
list
.add(new Tuple2<>(StringUtils.substringAfter(t1, "TYPE_NORM:").trim(),
StringUtils.substringAfter(t2, "TYPE:").trim()));
}
}
}
}
return list.iterator();
}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.distinct()
.write()
.mode(SaveMode.Overwrite)
.format("parquet")
.save(typesReportPath);
}
private static void generateOpenDoarReport(final SparkSession spark,
final String reportPath,
final String opendoarPath,

View File

@ -23,6 +23,12 @@
"paramDescription": "the path of the generated the OpenDOAR report",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "typesReportPath",
"paramDescription": "the path of the generated the types report",
"paramRequired": true
},
{
"paramName": "f",
"paramLongName": "fromStep",

View File

@ -16,6 +16,10 @@
<name>baseOpenDoarReportsPath</name>
<description>path where to store the OpenDOAR reports</description>
</property>
<property>
<name>baseTypesReportPath</name>
<description>path of the generated the types report</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
@ -61,6 +65,7 @@
<arg>--dataPath</arg><arg>${baseDataPath}</arg>
<arg>--outputPath</arg><arg>${baseReportsPath}</arg>
<arg>--opendoarPath</arg><arg>${baseOpenDoarReportsPath}</arg>
<arg>--typesReportPath</arg><arg>${baseTypesReportPath}</arg>
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>