reports for types
This commit is contained in:
parent
71204a8056
commit
32f4d6f691
|
@ -42,6 +42,7 @@ import org.slf4j.LoggerFactory;
|
|||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.DbClient;
|
||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class BaseAnalyzerJob {
|
||||
|
||||
|
@ -77,6 +78,9 @@ public class BaseAnalyzerJob {
|
|||
final String opendoarPath = parser.get("opendoarPath");
|
||||
log.info("opendoarPath {}: ", opendoarPath);
|
||||
|
||||
final String typesReportPath = parser.get("typesReportPath");
|
||||
log.info("typesReportPath {}: ", typesReportPath);
|
||||
|
||||
final int fromStep = Integer.parseInt(parser.get("fromStep"));
|
||||
log.info("fromStep {}: ", fromStep);
|
||||
|
||||
|
@ -115,10 +119,48 @@ public class BaseAnalyzerJob {
|
|||
log
|
||||
.info("\n**************************************\n* EXECUTING STEP 2: DONE\n**************************************");
|
||||
}
|
||||
|
||||
if (fromStep <= 3) {
|
||||
log
|
||||
.info("\n**************************************\n* EXECUTING STEP 3: Type Vocabulary Report\n**************************************");
|
||||
generateVocTypeReport(spark, outputPath, typesReportPath);
|
||||
log
|
||||
.info("\n**************************************\n* EXECUTING STEP 3: DONE\n**************************************");
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void generateVocTypeReport(final SparkSession spark,
|
||||
final String reportPath,
|
||||
final String typesReportPath) {
|
||||
spark
|
||||
.read()
|
||||
.parquet(reportPath)
|
||||
.as(Encoders.bean(BaseRecordInfo.class))
|
||||
.flatMap(rec -> {
|
||||
final List<Tuple2<String, String>> list = new ArrayList<>();
|
||||
for (final String t1 : rec.getTypes()) {
|
||||
if (t1.startsWith("TYPE_NORM:")) {
|
||||
for (final String t2 : rec.getTypes()) {
|
||||
if (t2.startsWith("TYPE:")) {
|
||||
list
|
||||
.add(new Tuple2<>(StringUtils.substringAfter(t1, "TYPE_NORM:").trim(),
|
||||
StringUtils.substringAfter(t2, "TYPE:").trim()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return list.iterator();
|
||||
}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
||||
.distinct()
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.format("parquet")
|
||||
.save(typesReportPath);
|
||||
|
||||
}
|
||||
|
||||
private static void generateOpenDoarReport(final SparkSession spark,
|
||||
final String reportPath,
|
||||
final String opendoarPath,
|
||||
|
|
|
@ -23,6 +23,12 @@
|
|||
"paramDescription": "the path of the generated the OpenDOAR report",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "typesReportPath",
|
||||
"paramDescription": "the path of the generated the types report",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "f",
|
||||
"paramLongName": "fromStep",
|
||||
|
|
|
@ -16,6 +16,10 @@
|
|||
<name>baseOpenDoarReportsPath</name>
|
||||
<description>path where to store the OpenDOAR reports</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>baseTypesReportPath</name>
|
||||
<description>path of the generated the types report</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
|
@ -61,6 +65,7 @@
|
|||
<arg>--dataPath</arg><arg>${baseDataPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${baseReportsPath}</arg>
|
||||
<arg>--opendoarPath</arg><arg>${baseOpenDoarReportsPath}</arg>
|
||||
<arg>--typesReportPath</arg><arg>${baseTypesReportPath}</arg>
|
||||
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
|
||||
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
||||
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
||||
|
|
Loading…
Reference in New Issue