selection of the new plugin

This commit is contained in:
Michele Artini 2024-03-04 10:43:40 +01:00
parent 9506d80ddc
commit db6f774394
4 changed files with 146 additions and 119 deletions

View File

@ -19,6 +19,7 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
import eu.dnetlib.dhp.aggregation.common.ReportingJob;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.base.BaseCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
@ -57,7 +58,7 @@ public class CollectorWorker extends ReportingJob {
public void collect() throws UnknownCollectorPluginException, CollectorException, IOException {
final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
final String outputPath = this.mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
log.info("outputPath path is {}", outputPath);
final CollectorPlugin plugin = getCollectorPlugin();
@ -67,36 +68,36 @@ public class CollectorWorker extends ReportingJob {
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
fileSystem.getConf(),
SequenceFile.Writer.file(new Path(outputPath)),
SequenceFile.Writer.keyClass(IntWritable.class),
SequenceFile.Writer.valueClass(Text.class),
this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
.keyClass(IntWritable.class),
SequenceFile.Writer
.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
final IntWritable key = new IntWritable(counter.get());
final Text value = new Text();
plugin
.collect(api, report)
.forEach(
content -> {
.collect(this.api, this.report)
.forEach(content -> {
key.set(counter.getAndIncrement());
value.set(content);
try {
writer.append(key, value);
} catch (Throwable e) {
} catch (final Throwable e) {
throw new RuntimeException(e);
}
});
} catch (Throwable e) {
report.put(e.getClass().getName(), e.getMessage());
} catch (final Throwable e) {
this.report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e);
} finally {
shutdown();
report.ongoing(counter.longValue(), counter.longValue());
this.report.ongoing(counter.longValue(), counter.longValue());
}
}
private void scheduleReport(AtomicInteger counter) {
private void scheduleReport(final AtomicInteger counter) {
schedule(new ReporterCallback() {
@Override
public Long getCurrent() {
return counter.longValue();
@ -111,31 +112,33 @@ public class CollectorWorker extends ReportingJob {
private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException {
switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) {
switch (CollectorPlugin.NAME.valueOf(this.api.getProtocol())) {
case oai:
return new OaiCollectorPlugin(clientParams);
return new OaiCollectorPlugin(this.clientParams);
case rest_json2xml:
return new RestCollectorPlugin(clientParams);
return new RestCollectorPlugin(this.clientParams);
case file:
return new FileCollectorPlugin(fileSystem);
return new FileCollectorPlugin(this.fileSystem);
case fileGzip:
return new FileGZipCollectorPlugin(fileSystem);
return new FileGZipCollectorPlugin(this.fileSystem);
case baseDump:
return new BaseCollectorPlugin(this.fileSystem);
case other:
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
.ofNullable(api.getParams().get("other_plugin_type"))
.ofNullable(this.api.getParams().get("other_plugin_type"))
.map(CollectorPlugin.NAME.OTHER_NAME::valueOf)
.orElseThrow(() -> new IllegalArgumentException("invalid other_plugin_type"));
switch (plugin) {
case mdstore_mongodb_dump:
return new MongoDbDumpCollectorPlugin(fileSystem);
return new MongoDbDumpCollectorPlugin(this.fileSystem);
case mdstore_mongodb:
return new MDStoreCollectorPlugin();
default:
throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
}
default:
throw new UnknownCollectorPluginException("protocol is not managed: " + api.getProtocol());
throw new UnknownCollectorPluginException("protocol is not managed: " + this.api.getProtocol());
}
}

View File

@ -10,7 +10,8 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
public interface CollectorPlugin {
enum NAME {
oai, other, rest_json2xml, file, fileGzip;
oai, other, rest_json2xml, file, fileGzip, baseDump;
public enum OTHER_NAME {
mdstore_mongodb_dump, mdstore_mongodb

View File

@ -52,7 +52,8 @@ public class BaseAnalyzerJob {
public static void main(final String[] args) throws Exception {
final String jsonConfiguration = IOUtils
.toString(BaseAnalyzerJob.class
.toString(
BaseAnalyzerJob.class
.getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/action_set_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
@ -98,34 +99,42 @@ public class BaseAnalyzerJob {
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
if (fromStep <= 0) {
log
.info("\n**************************************\n* EXECUTING STEP 0: LoadRecords\n**************************************");
.info(
"\n**************************************\n* EXECUTING STEP 0: LoadRecords\n**************************************");
loadRecords(inputPath, dataPath);
log
.info("\n**************************************\n* EXECUTING STEP 0: DONE\n**************************************");
.info(
"\n**************************************\n* EXECUTING STEP 0: DONE\n**************************************");
}
if (fromStep <= 1) {
log
.info("\n**************************************\n* EXECUTING STEP 1: Base Report\n**************************************");
.info(
"\n**************************************\n* EXECUTING STEP 1: Base Report\n**************************************");
generateReport(spark, dataPath, outputPath);
log
.info("\n**************************************\n* EXECUTING STEP 1: DONE\n**************************************");
.info(
"\n**************************************\n* EXECUTING STEP 1: DONE\n**************************************");
}
if (fromStep <= 2) {
log
.info("\n**************************************\n* EXECUTING STEP 2: OpenDOAR Report\n**************************************");
.info(
"\n**************************************\n* EXECUTING STEP 2: OpenDOAR Report\n**************************************");
generateOpenDoarReport(spark, outputPath, opendoarPath, loadOpenDoarStats(dbUrl, dbUser, dbPassword));
log
.info("\n**************************************\n* EXECUTING STEP 2: DONE\n**************************************");
.info(
"\n**************************************\n* EXECUTING STEP 2: DONE\n**************************************");
}
if (fromStep <= 3) {
log
.info("\n**************************************\n* EXECUTING STEP 3: Type Vocabulary Report\n**************************************");
.info(
"\n**************************************\n* EXECUTING STEP 3: Type Vocabulary Report\n**************************************");
generateVocTypeReport(spark, outputPath, typesReportPath);
log
.info("\n**************************************\n* EXECUTING STEP 3: DONE\n**************************************");
.info(
"\n**************************************\n* EXECUTING STEP 3: DONE\n**************************************");
}
});
@ -145,7 +154,8 @@ public class BaseAnalyzerJob {
for (final String t2 : rec.getTypes()) {
if (t2.startsWith("TYPE:")) {
list
.add(new Tuple2<>(StringUtils.substringAfter(t1, "TYPE_NORM:").trim(),
.add(
new Tuple2<>(StringUtils.substringAfter(t1, "TYPE_NORM:").trim(),
StringUtils.substringAfter(t2, "TYPE:").trim()));
}
}
@ -196,8 +206,12 @@ public class BaseAnalyzerJob {
}
private static OpenDoarRepoStatus merge(final OpenDoarRepoStatus r1, final OpenDoarRepoStatus r2) {
if (r1 == null) { return r2; }
if (r2 == null) { return r1; }
if (r1 == null) {
return r2;
}
if (r2 == null) {
return r1;
}
final OpenDoarRepoStatus r = new OpenDoarRepoStatus();
r.setId(ObjectUtils.firstNonNull(r1.getId(), r2.getId()));
@ -219,8 +233,10 @@ public class BaseAnalyzerJob {
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
final String sql = IOUtils
.toString(BaseAnalyzerJob.class
.getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-aggregation-status.sql"));
.toString(
BaseAnalyzerJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-aggregation-status.sql"));
dbClient.processResults(sql, row -> {
try {
@ -264,9 +280,12 @@ public class BaseAnalyzerJob {
final Text value = new Text();
try (final SequenceFile.Writer writer = SequenceFile
.createWriter(fs.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
.keyClass(LongWritable.class), SequenceFile.Writer
.valueClass(Text.class), SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
.createWriter(
fs.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
.keyClass(LongWritable.class),
SequenceFile.Writer
.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
final BaseCollectorIterator iteraror = new BaseCollectorIterator(fs, new Path(inputPath), report);

View File

@ -59,7 +59,9 @@ public class BaseCollectorPlugin implements CollectorPlugin {
log.info("dbPassword: {}", "***");
try {
if (!this.fs.exists(filePath)) { throw new CollectorException("path does not exist: " + filePath); }
if (!this.fs.exists(filePath)) {
throw new CollectorException("path does not exist: " + filePath);
}
} catch (final Throwable e) {
throw new CollectorException(e);
}
@ -73,13 +75,15 @@ public class BaseCollectorPlugin implements CollectorPlugin {
.filter(doc -> filterXml(doc, acceptedOpendoarIds, report));
}
private Set<String> findAcceptedOpendoarIds(final String dbUrl, final String dbUser, final String dbPassword) throws CollectorException {
private Set<String> findAcceptedOpendoarIds(final String dbUrl, final String dbUser, final String dbPassword)
throws CollectorException {
final Set<String> accepted = new HashSet<>();
try (final DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
final String sql = IOUtils
.toString(BaseAnalyzerJob.class
.toString(
BaseAnalyzerJob.class
.getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-accepted.sql"));
dbClient.processResults(sql, row -> {