diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorPlugin.java index 7bcc997d0..cd3ff86a0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorPlugin.java @@ -16,8 +16,10 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.DocumentHelper; +import org.dom4j.Node; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -45,46 +47,54 @@ public class BaseCollectorPlugin implements CollectorPlugin { public Stream collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException { // get path to file final Path filePath = Optional - .ofNullable(api.getBaseUrl()) - .map(Path::new) - .orElseThrow(() -> new CollectorException("missing baseUrl")); + .ofNullable(api.getBaseUrl()) + .map(Path::new) + .orElseThrow(() -> new CollectorException("missing baseUrl")); final String dbUrl = api.getParams().get("dbUrl"); final String dbUser = api.getParams().get("dbUser"); final String dbPassword = api.getParams().get("dbPassword"); + final String acceptedNormTypesString = api.getParams().get("acceptedNormTypes"); log.info("baseUrl: {}", filePath); log.info("dbUrl: {}", dbUrl); log.info("dbUser: {}", dbUser); log.info("dbPassword: {}", "***"); + log.info("acceptedNormTypes: {}", acceptedNormTypesString); try { - if (!this.fs.exists(filePath)) { - throw new CollectorException("path does not exist: " + filePath); - } + if (!this.fs.exists(filePath)) { throw new CollectorException("path does not exist: " + filePath); } } catch (final Throwable e) { throw new CollectorException(e); } final Set acceptedOpendoarIds = findAcceptedOpendoarIds(dbUrl, dbUser, dbPassword); + final Set acceptedNormTypes = new HashSet<>(); + if (StringUtils.isNotBlank(acceptedNormTypesString)) { + for (final String s : StringUtils.split(acceptedNormTypesString, ",")) { + if (StringUtils.isNotBlank(s)) { + acceptedNormTypes.add(s.trim()); + } + } + } + final Iterator iterator = new BaseCollectorIterator(this.fs, filePath, report); final Spliterator spliterator = Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED); return StreamSupport - .stream(spliterator, false) - .filter(doc -> filterXml(doc, acceptedOpendoarIds, report)); + .stream(spliterator, false) + .filter(doc -> filterXml(doc, acceptedOpendoarIds, acceptedNormTypes)); } private Set findAcceptedOpendoarIds(final String dbUrl, final String dbUser, final String dbPassword) - throws CollectorException { + throws CollectorException { final Set accepted = new HashSet<>(); try (final DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { final String sql = IOUtils - .toString( - BaseAnalyzerJob.class - .getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-accepted.sql")); + .toString(BaseAnalyzerJob.class + .getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-accepted.sql")); dbClient.processResults(sql, row -> { try { @@ -107,13 +117,24 @@ public class BaseCollectorPlugin implements CollectorPlugin { return accepted; } - private boolean filterXml(final String xml, final Set acceptedOpendoarIds, final AggregatorReport report) { + protected static boolean filterXml(final String xml, + final Set acceptedOpendoarIds, + final Set acceptedNormTypes) { try { - final String id = DocumentHelper - .parseText(xml) - .valueOf("//*[local-name()='collection']/@opendoar_id") - .trim(); - return (StringUtils.isNotBlank(id) && acceptedOpendoarIds.contains("opendoar____::" + id.trim())); + + final Document doc = DocumentHelper.parseText(xml); + + final String id = doc.valueOf("//*[local-name()='collection']/@opendoar_id").trim(); + + if (StringUtils.isBlank(id) || !acceptedOpendoarIds.contains("opendoar____::" + id)) { return false; } + + if (acceptedNormTypes.isEmpty()) { return true; } + + for (final Object s : doc.selectNodes("//*[local-name()='typenorm']")) { + if (acceptedNormTypes.contains(((Node) s).getText().trim())) { return true; } + } + + return false; } catch (final DocumentException e) { log.error("Error parsing document", e); throw new RuntimeException("Error parsing document", e); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorPluginTest.java new file mode 100644 index 000000000..306cc4303 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorPluginTest.java @@ -0,0 +1,31 @@ +package eu.dnetlib.dhp.collection.plugin.base; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; + +class BaseCollectorPluginTest { + + @Test + void testFilterXml() throws Exception { + final String xml = IOUtils.toString(getClass().getResourceAsStream("record.xml")); + + final Set validIds = new HashSet<>(Arrays.asList("opendoar____::1234", "opendoar____::4567")); + final Set validTypes = new HashSet<>(Arrays.asList("1", "121")); + final Set validTypes2 = new HashSet<>(Arrays.asList("1", "11")); + + assertTrue(BaseCollectorPlugin.filterXml(xml, validIds, validTypes)); + assertTrue(BaseCollectorPlugin.filterXml(xml, validIds, new HashSet<>())); + + assertFalse(BaseCollectorPlugin.filterXml(xml, new HashSet<>(), validTypes)); + assertFalse(BaseCollectorPlugin.filterXml(xml, validIds, validTypes2)); + + } + +}