From 26e1baddedc95ee93ba7577ef34b29e9b09940ed Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 19 Sep 2022 11:19:10 +0200 Subject: [PATCH] added instance.url syntactical validation, avoid creating multiple duplicated URLs --- dhp-workflows/dhp-graph-mapper/pom.xml | 5 +++ .../raw/AbstractMdRecordToOafMapper.java | 15 +++++++ .../dhp/oa/graph/raw/OafToOafMapper.java | 35 +++++++++-------- .../dhp/oa/graph/raw/OdfToOafMapper.java | 39 ++++++++++++------- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 2 +- pom.xml | 6 +++ 6 files changed, 71 insertions(+), 31 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index 687f0de667..f579a7d2bf 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -57,6 +57,11 @@ commons-io + + commons-validator + commons-validator + + org.apache.spark spark-core_2.11 diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 5cfb22cb91..a8d09e4a7f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -10,9 +10,13 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; +import java.net.MalformedURLException; +import java.net.URL; import java.util.*; +import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.validator.routines.UrlValidator; import org.dom4j.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -617,4 +621,15 @@ public abstract class AbstractMdRecordToOafMapper { return res; } + protected Set validateUrl(Collection url) { + UrlValidator urlValidator = UrlValidator.getInstance(); + if (Objects.isNull(url)) { + return new HashSet<>(); + } + return url + .stream() + .filter(u -> urlValidator.isValid(u)) + .collect(Collectors.toCollection(HashSet::new)); + } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 9225e174d3..30f3935f5f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -159,22 +159,25 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); final List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); - instance - .setUrl( - nodes - .stream() - .filter(n -> StringUtils.isNotBlank(n.getText())) - .map(n -> n.getText().trim()) - .filter(u -> u.startsWith("http")) - .map(s -> { - try { - return URLDecoder.decode(s, "UTF-8"); - } catch (Throwable t) { - return s; - } - }) - .distinct() - .collect(Collectors.toCollection(ArrayList::new))); + final List url = nodes + .stream() + .filter(n -> StringUtils.isNotBlank(n.getText())) + .map(n -> n.getText().trim()) + .filter(u -> u.startsWith("http")) + .map(s -> { + try { + return URLDecoder.decode(s, "UTF-8"); + } catch (Throwable t) { + return s; + } + }) + .distinct() + .collect(Collectors.toCollection(ArrayList::new)); + final Set validUrl = validateUrl(url); + if (!validUrl.isEmpty()) { + instance.setUrl(new ArrayList<>()); + instance.getUrl().addAll(validUrl); + } return Lists.newArrayList(instance); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index d6bfe67142..5781988e62 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -6,11 +6,14 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty; import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; import java.net.URLDecoder; import java.util.*; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.validator.routines.UrlValidator; import org.dom4j.Document; import org.dom4j.Element; import org.dom4j.Node; @@ -171,23 +174,31 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) { url.add(trimAndDecodeUrl(((Node) o).getText().trim())); } - for (final Object o : doc - .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) { - url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); + + Set validUrl = validateUrl(url); + + if (validUrl.stream().noneMatch(s -> s.contains("doi.org"))) { + for (final Object o : doc + .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) { + validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); + } + for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) { + validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); + } } - for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) { - url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); + if (validUrl.stream().noneMatch(s -> s.contains("hdl.handle.net"))) { + for (final Object o : doc + .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) { + validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); + } + for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) { + validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); + } } - for (final Object o : doc - .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) { - url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); - } - for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) { - url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); - } - if (!url.isEmpty()) { + + if (!validUrl.isEmpty()) { instance.setUrl(new ArrayList<>()); - instance.getUrl().addAll(url); + instance.getUrl().addAll(validUrl); } return Arrays.asList(instance); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 231d5b0ac4..64b68e6af1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -950,7 +950,7 @@ class MappersTest { @Test void testNotWellFormed() throws IOException { final String xml = IOUtils - .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml"))); + .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml"))); final List actual = new OafToOafMapper(vocs, false, true).processMdRecord(xml); assertNotNull(actual); assertTrue(actual.isEmpty()); diff --git a/pom.xml b/pom.xml index ab59e7be3d..a1b26966e1 100644 --- a/pom.xml +++ b/pom.xml @@ -200,6 +200,12 @@ ${dhp.commons.lang.version} + + commons-validator + commons-validator + 1.7 + + com.github.sisyphsu dateparser