diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml
index 687f0de667..f579a7d2bf 100644
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@@ -57,6 +57,11 @@
commons-io
+
+ commons-validator
+ commons-validator
+
+
org.apache.spark
spark-core_2.11
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
index 5cfb22cb91..a8d09e4a7f 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
@@ -10,9 +10,13 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
+import java.net.MalformedURLException;
+import java.net.URL;
import java.util.*;
+import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.validator.routines.UrlValidator;
import org.dom4j.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -617,4 +621,15 @@ public abstract class AbstractMdRecordToOafMapper {
return res;
}
+ protected Set validateUrl(Collection url) {
+ UrlValidator urlValidator = UrlValidator.getInstance();
+ if (Objects.isNull(url)) {
+ return new HashSet<>();
+ }
+ return url
+ .stream()
+ .filter(u -> urlValidator.isValid(u))
+ .collect(Collectors.toCollection(HashSet::new));
+ }
+
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
index 9225e174d3..30f3935f5f 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java
@@ -159,22 +159,25 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
final List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
- instance
- .setUrl(
- nodes
- .stream()
- .filter(n -> StringUtils.isNotBlank(n.getText()))
- .map(n -> n.getText().trim())
- .filter(u -> u.startsWith("http"))
- .map(s -> {
- try {
- return URLDecoder.decode(s, "UTF-8");
- } catch (Throwable t) {
- return s;
- }
- })
- .distinct()
- .collect(Collectors.toCollection(ArrayList::new)));
+ final List url = nodes
+ .stream()
+ .filter(n -> StringUtils.isNotBlank(n.getText()))
+ .map(n -> n.getText().trim())
+ .filter(u -> u.startsWith("http"))
+ .map(s -> {
+ try {
+ return URLDecoder.decode(s, "UTF-8");
+ } catch (Throwable t) {
+ return s;
+ }
+ })
+ .distinct()
+ .collect(Collectors.toCollection(ArrayList::new));
+ final Set validUrl = validateUrl(url);
+ if (!validUrl.isEmpty()) {
+ instance.setUrl(new ArrayList<>());
+ instance.getUrl().addAll(validUrl);
+ }
return Lists.newArrayList(instance);
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
index d6bfe67142..5781988e62 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
@@ -6,11 +6,14 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
+import java.net.URL;
import java.net.URLDecoder;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.validator.routines.UrlValidator;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.Node;
@@ -171,23 +174,31 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) {
url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
}
- for (final Object o : doc
- .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
- url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
+
+ Set validUrl = validateUrl(url);
+
+ if (validUrl.stream().noneMatch(s -> s.contains("doi.org"))) {
+ for (final Object o : doc
+ .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
+ validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
+ }
+ for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
+ validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
+ }
}
- for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
- url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
+ if (validUrl.stream().noneMatch(s -> s.contains("hdl.handle.net"))) {
+ for (final Object o : doc
+ .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
+ validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
+ }
+ for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
+ validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
+ }
}
- for (final Object o : doc
- .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
- url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
- }
- for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
- url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
- }
- if (!url.isEmpty()) {
+
+ if (!validUrl.isEmpty()) {
instance.setUrl(new ArrayList<>());
- instance.getUrl().addAll(url);
+ instance.getUrl().addAll(validUrl);
}
return Arrays.asList(instance);
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
index 231d5b0ac4..64b68e6af1 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
@@ -950,7 +950,7 @@ class MappersTest {
@Test
void testNotWellFormed() throws IOException {
final String xml = IOUtils
- .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml")));
+ .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml")));
final List actual = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
assertNotNull(actual);
assertTrue(actual.isEmpty());
diff --git a/pom.xml b/pom.xml
index ab59e7be3d..a1b26966e1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -200,6 +200,12 @@
${dhp.commons.lang.version}
+
+ commons-validator
+ commons-validator
+ 1.7
+
+
com.github.sisyphsu
dateparser