From cccb16900c15690cb1e22995674d7b8abbbc3010 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 23 Dec 2021 12:33:53 +0100 Subject: [PATCH] https://support.openaire.eu/issues/7330 normalising DOI urls --- .../oa/provision/utils/XmlRecordFactory.java | 17 +++++++++++ .../provision/IndexRecordTransformerTest.java | 29 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index af7c7b7c3..9a3f2188b 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -10,6 +10,7 @@ import java.io.IOException; import java.io.Serializable; import java.io.StringReader; import java.io.StringWriter; +import java.net.MalformedURLException; import java.net.URL; import java.util.*; import java.util.stream.Collectors; @@ -22,6 +23,7 @@ import javax.xml.transform.stream.StreamResult; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; +import org.apache.http.protocol.HTTP; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -55,6 +57,8 @@ public class XmlRecordFactory implements Serializable { * */ private static final long serialVersionUID = 2912912999272373172L; + public static final String DOI_ORG_AUTHORITY = "doi.org"; + public static final String HTTPS = "https"; private final Map accumulators; @@ -1269,6 +1273,7 @@ public class XmlRecordFactory implements Serializable { .getUrl() .stream() .filter(this::isValidUrl) + .map(XmlRecordFactory::normalizeDoiUrl) .collect(Collectors.toList())); return i; }) @@ -1285,6 +1290,18 @@ public class XmlRecordFactory implements Serializable { .map(this::mergeInstances); } + public static String normalizeDoiUrl(String url) { + if (url.contains(DOI_ORG_AUTHORITY)) { + try { + URL u = new URL(url); + return new URL(HTTPS, DOI_ORG_AUTHORITY, u.getFile()).toString(); + } catch (MalformedURLException e) { + e.printStackTrace(); + } + } + return url; + } + private boolean isValidUrl(String url) { try { new URL(url).toURI(); diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java index 64935e79d..86a155292 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java @@ -1,9 +1,14 @@ package eu.dnetlib.dhp.oa.provision; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Arrays; +import java.util.List; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; @@ -109,6 +114,30 @@ public class IndexRecordTransformerTest { testRecordTransformation(record); } + @Test + void testDoiUrlNormalization() throws MalformedURLException { + + // TODO add more test examples when needed + List urls = Arrays + .asList( + "https://dx.doi.org/10.1016/j.jas.2019.105013", + "http://dx.doi.org/10.13140/rg.2.2.26964.65927", + "https://dx.doi.org/10.13140/rg.2.2.26964.65927", + "http://dx.doi.org/10.1016/j.jas.2019.105013", + "http://hdl.handle.net/2072/369223", + "https://doi.org/10.1016/j.jas.2019.105013"); + + for (String url : urls) { + URL u = new URL(XmlRecordFactory.normalizeDoiUrl(url)); + if (url.contains(XmlRecordFactory.DOI_ORG_AUTHORITY)) { + assertEquals(XmlRecordFactory.HTTPS, u.getProtocol()); + assertEquals(XmlRecordFactory.DOI_ORG_AUTHORITY, u.getAuthority()); + } else { + assertEquals(url, u.toString()); + } + } + } + private void testRecordTransformation(final String record) throws IOException, TransformerException { final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml")); final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl"));