forked from antonis.lempesis/dnet-hadoop
https://support.openaire.eu/issues/7330 normalising DOI urls
This commit is contained in:
parent
5c4fee3533
commit
cccb16900c
|
@ -10,6 +10,7 @@ import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -22,6 +23,7 @@ import javax.xml.transform.stream.StreamResult;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.lang3.tuple.ImmutablePair;
|
import org.apache.commons.lang3.tuple.ImmutablePair;
|
||||||
import org.apache.commons.lang3.tuple.Pair;
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
import org.apache.http.protocol.HTTP;
|
||||||
import org.apache.spark.util.LongAccumulator;
|
import org.apache.spark.util.LongAccumulator;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
|
@ -55,6 +57,8 @@ public class XmlRecordFactory implements Serializable {
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
private static final long serialVersionUID = 2912912999272373172L;
|
private static final long serialVersionUID = 2912912999272373172L;
|
||||||
|
public static final String DOI_ORG_AUTHORITY = "doi.org";
|
||||||
|
public static final String HTTPS = "https";
|
||||||
|
|
||||||
private final Map<String, LongAccumulator> accumulators;
|
private final Map<String, LongAccumulator> accumulators;
|
||||||
|
|
||||||
|
@ -1269,6 +1273,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
.getUrl()
|
.getUrl()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(this::isValidUrl)
|
.filter(this::isValidUrl)
|
||||||
|
.map(XmlRecordFactory::normalizeDoiUrl)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
return i;
|
return i;
|
||||||
})
|
})
|
||||||
|
@ -1285,6 +1290,18 @@ public class XmlRecordFactory implements Serializable {
|
||||||
.map(this::mergeInstances);
|
.map(this::mergeInstances);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String normalizeDoiUrl(String url) {
|
||||||
|
if (url.contains(DOI_ORG_AUTHORITY)) {
|
||||||
|
try {
|
||||||
|
URL u = new URL(url);
|
||||||
|
return new URL(HTTPS, DOI_ORG_AUTHORITY, u.getFile()).toString();
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
private boolean isValidUrl(String url) {
|
private boolean isValidUrl(String url) {
|
||||||
try {
|
try {
|
||||||
new URL(url).toURI();
|
new URL(url).toURI();
|
||||||
|
|
|
@ -1,9 +1,14 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.provision;
|
package eu.dnetlib.dhp.oa.provision;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import javax.xml.transform.Transformer;
|
import javax.xml.transform.Transformer;
|
||||||
import javax.xml.transform.TransformerException;
|
import javax.xml.transform.TransformerException;
|
||||||
|
@ -109,6 +114,30 @@ public class IndexRecordTransformerTest {
|
||||||
testRecordTransformation(record);
|
testRecordTransformation(record);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testDoiUrlNormalization() throws MalformedURLException {
|
||||||
|
|
||||||
|
// TODO add more test examples when needed
|
||||||
|
List<String> urls = Arrays
|
||||||
|
.asList(
|
||||||
|
"https://dx.doi.org/10.1016/j.jas.2019.105013",
|
||||||
|
"http://dx.doi.org/10.13140/rg.2.2.26964.65927",
|
||||||
|
"https://dx.doi.org/10.13140/rg.2.2.26964.65927",
|
||||||
|
"http://dx.doi.org/10.1016/j.jas.2019.105013",
|
||||||
|
"http://hdl.handle.net/2072/369223",
|
||||||
|
"https://doi.org/10.1016/j.jas.2019.105013");
|
||||||
|
|
||||||
|
for (String url : urls) {
|
||||||
|
URL u = new URL(XmlRecordFactory.normalizeDoiUrl(url));
|
||||||
|
if (url.contains(XmlRecordFactory.DOI_ORG_AUTHORITY)) {
|
||||||
|
assertEquals(XmlRecordFactory.HTTPS, u.getProtocol());
|
||||||
|
assertEquals(XmlRecordFactory.DOI_ORG_AUTHORITY, u.getAuthority());
|
||||||
|
} else {
|
||||||
|
assertEquals(url, u.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void testRecordTransformation(final String record) throws IOException, TransformerException {
|
private void testRecordTransformation(final String record) throws IOException, TransformerException {
|
||||||
final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml"));
|
final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml"));
|
||||||
final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl"));
|
final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl"));
|
||||||
|
|
Loading…
Reference in New Issue