Merge pull request 'Normalising DOI urls' (#177) from instance_group_by_url into beta

Reviewed-on: D-Net/dnet-hadoop#177
This commit is contained in:
Claudio Atzori 2021-12-23 12:40:17 +01:00
commit 278cf08421
2 changed files with 46 additions and 0 deletions

View File

@ -10,6 +10,7 @@ import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.io.StringReader; import java.io.StringReader;
import java.io.StringWriter; import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -22,6 +23,7 @@ import javax.xml.transform.stream.StreamResult;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.tuple.Pair;
import org.apache.http.protocol.HTTP;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
@ -55,6 +57,8 @@ public class XmlRecordFactory implements Serializable {
* *
*/ */
private static final long serialVersionUID = 2912912999272373172L; private static final long serialVersionUID = 2912912999272373172L;
public static final String DOI_ORG_AUTHORITY = "doi.org";
public static final String HTTPS = "https";
private final Map<String, LongAccumulator> accumulators; private final Map<String, LongAccumulator> accumulators;
@ -1269,6 +1273,7 @@ public class XmlRecordFactory implements Serializable {
.getUrl() .getUrl()
.stream() .stream()
.filter(this::isValidUrl) .filter(this::isValidUrl)
.map(XmlRecordFactory::normalizeDoiUrl)
.collect(Collectors.toList())); .collect(Collectors.toList()));
return i; return i;
}) })
@ -1285,6 +1290,18 @@ public class XmlRecordFactory implements Serializable {
.map(this::mergeInstances); .map(this::mergeInstances);
} }
public static String normalizeDoiUrl(String url) {
if (url.contains(DOI_ORG_AUTHORITY)) {
try {
URL u = new URL(url);
return new URL(HTTPS, DOI_ORG_AUTHORITY, u.getFile()).toString();
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
return url;
}
private boolean isValidUrl(String url) { private boolean isValidUrl(String url) {
try { try {
new URL(url).toURI(); new URL(url).toURI();

View File

@ -1,9 +1,14 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.List;
import javax.xml.transform.Transformer; import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerException;
@ -109,6 +114,30 @@ public class IndexRecordTransformerTest {
testRecordTransformation(record); testRecordTransformation(record);
} }
@Test
void testDoiUrlNormalization() throws MalformedURLException {
// TODO add more test examples when needed
List<String> urls = Arrays
.asList(
"https://dx.doi.org/10.1016/j.jas.2019.105013",
"http://dx.doi.org/10.13140/rg.2.2.26964.65927",
"https://dx.doi.org/10.13140/rg.2.2.26964.65927",
"http://dx.doi.org/10.1016/j.jas.2019.105013",
"http://hdl.handle.net/2072/369223",
"https://doi.org/10.1016/j.jas.2019.105013");
for (String url : urls) {
URL u = new URL(XmlRecordFactory.normalizeDoiUrl(url));
if (url.contains(XmlRecordFactory.DOI_ORG_AUTHORITY)) {
assertEquals(XmlRecordFactory.HTTPS, u.getProtocol());
assertEquals(XmlRecordFactory.DOI_ORG_AUTHORITY, u.getAuthority());
} else {
assertEquals(url, u.toString());
}
}
}
private void testRecordTransformation(final String record) throws IOException, TransformerException { private void testRecordTransformation(final String record) throws IOException, TransformerException {
final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml")); final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml"));
final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl")); final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl"));