forked from D-Net/dnet-hadoop
Merge pull request 'Normalising DOI urls' (#177) from instance_group_by_url into beta
Reviewed-on: D-Net/dnet-hadoop#177
This commit is contained in:
commit
278cf08421
|
@ -10,6 +10,7 @@ import java.io.IOException;
|
|||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.io.StringWriter;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -22,6 +23,7 @@ import javax.xml.transform.stream.StreamResult;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.tuple.ImmutablePair;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.apache.http.protocol.HTTP;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
|
@ -55,6 +57,8 @@ public class XmlRecordFactory implements Serializable {
|
|||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 2912912999272373172L;
|
||||
public static final String DOI_ORG_AUTHORITY = "doi.org";
|
||||
public static final String HTTPS = "https";
|
||||
|
||||
private final Map<String, LongAccumulator> accumulators;
|
||||
|
||||
|
@ -1269,6 +1273,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
.getUrl()
|
||||
.stream()
|
||||
.filter(this::isValidUrl)
|
||||
.map(XmlRecordFactory::normalizeDoiUrl)
|
||||
.collect(Collectors.toList()));
|
||||
return i;
|
||||
})
|
||||
|
@ -1285,6 +1290,18 @@ public class XmlRecordFactory implements Serializable {
|
|||
.map(this::mergeInstances);
|
||||
}
|
||||
|
||||
public static String normalizeDoiUrl(String url) {
|
||||
if (url.contains(DOI_ORG_AUTHORITY)) {
|
||||
try {
|
||||
URL u = new URL(url);
|
||||
return new URL(HTTPS, DOI_ORG_AUTHORITY, u.getFile()).toString();
|
||||
} catch (MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
private boolean isValidUrl(String url) {
|
||||
try {
|
||||
new URL(url).toURI();
|
||||
|
|
|
@ -1,9 +1,14 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerException;
|
||||
|
@ -109,6 +114,30 @@ public class IndexRecordTransformerTest {
|
|||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testDoiUrlNormalization() throws MalformedURLException {
|
||||
|
||||
// TODO add more test examples when needed
|
||||
List<String> urls = Arrays
|
||||
.asList(
|
||||
"https://dx.doi.org/10.1016/j.jas.2019.105013",
|
||||
"http://dx.doi.org/10.13140/rg.2.2.26964.65927",
|
||||
"https://dx.doi.org/10.13140/rg.2.2.26964.65927",
|
||||
"http://dx.doi.org/10.1016/j.jas.2019.105013",
|
||||
"http://hdl.handle.net/2072/369223",
|
||||
"https://doi.org/10.1016/j.jas.2019.105013");
|
||||
|
||||
for (String url : urls) {
|
||||
URL u = new URL(XmlRecordFactory.normalizeDoiUrl(url));
|
||||
if (url.contains(XmlRecordFactory.DOI_ORG_AUTHORITY)) {
|
||||
assertEquals(XmlRecordFactory.HTTPS, u.getProtocol());
|
||||
assertEquals(XmlRecordFactory.DOI_ORG_AUTHORITY, u.getAuthority());
|
||||
} else {
|
||||
assertEquals(url, u.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void testRecordTransformation(final String record) throws IOException, TransformerException {
|
||||
final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml"));
|
||||
final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl"));
|
||||
|
|
Loading…
Reference in New Issue