forked from D-Net/dnet-hadoop
added instance.url syntactical validation, avoid creating multiple duplicated URLs
This commit is contained in:
parent
192215a18e
commit
26e1badded
|
@ -57,6 +57,11 @@
|
||||||
<artifactId>commons-io</artifactId>
|
<artifactId>commons-io</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-validator</groupId>
|
||||||
|
<artifactId>commons-validator</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_2.11</artifactId>
|
||||||
|
|
|
@ -10,9 +10,13 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
|
||||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||||
|
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.commons.validator.routines.UrlValidator;
|
||||||
import org.dom4j.*;
|
import org.dom4j.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -617,4 +621,15 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected Set<String> validateUrl(Collection<String> url) {
|
||||||
|
UrlValidator urlValidator = UrlValidator.getInstance();
|
||||||
|
if (Objects.isNull(url)) {
|
||||||
|
return new HashSet<>();
|
||||||
|
}
|
||||||
|
return url
|
||||||
|
.stream()
|
||||||
|
.filter(u -> urlValidator.isValid(u))
|
||||||
|
.collect(Collectors.toCollection(HashSet::new));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -159,22 +159,25 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||||
|
|
||||||
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
|
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
|
||||||
instance
|
final List<String> url = nodes
|
||||||
.setUrl(
|
.stream()
|
||||||
nodes
|
.filter(n -> StringUtils.isNotBlank(n.getText()))
|
||||||
.stream()
|
.map(n -> n.getText().trim())
|
||||||
.filter(n -> StringUtils.isNotBlank(n.getText()))
|
.filter(u -> u.startsWith("http"))
|
||||||
.map(n -> n.getText().trim())
|
.map(s -> {
|
||||||
.filter(u -> u.startsWith("http"))
|
try {
|
||||||
.map(s -> {
|
return URLDecoder.decode(s, "UTF-8");
|
||||||
try {
|
} catch (Throwable t) {
|
||||||
return URLDecoder.decode(s, "UTF-8");
|
return s;
|
||||||
} catch (Throwable t) {
|
}
|
||||||
return s;
|
})
|
||||||
}
|
.distinct()
|
||||||
})
|
.collect(Collectors.toCollection(ArrayList::new));
|
||||||
.distinct()
|
final Set<String> validUrl = validateUrl(url);
|
||||||
.collect(Collectors.toCollection(ArrayList::new)));
|
if (!validUrl.isEmpty()) {
|
||||||
|
instance.setUrl(new ArrayList<>());
|
||||||
|
instance.getUrl().addAll(validUrl);
|
||||||
|
}
|
||||||
|
|
||||||
return Lists.newArrayList(instance);
|
return Lists.newArrayList(instance);
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,11 +6,14 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
|
||||||
|
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
import java.net.URLDecoder;
|
import java.net.URLDecoder;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.commons.validator.routines.UrlValidator;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.Element;
|
import org.dom4j.Element;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
|
@ -171,23 +174,31 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) {
|
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) {
|
||||||
url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
|
url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
|
||||||
}
|
}
|
||||||
for (final Object o : doc
|
|
||||||
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
|
Set<String> validUrl = validateUrl(url);
|
||||||
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
|
||||||
|
if (validUrl.stream().noneMatch(s -> s.contains("doi.org"))) {
|
||||||
|
for (final Object o : doc
|
||||||
|
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
|
||||||
|
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
||||||
|
}
|
||||||
|
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
|
||||||
|
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
|
if (validUrl.stream().noneMatch(s -> s.contains("hdl.handle.net"))) {
|
||||||
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
for (final Object o : doc
|
||||||
|
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
|
||||||
|
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
||||||
|
}
|
||||||
|
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
|
||||||
|
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for (final Object o : doc
|
|
||||||
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
|
if (!validUrl.isEmpty()) {
|
||||||
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
|
||||||
}
|
|
||||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
|
|
||||||
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
|
||||||
}
|
|
||||||
if (!url.isEmpty()) {
|
|
||||||
instance.setUrl(new ArrayList<>());
|
instance.setUrl(new ArrayList<>());
|
||||||
instance.getUrl().addAll(url);
|
instance.getUrl().addAll(validUrl);
|
||||||
}
|
}
|
||||||
return Arrays.asList(instance);
|
return Arrays.asList(instance);
|
||||||
}
|
}
|
||||||
|
|
|
@ -950,7 +950,7 @@ class MappersTest {
|
||||||
@Test
|
@Test
|
||||||
void testNotWellFormed() throws IOException {
|
void testNotWellFormed() throws IOException {
|
||||||
final String xml = IOUtils
|
final String xml = IOUtils
|
||||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml")));
|
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml")));
|
||||||
final List<Oaf> actual = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
final List<Oaf> actual = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||||
assertNotNull(actual);
|
assertNotNull(actual);
|
||||||
assertTrue(actual.isEmpty());
|
assertTrue(actual.isEmpty());
|
||||||
|
|
6
pom.xml
6
pom.xml
|
@ -200,6 +200,12 @@
|
||||||
<version>${dhp.commons.lang.version}</version>
|
<version>${dhp.commons.lang.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-validator</groupId>
|
||||||
|
<artifactId>commons-validator</artifactId>
|
||||||
|
<version>1.7</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.sisyphsu</groupId>
|
<groupId>com.github.sisyphsu</groupId>
|
||||||
<artifactId>dateparser</artifactId>
|
<artifactId>dateparser</artifactId>
|
||||||
|
|
Loading…
Reference in New Issue