forked from D-Net/dnet-hadoop
added instance.url syntactical validation, avoid creating multiple duplicated URLs
This commit is contained in:
parent
192215a18e
commit
26e1badded
|
@ -57,6 +57,11 @@
|
|||
<artifactId>commons-io</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>commons-validator</groupId>
|
||||
<artifactId>commons-validator</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
|
|
|
@ -10,9 +10,13 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
|
|||
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.validator.routines.UrlValidator;
|
||||
import org.dom4j.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -617,4 +621,15 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
return res;
|
||||
}
|
||||
|
||||
protected Set<String> validateUrl(Collection<String> url) {
|
||||
UrlValidator urlValidator = UrlValidator.getInstance();
|
||||
if (Objects.isNull(url)) {
|
||||
return new HashSet<>();
|
||||
}
|
||||
return url
|
||||
.stream()
|
||||
.filter(u -> urlValidator.isValid(u))
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -159,22 +159,25 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||
|
||||
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
|
||||
instance
|
||||
.setUrl(
|
||||
nodes
|
||||
.stream()
|
||||
.filter(n -> StringUtils.isNotBlank(n.getText()))
|
||||
.map(n -> n.getText().trim())
|
||||
.filter(u -> u.startsWith("http"))
|
||||
.map(s -> {
|
||||
try {
|
||||
return URLDecoder.decode(s, "UTF-8");
|
||||
} catch (Throwable t) {
|
||||
return s;
|
||||
}
|
||||
})
|
||||
.distinct()
|
||||
.collect(Collectors.toCollection(ArrayList::new)));
|
||||
final List<String> url = nodes
|
||||
.stream()
|
||||
.filter(n -> StringUtils.isNotBlank(n.getText()))
|
||||
.map(n -> n.getText().trim())
|
||||
.filter(u -> u.startsWith("http"))
|
||||
.map(s -> {
|
||||
try {
|
||||
return URLDecoder.decode(s, "UTF-8");
|
||||
} catch (Throwable t) {
|
||||
return s;
|
||||
}
|
||||
})
|
||||
.distinct()
|
||||
.collect(Collectors.toCollection(ArrayList::new));
|
||||
final Set<String> validUrl = validateUrl(url);
|
||||
if (!validUrl.isEmpty()) {
|
||||
instance.setUrl(new ArrayList<>());
|
||||
instance.getUrl().addAll(validUrl);
|
||||
}
|
||||
|
||||
return Lists.newArrayList(instance);
|
||||
}
|
||||
|
|
|
@ -6,11 +6,14 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
|||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.net.URLDecoder;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.validator.routines.UrlValidator;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.Node;
|
||||
|
@ -171,23 +174,31 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) {
|
||||
url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
|
||||
}
|
||||
for (final Object o : doc
|
||||
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
|
||||
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
||||
|
||||
Set<String> validUrl = validateUrl(url);
|
||||
|
||||
if (validUrl.stream().noneMatch(s -> s.contains("doi.org"))) {
|
||||
for (final Object o : doc
|
||||
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
|
||||
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
||||
}
|
||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
|
||||
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
||||
}
|
||||
}
|
||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
|
||||
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
|
||||
if (validUrl.stream().noneMatch(s -> s.contains("hdl.handle.net"))) {
|
||||
for (final Object o : doc
|
||||
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
|
||||
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
||||
}
|
||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
|
||||
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
||||
}
|
||||
}
|
||||
for (final Object o : doc
|
||||
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
|
||||
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
||||
}
|
||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
|
||||
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
|
||||
}
|
||||
if (!url.isEmpty()) {
|
||||
|
||||
if (!validUrl.isEmpty()) {
|
||||
instance.setUrl(new ArrayList<>());
|
||||
instance.getUrl().addAll(url);
|
||||
instance.getUrl().addAll(validUrl);
|
||||
}
|
||||
return Arrays.asList(instance);
|
||||
}
|
||||
|
|
|
@ -950,7 +950,7 @@ class MappersTest {
|
|||
@Test
|
||||
void testNotWellFormed() throws IOException {
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml")));
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml")));
|
||||
final List<Oaf> actual = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
assertNotNull(actual);
|
||||
assertTrue(actual.isEmpty());
|
||||
|
|
6
pom.xml
6
pom.xml
|
@ -200,6 +200,12 @@
|
|||
<version>${dhp.commons.lang.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>commons-validator</groupId>
|
||||
<artifactId>commons-validator</artifactId>
|
||||
<version>1.7</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.github.sisyphsu</groupId>
|
||||
<artifactId>dateparser</artifactId>
|
||||
|
|
Loading…
Reference in New Issue