added instance.url syntactical validation, avoid creating multiple duplicated URLs

This commit is contained in:
Claudio Atzori 2022-09-19 11:19:10 +02:00
parent 192215a18e
commit 26e1badded
6 changed files with 71 additions and 31 deletions

View File

@ -57,6 +57,11 @@
<artifactId>commons-io</artifactId>
</dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>

View File

@ -10,9 +10,13 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.validator.routines.UrlValidator;
import org.dom4j.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -617,4 +621,15 @@ public abstract class AbstractMdRecordToOafMapper {
return res;
}
protected Set<String> validateUrl(Collection<String> url) {
UrlValidator urlValidator = UrlValidator.getInstance();
if (Objects.isNull(url)) {
return new HashSet<>();
}
return url
.stream()
.filter(u -> urlValidator.isValid(u))
.collect(Collectors.toCollection(HashSet::new));
}
}

View File

@ -159,22 +159,25 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
instance
.setUrl(
nodes
.stream()
.filter(n -> StringUtils.isNotBlank(n.getText()))
.map(n -> n.getText().trim())
.filter(u -> u.startsWith("http"))
.map(s -> {
try {
return URLDecoder.decode(s, "UTF-8");
} catch (Throwable t) {
return s;
}
})
.distinct()
.collect(Collectors.toCollection(ArrayList::new)));
final List<String> url = nodes
.stream()
.filter(n -> StringUtils.isNotBlank(n.getText()))
.map(n -> n.getText().trim())
.filter(u -> u.startsWith("http"))
.map(s -> {
try {
return URLDecoder.decode(s, "UTF-8");
} catch (Throwable t) {
return s;
}
})
.distinct()
.collect(Collectors.toCollection(ArrayList::new));
final Set<String> validUrl = validateUrl(url);
if (!validUrl.isEmpty()) {
instance.setUrl(new ArrayList<>());
instance.getUrl().addAll(validUrl);
}
return Lists.newArrayList(instance);
}

View File

@ -6,11 +6,14 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.validator.routines.UrlValidator;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.Node;
@ -171,23 +174,31 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) {
url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
}
for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
Set<String> validUrl = validateUrl(url);
if (validUrl.stream().noneMatch(s -> s.contains("doi.org"))) {
for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
}
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
}
}
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
if (validUrl.stream().noneMatch(s -> s.contains("hdl.handle.net"))) {
for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
}
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
}
}
for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
}
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
}
if (!url.isEmpty()) {
if (!validUrl.isEmpty()) {
instance.setUrl(new ArrayList<>());
instance.getUrl().addAll(url);
instance.getUrl().addAll(validUrl);
}
return Arrays.asList(instance);
}

View File

@ -950,7 +950,7 @@ class MappersTest {
@Test
void testNotWellFormed() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml")));
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml")));
final List<Oaf> actual = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
assertNotNull(actual);
assertTrue(actual.isEmpty());

View File

@ -200,6 +200,12 @@
<version>${dhp.commons.lang.version}</version>
</dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
<version>1.7</version>
</dependency>
<dependency>
<groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId>