added instance.url syntactical validation, avoid creating multiple duplicated URLs

This commit is contained in:
Claudio Atzori 2022-09-19 11:19:10 +02:00
parent 192215a18e
commit 26e1badded
6 changed files with 71 additions and 31 deletions

View File

@ -57,6 +57,11 @@
<artifactId>commons-io</artifactId> <artifactId>commons-io</artifactId>
</dependency> </dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <artifactId>spark-core_2.11</artifactId>

View File

@ -10,9 +10,13 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN; import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.validator.routines.UrlValidator;
import org.dom4j.*; import org.dom4j.*;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -617,4 +621,15 @@ public abstract class AbstractMdRecordToOafMapper {
return res; return res;
} }
protected Set<String> validateUrl(Collection<String> url) {
UrlValidator urlValidator = UrlValidator.getInstance();
if (Objects.isNull(url)) {
return new HashSet<>();
}
return url
.stream()
.filter(u -> urlValidator.isValid(u))
.collect(Collectors.toCollection(HashSet::new));
}
} }

View File

@ -159,22 +159,25 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
instance final List<String> url = nodes
.setUrl( .stream()
nodes .filter(n -> StringUtils.isNotBlank(n.getText()))
.stream() .map(n -> n.getText().trim())
.filter(n -> StringUtils.isNotBlank(n.getText())) .filter(u -> u.startsWith("http"))
.map(n -> n.getText().trim()) .map(s -> {
.filter(u -> u.startsWith("http")) try {
.map(s -> { return URLDecoder.decode(s, "UTF-8");
try { } catch (Throwable t) {
return URLDecoder.decode(s, "UTF-8"); return s;
} catch (Throwable t) { }
return s; })
} .distinct()
}) .collect(Collectors.toCollection(ArrayList::new));
.distinct() final Set<String> validUrl = validateUrl(url);
.collect(Collectors.toCollection(ArrayList::new))); if (!validUrl.isEmpty()) {
instance.setUrl(new ArrayList<>());
instance.getUrl().addAll(validUrl);
}
return Lists.newArrayList(instance); return Lists.newArrayList(instance);
} }

View File

@ -6,11 +6,14 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder; import java.net.URLDecoder;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.validator.routines.UrlValidator;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.Element; import org.dom4j.Element;
import org.dom4j.Node; import org.dom4j.Node;
@ -171,23 +174,31 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) { for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) {
url.add(trimAndDecodeUrl(((Node) o).getText().trim())); url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
} }
for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) { Set<String> validUrl = validateUrl(url);
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
if (validUrl.stream().noneMatch(s -> s.contains("doi.org"))) {
for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
}
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
}
} }
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) { if (validUrl.stream().noneMatch(s -> s.contains("hdl.handle.net"))) {
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
}
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
}
} }
for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) { if (!validUrl.isEmpty()) {
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
}
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
}
if (!url.isEmpty()) {
instance.setUrl(new ArrayList<>()); instance.setUrl(new ArrayList<>());
instance.getUrl().addAll(url); instance.getUrl().addAll(validUrl);
} }
return Arrays.asList(instance); return Arrays.asList(instance);
} }

View File

@ -950,7 +950,7 @@ class MappersTest {
@Test @Test
void testNotWellFormed() throws IOException { void testNotWellFormed() throws IOException {
final String xml = IOUtils final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml"))); .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml")));
final List<Oaf> actual = new OafToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> actual = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
assertNotNull(actual); assertNotNull(actual);
assertTrue(actual.isEmpty()); assertTrue(actual.isEmpty());

View File

@ -200,6 +200,12 @@
<version>${dhp.commons.lang.version}</version> <version>${dhp.commons.lang.version}</version>
</dependency> </dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
<version>1.7</version>
</dependency>
<dependency> <dependency>
<groupId>com.github.sisyphsu</groupId> <groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId> <artifactId>dateparser</artifactId>