added instance.url syntactical validation, avoid creating multiple duplicated URLs

This commit is contained in:
Claudio Atzori 2022-09-19 11:19:10 +02:00
parent 192215a18e
commit 26e1badded
6 changed files with 71 additions and 31 deletions

View File

@ -57,6 +57,11 @@
<artifactId>commons-io</artifactId> <artifactId>commons-io</artifactId>
</dependency> </dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <artifactId>spark-core_2.11</artifactId>

View File

@ -10,9 +10,13 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN; import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.validator.routines.UrlValidator;
import org.dom4j.*; import org.dom4j.*;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -617,4 +621,15 @@ public abstract class AbstractMdRecordToOafMapper {
return res; return res;
} }
protected Set<String> validateUrl(Collection<String> url) {
UrlValidator urlValidator = UrlValidator.getInstance();
if (Objects.isNull(url)) {
return new HashSet<>();
}
return url
.stream()
.filter(u -> urlValidator.isValid(u))
.collect(Collectors.toCollection(HashSet::new));
}
} }

View File

@ -159,9 +159,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
instance final List<String> url = nodes
.setUrl(
nodes
.stream() .stream()
.filter(n -> StringUtils.isNotBlank(n.getText())) .filter(n -> StringUtils.isNotBlank(n.getText()))
.map(n -> n.getText().trim()) .map(n -> n.getText().trim())
@ -174,7 +172,12 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
} }
}) })
.distinct() .distinct()
.collect(Collectors.toCollection(ArrayList::new))); .collect(Collectors.toCollection(ArrayList::new));
final Set<String> validUrl = validateUrl(url);
if (!validUrl.isEmpty()) {
instance.setUrl(new ArrayList<>());
instance.getUrl().addAll(validUrl);
}
return Lists.newArrayList(instance); return Lists.newArrayList(instance);
} }

View File

@ -6,11 +6,14 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder; import java.net.URLDecoder;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.validator.routines.UrlValidator;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.Element; import org.dom4j.Element;
import org.dom4j.Node; import org.dom4j.Node;
@ -171,23 +174,31 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) { for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) {
url.add(trimAndDecodeUrl(((Node) o).getText().trim())); url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
} }
Set<String> validUrl = validateUrl(url);
if (validUrl.stream().noneMatch(s -> s.contains("doi.org"))) {
for (final Object o : doc for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) { .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
} }
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) { for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
} }
}
if (validUrl.stream().noneMatch(s -> s.contains("hdl.handle.net"))) {
for (final Object o : doc for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) { .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
} }
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) { for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
} }
if (!url.isEmpty()) { }
if (!validUrl.isEmpty()) {
instance.setUrl(new ArrayList<>()); instance.setUrl(new ArrayList<>());
instance.getUrl().addAll(url); instance.getUrl().addAll(validUrl);
} }
return Arrays.asList(instance); return Arrays.asList(instance);
} }

View File

@ -200,6 +200,12 @@
<version>${dhp.commons.lang.version}</version> <version>${dhp.commons.lang.version}</version>
</dependency> </dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
<version>1.7</version>
</dependency>
<dependency> <dependency>
<groupId>com.github.sisyphsu</groupId> <groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId> <artifactId>dateparser</artifactId>