added instance.url syntactical validation, avoid creating multiple duplicated URLs

This commit is contained in:
Claudio Atzori 2022-09-19 11:19:10 +02:00
parent 192215a18e
commit 26e1badded
6 changed files with 71 additions and 31 deletions

View File

@ -57,6 +57,11 @@
<artifactId>commons-io</artifactId>
</dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>

View File

@ -10,9 +10,13 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.validator.routines.UrlValidator;
import org.dom4j.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -617,4 +621,15 @@ public abstract class AbstractMdRecordToOafMapper {
return res;
}
protected Set<String> validateUrl(Collection<String> url) {
UrlValidator urlValidator = UrlValidator.getInstance();
if (Objects.isNull(url)) {
return new HashSet<>();
}
return url
.stream()
.filter(u -> urlValidator.isValid(u))
.collect(Collectors.toCollection(HashSet::new));
}
}

View File

@ -159,9 +159,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
instance
.setUrl(
nodes
final List<String> url = nodes
.stream()
.filter(n -> StringUtils.isNotBlank(n.getText()))
.map(n -> n.getText().trim())
@ -174,7 +172,12 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
}
})
.distinct()
.collect(Collectors.toCollection(ArrayList::new)));
.collect(Collectors.toCollection(ArrayList::new));
final Set<String> validUrl = validateUrl(url);
if (!validUrl.isEmpty()) {
instance.setUrl(new ArrayList<>());
instance.getUrl().addAll(validUrl);
}
return Lists.newArrayList(instance);
}

View File

@ -6,11 +6,14 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.validator.routines.UrlValidator;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.Node;
@ -171,23 +174,31 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) {
url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
}
Set<String> validUrl = validateUrl(url);
if (validUrl.stream().noneMatch(s -> s.contains("doi.org"))) {
for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
}
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
}
}
if (validUrl.stream().noneMatch(s -> s.contains("hdl.handle.net"))) {
for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
}
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
}
if (!url.isEmpty()) {
}
if (!validUrl.isEmpty()) {
instance.setUrl(new ArrayList<>());
instance.getUrl().addAll(url);
instance.getUrl().addAll(validUrl);
}
return Arrays.asList(instance);
}

View File

@ -200,6 +200,12 @@
<version>${dhp.commons.lang.version}</version>
</dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
<version>1.7</version>
</dependency>
<dependency>
<groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId>