[aggregator graph] added validation for URLs mapped from oaf:fulltext

This commit is contained in:
Claudio Atzori 2023-05-26 11:33:42 +02:00
parent a235d2a24a
commit e45777e7e1
5 changed files with 38 additions and 18 deletions

View File

@ -5,8 +5,6 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -17,7 +15,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.Constants; import eu.dnetlib.dhp.common.Constants;
@ -27,12 +24,13 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
public abstract class AbstractMdRecordToOafMapper { public abstract class AbstractMdRecordToOafMapper {
protected final VocabularyGroup vocs; protected final VocabularyGroup vocs;
protected static final UrlValidator URL_VALIDATOR = UrlValidator.getInstance();
private final boolean invisible; private final boolean invisible;
private final boolean shouldHashId; private final boolean shouldHashId;
@ -393,7 +391,7 @@ public abstract class AbstractMdRecordToOafMapper {
r.setPublisher(preparePublisher(doc, info)); r.setPublisher(preparePublisher(doc, info));
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
r.setSource(prepareSources(doc, info)); r.setSource(prepareSources(doc, info));
r.setFulltext(prepareListFields(doc, "//oaf:fulltext", info)); r.setFulltext(prepareListURL(doc, "//oaf:fulltext", info));
r.setFormat(prepareFormats(doc, info)); r.setFormat(prepareFormats(doc, info));
r.setContributor(prepareContributors(doc, info)); r.setContributor(prepareContributors(doc, info));
r.setResourcetype(prepareResourceType(doc, info)); r.setResourcetype(prepareResourceType(doc, info));
@ -672,6 +670,14 @@ public abstract class AbstractMdRecordToOafMapper {
qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
} }
protected List<Field<String>> prepareListURL(final Node node, final String xpath, final DataInfo info) {
return listFields(
info, prepareListString(node, xpath)
.stream()
.filter(URL_VALIDATOR::isValid)
.collect(Collectors.toList()));
}
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) { protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
return field(node.valueOf(xpath), info); return field(node.valueOf(xpath), info);
} }
@ -695,13 +701,13 @@ public abstract class AbstractMdRecordToOafMapper {
} }
protected Set<String> validateUrl(Collection<String> url) { protected Set<String> validateUrl(Collection<String> url) {
UrlValidator urlValidator = UrlValidator.getInstance();
if (Objects.isNull(url)) { if (Objects.isNull(url)) {
return new HashSet<>(); return new HashSet<>();
} }
return url return url
.stream() .stream()
.filter(u -> urlValidator.isValid(u)) .filter(URL_VALIDATOR::isValid)
.collect(Collectors.toCollection(HashSet::new)); .collect(Collectors.toCollection(HashSet::new));
} }

View File

@ -140,7 +140,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info); final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom); final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new)); final Set<StructuredProperty> pids = new HashSet<>(pid);
instance instance
.setAlternateIdentifier( .setAlternateIdentifier(
@ -158,6 +158,12 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
instance instance
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
prepareListURL(doc, "//oaf:fulltext", info)
.stream()
.findFirst()
.map(Field::getValue)
.ifPresent(instance::setFulltext);
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
final List<String> url = nodes final List<String> url = nodes
.stream() .stream()

View File

@ -144,7 +144,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info); final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom); final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new)); final Set<StructuredProperty> pids = new HashSet<>(pid);
instance instance
.setAlternateIdentifier( .setAlternateIdentifier(
@ -161,6 +161,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance instance
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
prepareListURL(doc, "//oaf:fulltext", info)
.stream()
.findFirst()
.map(Field::getValue)
.ifPresent(instance::setFulltext);
final Set<String> url = new HashSet<>(); final Set<String> url = new HashSet<>();
for (final Object o : doc for (final Object o : doc

View File

@ -27,7 +27,6 @@ import eu.dnetlib.dhp.common.Constants;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidType; import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -52,7 +51,7 @@ class MappersTest {
} }
@Test @Test
void testPublication() throws IOException, DocumentException { void testPublication() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record.xml")));
@ -112,13 +111,17 @@ class MappersTest {
assertNotNull(i.getAccessright()); assertNotNull(i.getAccessright());
assertEquals("OPEN", i.getAccessright().getClassid()); assertEquals("OPEN", i.getAccessright().getClassid());
}); });
assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid()); final Instance instance = p.getInstance().get(0);
assertNotNull(p.getInstance().get(0).getPid()); assertEquals("0001", instance.getRefereed().getClassid());
assertTrue(p.getInstance().get(0).getPid().isEmpty()); assertNotNull(instance.getPid());
assertTrue(instance.getPid().isEmpty());
assertTrue(!p.getInstance().get(0).getAlternateIdentifier().isEmpty()); assertFalse(instance.getAlternateIdentifier().isEmpty());
assertEquals("doi", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid()); assertEquals("doi", instance.getAlternateIdentifier().get(0).getQualifier().getClassid());
assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue()); assertEquals("10.3897/oneeco.2.e13718", instance.getAlternateIdentifier().get(0).getValue());
assertNotNull(instance.getFulltext());
assertEquals("https://oneecosystem.pensoft.net/article/13718/", instance.getFulltext());
assertNotNull(p.getBestaccessright()); assertNotNull(p.getBestaccessright());
assertEquals("OPEN", p.getBestaccessright().getClassid()); assertEquals("OPEN", p.getBestaccessright().getClassid());

View File

@ -807,7 +807,7 @@
<mockito-core.version>3.3.3</mockito-core.version> <mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version> <mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version> <vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[3.16.0]</dhp-schemas.version> <dhp-schemas.version>[3.17.1-SNAPSHOT]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version> <dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version> <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version> <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>