From e45777e7e12256669ce541ad50b6820c0d945cec Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 26 May 2023 11:33:42 +0200 Subject: [PATCH] [aggregator graph] added validation for URLs mapped from oaf:fulltext --- .../raw/AbstractMdRecordToOafMapper.java | 20 ++++++++++++------- .../dhp/oa/graph/raw/OafToOafMapper.java | 8 +++++++- .../dhp/oa/graph/raw/OdfToOafMapper.java | 7 ++++++- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 19 ++++++++++-------- pom.xml | 2 +- 5 files changed, 38 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 504c9dd4c..efb860d44 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -5,8 +5,6 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId; -import java.net.MalformedURLException; -import java.net.URL; import java.util.*; import java.util.stream.Collectors; @@ -17,7 +15,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Lists; -import com.google.common.collect.Maps; import com.google.common.collect.Sets; import eu.dnetlib.dhp.common.Constants; @@ -27,12 +24,13 @@ import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; -import eu.dnetlib.dhp.schema.oaf.utils.PidType; public abstract class AbstractMdRecordToOafMapper { protected final VocabularyGroup vocs; + protected static final UrlValidator URL_VALIDATOR = UrlValidator.getInstance(); + private final boolean invisible; private final boolean shouldHashId; @@ -393,7 +391,7 @@ public abstract class AbstractMdRecordToOafMapper { r.setPublisher(preparePublisher(doc, info)); r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); r.setSource(prepareSources(doc, info)); - r.setFulltext(prepareListFields(doc, "//oaf:fulltext", info)); + r.setFulltext(prepareListURL(doc, "//oaf:fulltext", info)); r.setFormat(prepareFormats(doc, info)); r.setContributor(prepareContributors(doc, info)); r.setResourcetype(prepareResourceType(doc, info)); @@ -672,6 +670,14 @@ public abstract class AbstractMdRecordToOafMapper { qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); } + protected List> prepareListURL(final Node node, final String xpath, final DataInfo info) { + return listFields( + info, prepareListString(node, xpath) + .stream() + .filter(URL_VALIDATOR::isValid) + .collect(Collectors.toList())); + } + protected Field prepareField(final Node node, final String xpath, final DataInfo info) { return field(node.valueOf(xpath), info); } @@ -695,13 +701,13 @@ public abstract class AbstractMdRecordToOafMapper { } protected Set validateUrl(Collection url) { - UrlValidator urlValidator = UrlValidator.getInstance(); + if (Objects.isNull(url)) { return new HashSet<>(); } return url .stream() - .filter(u -> urlValidator.isValid(u)) + .filter(URL_VALIDATOR::isValid) .collect(Collectors.toCollection(HashSet::new)); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 30f3935f5..2271a0fff 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -140,7 +140,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final List alternateIdentifier = prepareResultPids(doc, info); final List pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom); - final Set pids = pid.stream().collect(Collectors.toCollection(HashSet::new)); + final Set pids = new HashSet<>(pid); instance .setAlternateIdentifier( @@ -158,6 +158,12 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { instance .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + prepareListURL(doc, "//oaf:fulltext", info) + .stream() + .findFirst() + .map(Field::getValue) + .ifPresent(instance::setFulltext); + final List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); final List url = nodes .stream() diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 39c77bd37..1faa2fe9b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -144,7 +144,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final List alternateIdentifier = prepareResultPids(doc, info); final List pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom); - final Set pids = pid.stream().collect(Collectors.toCollection(HashSet::new)); + final Set pids = new HashSet<>(pid); instance .setAlternateIdentifier( @@ -161,6 +161,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); instance .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + prepareListURL(doc, "//oaf:fulltext", info) + .stream() + .findFirst() + .map(Field::getValue) + .ifPresent(instance::setFulltext); final Set url = new HashSet<>(); for (final Object o : doc diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index a5a277470..894ed33f7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -27,7 +27,6 @@ import eu.dnetlib.dhp.common.Constants; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.PidType; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -52,7 +51,7 @@ class MappersTest { } @Test - void testPublication() throws IOException, DocumentException { + void testPublication() throws IOException { final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record.xml"))); @@ -112,13 +111,17 @@ class MappersTest { assertNotNull(i.getAccessright()); assertEquals("OPEN", i.getAccessright().getClassid()); }); - assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid()); - assertNotNull(p.getInstance().get(0).getPid()); - assertTrue(p.getInstance().get(0).getPid().isEmpty()); + final Instance instance = p.getInstance().get(0); + assertEquals("0001", instance.getRefereed().getClassid()); + assertNotNull(instance.getPid()); + assertTrue(instance.getPid().isEmpty()); - assertTrue(!p.getInstance().get(0).getAlternateIdentifier().isEmpty()); - assertEquals("doi", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid()); - assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue()); + assertFalse(instance.getAlternateIdentifier().isEmpty()); + assertEquals("doi", instance.getAlternateIdentifier().get(0).getQualifier().getClassid()); + assertEquals("10.3897/oneeco.2.e13718", instance.getAlternateIdentifier().get(0).getValue()); + + assertNotNull(instance.getFulltext()); + assertEquals("https://oneecosystem.pensoft.net/article/13718/", instance.getFulltext()); assertNotNull(p.getBestaccessright()); assertEquals("OPEN", p.getBestaccessright().getClassid()); diff --git a/pom.xml b/pom.xml index 8a9f1dea8..63d871039 100644 --- a/pom.xml +++ b/pom.xml @@ -807,7 +807,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [3.16.0] + [3.17.1-SNAPSHOT] [4.0.3] [6.0.5] [3.1.6]