Merge pull request '[aggregator graph] validation for URLs from oaf:fulltext' (#298) from fulltext_url_validation into beta

Reviewed-on: D-Net/dnet-hadoop#298
This commit is contained in:
Claudio Atzori 2023-06-12 09:55:35 +02:00
commit daa21ddbb5
5 changed files with 38 additions and 18 deletions

View File

@ -5,8 +5,6 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.util.stream.Collectors;
@ -17,7 +15,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.Constants;
@ -27,12 +24,13 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
public abstract class AbstractMdRecordToOafMapper {
protected final VocabularyGroup vocs;
protected static final UrlValidator URL_VALIDATOR = UrlValidator.getInstance();
private final boolean invisible;
private final boolean shouldHashId;
@ -393,7 +391,7 @@ public abstract class AbstractMdRecordToOafMapper {
r.setPublisher(preparePublisher(doc, info));
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
r.setSource(prepareSources(doc, info));
r.setFulltext(prepareListFields(doc, "//oaf:fulltext", info));
r.setFulltext(prepareListURL(doc, "//oaf:fulltext", info));
r.setFormat(prepareFormats(doc, info));
r.setContributor(prepareContributors(doc, info));
r.setResourcetype(prepareResourceType(doc, info));
@ -672,6 +670,14 @@ public abstract class AbstractMdRecordToOafMapper {
qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
}
protected List<Field<String>> prepareListURL(final Node node, final String xpath, final DataInfo info) {
return listFields(
info, prepareListString(node, xpath)
.stream()
.filter(URL_VALIDATOR::isValid)
.collect(Collectors.toList()));
}
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
return field(node.valueOf(xpath), info);
}
@ -695,13 +701,13 @@ public abstract class AbstractMdRecordToOafMapper {
}
protected Set<String> validateUrl(Collection<String> url) {
UrlValidator urlValidator = UrlValidator.getInstance();
if (Objects.isNull(url)) {
return new HashSet<>();
}
return url
.stream()
.filter(u -> urlValidator.isValid(u))
.filter(URL_VALIDATOR::isValid)
.collect(Collectors.toCollection(HashSet::new));
}

View File

@ -140,7 +140,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new));
final Set<StructuredProperty> pids = new HashSet<>(pid);
instance
.setAlternateIdentifier(
@ -158,6 +158,12 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
instance
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
prepareListURL(doc, "//oaf:fulltext", info)
.stream()
.findFirst()
.map(Field::getValue)
.ifPresent(instance::setFulltext);
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
final List<String> url = nodes
.stream()

View File

@ -144,7 +144,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new));
final Set<StructuredProperty> pids = new HashSet<>(pid);
instance
.setAlternateIdentifier(
@ -161,6 +161,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
prepareListURL(doc, "//oaf:fulltext", info)
.stream()
.findFirst()
.map(Field::getValue)
.ifPresent(instance::setFulltext);
final Set<String> url = new HashSet<>();
for (final Object o : doc

View File

@ -27,7 +27,6 @@ import eu.dnetlib.dhp.common.Constants;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -52,7 +51,7 @@ class MappersTest {
}
@Test
void testPublication() throws IOException, DocumentException {
void testPublication() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record.xml")));
@ -112,13 +111,17 @@ class MappersTest {
assertNotNull(i.getAccessright());
assertEquals("OPEN", i.getAccessright().getClassid());
});
assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid());
assertNotNull(p.getInstance().get(0).getPid());
assertTrue(p.getInstance().get(0).getPid().isEmpty());
final Instance instance = p.getInstance().get(0);
assertEquals("0001", instance.getRefereed().getClassid());
assertNotNull(instance.getPid());
assertTrue(instance.getPid().isEmpty());
assertTrue(!p.getInstance().get(0).getAlternateIdentifier().isEmpty());
assertEquals("doi", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid());
assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue());
assertFalse(instance.getAlternateIdentifier().isEmpty());
assertEquals("doi", instance.getAlternateIdentifier().get(0).getQualifier().getClassid());
assertEquals("10.3897/oneeco.2.e13718", instance.getAlternateIdentifier().get(0).getValue());
assertNotNull(instance.getFulltext());
assertEquals("https://oneecosystem.pensoft.net/article/13718/", instance.getFulltext());
assertNotNull(p.getBestaccessright());
assertEquals("OPEN", p.getBestaccessright().getClassid());

View File

@ -807,7 +807,7 @@
<mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[3.16.0]</dhp-schemas.version>
<dhp-schemas.version>[3.17.1-SNAPSHOT]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>