forked from D-Net/dnet-hadoop
Merge pull request '[aggregator graph] validation for URLs from oaf:fulltext' (#298) from fulltext_url_validation into beta
Reviewed-on: D-Net/dnet-hadoop#298
This commit is contained in:
commit
daa21ddbb5
|
@ -5,8 +5,6 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
|||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
@ -17,7 +15,6 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.common.Constants;
|
||||
|
@ -27,12 +24,13 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||
|
||||
public abstract class AbstractMdRecordToOafMapper {
|
||||
|
||||
protected final VocabularyGroup vocs;
|
||||
|
||||
protected static final UrlValidator URL_VALIDATOR = UrlValidator.getInstance();
|
||||
|
||||
private final boolean invisible;
|
||||
|
||||
private final boolean shouldHashId;
|
||||
|
@ -393,7 +391,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
r.setPublisher(preparePublisher(doc, info));
|
||||
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
|
||||
r.setSource(prepareSources(doc, info));
|
||||
r.setFulltext(prepareListFields(doc, "//oaf:fulltext", info));
|
||||
r.setFulltext(prepareListURL(doc, "//oaf:fulltext", info));
|
||||
r.setFormat(prepareFormats(doc, info));
|
||||
r.setContributor(prepareContributors(doc, info));
|
||||
r.setResourcetype(prepareResourceType(doc, info));
|
||||
|
@ -672,6 +670,14 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
|
||||
}
|
||||
|
||||
protected List<Field<String>> prepareListURL(final Node node, final String xpath, final DataInfo info) {
|
||||
return listFields(
|
||||
info, prepareListString(node, xpath)
|
||||
.stream()
|
||||
.filter(URL_VALIDATOR::isValid)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
|
||||
return field(node.valueOf(xpath), info);
|
||||
}
|
||||
|
@ -695,13 +701,13 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
}
|
||||
|
||||
protected Set<String> validateUrl(Collection<String> url) {
|
||||
UrlValidator urlValidator = UrlValidator.getInstance();
|
||||
|
||||
if (Objects.isNull(url)) {
|
||||
return new HashSet<>();
|
||||
}
|
||||
return url
|
||||
.stream()
|
||||
.filter(u -> urlValidator.isValid(u))
|
||||
.filter(URL_VALIDATOR::isValid)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
|
|
|
@ -140,7 +140,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
|
||||
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
|
||||
|
||||
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new));
|
||||
final Set<StructuredProperty> pids = new HashSet<>(pid);
|
||||
|
||||
instance
|
||||
.setAlternateIdentifier(
|
||||
|
@ -158,6 +158,12 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
instance
|
||||
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||
|
||||
prepareListURL(doc, "//oaf:fulltext", info)
|
||||
.stream()
|
||||
.findFirst()
|
||||
.map(Field::getValue)
|
||||
.ifPresent(instance::setFulltext);
|
||||
|
||||
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
|
||||
final List<String> url = nodes
|
||||
.stream()
|
||||
|
|
|
@ -144,7 +144,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
|
||||
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
|
||||
|
||||
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new));
|
||||
final Set<StructuredProperty> pids = new HashSet<>(pid);
|
||||
|
||||
instance
|
||||
.setAlternateIdentifier(
|
||||
|
@ -161,6 +161,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||
instance
|
||||
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||
prepareListURL(doc, "//oaf:fulltext", info)
|
||||
.stream()
|
||||
.findFirst()
|
||||
.map(Field::getValue)
|
||||
.ifPresent(instance::setFulltext);
|
||||
|
||||
final Set<String> url = new HashSet<>();
|
||||
for (final Object o : doc
|
||||
|
|
|
@ -27,7 +27,6 @@ import eu.dnetlib.dhp.common.Constants;
|
|||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
@ -52,7 +51,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testPublication() throws IOException, DocumentException {
|
||||
void testPublication() throws IOException {
|
||||
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record.xml")));
|
||||
|
||||
|
@ -112,13 +111,17 @@ class MappersTest {
|
|||
assertNotNull(i.getAccessright());
|
||||
assertEquals("OPEN", i.getAccessright().getClassid());
|
||||
});
|
||||
assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid());
|
||||
assertNotNull(p.getInstance().get(0).getPid());
|
||||
assertTrue(p.getInstance().get(0).getPid().isEmpty());
|
||||
final Instance instance = p.getInstance().get(0);
|
||||
assertEquals("0001", instance.getRefereed().getClassid());
|
||||
assertNotNull(instance.getPid());
|
||||
assertTrue(instance.getPid().isEmpty());
|
||||
|
||||
assertTrue(!p.getInstance().get(0).getAlternateIdentifier().isEmpty());
|
||||
assertEquals("doi", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid());
|
||||
assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue());
|
||||
assertFalse(instance.getAlternateIdentifier().isEmpty());
|
||||
assertEquals("doi", instance.getAlternateIdentifier().get(0).getQualifier().getClassid());
|
||||
assertEquals("10.3897/oneeco.2.e13718", instance.getAlternateIdentifier().get(0).getValue());
|
||||
|
||||
assertNotNull(instance.getFulltext());
|
||||
assertEquals("https://oneecosystem.pensoft.net/article/13718/", instance.getFulltext());
|
||||
|
||||
assertNotNull(p.getBestaccessright());
|
||||
assertEquals("OPEN", p.getBestaccessright().getClassid());
|
||||
|
|
2
pom.xml
2
pom.xml
|
@ -807,7 +807,7 @@
|
|||
<mockito-core.version>3.3.3</mockito-core.version>
|
||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||
<vtd.version>[2.12,3.0)</vtd.version>
|
||||
<dhp-schemas.version>[3.16.0]</dhp-schemas.version>
|
||||
<dhp-schemas.version>[3.17.1-SNAPSHOT]</dhp-schemas.version>
|
||||
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
||||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||
|
|
Loading…
Reference in New Issue