forked from D-Net/dnet-hadoop
[aggregator graph] added validation for URLs mapped from oaf:fulltext
This commit is contained in:
parent
a235d2a24a
commit
e45777e7e1
|
@ -5,8 +5,6 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
|
||||||
|
|
||||||
import java.net.MalformedURLException;
|
|
||||||
import java.net.URL;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@ -17,7 +15,6 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.Constants;
|
import eu.dnetlib.dhp.common.Constants;
|
||||||
|
@ -27,12 +24,13 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
|
||||||
|
|
||||||
public abstract class AbstractMdRecordToOafMapper {
|
public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
protected final VocabularyGroup vocs;
|
protected final VocabularyGroup vocs;
|
||||||
|
|
||||||
|
protected static final UrlValidator URL_VALIDATOR = UrlValidator.getInstance();
|
||||||
|
|
||||||
private final boolean invisible;
|
private final boolean invisible;
|
||||||
|
|
||||||
private final boolean shouldHashId;
|
private final boolean shouldHashId;
|
||||||
|
@ -393,7 +391,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
r.setPublisher(preparePublisher(doc, info));
|
r.setPublisher(preparePublisher(doc, info));
|
||||||
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
|
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
|
||||||
r.setSource(prepareSources(doc, info));
|
r.setSource(prepareSources(doc, info));
|
||||||
r.setFulltext(prepareListFields(doc, "//oaf:fulltext", info));
|
r.setFulltext(prepareListURL(doc, "//oaf:fulltext", info));
|
||||||
r.setFormat(prepareFormats(doc, info));
|
r.setFormat(prepareFormats(doc, info));
|
||||||
r.setContributor(prepareContributors(doc, info));
|
r.setContributor(prepareContributors(doc, info));
|
||||||
r.setResourcetype(prepareResourceType(doc, info));
|
r.setResourcetype(prepareResourceType(doc, info));
|
||||||
|
@ -672,6 +670,14 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
|
qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected List<Field<String>> prepareListURL(final Node node, final String xpath, final DataInfo info) {
|
||||||
|
return listFields(
|
||||||
|
info, prepareListString(node, xpath)
|
||||||
|
.stream()
|
||||||
|
.filter(URL_VALIDATOR::isValid)
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
|
||||||
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
|
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
|
||||||
return field(node.valueOf(xpath), info);
|
return field(node.valueOf(xpath), info);
|
||||||
}
|
}
|
||||||
|
@ -695,13 +701,13 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Set<String> validateUrl(Collection<String> url) {
|
protected Set<String> validateUrl(Collection<String> url) {
|
||||||
UrlValidator urlValidator = UrlValidator.getInstance();
|
|
||||||
if (Objects.isNull(url)) {
|
if (Objects.isNull(url)) {
|
||||||
return new HashSet<>();
|
return new HashSet<>();
|
||||||
}
|
}
|
||||||
return url
|
return url
|
||||||
.stream()
|
.stream()
|
||||||
.filter(u -> urlValidator.isValid(u))
|
.filter(URL_VALIDATOR::isValid)
|
||||||
.collect(Collectors.toCollection(HashSet::new));
|
.collect(Collectors.toCollection(HashSet::new));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -140,7 +140,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
|
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
|
||||||
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
|
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
|
||||||
|
|
||||||
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new));
|
final Set<StructuredProperty> pids = new HashSet<>(pid);
|
||||||
|
|
||||||
instance
|
instance
|
||||||
.setAlternateIdentifier(
|
.setAlternateIdentifier(
|
||||||
|
@ -158,6 +158,12 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
instance
|
instance
|
||||||
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||||
|
|
||||||
|
prepareListURL(doc, "//oaf:fulltext", info)
|
||||||
|
.stream()
|
||||||
|
.findFirst()
|
||||||
|
.map(Field::getValue)
|
||||||
|
.ifPresent(instance::setFulltext);
|
||||||
|
|
||||||
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
|
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
|
||||||
final List<String> url = nodes
|
final List<String> url = nodes
|
||||||
.stream()
|
.stream()
|
||||||
|
|
|
@ -144,7 +144,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
|
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
|
||||||
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
|
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
|
||||||
|
|
||||||
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new));
|
final Set<StructuredProperty> pids = new HashSet<>(pid);
|
||||||
|
|
||||||
instance
|
instance
|
||||||
.setAlternateIdentifier(
|
.setAlternateIdentifier(
|
||||||
|
@ -161,6 +161,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||||
instance
|
instance
|
||||||
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||||
|
prepareListURL(doc, "//oaf:fulltext", info)
|
||||||
|
.stream()
|
||||||
|
.findFirst()
|
||||||
|
.map(Field::getValue)
|
||||||
|
.ifPresent(instance::setFulltext);
|
||||||
|
|
||||||
final Set<String> url = new HashSet<>();
|
final Set<String> url = new HashSet<>();
|
||||||
for (final Object o : doc
|
for (final Object o : doc
|
||||||
|
|
|
@ -27,7 +27,6 @@ import eu.dnetlib.dhp.common.Constants;
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
@ -52,7 +51,7 @@ class MappersTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testPublication() throws IOException, DocumentException {
|
void testPublication() throws IOException {
|
||||||
|
|
||||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record.xml")));
|
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record.xml")));
|
||||||
|
|
||||||
|
@ -112,13 +111,17 @@ class MappersTest {
|
||||||
assertNotNull(i.getAccessright());
|
assertNotNull(i.getAccessright());
|
||||||
assertEquals("OPEN", i.getAccessright().getClassid());
|
assertEquals("OPEN", i.getAccessright().getClassid());
|
||||||
});
|
});
|
||||||
assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid());
|
final Instance instance = p.getInstance().get(0);
|
||||||
assertNotNull(p.getInstance().get(0).getPid());
|
assertEquals("0001", instance.getRefereed().getClassid());
|
||||||
assertTrue(p.getInstance().get(0).getPid().isEmpty());
|
assertNotNull(instance.getPid());
|
||||||
|
assertTrue(instance.getPid().isEmpty());
|
||||||
|
|
||||||
assertTrue(!p.getInstance().get(0).getAlternateIdentifier().isEmpty());
|
assertFalse(instance.getAlternateIdentifier().isEmpty());
|
||||||
assertEquals("doi", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid());
|
assertEquals("doi", instance.getAlternateIdentifier().get(0).getQualifier().getClassid());
|
||||||
assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue());
|
assertEquals("10.3897/oneeco.2.e13718", instance.getAlternateIdentifier().get(0).getValue());
|
||||||
|
|
||||||
|
assertNotNull(instance.getFulltext());
|
||||||
|
assertEquals("https://oneecosystem.pensoft.net/article/13718/", instance.getFulltext());
|
||||||
|
|
||||||
assertNotNull(p.getBestaccessright());
|
assertNotNull(p.getBestaccessright());
|
||||||
assertEquals("OPEN", p.getBestaccessright().getClassid());
|
assertEquals("OPEN", p.getBestaccessright().getClassid());
|
||||||
|
|
2
pom.xml
2
pom.xml
|
@ -807,7 +807,7 @@
|
||||||
<mockito-core.version>3.3.3</mockito-core.version>
|
<mockito-core.version>3.3.3</mockito-core.version>
|
||||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||||
<vtd.version>[2.12,3.0)</vtd.version>
|
<vtd.version>[2.12,3.0)</vtd.version>
|
||||||
<dhp-schemas.version>[3.16.0]</dhp-schemas.version>
|
<dhp-schemas.version>[3.17.1-SNAPSHOT]</dhp-schemas.version>
|
||||||
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
||||||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||||
|
|
Loading…
Reference in New Issue