ProvisionConstants moved as ModelHardLimits in dhp-common and applied to truncate long abstracts (len > 150000). Further filtering for empty PID values

This commit is contained in:
Claudio Atzori 2020-10-30 10:56:42 +01:00
parent 266bf1a221
commit 58f28296ea
8 changed files with 75 additions and 32 deletions

View File

@ -1,14 +1,14 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.schema.oaf;
public class ProvisionConstants { public class ModelHardLimits {
public static final int MAX_EXTERNAL_ENTITIES = 50; public static final int MAX_EXTERNAL_ENTITIES = 50;
public static final int MAX_AUTHORS = 200; public static final int MAX_AUTHORS = 200;
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000; public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
public static final int MAX_TITLE_LENGTH = 5000; public static final int MAX_TITLE_LENGTH = 5000;
public static final int MAX_TITLES = 10; public static final int MAX_TITLES = 10;
public static final int MAX_ABSTRACT_LENGTH = 100000; public static final int MAX_ABSTRACT_LENGTH = 150000;
public static final int MAX_INSTANCES = 10; public static final int MAX_INSTANCES = 10;
} }

View File

@ -3,6 +3,7 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import java.io.Serializable; import java.io.Serializable;
import java.util.Objects; import java.util.Objects;
import java.util.Optional;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -15,12 +16,21 @@ import eu.dnetlib.dhp.utils.DHPUtils;
*/ */
public class IdentifierFactory implements Serializable { public class IdentifierFactory implements Serializable {
public static final String DOI_URL_PREFIX = "^http(s?):\\/\\/(dx\\.)?doi\\.org\\/";
public static final String ID_SEPARATOR = "::"; public static final String ID_SEPARATOR = "::";
public static final String ID_PREFIX_SEPARATOR = "|"; public static final String ID_PREFIX_SEPARATOR = "|";
public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR
+ "[a-zA-Z0-9]{32}$"; + "[a-zA-Z0-9]{32}$";
public static final int ID_PREFIX_LEN = 12; public static final int ID_PREFIX_LEN = 12;
/**
* Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id
* when no PID is available
* @param entity the entity providing PIDs and a default ID.
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
* @return an identifier from the most relevant PID, entity.id otherwise
*/
public static <T extends OafEntity> String createIdentifier(T entity) { public static <T extends OafEntity> String createIdentifier(T entity) {
if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) { if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) {
@ -32,12 +42,33 @@ public class IdentifierFactory implements Serializable {
.stream() .stream()
.filter(s -> Objects.nonNull(s.getQualifier())) .filter(s -> Objects.nonNull(s.getQualifier()))
.filter(s -> PidType.isValid(s.getQualifier().getClassid())) .filter(s -> PidType.isValid(s.getQualifier().getClassid()))
.filter(s -> StringUtils.isNotBlank(StringUtils.trim(s.getValue())))
.min(new PidComparator<>(entity)) .min(new PidComparator<>(entity))
.map(s -> idFromPid(entity, s)) .map(s -> idFromPid(entity, s))
.map(IdentifierFactory::verifyIdSyntax) .map(IdentifierFactory::verifyIdSyntax)
.orElseGet(entity::getId); .orElseGet(entity::getId);
} }
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
String value = Optional
.ofNullable(pid.getValue())
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pid.getQualifier().getClassid()) {
// TODO add cleaning for more PID types as needed
case "doi":
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX, ""));
break;
}
return pid;
}
private static String verifyIdSyntax(String s) { private static String verifyIdSyntax(String s) {
if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) { if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) {
throw new RuntimeException(String.format("malformed id: '%s'", s)); throw new RuntimeException(String.format("malformed id: '%s'", s));
@ -52,15 +83,10 @@ public class IdentifierFactory implements Serializable {
.append(ID_PREFIX_SEPARATOR) .append(ID_PREFIX_SEPARATOR)
.append(createPrefix(s.getQualifier().getClassid())) .append(createPrefix(s.getQualifier().getClassid()))
.append(ID_SEPARATOR) .append(ID_SEPARATOR)
.append(DHPUtils.md5(normalizePidValue(s.getValue()))) .append(DHPUtils.md5(normalizePidValue(s).getValue()))
.toString(); .toString();
} }
private static String normalizePidValue(String value) {
// TODO more aggressive cleaning? keep only alphanum and punctuation?
return value.toLowerCase().replaceAll(" ", "");
}
// create the prefix (length = 12) // create the prefix (length = 12)
private static String createPrefix(String pidType) { private static String createPrefix(String pidType) {
StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN)); StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN));

View File

@ -561,4 +561,5 @@ public abstract class AbstractMdRecordToOafMapper {
} }
return res; return res;
} }
} }

View File

@ -116,17 +116,9 @@ public class GenerateEntitiesApplication {
private static Oaf merge(final Oaf o1, final Oaf o2) { private static Oaf merge(final Oaf o1, final Oaf o2) {
if (ModelSupport.isSubClass(o1, OafEntity.class)) { if (ModelSupport.isSubClass(o1, OafEntity.class)) {
if (ModelSupport.isSubClass(o1, Result.class)) { if (ModelSupport.isSubClass(o1, Result.class)) {
if (ModelSupport.isSubClass(o1, Publication.class)) {
((Publication) o1).mergeFrom((Publication) o2); // We cannot further specify the result type as different result types might share the same ID
} else if (ModelSupport.isSubClass(o1, Dataset.class)) { ((Result) o1).mergeFrom((Result) o2);
((Dataset) o1).mergeFrom((Dataset) o2);
} else if (ModelSupport.isSubClass(o1, Software.class)) {
((Software) o1).mergeFrom((Software) o2);
} else if (ModelSupport.isSubClass(o1, OtherResearchProduct.class)) {
((OtherResearchProduct) o1).mergeFrom((OtherResearchProduct) o2);
} else {
throw new RuntimeException("invalid Result subtype:" + o1.getClass().getCanonicalName());
}
} else if (ModelSupport.isSubClass(o1, Datasource.class)) { } else if (ModelSupport.isSubClass(o1, Datasource.class)) {
((Datasource) o1).mergeFrom((Datasource) o2); ((Datasource) o1).mergeFrom((Datasource) o2);
} else if (ModelSupport.isSubClass(o1, Organization.class)) { } else if (ModelSupport.isSubClass(o1, Organization.class)) {

View File

@ -8,6 +8,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -20,6 +21,7 @@ import com.google.common.collect.Lists;
import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class OafToOafMapper extends AbstractMdRecordToOafMapper { public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@ -85,7 +87,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) { protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:description", info); return prepareListFields(doc, "//dc:description", info)
.stream()
.map(d -> {
d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
return d;
})
.collect(Collectors.toList());
} }
@Override @Override
@ -283,6 +291,10 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) { protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
return prepareListStructPropsWithValidQualifier( return prepareListStructPropsWithValidQualifier(
doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info); doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info)
.stream()
.map(IdentifierFactory::normalizePidValue)
.collect(Collectors.toList());
} }
} }

View File

@ -11,6 +11,7 @@ import java.util.Arrays;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document; import org.dom4j.Document;
@ -22,6 +23,7 @@ import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class OdfToOafMapper extends AbstractMdRecordToOafMapper { public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@ -191,7 +193,13 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) { protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info); return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info)
.stream()
.map(d -> {
d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
return d;
})
.collect(Collectors.toList());
} }
@Override @Override
@ -371,7 +379,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
doc, doc,
"//datacite:alternateIdentifier[@alternateIdentifierType != 'URL' and @alternateIdentifierType != 'landingPage']", "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL' and @alternateIdentifierType != 'landingPage']",
"@alternateIdentifierType", DNET_PID_TYPES, info)); "@alternateIdentifierType", DNET_PID_TYPES, info));
return Lists.newArrayList(res);
return res
.stream()
.map(IdentifierFactory::normalizePidValue)
.collect(Collectors.toList());
} }
} }

View File

@ -164,7 +164,7 @@ public class CreateRelatedEntitiesJob_phase1 {
if (result.getTitle() != null && !result.getTitle().isEmpty()) { if (result.getTitle() != null && !result.getTitle().isEmpty()) {
final StructuredProperty title = result.getTitle().stream().findFirst().get(); final StructuredProperty title = result.getTitle().stream().findFirst().get();
title.setValue(StringUtils.left(title.getValue(), ProvisionConstants.MAX_TITLE_LENGTH)); title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
re.setTitle(title); re.setTitle(title);
} }
@ -178,7 +178,7 @@ public class CreateRelatedEntitiesJob_phase1 {
.getInstance() .getInstance()
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.limit(ProvisionConstants.MAX_INSTANCES) .limit(ModelHardLimits.MAX_INSTANCES)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }

View File

@ -240,15 +240,15 @@ public class CreateRelatedEntitiesJob_phase2 {
List<ExternalReference> refs = r List<ExternalReference> refs = r
.getExternalReference() .getExternalReference()
.stream() .stream()
.limit(ProvisionConstants.MAX_EXTERNAL_ENTITIES) .limit(ModelHardLimits.MAX_EXTERNAL_ENTITIES)
.collect(Collectors.toList()); .collect(Collectors.toList());
r.setExternalReference(refs); r.setExternalReference(refs);
} }
if (r.getAuthor() != null) { if (r.getAuthor() != null) {
List<Author> authors = Lists.newArrayList(); List<Author> authors = Lists.newArrayList();
for (Author a : r.getAuthor()) { for (Author a : r.getAuthor()) {
a.setFullname(StringUtils.left(a.getFullname(), ProvisionConstants.MAX_AUTHOR_FULLNAME_LENGTH)); a.setFullname(StringUtils.left(a.getFullname(), ModelHardLimits.MAX_AUTHOR_FULLNAME_LENGTH));
if (authors.size() < ProvisionConstants.MAX_AUTHORS || hasORCID(a)) { if (authors.size() < ModelHardLimits.MAX_AUTHORS || hasORCID(a)) {
authors.add(a); authors.add(a);
} }
} }
@ -260,7 +260,7 @@ public class CreateRelatedEntitiesJob_phase2 {
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.map(d -> { .map(d -> {
d.setValue(StringUtils.left(d.getValue(), ProvisionConstants.MAX_ABSTRACT_LENGTH)); d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
return d; return d;
}) })
.collect(Collectors.toList()); .collect(Collectors.toList());
@ -272,10 +272,10 @@ public class CreateRelatedEntitiesJob_phase2 {
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.map(t -> { .map(t -> {
t.setValue(StringUtils.left(t.getValue(), ProvisionConstants.MAX_TITLE_LENGTH)); t.setValue(StringUtils.left(t.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
return t; return t;
}) })
.limit(ProvisionConstants.MAX_TITLES) .limit(ModelHardLimits.MAX_TITLES)
.collect(Collectors.toList()); .collect(Collectors.toList());
r.setTitle(titles); r.setTitle(titles);
} }