From 58f28296ea2a8eb2ed1e2c9c68f9a35107a19da1 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 30 Oct 2020 10:56:42 +0100 Subject: [PATCH] ProvisionConstants moved as ModelHardLimits in dhp-common and applied to truncate long abstracts (len > 150000). Further filtering for empty PID values --- .../dhp/schema/oaf/ModelHardLimits.java | 6 +-- .../schema/oaf/utils/IdentifierFactory.java | 38 ++++++++++++++++--- .../raw/AbstractMdRecordToOafMapper.java | 1 + .../raw/GenerateEntitiesApplication.java | 14 ++----- .../dhp/oa/graph/raw/OafToOafMapper.java | 16 +++++++- .../dhp/oa/graph/raw/OdfToOafMapper.java | 16 +++++++- .../CreateRelatedEntitiesJob_phase1.java | 4 +- .../CreateRelatedEntitiesJob_phase2.java | 12 +++--- 8 files changed, 75 insertions(+), 32 deletions(-) rename dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java => dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ModelHardLimits.java (70%) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ModelHardLimits.java similarity index 70% rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ModelHardLimits.java index 9bc3706cdd..16fdc3760d 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ModelHardLimits.java @@ -1,14 +1,14 @@ -package eu.dnetlib.dhp.oa.provision; +package eu.dnetlib.dhp.schema.oaf; -public class ProvisionConstants { +public class ModelHardLimits { public static final int MAX_EXTERNAL_ENTITIES = 50; public static final int MAX_AUTHORS = 200; public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000; public static final int MAX_TITLE_LENGTH = 5000; public static final int MAX_TITLES = 10; - public static final int MAX_ABSTRACT_LENGTH = 100000; + public static final int MAX_ABSTRACT_LENGTH = 150000; public static final int MAX_INSTANCES = 10; } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java index 183f689048..3059459d66 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.schema.oaf.utils; import java.io.Serializable; import java.util.Objects; +import java.util.Optional; import org.apache.commons.lang.StringUtils; @@ -15,12 +16,21 @@ import eu.dnetlib.dhp.utils.DHPUtils; */ public class IdentifierFactory implements Serializable { + public static final String DOI_URL_PREFIX = "^http(s?):\\/\\/(dx\\.)?doi\\.org\\/"; + public static final String ID_SEPARATOR = "::"; public static final String ID_PREFIX_SEPARATOR = "|"; public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR + "[a-zA-Z0-9]{32}$"; public static final int ID_PREFIX_LEN = 12; + /** + * Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id + * when no PID is available + * @param entity the entity providing PIDs and a default ID. + * @param the specific entity type. Currently Organization and Result subclasses are supported. + * @return an identifier from the most relevant PID, entity.id otherwise + */ public static String createIdentifier(T entity) { if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) { @@ -32,12 +42,33 @@ public class IdentifierFactory implements Serializable { .stream() .filter(s -> Objects.nonNull(s.getQualifier())) .filter(s -> PidType.isValid(s.getQualifier().getClassid())) + .filter(s -> StringUtils.isNotBlank(StringUtils.trim(s.getValue()))) .min(new PidComparator<>(entity)) .map(s -> idFromPid(entity, s)) .map(IdentifierFactory::verifyIdSyntax) .orElseGet(entity::getId); } + /** + * Utility method that normalises PID values on a per-type basis. + * @param pid the PID whose value will be normalised. + * @return the PID containing the normalised value. + */ + public static StructuredProperty normalizePidValue(StructuredProperty pid) { + String value = Optional + .ofNullable(pid.getValue()) + .map(String::trim) + .orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty")); + switch (pid.getQualifier().getClassid()) { + + // TODO add cleaning for more PID types as needed + case "doi": + pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX, "")); + break; + } + return pid; + } + private static String verifyIdSyntax(String s) { if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) { throw new RuntimeException(String.format("malformed id: '%s'", s)); @@ -52,15 +83,10 @@ public class IdentifierFactory implements Serializable { .append(ID_PREFIX_SEPARATOR) .append(createPrefix(s.getQualifier().getClassid())) .append(ID_SEPARATOR) - .append(DHPUtils.md5(normalizePidValue(s.getValue()))) + .append(DHPUtils.md5(normalizePidValue(s).getValue())) .toString(); } - private static String normalizePidValue(String value) { - // TODO more aggressive cleaning? keep only alphanum and punctuation? - return value.toLowerCase().replaceAll(" ", ""); - } - // create the prefix (length = 12) private static String createPrefix(String pidType) { StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN)); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 37c479e5f1..57e0aa49e7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -561,4 +561,5 @@ public abstract class AbstractMdRecordToOafMapper { } return res; } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 7b13e500e7..2dbe4eb836 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -116,17 +116,9 @@ public class GenerateEntitiesApplication { private static Oaf merge(final Oaf o1, final Oaf o2) { if (ModelSupport.isSubClass(o1, OafEntity.class)) { if (ModelSupport.isSubClass(o1, Result.class)) { - if (ModelSupport.isSubClass(o1, Publication.class)) { - ((Publication) o1).mergeFrom((Publication) o2); - } else if (ModelSupport.isSubClass(o1, Dataset.class)) { - ((Dataset) o1).mergeFrom((Dataset) o2); - } else if (ModelSupport.isSubClass(o1, Software.class)) { - ((Software) o1).mergeFrom((Software) o2); - } else if (ModelSupport.isSubClass(o1, OtherResearchProduct.class)) { - ((OtherResearchProduct) o1).mergeFrom((OtherResearchProduct) o2); - } else { - throw new RuntimeException("invalid Result subtype:" + o1.getClass().getCanonicalName()); - } + + // We cannot further specify the result type as different result types might share the same ID + ((Result) o1).mergeFrom((Result) o2); } else if (ModelSupport.isSubClass(o1, Datasource.class)) { ((Datasource) o1).mergeFrom((Datasource) o2); } else if (ModelSupport.isSubClass(o1, Organization.class)) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 543d22eeb7..4813a202b1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -8,6 +8,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import java.util.ArrayList; import java.util.List; +import java.util.Optional; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; @@ -20,6 +21,7 @@ import com.google.common.collect.Lists; import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; public class OafToOafMapper extends AbstractMdRecordToOafMapper { @@ -85,7 +87,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { @Override protected List> prepareDescriptions(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:description", info); + return prepareListFields(doc, "//dc:description", info) + .stream() + .map(d -> { + d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH)); + return d; + }) + .collect(Collectors.toList()); } @Override @@ -283,6 +291,10 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { @Override protected List prepareResultPids(final Document doc, final DataInfo info) { return prepareListStructPropsWithValidQualifier( - doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info); + doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info) + .stream() + .map(IdentifierFactory::normalizePidValue) + .collect(Collectors.toList()); } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 0319cb89e1..d819de1cbe 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -11,6 +11,7 @@ import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; @@ -22,6 +23,7 @@ import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @@ -191,7 +193,13 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected List> prepareDescriptions(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info); + return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info) + .stream() + .map(d -> { + d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH)); + return d; + }) + .collect(Collectors.toList()); } @Override @@ -371,7 +379,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL' and @alternateIdentifierType != 'landingPage']", "@alternateIdentifierType", DNET_PID_TYPES, info)); - return Lists.newArrayList(res); + + return res + .stream() + .map(IdentifierFactory::normalizePidValue) + .collect(Collectors.toList()); } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index b08e593f73..d404850ebe 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -164,7 +164,7 @@ public class CreateRelatedEntitiesJob_phase1 { if (result.getTitle() != null && !result.getTitle().isEmpty()) { final StructuredProperty title = result.getTitle().stream().findFirst().get(); - title.setValue(StringUtils.left(title.getValue(), ProvisionConstants.MAX_TITLE_LENGTH)); + title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH)); re.setTitle(title); } @@ -178,7 +178,7 @@ public class CreateRelatedEntitiesJob_phase1 { .getInstance() .stream() .filter(Objects::nonNull) - .limit(ProvisionConstants.MAX_INSTANCES) + .limit(ModelHardLimits.MAX_INSTANCES) .collect(Collectors.toList())); } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java index 7e175121e5..e32fe020b9 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java @@ -240,15 +240,15 @@ public class CreateRelatedEntitiesJob_phase2 { List refs = r .getExternalReference() .stream() - .limit(ProvisionConstants.MAX_EXTERNAL_ENTITIES) + .limit(ModelHardLimits.MAX_EXTERNAL_ENTITIES) .collect(Collectors.toList()); r.setExternalReference(refs); } if (r.getAuthor() != null) { List authors = Lists.newArrayList(); for (Author a : r.getAuthor()) { - a.setFullname(StringUtils.left(a.getFullname(), ProvisionConstants.MAX_AUTHOR_FULLNAME_LENGTH)); - if (authors.size() < ProvisionConstants.MAX_AUTHORS || hasORCID(a)) { + a.setFullname(StringUtils.left(a.getFullname(), ModelHardLimits.MAX_AUTHOR_FULLNAME_LENGTH)); + if (authors.size() < ModelHardLimits.MAX_AUTHORS || hasORCID(a)) { authors.add(a); } } @@ -260,7 +260,7 @@ public class CreateRelatedEntitiesJob_phase2 { .stream() .filter(Objects::nonNull) .map(d -> { - d.setValue(StringUtils.left(d.getValue(), ProvisionConstants.MAX_ABSTRACT_LENGTH)); + d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH)); return d; }) .collect(Collectors.toList()); @@ -272,10 +272,10 @@ public class CreateRelatedEntitiesJob_phase2 { .stream() .filter(Objects::nonNull) .map(t -> { - t.setValue(StringUtils.left(t.getValue(), ProvisionConstants.MAX_TITLE_LENGTH)); + t.setValue(StringUtils.left(t.getValue(), ModelHardLimits.MAX_TITLE_LENGTH)); return t; }) - .limit(ProvisionConstants.MAX_TITLES) + .limit(ModelHardLimits.MAX_TITLES) .collect(Collectors.toList()); r.setTitle(titles); }