forked from antonis.lempesis/dnet-hadoop
ProvisionConstants moved as ModelHardLimits in dhp-common and applied to truncate long abstracts (len > 150000). Further filtering for empty PID values
This commit is contained in:
parent
266bf1a221
commit
58f28296ea
|
@ -1,14 +1,14 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision;
|
||||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
public class ProvisionConstants {
|
||||
public class ModelHardLimits {
|
||||
|
||||
public static final int MAX_EXTERNAL_ENTITIES = 50;
|
||||
public static final int MAX_AUTHORS = 200;
|
||||
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
|
||||
public static final int MAX_TITLE_LENGTH = 5000;
|
||||
public static final int MAX_TITLES = 10;
|
||||
public static final int MAX_ABSTRACT_LENGTH = 100000;
|
||||
public static final int MAX_ABSTRACT_LENGTH = 150000;
|
||||
public static final int MAX_INSTANCES = 10;
|
||||
|
||||
}
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.schema.oaf.utils;
|
|||
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -15,12 +16,21 @@ import eu.dnetlib.dhp.utils.DHPUtils;
|
|||
*/
|
||||
public class IdentifierFactory implements Serializable {
|
||||
|
||||
public static final String DOI_URL_PREFIX = "^http(s?):\\/\\/(dx\\.)?doi\\.org\\/";
|
||||
|
||||
public static final String ID_SEPARATOR = "::";
|
||||
public static final String ID_PREFIX_SEPARATOR = "|";
|
||||
public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR
|
||||
+ "[a-zA-Z0-9]{32}$";
|
||||
public static final int ID_PREFIX_LEN = 12;
|
||||
|
||||
/**
|
||||
* Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id
|
||||
* when no PID is available
|
||||
* @param entity the entity providing PIDs and a default ID.
|
||||
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
|
||||
* @return an identifier from the most relevant PID, entity.id otherwise
|
||||
*/
|
||||
public static <T extends OafEntity> String createIdentifier(T entity) {
|
||||
|
||||
if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) {
|
||||
|
@ -32,12 +42,33 @@ public class IdentifierFactory implements Serializable {
|
|||
.stream()
|
||||
.filter(s -> Objects.nonNull(s.getQualifier()))
|
||||
.filter(s -> PidType.isValid(s.getQualifier().getClassid()))
|
||||
.filter(s -> StringUtils.isNotBlank(StringUtils.trim(s.getValue())))
|
||||
.min(new PidComparator<>(entity))
|
||||
.map(s -> idFromPid(entity, s))
|
||||
.map(IdentifierFactory::verifyIdSyntax)
|
||||
.orElseGet(entity::getId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method that normalises PID values on a per-type basis.
|
||||
* @param pid the PID whose value will be normalised.
|
||||
* @return the PID containing the normalised value.
|
||||
*/
|
||||
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
|
||||
String value = Optional
|
||||
.ofNullable(pid.getValue())
|
||||
.map(String::trim)
|
||||
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
||||
switch (pid.getQualifier().getClassid()) {
|
||||
|
||||
// TODO add cleaning for more PID types as needed
|
||||
case "doi":
|
||||
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX, ""));
|
||||
break;
|
||||
}
|
||||
return pid;
|
||||
}
|
||||
|
||||
private static String verifyIdSyntax(String s) {
|
||||
if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) {
|
||||
throw new RuntimeException(String.format("malformed id: '%s'", s));
|
||||
|
@ -52,15 +83,10 @@ public class IdentifierFactory implements Serializable {
|
|||
.append(ID_PREFIX_SEPARATOR)
|
||||
.append(createPrefix(s.getQualifier().getClassid()))
|
||||
.append(ID_SEPARATOR)
|
||||
.append(DHPUtils.md5(normalizePidValue(s.getValue())))
|
||||
.append(DHPUtils.md5(normalizePidValue(s).getValue()))
|
||||
.toString();
|
||||
}
|
||||
|
||||
private static String normalizePidValue(String value) {
|
||||
// TODO more aggressive cleaning? keep only alphanum and punctuation?
|
||||
return value.toLowerCase().replaceAll(" ", "");
|
||||
}
|
||||
|
||||
// create the prefix (length = 12)
|
||||
private static String createPrefix(String pidType) {
|
||||
StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN));
|
||||
|
|
|
@ -561,4 +561,5 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -116,17 +116,9 @@ public class GenerateEntitiesApplication {
|
|||
private static Oaf merge(final Oaf o1, final Oaf o2) {
|
||||
if (ModelSupport.isSubClass(o1, OafEntity.class)) {
|
||||
if (ModelSupport.isSubClass(o1, Result.class)) {
|
||||
if (ModelSupport.isSubClass(o1, Publication.class)) {
|
||||
((Publication) o1).mergeFrom((Publication) o2);
|
||||
} else if (ModelSupport.isSubClass(o1, Dataset.class)) {
|
||||
((Dataset) o1).mergeFrom((Dataset) o2);
|
||||
} else if (ModelSupport.isSubClass(o1, Software.class)) {
|
||||
((Software) o1).mergeFrom((Software) o2);
|
||||
} else if (ModelSupport.isSubClass(o1, OtherResearchProduct.class)) {
|
||||
((OtherResearchProduct) o1).mergeFrom((OtherResearchProduct) o2);
|
||||
} else {
|
||||
throw new RuntimeException("invalid Result subtype:" + o1.getClass().getCanonicalName());
|
||||
}
|
||||
|
||||
// We cannot further specify the result type as different result types might share the same ID
|
||||
((Result) o1).mergeFrom((Result) o2);
|
||||
} else if (ModelSupport.isSubClass(o1, Datasource.class)) {
|
||||
((Datasource) o1).mergeFrom((Datasource) o2);
|
||||
} else if (ModelSupport.isSubClass(o1, Organization.class)) {
|
||||
|
|
|
@ -8,6 +8,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -20,6 +21,7 @@ import com.google.common.collect.Lists;
|
|||
import eu.dnetlib.dhp.common.PacePerson;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
|
||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
|
@ -85,7 +87,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
|
||||
@Override
|
||||
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:description", info);
|
||||
return prepareListFields(doc, "//dc:description", info)
|
||||
.stream()
|
||||
.map(d -> {
|
||||
d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
|
||||
return d;
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -283,6 +291,10 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
@Override
|
||||
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
|
||||
return prepareListStructPropsWithValidQualifier(
|
||||
doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info);
|
||||
doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info)
|
||||
.stream()
|
||||
.map(IdentifierFactory::normalizePidValue)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ import java.util.Arrays;
|
|||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.Document;
|
||||
|
@ -22,6 +23,7 @@ import eu.dnetlib.dhp.common.PacePerson;
|
|||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
|
||||
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
|
@ -191,7 +193,13 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
|
||||
@Override
|
||||
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info);
|
||||
return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info)
|
||||
.stream()
|
||||
.map(d -> {
|
||||
d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
|
||||
return d;
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -371,7 +379,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
doc,
|
||||
"//datacite:alternateIdentifier[@alternateIdentifierType != 'URL' and @alternateIdentifierType != 'landingPage']",
|
||||
"@alternateIdentifierType", DNET_PID_TYPES, info));
|
||||
return Lists.newArrayList(res);
|
||||
|
||||
return res
|
||||
.stream()
|
||||
.map(IdentifierFactory::normalizePidValue)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -164,7 +164,7 @@ public class CreateRelatedEntitiesJob_phase1 {
|
|||
|
||||
if (result.getTitle() != null && !result.getTitle().isEmpty()) {
|
||||
final StructuredProperty title = result.getTitle().stream().findFirst().get();
|
||||
title.setValue(StringUtils.left(title.getValue(), ProvisionConstants.MAX_TITLE_LENGTH));
|
||||
title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
|
||||
re.setTitle(title);
|
||||
}
|
||||
|
||||
|
@ -178,7 +178,7 @@ public class CreateRelatedEntitiesJob_phase1 {
|
|||
.getInstance()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.limit(ProvisionConstants.MAX_INSTANCES)
|
||||
.limit(ModelHardLimits.MAX_INSTANCES)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
|
|
|
@ -240,15 +240,15 @@ public class CreateRelatedEntitiesJob_phase2 {
|
|||
List<ExternalReference> refs = r
|
||||
.getExternalReference()
|
||||
.stream()
|
||||
.limit(ProvisionConstants.MAX_EXTERNAL_ENTITIES)
|
||||
.limit(ModelHardLimits.MAX_EXTERNAL_ENTITIES)
|
||||
.collect(Collectors.toList());
|
||||
r.setExternalReference(refs);
|
||||
}
|
||||
if (r.getAuthor() != null) {
|
||||
List<Author> authors = Lists.newArrayList();
|
||||
for (Author a : r.getAuthor()) {
|
||||
a.setFullname(StringUtils.left(a.getFullname(), ProvisionConstants.MAX_AUTHOR_FULLNAME_LENGTH));
|
||||
if (authors.size() < ProvisionConstants.MAX_AUTHORS || hasORCID(a)) {
|
||||
a.setFullname(StringUtils.left(a.getFullname(), ModelHardLimits.MAX_AUTHOR_FULLNAME_LENGTH));
|
||||
if (authors.size() < ModelHardLimits.MAX_AUTHORS || hasORCID(a)) {
|
||||
authors.add(a);
|
||||
}
|
||||
}
|
||||
|
@ -260,7 +260,7 @@ public class CreateRelatedEntitiesJob_phase2 {
|
|||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.map(d -> {
|
||||
d.setValue(StringUtils.left(d.getValue(), ProvisionConstants.MAX_ABSTRACT_LENGTH));
|
||||
d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
|
||||
return d;
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
|
@ -272,10 +272,10 @@ public class CreateRelatedEntitiesJob_phase2 {
|
|||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.map(t -> {
|
||||
t.setValue(StringUtils.left(t.getValue(), ProvisionConstants.MAX_TITLE_LENGTH));
|
||||
t.setValue(StringUtils.left(t.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
|
||||
return t;
|
||||
})
|
||||
.limit(ProvisionConstants.MAX_TITLES)
|
||||
.limit(ModelHardLimits.MAX_TITLES)
|
||||
.collect(Collectors.toList());
|
||||
r.setTitle(titles);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue