2020-10-01 10:50:15 +02:00
|
|
|
|
|
|
|
package eu.dnetlib.dhp.oa.graph.clean;
|
|
|
|
|
2020-11-24 18:40:25 +01:00
|
|
|
import java.util.*;
|
2020-10-01 12:50:40 +02:00
|
|
|
import java.util.function.Function;
|
2020-10-01 10:50:15 +02:00
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
|
2020-10-02 09:43:24 +02:00
|
|
|
import com.clearspring.analytics.util.Lists;
|
|
|
|
|
2020-10-01 10:50:15 +02:00
|
|
|
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
|
|
|
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.*;
|
|
|
|
|
|
|
|
public class CleaningFunctions {
|
|
|
|
|
2021-01-22 14:16:33 +01:00
|
|
|
public static final String DOI_PREFIX_REGEX = "^10\\.";
|
2020-12-23 12:22:48 +01:00
|
|
|
|
2020-12-21 11:40:17 +01:00
|
|
|
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
|
2020-12-23 12:22:48 +01:00
|
|
|
public static final int ORCID_LEN = 19;
|
|
|
|
|
2020-11-27 09:00:04 +01:00
|
|
|
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
2020-11-17 12:27:06 +01:00
|
|
|
|
|
|
|
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
|
|
|
|
|
|
|
static {
|
|
|
|
PID_BLACKLIST.add("none");
|
|
|
|
PID_BLACKLIST.add("na");
|
|
|
|
}
|
2020-10-01 12:50:40 +02:00
|
|
|
|
2020-10-01 10:50:15 +02:00
|
|
|
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
|
|
|
if (value instanceof Datasource) {
|
|
|
|
// nothing to clean here
|
|
|
|
} else if (value instanceof Project) {
|
|
|
|
// nothing to clean here
|
|
|
|
} else if (value instanceof Organization) {
|
|
|
|
Organization o = (Organization) value;
|
|
|
|
if (Objects.nonNull(o.getCountry())) {
|
|
|
|
fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE);
|
|
|
|
}
|
|
|
|
} else if (value instanceof Relation) {
|
|
|
|
// nothing to clean here
|
|
|
|
} else if (value instanceof Result) {
|
|
|
|
|
|
|
|
Result r = (Result) value;
|
|
|
|
|
|
|
|
fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES);
|
|
|
|
fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE);
|
|
|
|
fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES);
|
|
|
|
|
|
|
|
if (Objects.nonNull(r.getSubject())) {
|
|
|
|
r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES));
|
|
|
|
}
|
|
|
|
if (Objects.nonNull(r.getInstance())) {
|
|
|
|
for (Instance i : r.getInstance()) {
|
|
|
|
fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES);
|
|
|
|
fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (Objects.nonNull(r.getAuthor())) {
|
2021-01-25 16:54:53 +01:00
|
|
|
r
|
|
|
|
.getAuthor()
|
|
|
|
.stream()
|
|
|
|
.filter(Objects::nonNull)
|
|
|
|
.forEach(a -> {
|
|
|
|
if (Objects.nonNull(a.getPid())) {
|
|
|
|
a
|
|
|
|
.getPid()
|
|
|
|
.stream()
|
|
|
|
.filter(Objects::nonNull)
|
|
|
|
.forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES));
|
|
|
|
}
|
|
|
|
});
|
2020-10-01 10:50:15 +02:00
|
|
|
}
|
|
|
|
if (value instanceof Publication) {
|
|
|
|
|
|
|
|
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
|
|
|
|
|
|
|
|
} else if (value instanceof OtherResearchProduct) {
|
|
|
|
|
|
|
|
} else if (value instanceof Software) {
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return value;
|
|
|
|
}
|
|
|
|
|
2020-11-24 18:40:25 +01:00
|
|
|
public static <T extends Oaf> T fixDefaults(T value) {
|
2020-10-01 10:50:15 +02:00
|
|
|
if (value instanceof Datasource) {
|
|
|
|
// nothing to clean here
|
|
|
|
} else if (value instanceof Project) {
|
|
|
|
// nothing to clean here
|
|
|
|
} else if (value instanceof Organization) {
|
|
|
|
Organization o = (Organization) value;
|
|
|
|
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
|
2020-12-23 16:59:52 +01:00
|
|
|
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
|
2020-10-01 10:50:15 +02:00
|
|
|
}
|
|
|
|
} else if (value instanceof Relation) {
|
|
|
|
// nothing to clean here
|
|
|
|
} else if (value instanceof Result) {
|
|
|
|
|
|
|
|
Result r = (Result) value;
|
|
|
|
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
|
|
|
|
r.setPublisher(null);
|
|
|
|
}
|
|
|
|
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
|
|
|
|
r
|
|
|
|
.setLanguage(
|
|
|
|
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
|
|
|
|
}
|
|
|
|
if (Objects.nonNull(r.getSubject())) {
|
|
|
|
r
|
|
|
|
.setSubject(
|
|
|
|
r
|
|
|
|
.getSubject()
|
|
|
|
.stream()
|
|
|
|
.filter(Objects::nonNull)
|
|
|
|
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
|
|
|
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
|
|
|
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
2020-11-27 09:00:04 +01:00
|
|
|
.map(CleaningFunctions::cleanValue)
|
2020-11-24 18:40:25 +01:00
|
|
|
.collect(Collectors.toList()));
|
|
|
|
}
|
|
|
|
if (Objects.nonNull(r.getTitle())) {
|
|
|
|
r
|
|
|
|
.setTitle(
|
|
|
|
r
|
|
|
|
.getTitle()
|
|
|
|
.stream()
|
|
|
|
.filter(Objects::nonNull)
|
|
|
|
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
2020-11-27 09:00:04 +01:00
|
|
|
.map(CleaningFunctions::cleanValue)
|
2020-11-24 18:40:25 +01:00
|
|
|
.collect(Collectors.toList()));
|
|
|
|
}
|
|
|
|
if (Objects.nonNull(r.getDescription())) {
|
|
|
|
r
|
|
|
|
.setDescription(
|
|
|
|
r
|
|
|
|
.getDescription()
|
|
|
|
.stream()
|
|
|
|
.filter(Objects::nonNull)
|
|
|
|
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
2020-11-27 09:00:04 +01:00
|
|
|
.map(CleaningFunctions::cleanValue)
|
2020-10-01 10:50:15 +02:00
|
|
|
.collect(Collectors.toList()));
|
|
|
|
}
|
2020-11-03 11:52:10 +01:00
|
|
|
if (Objects.nonNull(r.getPid())) {
|
|
|
|
r
|
2020-11-06 17:12:31 +01:00
|
|
|
.setPid(
|
|
|
|
r
|
|
|
|
.getPid()
|
|
|
|
.stream()
|
|
|
|
.filter(Objects::nonNull)
|
|
|
|
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
|
2020-11-17 12:27:06 +01:00
|
|
|
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
|
2020-11-06 17:12:31 +01:00
|
|
|
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
|
|
|
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
2020-11-24 18:40:25 +01:00
|
|
|
.map(CleaningFunctions::normalizePidValue)
|
2020-11-06 17:12:31 +01:00
|
|
|
.collect(Collectors.toList()));
|
2020-11-03 11:52:10 +01:00
|
|
|
}
|
2020-10-01 10:50:15 +02:00
|
|
|
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
|
|
|
r
|
|
|
|
.setResourcetype(
|
2020-12-23 12:22:48 +01:00
|
|
|
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
|
2020-10-01 10:50:15 +02:00
|
|
|
}
|
|
|
|
if (Objects.nonNull(r.getInstance())) {
|
|
|
|
for (Instance i : r.getInstance()) {
|
|
|
|
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
|
2020-12-23 12:22:48 +01:00
|
|
|
i
|
|
|
|
.setAccessright(
|
|
|
|
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
|
2020-10-01 10:50:15 +02:00
|
|
|
}
|
|
|
|
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
|
|
|
|
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
|
|
|
|
}
|
|
|
|
if (Objects.isNull(i.getRefereed())) {
|
|
|
|
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
|
|
|
Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance());
|
|
|
|
if (Objects.isNull(bestaccessrights)) {
|
|
|
|
r
|
|
|
|
.setBestaccessright(
|
2020-12-23 12:22:48 +01:00
|
|
|
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
|
2020-10-01 10:50:15 +02:00
|
|
|
} else {
|
|
|
|
r.setBestaccessright(bestaccessrights);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (Objects.nonNull(r.getAuthor())) {
|
|
|
|
boolean nullRank = r
|
|
|
|
.getAuthor()
|
|
|
|
.stream()
|
|
|
|
.anyMatch(a -> Objects.isNull(a.getRank()));
|
|
|
|
if (nullRank) {
|
|
|
|
int i = 1;
|
|
|
|
for (Author author : r.getAuthor()) {
|
|
|
|
author.setRank(i++);
|
|
|
|
}
|
|
|
|
}
|
2020-12-02 10:44:05 +01:00
|
|
|
|
2020-10-02 09:43:24 +02:00
|
|
|
for (Author a : r.getAuthor()) {
|
2020-10-01 12:50:40 +02:00
|
|
|
if (Objects.isNull(a.getPid())) {
|
|
|
|
a.setPid(Lists.newArrayList());
|
|
|
|
} else {
|
2020-10-02 09:43:24 +02:00
|
|
|
a
|
|
|
|
.setPid(
|
|
|
|
a
|
|
|
|
.getPid()
|
|
|
|
.stream()
|
2021-01-25 16:54:53 +01:00
|
|
|
.filter(Objects::nonNull)
|
2020-10-02 09:43:24 +02:00
|
|
|
.filter(p -> Objects.nonNull(p.getQualifier()))
|
|
|
|
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
|
|
|
.map(p -> {
|
2020-12-02 10:44:05 +01:00
|
|
|
// hack to distinguish orcid from orcid_pending
|
|
|
|
String pidProvenance = Optional
|
2020-12-02 11:23:49 +01:00
|
|
|
.ofNullable(p.getDataInfo())
|
|
|
|
.map(
|
|
|
|
d -> Optional
|
|
|
|
.ofNullable(d.getProvenanceaction())
|
|
|
|
.map(Qualifier::getClassid)
|
|
|
|
.orElse(""))
|
|
|
|
.orElse("");
|
2020-12-21 11:40:17 +01:00
|
|
|
if (p
|
|
|
|
.getQualifier()
|
|
|
|
.getClassid()
|
|
|
|
.toLowerCase()
|
|
|
|
.contains(ModelConstants.ORCID)) {
|
|
|
|
if (pidProvenance
|
|
|
|
.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
|
|
|
|
p.getQualifier().setClassid(ModelConstants.ORCID);
|
|
|
|
} else {
|
|
|
|
p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
|
|
|
|
}
|
|
|
|
final String orcid = p
|
|
|
|
.getValue()
|
|
|
|
.trim()
|
|
|
|
.toLowerCase()
|
|
|
|
.replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4");
|
2020-12-23 12:22:48 +01:00
|
|
|
if (orcid.length() == ORCID_LEN) {
|
2020-12-21 11:40:17 +01:00
|
|
|
p.setValue(orcid);
|
|
|
|
} else {
|
|
|
|
p.setValue("");
|
|
|
|
}
|
2020-12-02 10:44:05 +01:00
|
|
|
}
|
2020-10-02 09:43:24 +02:00
|
|
|
return p;
|
|
|
|
})
|
2020-12-21 11:40:17 +01:00
|
|
|
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
2020-10-02 09:43:24 +02:00
|
|
|
.collect(
|
|
|
|
Collectors
|
|
|
|
.toMap(
|
2020-12-02 10:44:05 +01:00
|
|
|
p -> p.getQualifier().getClassid() + p.getValue(),
|
2020-12-02 11:23:49 +01:00
|
|
|
Function.identity(),
|
|
|
|
(p1, p2) -> p1,
|
2020-10-02 09:43:24 +02:00
|
|
|
LinkedHashMap::new))
|
|
|
|
.values()
|
|
|
|
.stream()
|
|
|
|
.collect(Collectors.toList()));
|
2020-10-01 12:50:40 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-01 10:50:15 +02:00
|
|
|
}
|
|
|
|
if (value instanceof Publication) {
|
|
|
|
|
|
|
|
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
|
|
|
|
|
|
|
|
} else if (value instanceof OtherResearchProduct) {
|
|
|
|
|
|
|
|
} else if (value instanceof Software) {
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return value;
|
|
|
|
}
|
|
|
|
|
2020-11-27 09:00:04 +01:00
|
|
|
protected static StructuredProperty cleanValue(StructuredProperty s) {
|
|
|
|
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
2020-11-24 18:40:25 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-11-27 09:00:04 +01:00
|
|
|
protected static Field<String> cleanValue(Field<String> s) {
|
|
|
|
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
2020-11-24 18:40:25 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-10-01 10:50:15 +02:00
|
|
|
// HELPERS
|
|
|
|
|
|
|
|
private static void fixVocabName(Qualifier q, String vocabularyName) {
|
|
|
|
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
|
|
|
|
q.setSchemeid(vocabularyName);
|
|
|
|
q.setSchemename(vocabularyName);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private static Qualifier qualifier(String classid, String classname, String scheme) {
|
|
|
|
return OafMapperUtils
|
|
|
|
.qualifier(
|
|
|
|
classid, classname, scheme, scheme);
|
|
|
|
}
|
|
|
|
|
2020-11-24 18:40:25 +01:00
|
|
|
/**
|
|
|
|
* Utility method that normalises PID values on a per-type basis.
|
|
|
|
* @param pid the PID whose value will be normalised.
|
|
|
|
* @return the PID containing the normalised value.
|
|
|
|
*/
|
|
|
|
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
|
|
|
|
String value = Optional
|
|
|
|
.ofNullable(pid.getValue())
|
|
|
|
.map(String::trim)
|
|
|
|
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
|
|
|
switch (pid.getQualifier().getClassid()) {
|
|
|
|
|
|
|
|
// TODO add cleaning for more PID types as needed
|
|
|
|
case "doi":
|
2021-01-22 14:16:33 +01:00
|
|
|
pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10."));
|
2020-11-24 18:40:25 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
return pid;
|
|
|
|
}
|
|
|
|
|
2020-10-01 10:50:15 +02:00
|
|
|
}
|