Cleanig workflow: remove newlines from titles, descriptions, subjects

This commit is contained in:
Claudio Atzori 2020-11-24 18:40:25 +01:00
parent d48f388fb2
commit eeebd5a920
1 changed files with 58 additions and 9 deletions

View File

@ -1,10 +1,7 @@
package eu.dnetlib.dhp.oa.graph.clean; package eu.dnetlib.dhp.oa.graph.clean;
import java.util.HashSet; import java.util.*;
import java.util.LinkedHashMap;
import java.util.Objects;
import java.util.Set;
import java.util.function.Function; import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -18,7 +15,9 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class CleaningFunctions { public class CleaningFunctions {
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static final String NEWLINES = "(?:\\n|\\r)";
public static final Set<String> PID_BLACKLIST = new HashSet<>(); public static final Set<String> PID_BLACKLIST = new HashSet<>();
@ -79,7 +78,7 @@ public class CleaningFunctions {
return value; return value;
} }
protected static <T extends Oaf> T fixDefaults(T value) { public static <T extends Oaf> T fixDefaults(T value) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
// nothing to clean here // nothing to clean here
} else if (value instanceof Project) { } else if (value instanceof Project) {
@ -112,6 +111,29 @@ public class CleaningFunctions {
.filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::removeNewLines)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getTitle())) {
r
.setTitle(
r
.getTitle()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::removeNewLines)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getDescription())) {
r
.setDescription(
r
.getDescription()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::removeNewLines)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getPid())) { if (Objects.nonNull(r.getPid())) {
@ -125,10 +147,7 @@ public class CleaningFunctions {
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase())) .filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(sp -> { .map(CleaningFunctions::normalizePidValue)
sp.setValue(StringUtils.trim(sp.getValue()));
return sp;
})
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
@ -211,6 +230,16 @@ public class CleaningFunctions {
return value; return value;
} }
protected static StructuredProperty removeNewLines(StructuredProperty s) {
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
return s;
}
protected static Field<String> removeNewLines(Field<String> s) {
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
return s;
}
// HELPERS // HELPERS
private static void fixVocabName(Qualifier q, String vocabularyName) { private static void fixVocabName(Qualifier q, String vocabularyName) {
@ -226,4 +255,24 @@ public class CleaningFunctions {
classid, classname, scheme, scheme); classid, classname, scheme, scheme);
} }
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
String value = Optional
.ofNullable(pid.getValue())
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pid.getQualifier().getClassid()) {
// TODO add cleaning for more PID types as needed
case "doi":
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
break;
}
return pid;
}
} }