forked from antonis.lempesis/dnet-hadoop
Cleanig workflow: remove newlines from titles, descriptions, subjects
This commit is contained in:
parent
d48f388fb2
commit
eeebd5a920
|
@ -1,10 +1,7 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
@ -18,7 +15,9 @@ import eu.dnetlib.dhp.schema.oaf.*;
|
|||
|
||||
public class CleaningFunctions {
|
||||
|
||||
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
|
||||
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
|
||||
public static final String NEWLINES = "(?:\\n|\\r)";
|
||||
|
||||
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
||||
|
||||
|
@ -79,7 +78,7 @@ public class CleaningFunctions {
|
|||
return value;
|
||||
}
|
||||
|
||||
protected static <T extends Oaf> T fixDefaults(T value) {
|
||||
public static <T extends Oaf> T fixDefaults(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
|
@ -112,6 +111,29 @@ public class CleaningFunctions {
|
|||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(CleaningFunctions::removeNewLines)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getTitle())) {
|
||||
r
|
||||
.setTitle(
|
||||
r
|
||||
.getTitle()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.map(CleaningFunctions::removeNewLines)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getDescription())) {
|
||||
r
|
||||
.setDescription(
|
||||
r
|
||||
.getDescription()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.map(CleaningFunctions::removeNewLines)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getPid())) {
|
||||
|
@ -125,10 +147,7 @@ public class CleaningFunctions {
|
|||
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(sp -> {
|
||||
sp.setValue(StringUtils.trim(sp.getValue()));
|
||||
return sp;
|
||||
})
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||
|
@ -211,6 +230,16 @@ public class CleaningFunctions {
|
|||
return value;
|
||||
}
|
||||
|
||||
protected static StructuredProperty removeNewLines(StructuredProperty s) {
|
||||
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
|
||||
return s;
|
||||
}
|
||||
|
||||
protected static Field<String> removeNewLines(Field<String> s) {
|
||||
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
|
||||
return s;
|
||||
}
|
||||
|
||||
// HELPERS
|
||||
|
||||
private static void fixVocabName(Qualifier q, String vocabularyName) {
|
||||
|
@ -226,4 +255,24 @@ public class CleaningFunctions {
|
|||
classid, classname, scheme, scheme);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method that normalises PID values on a per-type basis.
|
||||
* @param pid the PID whose value will be normalised.
|
||||
* @return the PID containing the normalised value.
|
||||
*/
|
||||
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
|
||||
String value = Optional
|
||||
.ofNullable(pid.getValue())
|
||||
.map(String::trim)
|
||||
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
||||
switch (pid.getQualifier().getClassid()) {
|
||||
|
||||
// TODO add cleaning for more PID types as needed
|
||||
case "doi":
|
||||
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
|
||||
break;
|
||||
}
|
||||
return pid;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue