dump of the results related to at least one project #61

Merged
claudio.atzori merged 51 commits from miriam.baglioni/dnet-hadoop:dump into master 2020-12-09 17:22:57 +01:00
1 changed files with 58 additions and 9 deletions
Showing only changes of commit 6750e33d69 - Show all commits

View File

@ -1,10 +1,7 @@
package eu.dnetlib.dhp.oa.graph.clean;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Objects;
import java.util.Set;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
@ -18,7 +15,9 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class CleaningFunctions {
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static final String NEWLINES = "(?:\\n|\\r)";
public static final Set<String> PID_BLACKLIST = new HashSet<>();
@ -79,7 +78,7 @@ public class CleaningFunctions {
return value;
}
protected static <T extends Oaf> T fixDefaults(T value) {
public static <T extends Oaf> T fixDefaults(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
@ -112,6 +111,29 @@ public class CleaningFunctions {
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::removeNewLines)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getTitle())) {
r
.setTitle(
r
.getTitle()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::removeNewLines)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getDescription())) {
r
.setDescription(
r
.getDescription()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::removeNewLines)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getPid())) {
@ -125,10 +147,7 @@ public class CleaningFunctions {
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(sp -> {
sp.setValue(StringUtils.trim(sp.getValue()));
return sp;
})
.map(CleaningFunctions::normalizePidValue)
.collect(Collectors.toList()));
}
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
@ -211,6 +230,16 @@ public class CleaningFunctions {
return value;
}
protected static StructuredProperty removeNewLines(StructuredProperty s) {
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
return s;
}
protected static Field<String> removeNewLines(Field<String> s) {
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
return s;
}
// HELPERS
private static void fixVocabName(Qualifier q, String vocabularyName) {
@ -226,4 +255,24 @@ public class CleaningFunctions {
classid, classname, scheme, scheme);
}
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
String value = Optional
.ofNullable(pid.getValue())
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pid.getQualifier().getClassid()) {
// TODO add cleaning for more PID types as needed
case "doi":
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
break;
}
return pid;
}
}