1
0
Fork 0

cleaning texts to remove tab characters too

This commit is contained in:
Claudio Atzori 2020-11-27 09:00:04 +01:00
parent a104d2b6ad
commit e731a7658d
1 changed files with 8 additions and 8 deletions

View File

@ -17,7 +17,7 @@ public class CleaningFunctions {
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/"; public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static final String NEWLINES = "(?:\\n|\\r)"; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
public static final Set<String> PID_BLACKLIST = new HashSet<>(); public static final Set<String> PID_BLACKLIST = new HashSet<>();
@ -111,7 +111,7 @@ public class CleaningFunctions {
.filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::removeNewLines) .map(CleaningFunctions::cleanValue)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getTitle())) { if (Objects.nonNull(r.getTitle())) {
@ -122,7 +122,7 @@ public class CleaningFunctions {
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::removeNewLines) .map(CleaningFunctions::cleanValue)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getDescription())) { if (Objects.nonNull(r.getDescription())) {
@ -133,7 +133,7 @@ public class CleaningFunctions {
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::removeNewLines) .map(CleaningFunctions::cleanValue)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getPid())) { if (Objects.nonNull(r.getPid())) {
@ -230,13 +230,13 @@ public class CleaningFunctions {
return value; return value;
} }
protected static StructuredProperty removeNewLines(StructuredProperty s) { protected static StructuredProperty cleanValue(StructuredProperty s) {
s.setValue(s.getValue().replaceAll(NEWLINES, " ")); s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s; return s;
} }
protected static Field<String> removeNewLines(Field<String> s) { protected static Field<String> cleanValue(Field<String> s) {
s.setValue(s.getValue().replaceAll(NEWLINES, " ")); s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s; return s;
} }