forked from D-Net/dnet-hadoop
cleaning texts to remove tab characters too
This commit is contained in:
parent
a104d2b6ad
commit
e731a7658d
|
@ -17,7 +17,7 @@ public class CleaningFunctions {
|
||||||
|
|
||||||
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
|
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
|
||||||
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
|
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
|
||||||
public static final String NEWLINES = "(?:\\n|\\r)";
|
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||||
|
|
||||||
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
||||||
|
|
||||||
|
@ -111,7 +111,7 @@ public class CleaningFunctions {
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||||
.map(CleaningFunctions::removeNewLines)
|
.map(CleaningFunctions::cleanValue)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getTitle())) {
|
if (Objects.nonNull(r.getTitle())) {
|
||||||
|
@ -122,7 +122,7 @@ public class CleaningFunctions {
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||||
.map(CleaningFunctions::removeNewLines)
|
.map(CleaningFunctions::cleanValue)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getDescription())) {
|
if (Objects.nonNull(r.getDescription())) {
|
||||||
|
@ -133,7 +133,7 @@ public class CleaningFunctions {
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||||
.map(CleaningFunctions::removeNewLines)
|
.map(CleaningFunctions::cleanValue)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getPid())) {
|
if (Objects.nonNull(r.getPid())) {
|
||||||
|
@ -230,13 +230,13 @@ public class CleaningFunctions {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static StructuredProperty removeNewLines(StructuredProperty s) {
|
protected static StructuredProperty cleanValue(StructuredProperty s) {
|
||||||
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
|
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static Field<String> removeNewLines(Field<String> s) {
|
protected static Field<String> cleanValue(Field<String> s) {
|
||||||
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
|
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue