cleaning result titles: keep only the 10 longest, whose length is truncated to 500 characters
This commit is contained in:
parent
29a2a29666
commit
0b5f2ebd89
|
@ -472,23 +472,12 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||||
.filter(
|
.filter(GraphCleaningFunctions::checkTestTitle)
|
||||||
sp -> {
|
|
||||||
final String title = sp
|
|
||||||
.getValue()
|
|
||||||
.toLowerCase();
|
|
||||||
final String decoded = Unidecode.decode(title);
|
|
||||||
|
|
||||||
if (StringUtils.contains(decoded, TITLE_TEST)) {
|
|
||||||
return decoded
|
|
||||||
.replaceAll(TITLE_FILTER_REGEX, "")
|
|
||||||
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
|
|
||||||
}
|
|
||||||
return !decoded
|
|
||||||
.replaceAll("\\W|\\d", "")
|
|
||||||
.isEmpty();
|
|
||||||
})
|
|
||||||
.map(GraphCleaningFunctions::cleanValue)
|
.map(GraphCleaningFunctions::cleanValue)
|
||||||
|
.sorted(
|
||||||
|
Comparator.comparingInt((StructuredProperty t) -> t.getValue().length()).reversed())
|
||||||
|
.limit(ModelHardLimits.MAX_TITLES)
|
||||||
|
.map(GraphCleaningFunctions::shortenTitles)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getFormat())) {
|
if (Objects.nonNull(r.getFormat())) {
|
||||||
|
@ -815,6 +804,29 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static StructuredProperty shortenTitles(StructuredProperty title) {
|
||||||
|
if (title.getValue().length() > ModelHardLimits.MAX_TITLE_LENGTH) {
|
||||||
|
title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
|
||||||
|
}
|
||||||
|
return title;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean checkTestTitle(StructuredProperty sp) {
|
||||||
|
final String title = sp
|
||||||
|
.getValue()
|
||||||
|
.toLowerCase();
|
||||||
|
final String decoded = Unidecode.decode(title);
|
||||||
|
|
||||||
|
if (StringUtils.contains(decoded, TITLE_TEST)) {
|
||||||
|
return decoded
|
||||||
|
.replaceAll(TITLE_FILTER_REGEX, "")
|
||||||
|
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
|
||||||
|
}
|
||||||
|
return !decoded
|
||||||
|
.replaceAll("\\W|\\d", "")
|
||||||
|
.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
private static Author cleanupAuthor(Author author) {
|
private static Author cleanupAuthor(Author author) {
|
||||||
if (StringUtils.isNotBlank(author.getFullname())) {
|
if (StringUtils.isNotBlank(author.getFullname())) {
|
||||||
author
|
author
|
||||||
|
|
|
@ -20,7 +20,7 @@ public class ModelHardLimits {
|
||||||
public static final int MAX_AUTHORS = 200;
|
public static final int MAX_AUTHORS = 200;
|
||||||
public static final int MAX_RELATED_AUTHORS = 20;
|
public static final int MAX_RELATED_AUTHORS = 20;
|
||||||
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
|
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
|
||||||
public static final int MAX_TITLE_LENGTH = 5000;
|
public static final int MAX_TITLE_LENGTH = 500;
|
||||||
public static final int MAX_TITLES = 10;
|
public static final int MAX_TITLES = 10;
|
||||||
public static final int MAX_ABSTRACTS = 10;
|
public static final int MAX_ABSTRACTS = 10;
|
||||||
public static final int MAX_ABSTRACT_LENGTH = 150000;
|
public static final int MAX_ABSTRACT_LENGTH = 150000;
|
||||||
|
|
Loading…
Reference in New Issue