cleaning result titles: keep only the 10 longest, whose length is truncated to 500 characters
This commit is contained in:
parent
29a2a29666
commit
0b5f2ebd89
|
@ -472,23 +472,12 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.filter(
|
||||
sp -> {
|
||||
final String title = sp
|
||||
.getValue()
|
||||
.toLowerCase();
|
||||
final String decoded = Unidecode.decode(title);
|
||||
|
||||
if (StringUtils.contains(decoded, TITLE_TEST)) {
|
||||
return decoded
|
||||
.replaceAll(TITLE_FILTER_REGEX, "")
|
||||
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
|
||||
}
|
||||
return !decoded
|
||||
.replaceAll("\\W|\\d", "")
|
||||
.isEmpty();
|
||||
})
|
||||
.filter(GraphCleaningFunctions::checkTestTitle)
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.sorted(
|
||||
Comparator.comparingInt((StructuredProperty t) -> t.getValue().length()).reversed())
|
||||
.limit(ModelHardLimits.MAX_TITLES)
|
||||
.map(GraphCleaningFunctions::shortenTitles)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getFormat())) {
|
||||
|
@ -815,6 +804,29 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
return value;
|
||||
}
|
||||
|
||||
private static StructuredProperty shortenTitles(StructuredProperty title) {
|
||||
if (title.getValue().length() > ModelHardLimits.MAX_TITLE_LENGTH) {
|
||||
title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
|
||||
}
|
||||
return title;
|
||||
}
|
||||
|
||||
private static boolean checkTestTitle(StructuredProperty sp) {
|
||||
final String title = sp
|
||||
.getValue()
|
||||
.toLowerCase();
|
||||
final String decoded = Unidecode.decode(title);
|
||||
|
||||
if (StringUtils.contains(decoded, TITLE_TEST)) {
|
||||
return decoded
|
||||
.replaceAll(TITLE_FILTER_REGEX, "")
|
||||
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
|
||||
}
|
||||
return !decoded
|
||||
.replaceAll("\\W|\\d", "")
|
||||
.isEmpty();
|
||||
}
|
||||
|
||||
private static Author cleanupAuthor(Author author) {
|
||||
if (StringUtils.isNotBlank(author.getFullname())) {
|
||||
author
|
||||
|
|
|
@ -20,7 +20,7 @@ public class ModelHardLimits {
|
|||
public static final int MAX_AUTHORS = 200;
|
||||
public static final int MAX_RELATED_AUTHORS = 20;
|
||||
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
|
||||
public static final int MAX_TITLE_LENGTH = 5000;
|
||||
public static final int MAX_TITLE_LENGTH = 500;
|
||||
public static final int MAX_TITLES = 10;
|
||||
public static final int MAX_ABSTRACTS = 10;
|
||||
public static final int MAX_ABSTRACT_LENGTH = 150000;
|
||||
|
|
Loading…
Reference in New Issue