cleaning result titles: keep only the 10 longest, whose length is truncated to 500 characters

This commit is contained in:
Claudio Atzori 2024-12-12 14:43:14 +01:00
parent 29a2a29666
commit 0b5f2ebd89
2 changed files with 29 additions and 17 deletions

View File

@ -472,23 +472,12 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(
sp -> {
final String title = sp
.getValue()
.toLowerCase();
final String decoded = Unidecode.decode(title);
if (StringUtils.contains(decoded, TITLE_TEST)) {
return decoded
.replaceAll(TITLE_FILTER_REGEX, "")
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
}
return !decoded
.replaceAll("\\W|\\d", "")
.isEmpty();
})
.filter(GraphCleaningFunctions::checkTestTitle)
.map(GraphCleaningFunctions::cleanValue)
.sorted(
Comparator.comparingInt((StructuredProperty t) -> t.getValue().length()).reversed())
.limit(ModelHardLimits.MAX_TITLES)
.map(GraphCleaningFunctions::shortenTitles)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getFormat())) {
@ -815,6 +804,29 @@ public class GraphCleaningFunctions extends CleaningFunctions {
return value;
}
private static StructuredProperty shortenTitles(StructuredProperty title) {
if (title.getValue().length() > ModelHardLimits.MAX_TITLE_LENGTH) {
title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
}
return title;
}
private static boolean checkTestTitle(StructuredProperty sp) {
final String title = sp
.getValue()
.toLowerCase();
final String decoded = Unidecode.decode(title);
if (StringUtils.contains(decoded, TITLE_TEST)) {
return decoded
.replaceAll(TITLE_FILTER_REGEX, "")
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
}
return !decoded
.replaceAll("\\W|\\d", "")
.isEmpty();
}
private static Author cleanupAuthor(Author author) {
if (StringUtils.isNotBlank(author.getFullname())) {
author

View File

@ -20,7 +20,7 @@ public class ModelHardLimits {
public static final int MAX_AUTHORS = 200;
public static final int MAX_RELATED_AUTHORS = 20;
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
public static final int MAX_TITLE_LENGTH = 5000;
public static final int MAX_TITLE_LENGTH = 500;
public static final int MAX_TITLES = 10;
public static final int MAX_ABSTRACTS = 10;
public static final int MAX_ABSTRACT_LENGTH = 150000;