From 49f897ef29721af14960de650e187a71cd485d99 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 16 Nov 2021 15:24:23 +0100 Subject: [PATCH] [cleaning wf] fixed regex used to spot garbage in result titles; adjusted threshold for filtering titles --- .../dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index d8b1cded8..43413b311 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -27,8 +27,8 @@ public class GraphCleaningFunctions extends CleaningFunctions { public static final int ORCID_LEN = 19; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*"; - public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]"; - public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10; + public static final String TITLE_FILTER_REGEX = "(test)|\\W|\\d"; + public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5; public static T fixVocabularyNames(T value) { if (value instanceof Datasource) {