[cleaning wf] fixed methodology to rule out invalid result titles, based on https://support.openaire.eu/issues/7206

This commit is contained in:
Claudio Atzori 2021-11-17 14:17:22 +01:00
parent 49f897ef29
commit 82a4e4efae
3 changed files with 95 additions and 7 deletions

View File

@ -27,7 +27,10 @@ public class GraphCleaningFunctions extends CleaningFunctions {
public static final int ORCID_LEN = 19; public static final int ORCID_LEN = 19;
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*"; public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
public static final String TITLE_FILTER_REGEX = "(test)|\\W|\\d";
public static final String TITLE_TEST = "test";
public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST);
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5; public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
public static <T extends Oaf> T fixVocabularyNames(T value) { public static <T extends Oaf> T fixVocabularyNames(T value) {
@ -195,10 +198,16 @@ public class GraphCleaningFunctions extends CleaningFunctions {
final String title = sp final String title = sp
.getValue() .getValue()
.toLowerCase(); .toLowerCase();
final String residual = Unidecode final String decoded = Unidecode.decode(title);
.decode(title)
.replaceAll(TITLE_FILTER_REGEX, ""); if (StringUtils.contains(decoded, TITLE_TEST)) {
return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH; return decoded
.replaceAll(TITLE_FILTER_REGEX, "")
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
}
return !decoded
.replaceAll("\\W|\\d", "")
.isEmpty();
}) })
.map(GraphCleaningFunctions::cleanValue) .map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList())); .collect(Collectors.toList()));

View File

@ -8,6 +8,7 @@ import java.io.IOException;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@ -137,9 +138,21 @@ public class GraphCleaningFunctionsTest {
.stream() .stream()
.anyMatch(s -> s.getValue().equals("10.1009/qwerty"))); .anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
assertEquals(5, p_out.getTitle().size());
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out); Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out);
assertEquals(1, p_cleaned.getTitle().size()); assertEquals(3, p_cleaned.getTitle().size());
List<String> titles = p_cleaned
.getTitle()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toList());
assertTrue(titles.contains("omic"));
assertTrue(
titles.contains("Optical response of strained- and unstrained-silicon cold-electron bolometers test"));
assertTrue(titles.contains("「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳"));
assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid()); assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid());
assertNull(p_out.getPublisher()); assertNull(p_out.getPublisher());

View File

@ -864,7 +864,7 @@
"schemeid": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title",
"schemename": "dnet:dataCite_title" "schemename": "dnet:dataCite_title"
}, },
"value": "Optical response of strained- and unstrained-silicon cold-electron bolometers" "value": "Optical response of strained- and unstrained-silicon cold-electron bolometers test"
}, },
{ {
"dataInfo": { "dataInfo": {
@ -887,6 +887,72 @@
"schemename": "dnet:dataCite_title" "schemename": "dnet:dataCite_title"
}, },
"value": "test test 123 test" "value": "test test 123 test"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemeid": "dnet:dataCite_title",
"schemename": "dnet:dataCite_title"
},
"value": "omic"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemeid": "dnet:dataCite_title",
"schemename": "dnet:dataCite_title"
},
"value": "「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemeid": "dnet:dataCite_title",
"schemename": "dnet:dataCite_title"
},
"value": "-"
} }
] ]
} }